/******************************************************************************

       File: mtregex.c
    Package:
Description: thread-safe regular expressions

-------------------------------------------------------------------------------
Copyright (c) 1986 by University of Toronto.
Written by Henry Spencer.  Not derived from licensed software.

Permission is granted to anyone to use this software for any
purpose on any computer system, and to redistribute it freely,
subject to the following restrictions:

1. The author is not responsible for the consequences of use of
   this software, no matter how awful, even if they arise
   from defects in it.

2. The origin of this software must not be misrepresented, either
   by explicit claim or by omission.

3. Altered versions must be plainly marked as such, and must not
   be misrepresented as being the original software.

Beware that some of this code is subtly aware of the way operator
precedence is structured in regular expressions.  Serious changes in
regular-expression syntax might require a total rethink.
-------------------------------------------------------------------------------

Cleaned up by Alex Meyer (a_l_e_x at i-n-k-t-o-m-i dot c_o_m) 5/5/98
1. Consolidated into one header and one source file
2. Converted to ANSI C
3. Removed globals to make thread-safe
4. Normalized error returns (got rid of regerror)
5. Removed C++ reserved words

$Id$

******************************************************************************/

static const char rcsid[] = "$Id$";


#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include "mtregex.h"

/* The "internal use only" fields in regexp.h are present to pass info from
   compile to execute that permits the execute phase to run lots faster on
   simple cases.  They are:

   regstart     char that must begin a match; '\0' if none obvious
   reganch      is the match anchored (at beginning-of-line only)?
   regmust      string (pointer into program) that match must include, or NULL
   regmlen      length of regmust string

   Regstart and reganch permit very fast decisions on suitable starting points
   for a match, cutting down the work a lot.  Regmust permits fast rejection
   of lines that cannot possibly match.  The regmust tests are costly enough
   that regcomp() supplies a regmust only if the r.e. contains something
   potentially expensive (at present, the only such thing detected is   or +
   at the start of the r.e., which can involve a lot of backup).  Regmlen is
   supplied because the test in regexec() needs it and regcomp() is computing
   it anyway. */

/* Structure for regexp "program".  This is essentially a linear encoding
   of a nondeterministic finite-state machine (aka syntax charts or
   "railroad normal form" in parsing technology).  Each node is an opcode
   plus a "next" pointer, possibly plus an operand.  "Next" pointers of
   all nodes except BRANCH implement concatenation; a "next" pointer with
   a BRANCH on both ends of it is connecting two alternatives.  (Here we
   have one of the subtle syntax dependencies:  an individual BRANCH (as
   opposed to a collection of them) is never concatenated with anything
   because of operator precedence.)  The operand of some types of node is
   a literal string; for others, it is a node leading into a sub-FSM.  In
   particular, the operand of a BRANCH node is the first node of the branch.
   (NB this is *not* a tree structure:  the tail of the branch connects
   to the thing following the set of BRANCHes.)  The opcodes are: */

/* definition    number    opnd?  meaning */
#define END      0      /* no     End of program */
#define BOL      1      /* no     Match "" at beginning of line */
#define EOL      2      /* no     Match "" at end of line */
#define ANY      3      /* no     Match any one character */
#define ANYOF    4      /* str    Match any character in this string */
#define ANYBUT   5      /* str    Match any character not in this string */
#define BRANCH   6      /* node   Match this alternative, or the next... */
#define BACK     7      /* no     Match "", "next" ptr points backward */
#define EXACTLY  8      /* str    Match this string */
#define NOTHING  9      /* no     Match empty string */
#define STAR     10     /* node   Match this (simple) thing 0 or more times */
#define PLUS     11     /* node   Match this (simple) thing 1 or more times */
#define OPEN     20     /* no     Mark this point in input as start of #n */
                        /*        OPEN+1 is number 1, etc */
#define CLOSE    30     /* no     Analogous to OPEN */

#if ((OPEN + NSUBEXP) > CLOSE)
# error NSUBEXP too big
#endif

/* Opcode notes:

   BRANCH       The set of branches constituting a single choice are hooked
                together with their "next" pointers, since precedence prevents
                anything being concatenated to any individual branch.  The
                "next" pointer of the last BRANCH in a choice points to the
                thing following the whole choice.  This is also where the
                final "next" pointer of each individual branch points; each
                branch starts with the operand node of a BRANCH node.

   BACK         Normal "next" pointers all implicitly point forward; BACK
                exists to make loop structures possible.

   STAR,PLUS    '?', and complex '*' and '+', are implemented as circular
                BRANCH structures using BACK.  Simple cases (one character
                per match) are implemented with STAR and PLUS for speed
                and to minimize recursive plunges.

   OPEN,CLOSE   ...are numbered at compile time. */

/* A node is one char of opcode followed by two chars of "next" pointer.
   "Next" pointers are stored as two 8-bit pieces, high order first.  The
   value is a positive offset from the opcode of the node containing it.
   An operand, if any, simply follows the node.  (Note that much of the
   code generation knows about this implicit relationship.)

   Using two bytes for the "next" pointer is vast overkill for most things,
   but allows patterns to get big without disasters. */

#define OP(p)       (*(p))
#define NEXT(p)     (((unsigned char *) p)[1] << 8 | ((unsigned char *) p)[2])
#define OPERAND(p)  ((p) + 3)

/* The first byte of the regexp internal "program" is actually this magic
   number; the start node begins in the second byte. */
#define MAGIC   0234

/* Utility definitions */
#define UCHARAT(p)  ((int)*(unsigned char *)(p))

#define ISMULT(c)  ((c) == '*' || (c) == '+' || (c) == '?')
#define META       "^$.[()|?+*\\"

/* Flags to be passed up and down */
#define HASWIDTH  0x0001  /* Known never to match null string */
#define SIMPLE    0x0002  /* Simple enough to be STAR/PLUS operand */
#define SPSTART   0x0004  /* Starts with * or + */
#define WORST     0x0000  /* Worst case */


/* compilation context */
typedef struct {
  const char  *parse;  /* Input-scan pointer */
  int          npar;   /* () count */
  char        *code;   /* Code-emit pointer; NOEMIT means don't */
  size_t       size;   /* Code size */
  const char  *errmsg;
} cCtxT;

#define NOEMIT ((char *) ~0)


/* execution context */
typedef struct {
  const char   *input;   /* String-input pointer */
  const char   *bol;     /* Beginning of input, for ^ check */
  const char  **startp;  /* Pointer to startp array */
  const char  **endp;    /* Ditto for endp */
  const char  *errmsg;
} xCtxT;

/******************************************************************************

STATIC COMPILATION FUNCTIONS

******************************************************************************/

static char *reg(cCtxT *cCtx, int paren, int *flagp);


/* regc - emit (if appropriate) a byte of code */
static void
regc(cCtxT *cCtx, char b)
{
  if (cCtx->code != NOEMIT)
    *cCtx->code++ = b;
  else
    cCtx->size++;
}


/* regnode - emit a node */
static char *                   /* Location */
regnode(cCtxT *cCtx, char op)
{
  char *ret;
  char *ptr;

  ret = cCtx->code;
  if (ret == NOEMIT)
    cCtx->size += 3;
  else {
    ptr = ret;
    *ptr++ = op;
    *ptr++ = '\0';          /* Null "next" pointer */
    *ptr++ = '\0';
    cCtx->code = ptr;
  }

  return ret;
}


/* reginsert - insert an operator in front of already-emitted operand

   Means relocating the operand */
static void
reginsert(cCtxT *cCtx, char op, char *opnd)
{
  char *p;
  char *src;
  char *dst;
  char *place;

  p = cCtx->code;
  if (p == NOEMIT)
    cCtx->size += 3;
  else {
    src = p;
    dst = cCtx->code + 3;
    cCtx->code = dst;
    while (src > opnd)
      *--dst = *--src;

    place = opnd;           /* Op node, where operand used to be */
    *place++ = op;
    *place++ = '\0';
    *place++ = '\0';
  }
}


/* regatom - the lowest level

   Optimization:  gobbles an entire sequence of ordinary characters so that
   it can turn them into a single node, which is smaller to store and
   faster to run.  Backslashed characters are exceptions, each becoming a
   separate node; the code is simpler that way and it's not worth fixing. */
static char *
regatom(cCtxT *cCtx, int *flagp)
{
  char *ret;
  int flags;

  *flagp = WORST;         /* Tentatively */

  switch (*cCtx->parse++) {
  case '^':
    ret = regnode(cCtx, BOL);
    break;
  case '$':
    ret = regnode(cCtx, EOL);
    break;
  case '.':
    ret = regnode(cCtx, ANY);
    *flagp |= HASWIDTH|SIMPLE;
    break;
  case '[':
    {
      int cclass;
      int cclassend;

      if (*cCtx->parse == '^') {   /* Complement of range */
        ret = regnode(cCtx, ANYBUT);
        cCtx->parse++;
      }
      else
        ret = regnode(cCtx, ANYOF);
      if (*cCtx->parse == ']' || *cCtx->parse == '-')
        regc(cCtx, *cCtx->parse++);
      while (*cCtx->parse != '\0' && *cCtx->parse != ']') {
        if (*cCtx->parse == '-') {
          cCtx->parse++;
          if (*cCtx->parse == ']' || *cCtx->parse == '\0')
            regc(cCtx, '-');
          else {
            cclass = UCHARAT(cCtx->parse-2)+1;
            cclassend = UCHARAT(cCtx->parse);
            if (cclass > cclassend+1) {
              cCtx->errmsg = "invalid [] range";
              return NULL;
            }
            for (; cclass <= cclassend; cclass++)
              regc(cCtx, cclass);
            cCtx->parse++;
          }
        }
        else
          regc(cCtx, *cCtx->parse++);
      }
      regc(cCtx, '\0');
      if (*cCtx->parse != ']') {
        cCtx->errmsg = "unmatched []";
        return NULL;
      }
      cCtx->parse++;
      *flagp |= HASWIDTH|SIMPLE;
      break;
    }
  case '(':
    ret = reg(cCtx, 1, &flags);
    if (ret == NULL)
      return NULL;
    *flagp |= flags&(HASWIDTH|SPSTART);
    break;
  case '\0':
  case '|':
  case ')':
    cCtx->errmsg = "internal urp";   /* Supposed to be caught earlier */
    return NULL;
    break;
  case '?':
  case '+':
  case '*':
    cCtx->errmsg = "?+* follows nothing";
    return NULL;
    break;
  case '\\':
    if (*cCtx->parse == '\0') {
      cCtx->errmsg = "trailing \\";
      return NULL;
    }
    ret = regnode(cCtx, EXACTLY);
    regc(cCtx, *cCtx->parse++);
    regc(cCtx, '\0');
    *flagp |= HASWIDTH|SIMPLE;
    break;
  default:
    {
      int len;
      char ender;
      
      cCtx->parse--;
      len = strcspn(cCtx->parse, META);
      if (len <= 0) {
        cCtx->errmsg = "internal disaster";
        return NULL;
      }
      ender = *(cCtx->parse+len);
      if (len > 1 && ISMULT(ender))
        len--;          /* Back off clear of ?+* operand */
      *flagp |= HASWIDTH;
      if (len == 1)
        *flagp |= SIMPLE;
      ret = regnode(cCtx, EXACTLY);
      while (len > 0) {
        regc(cCtx, *cCtx->parse++);
        len--;
      }
      regc(cCtx, '\0');
      break;
    }
  }

  return ret;
}


/* regnext - dig the "next" pointer out of a node */
static const char *
regnext(const char *p)
{
  int offset;

  if (p == NOEMIT)
    return NULL;

  offset = NEXT(p);
  if (offset == 0)
    return NULL;

  if (OP(p) == BACK)
    return (p-offset);
  else
    return (p+offset);
}


/* regtail - set the next-pointer at the end of a node chain */
static void
regtail(char *p, char *val)
{
  char *scan;
  char *temp;
  int offset;

  if (p == NOEMIT)
    return;

  /* Find last node */
  scan = p;
  for (;;) {
    temp = (char *) regnext(scan); /* !!! cast away const */
    if (temp == NULL)
      break;
    scan = temp;
  }

  if (OP(scan) == BACK)
    offset = scan - val;
  else
    offset = val - scan;
  *(scan+1) = (offset>>8)&0377;
  *(scan+2) = offset&0377;
}


/* regoptail - regtail on operand of first argument; nop if operandless */
static void
regoptail(char *p, char *val)
{
  /* "Operandless" and "op != BRANCH" are synonymous in practice */
  if (p == NULL || p == NOEMIT || OP(p) != BRANCH)
    return;
  regtail(OPERAND(p), val);
}


/* regpiece - something followed by possible [*+?]

   Note that the branching code sequences used for ? and the general cases
   of * and + are somewhat optimized:  they use the same NOTHING node as
   both the endmarker for their branch list and the body of the last branch.
   It might seem that this node could be dispensed with entirely, but the
   endmarker role is not redundant. */
static char *
regpiece(cCtxT *cCtx, int *flagp)
{
  char *ret;
  char op;
  char *next;
  int flags;

  ret = regatom(cCtx, &flags);
  if (ret == NULL)
    return NULL;

  op = *cCtx->parse;
  if (!ISMULT(op)) {
    *flagp = flags;
    return ret;
  }

  if (!(flags&HASWIDTH) && op != '?') {
    cCtx->errmsg = "*+ operand could be empty";
    return NULL;
  }
  *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);

  if (op == '*' && (flags&SIMPLE))
    reginsert(cCtx, STAR, ret);
  else if (op == '*') {
    /* Emit x* as (x&|), where & means "self" */
    reginsert(cCtx, BRANCH, ret);                   /* Either x */
    regoptail(ret, regnode(cCtx, BACK));      /* and loop */
    regoptail(ret, ret);                      /* back */
    regtail(ret, regnode(cCtx, BRANCH));      /* or */
    regtail(ret, regnode(cCtx, NOTHING));     /* null */
  }
  else if (op == '+' && (flags&SIMPLE))
    reginsert(cCtx, PLUS, ret);
  else if (op == '+') {
    /* Emit x+ as x(&|), where & means "self" */
    next = regnode(cCtx, BRANCH);                   /* Either */
    regtail(ret, next);
    regtail(regnode(cCtx, BACK), ret);        /* loop back */
    regtail(next, regnode(cCtx, BRANCH));     /* or */
    regtail(ret, regnode(cCtx, NOTHING));     /* null */
  }
  else if (op == '?') {
    /* Emit x? as (x|) */
    reginsert(cCtx, BRANCH, ret);                   /* Either x */
    regtail(ret, regnode(cCtx, BRANCH));      /* or */
    next = regnode(cCtx, NOTHING);                  /* null */
    regtail(ret, next);
    regoptail(ret, next);
  }
  cCtx->parse++;
  if (ISMULT(*cCtx->parse)) {
    cCtx->errmsg = "nested *?+";
    return NULL;
  }

  return ret;
}


/* regbranch - one alternative of an | operator

   Implements the concatenation operator */
static char *
regbranch(cCtxT *cCtx, int *flagp)
{
  char *ret;
  char *chain;
  char *latest;
  int flags;

  *flagp = WORST;         /* Tentatively */

  ret = regnode(cCtx, BRANCH);
  chain = NULL;
  while (*cCtx->parse != '\0' && *cCtx->parse != '|' && *cCtx->parse != ')') {
    latest = regpiece(cCtx, &flags);
    if (latest == NULL)
      return NULL;
    *flagp |= flags&HASWIDTH;
    if (chain == NULL)      /* First piece */
      *flagp |= flags&SPSTART;
    else
      regtail(chain, latest);
    chain = latest;
  }
  if (chain == NULL)      /* Loop ran zero times */
    regnode(cCtx, NOTHING);

  return ret;
}


/* reg - regular expression, i.e. main body or parenthesized thing

   Caller must absorb opening parenthesis.

   Combining parenthesis handling with the base level of regular expression
   is a trifle forced, but the need to tie the tails of the branches to what
   follows makes it hard to avoid. */
static char *
reg(cCtxT  *cCtx,
    int     paren,  /* Parenthesized? */
    int    *flagp)
{
  char *ret;
  char *br;
  char *ender;
  int parno  = 0;  /* init to avoid warning */
  int flags;

  *flagp = HASWIDTH;      /* Tentatively */

  /* Make an OPEN node, if parenthesized */
  if (paren) {
    if (cCtx->npar >= NSUBEXP) {
      cCtx->errmsg = "too many ()";
      return NULL;
    }
    parno = cCtx->npar;
    cCtx->npar++;
    ret = regnode(cCtx, OPEN+parno);
  }
  else
    ret = NULL;

  /* Pick up the branches, linking them together */
  br = regbranch(cCtx, &flags);
  if (br == NULL)
    return NULL;
  if (ret != NULL)
    regtail(ret, br); /* OPEN -> first */
  else
    ret = br;
  if (!(flags&HASWIDTH))
    *flagp &= ~HASWIDTH;
  *flagp |= flags&SPSTART;
  while (*cCtx->parse == '|') {
    cCtx->parse++;
    br = regbranch(cCtx, &flags);
    if (br == NULL)
      return NULL;
    regtail(ret, br); /* BRANCH -> BRANCH */
    if (!(flags&HASWIDTH))
      *flagp &= ~HASWIDTH;
    *flagp |= flags&SPSTART;
  }

  /* Make a closing node, and hook it on the end */
  ender = regnode(cCtx, (paren) ? CLOSE+parno : END);     
  regtail(ret, ender);

  /* Hook the tails of the branches to the closing node */
  for (br = ret; br != NULL; br = (char *) regnext(br))/* !!! cast off const */
    regoptail(br, ender);

  /* Check for proper termination */
  if (paren && *cCtx->parse++ != ')') {
    cCtx->errmsg = "unmatched ()";
    return NULL;
  }
  else if (!paren && *cCtx->parse != '\0') {
    if (*cCtx->parse == ')') {
      cCtx->errmsg = "unmatched ()";
      return NULL;
    }
    else {
      cCtx->errmsg = "junk on end";    /* "Can't happen" */
      return NULL;
    }
  }

  return ret;
}

/******************************************************************************

STATIC MATCHING FUNCTIONS

******************************************************************************/

/* regrepeat - repeatedly match something simple, report how many */
static int
regrepeat(xCtxT *xCtx, const char *p)
{
  int count = 0;
  const char *scan;
  const char *opnd;

  scan = xCtx->input;
  opnd = OPERAND(p);
  switch (OP(p)) {
  case ANY:
    count = strlen(scan);
    scan += count;
    break;
  case EXACTLY:
    while (*opnd == *scan) {
      count++;
      scan++;
    }
    break;
  case ANYOF:
    while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
      count++;
      scan++;
    }
    break;
  case ANYBUT:
    while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
      count++;
      scan++;
    }
    break;
  default:                /* Oh dear.  Called inappropriately */
    xCtx->errmsg = "internal foulup";
    count = 0;      /* Best compromise */
    break;
  }
  xCtx->input = scan;

  return count;
}


/* regmatch - main matching routine

   Conceptually the strategy is simple:  check to see whether the current
   node matches, call self recursively to see whether the rest matches,
   and then act accordingly.  In practice we make some effort to avoid
   recursion, in particular by going through "ordinary" nodes (that don't
   need to know whether the rest of the match failed) by a loop instead of
   by recursion. */
static int
regmatch(xCtxT *xCtx, const char *prog)
{
  const char  *scan;     /* Current node */
  const char  *next;     /* Next node */

  scan = prog;
  while (scan != NULL) {
    next = regnext(scan);

    switch (OP(scan)) {
    case BOL:
      if (xCtx->input != xCtx->bol)
        return MTREGEX_FAIL;
      break;
    case EOL:
      if (*xCtx->input != '\0')
        return MTREGEX_FAIL;
      break;
    case ANY:
      if (*xCtx->input == '\0')
        return MTREGEX_FAIL;
      xCtx->input++;
      break;
    case EXACTLY:
      {
        size_t len;
        const char *opnd;

        opnd = OPERAND(scan);
        /* Inline the first character, for speed */
        if (*opnd != *xCtx->input)
          return MTREGEX_FAIL;
        len = strlen(opnd);
        if (len > 1 && strncmp(opnd, xCtx->input, len) != 0)
          return MTREGEX_FAIL;
        xCtx->input += len;
        break;
      }
    case ANYOF:
      if (*xCtx->input == '\0' || strchr(OPERAND(scan), *xCtx->input) == NULL)
        return MTREGEX_FAIL;
      xCtx->input++;
      break;
    case ANYBUT:
      if (*xCtx->input == '\0' || strchr(OPERAND(scan), *xCtx->input) != NULL)
        return MTREGEX_FAIL;
      xCtx->input++;
      break;
    case NOTHING:
      break;
    case BACK:
      break;
    case OPEN+1:
    case OPEN+2:
    case OPEN+3:
    case OPEN+4:
    case OPEN+5:
    case OPEN+6:
    case OPEN+7:
    case OPEN+8:
    case OPEN+9:
      {
        int no;
        const char *save;
      
        no = OP(scan) - OPEN;
        save = xCtx->input;

        if (regmatch(xCtx, next)) {
          /* Don't set startp if some later invocation of the same parentheses
             already has */
          if (xCtx->startp[no] == NULL)
            xCtx->startp[no] = save;
          return MTREGEX_SUCCESS;
        }
        else
          return MTREGEX_FAIL;
        break;
      }
    case CLOSE+1:
    case CLOSE+2:
    case CLOSE+3:
    case CLOSE+4:
    case CLOSE+5:
    case CLOSE+6:
    case CLOSE+7:
    case CLOSE+8:
    case CLOSE+9:
      {
        int no;
        const char *save;

        no = OP(scan) - CLOSE;
        save = xCtx->input;

        if (regmatch(xCtx, next)) {
          /* Don't set endp if some later invocation of the same parentheses
             already has */
          if (xCtx->endp[no] == NULL)
            xCtx->endp[no] = save;
          return MTREGEX_SUCCESS;
        }
        else
          return MTREGEX_FAIL;
        break;
      }
    case BRANCH:
      {
        const char *save;

        if (OP(next) != BRANCH)   /* No choice */
          next = OPERAND(scan);   /* Avoid recursion */
        else {
          do {
            save = xCtx->input;
            if (regmatch(xCtx, OPERAND(scan)))
              return MTREGEX_SUCCESS;
            xCtx->input = save;
            scan = regnext(scan);
          } while (scan != NULL && OP(scan) == BRANCH);
          return MTREGEX_FAIL;
        }
        break;
      }
    case STAR:
    case PLUS:
      {
        char nextch;
        int no;
        const char *save;
        int min;

        /* Lookahead to avoid useless match attempts
           when we know what character comes next */
        nextch = '\0';
        if (OP(next) == EXACTLY)
          nextch = *OPERAND(next);
        min = (OP(scan) == STAR) ? 0 : 1;
        save = xCtx->input;
        no = regrepeat(xCtx, OPERAND(scan));
        while (no >= min) {
          /* If it could work, try it */
          if (nextch == '\0' || *xCtx->input == nextch)
            if (regmatch(xCtx, next))
              return MTREGEX_SUCCESS;
          /* Couldn't or didn't -- back up */
          no--;
          xCtx->input = save + no;
        }
        return MTREGEX_FAIL;
        break;
      }
    case END:
      return MTREGEX_SUCCESS;
      break;
    default:
      xCtx->errmsg = "memory corruption";
      return MTREGEX_FAIL;
      break;
    }

    scan = next;
  }

  /* We get here only if there's trouble -- normally "case END" is
     the terminating point */
  xCtx->errmsg = "corrupted pointers";
  return MTREGEX_FAIL;
}


/* regtry - try match at specific point */
static int
regtry(xCtxT           *xCtx,
       const MtRegexT  *prog,
       MtRegexMatchT   *matches,
       const char      *string)
{
  int           i;
  const char  **sp;
  const char  **ep;

  xCtx->input = string;
  xCtx->startp = matches->startp;
  xCtx->endp = matches->endp;

  sp = matches->startp;
  ep = matches->endp;
  for (i = NSUBEXP; i > 0; i--) {
    *sp++ = NULL;
    *ep++ = NULL;
  }
  if (regmatch(xCtx, prog->program + 1)) {
    matches->startp[0] = string;
    matches->endp[0] = xCtx->input;
    return MTREGEX_SUCCESS;
  }
  else
    return MTREGEX_FAIL;
}

/******************************************************************************

PUBLIC FUNCTIONS

******************************************************************************/

/* regcomp - compile a regular expression into internal code

   We can't allocate space until we know how big the compiled form will be,
   but we can't compile it (and thus know how big it is) until we've got a
   place to put the code.  So we cheat:  we compile it twice, once with code
   generation turned off and size counting turned on, and once "for real".
   This also means that we don't allocate space until we are sure that the
   thing really will compile successfully, and we never have to move the
   code and thus invalidate pointers into it.  (Note that it has to be in
   one piece because free() must be able to free it all.)

   Beware that the optimization-preparation code in here knows about some
   of the structure of the compiled regexp. */
MtRegexT *
MtRegexCompile(const char *exp, const char **errmsgP)
{
  cCtxT     cCtx;
  MtRegexT  *r;
  char     *scan;
  char     *longest;
  size_t    len;
  int       flags;

  if (!errmsgP)
    return NULL;
  *errmsgP = NULL;

  if (exp == NULL) {
    *errmsgP = "NULL argument";
    return NULL;
  }

  /* First pass: determine size, legality */
  cCtx.parse = exp;
  cCtx.npar = 1;
  cCtx.size = 0;
  cCtx.code = NOEMIT;
  cCtx.errmsg = NULL;
  regc(&cCtx, MAGIC);
  if (reg(&cCtx, 0, &flags) == NULL) {
    *errmsgP = cCtx.errmsg;
    return NULL;
  }

  /* Small enough for pointer-storage convention? */
  if (cCtx.size >= 0x7fff) {           /* Probably could be 0xffff */
    *errmsgP = "regexp too big";
    return NULL;
  }

  /* Allocate space */
  r = (MtRegexT *)malloc(sizeof(MtRegexT) + (unsigned)cCtx.size);
  if (r == NULL) {
    *errmsgP = "out of space";
    return NULL;
  }

  /* Second pass: emit code */
  cCtx.parse = exp;
  cCtx.npar = 1;
  cCtx.code = r->program;
  cCtx.errmsg = NULL;
  regc(&cCtx, MAGIC);
  if (reg(&cCtx, 0, &flags) == NULL) {
    *errmsgP = cCtx.errmsg;
    return NULL;
  }

  /* Dig out information for optimizations */
  r->regstart = '\0';     /* Worst-case defaults */
  r->reganch = 0;
  r->regmust = NULL;
  r->regmlen = 0;
  scan = r->program+1;                    /* First BRANCH */
  if (OP(regnext(scan)) == END) { /* Only one top-level choice */
    scan = OPERAND(scan);

    /* Starting-point info */
    if (OP(scan) == EXACTLY)
      r->regstart = *OPERAND(scan);
    else if (OP(scan) == BOL)
      r->reganch++;

    /* If there's something expensive in the r.e., find the
       longest literal string that must appear and make it the
       regmust.  Resolve ties in favor of later strings, since
       the regstart check works with the beginning of the r.e.
       and avoiding duplication strengthens checking.  Not a
       strong reason, but sufficient in the absence of others. */
    if (flags&SPSTART) {
      longest = NULL;
      len = 0;
      for (; scan != NULL; scan = (char *) regnext(scan)) /* !!! lose const */
        if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
          longest = OPERAND(scan);
          len = strlen(OPERAND(scan));
        }
      r->regmust = longest;
      r->regmlen = len;
    }
  }

  return r;
}


/* regexec - match a regexp against a string */
int
MtRegexMatch(const MtRegexT   *prog,
             MtRegexMatchT    *matches,
             const char       *string,
             const char      **errmsgP)
{
  xCtxT        xCtx;
  const char  *s;
  int          res;

  if (!errmsgP)
    return MTREGEX_FAIL;
  *errmsgP = NULL;

  /* Be paranoid... */
  if (prog == NULL || matches == NULL || string == NULL) {
    *errmsgP = "NULL parameter";
    return MTREGEX_FAIL;
  }

  /* Check validity of program */
  if (UCHARAT(prog->program) != MAGIC) {
    *errmsgP = "corrupted program";
    return MTREGEX_FAIL;
  }

  /* If there is a "must appear" string, look for it */
  if (prog->regmust != NULL) {
    s = string;
    while ((s = strchr(s, prog->regmust[0])) != NULL) {
      if (strncmp(s, prog->regmust, prog->regmlen) == 0)
        break;  /* Found it */
      s++;
    }
    if (s == NULL)  /* Not present */
      return MTREGEX_FAIL;
  }

  xCtx.errmsg = NULL;

  /* Mark beginning of line for ^  */
  xCtx.bol = string;

  /* Simplest case:  anchored match need be tried only once */
  if (prog->reganch) {
    res = regtry(&xCtx, prog, matches, string);
    if (!res)
      *errmsgP = xCtx.errmsg;
    return res;
  }

  /* Messy cases:  unanchored match */
  s = string;
  if (prog->regstart != '\0')
    /* We know what char it must start with */
    while ((s = strchr(s, prog->regstart)) != NULL) {
      if (regtry(&xCtx, prog, matches, s))
        return MTREGEX_SUCCESS;
      s++;
    }
  else
    /* We don't -- general case */
    do {
      if (regtry(&xCtx, prog, matches, s))
        return MTREGEX_SUCCESS;
    } while (*s++ != '\0');

  /* Failure */
  *errmsgP = xCtx.errmsg;
  return MTREGEX_FAIL;
}


/* regsub - perform substitutions after a regexp match */
int
MtRegexSubstitute(const MtRegexT        *prog,
                  const MtRegexMatchT   *matches,
                  const char            *source,
                  char                  *dest,
                  const char           **errmsgP)
{
  const char  *src;
  char        *dst;
  char         c;
  int          no;
  size_t       len;

  if (!errmsgP)
    return MTREGEX_FAIL;
  *errmsgP = NULL;

  if (prog == NULL || matches == NULL || source == NULL || dest == NULL) {
    *errmsgP = "NULL parm to regsub";
    return MTREGEX_FAIL;
  }
  if (UCHARAT(prog->program) != MAGIC) {
    *errmsgP = "damaged regexp fed to regsub";
    return MTREGEX_FAIL;
  }

  src = source;
  dst = dest;
  while ((c = *src++) != '\0') {
    if (c == '&')
      no = 0;
    else if (c == '\\' && '0' <= *src && *src <= '9')
      no = *src++ - '0';
    else
      no = -1;

    if (no < 0) {   /* Ordinary character */
      if (c == '\\' && (*src == '\\' || *src == '&'))
        c = *src++;
      *dst++ = c;
    }
    else if (matches->startp[no] != NULL && matches->endp[no] != NULL) {
      len = matches->endp[no] - matches->startp[no];
      strncpy(dst, matches->startp[no], len);
      dst += len;
      if (len != 0 && *(dst-1) == '\0') {     /* strncpy hit NUL */
        *errmsgP = "damaged match string";
        return MTREGEX_FAIL;
      }
    }
  }
  *dst++ = '\0';

  return MTREGEX_SUCCESS;
}
