/*############################################################################## Copyright (C) 2011 HPCC Systems. All rights reserved. This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . ############################################################################## */ #include "jlib.hpp" #include "jmisc.hpp" #include "jregexp.hpp" #define FAIL(s) { /*assert(!s);*/ return NULL; } #define SUBPARENL '{' #define SUBPARENR '}' // NB Meta must change if the above change typedef unsigned REGFLAGS; #define MAXEXPR 0xf000 #define MAXSUBST 0xf000 #define NSUBEXP 10 class RECOMP { public: const char *s_start[NSUBEXP]; const char *s_end[NSUBEXP]; const char *s_save; // save text after find const char *s_savestr[NSUBEXP]; size32_t s_savelen[NSUBEXP]; char start; char anch; char *must; size32_t mlen; char *program; const char *parse; /* Input-scan pointer. */ int npar; /* {} count. */ char *code; /* Code-emit pointer. */ const char *input; /* String-input pointer. */ const char *bol; /* Beginning of input, for ^ check. */ const char **startp; /* Pointer to startp array. */ const char **endp; /* Ditto for end. */ char * reg(bool sub, REGFLAGS &); char *branch (REGFLAGS &); char *piece (REGFLAGS &); char *atom (REGFLAGS &); char *rnode (char); #ifdef _DEBUG void regc (char b) { if (code-program>=MAXEXPR-3) assertex(!"regular expression too complicated!"); *code++ = b; }; #endif void insert (char, char *); void tail (char *, char *); void optail (char *, char *); char *next (char *); bool match (char *prog); char *rtry (const char *); int repeat (char *); const char * findstart; const char * findend; size32_t replacemax; // needed for replace (clarion) bool nocase; const char *rstrchr(const char *s,int c); bool strnequ(const char *s1,const char *s2,size32_t n); bool eob(const char *s) { return s==findend; }; bool claeol(const char *s); bool equch(int c1,int c2) { if (nocase) return (toupper(c1)==toupper(c2)); return c1==c2; }; void adjustptrs(const char *e,size32_t lo,size32_t lr); size32_t rstrlen(const char *s) {size32_t l=0; while (s++!=findend) l++; return l; }; bool clarion; bool suboverlap(int n,int m); RECOMP() { _clear(s_start); s_save=NULL; program = NULL; }; ~RECOMP() { free((void *)s_save); free((void *)program); } }; #ifndef _DEBUG #define REGC(b) *code++ = b #else #define REGC(b) regc(b) #endif //---- /* * The "internal use only" fields in regexp.h are present to pass info from * compile to execute that permits the execute phase to run lots faster on * simple cases. They are: * * regstart char that must begin a match; '\0' if none obvious * reganch is the match anchored (at beginning-of-line only)? * regmust string (pointer into program) that match must include, or NULL * regmlen length of regmust string * * Regstart and reganch permit very fast decisions on suitable starting points * for a match, cutting down the work a lot. Regmust permits fast rejection * of lines that cannot possibly match. The regmust tests are costly enough * that regcomp() supplies a regmust only if the r.e. contains something * potentially expensive (at present, the only such thing detected is * or + * at the start of the r.e., which can involve a lot of backup). Regmlen is * supplied because the test in regexec() needs it and regcomp() is computing * it anyway. */ /* * Structure for regexp "program". This is essentially a linear encoding * of a nondeterministic finite-state machine (aka syntax charts or * "railroad normal form" in parsing technology). Each node is an opcode * plus a "next" pointer, possibly plus an operand. "Next" pointers of * all nodes except BRANCH implement concatenation; a "next" pointer with * a BRANCH on both ends of it is connecting two alternatives. (Here we * have one of the subtle syntax dependencies: an individual BRANCH (as * opposed to a collection of them) is never concatenated with anything * because of operator precedence.) The operand of some types of node is * a literal string; for others, it is a node leading into a sub-FSM. In * particular, the operand of a BRANCH node is the first node of the branch. * (NB this is *not* a tree structure: the tail of the branch connects * to the thing following the set of BRANCHes.) The opcodes are: */ /* definition number opnd? meaning */ #define END 0 /* no End of program. */ #define BOL 1 /* no Match "" at beginning of line. */ #define EOL 2 /* no Match "" at end of line. */ #define ANY 3 /* no Match any one character. */ #define ANYOF 4 /* str Match any character in this string. */ #define ANYBUT 5 /* str Match any character not in this string. */ #define BRANCH 6 /* node Match this alternative, or the next... */ #define BACK 7 /* no Match "", "next" ptr points backward. */ #define EXACTLY 8 /* str Match this string. */ #define NOTHING 9 /* no Match empty string. */ #define STAR 10 /* node Match this (simple) thing 0 or more times. */ #define PLUS 11 /* node Match this (simple) thing 1 or more times. */ #define OPEN 20 /* no Mark this point in input as start of #n. */ /* OPEN+1 is number 1, etc. */ #define CLOSE 30 /* no Analogous to OPEN. */ /* * Opcode notes: * * BRANCH The set of branches constituting a single choice are hooked * together with their "next" pointers, since precedence prevents * anything being concatenated to any individual branch. The * "next" pointer of the last BRANCH in a choice points to the * thing following the whole choice. This is also where the * final "next" pointer of each individual branch points; each * branch starts with the operand node of a BRANCH node. * * BACK Normal "next" pointers all implicitly point forward; BACK * exists to make loop structures possible. * * STAR,PLUS '?', and complex '*' and '+', are implemented as circular * BRANCH structures using BACK. Simple cases (one character * per match) are implemented with STAR and PLUS for speed * and to minimize recursive plunges. * * OPEN,CLOSE ...are numbered at compile time. */ /* * A node is one char of opcode followed by two chars of "next" pointer. * "Next" pointers are stored as two 8-bit pieces, high order first. The * value is a positive offset from the opcode of the node containing it. * An operand, if any, simply follows the node. (Note that much of the * code generation knows about this implicit relationship.) * * Using two bytes for the "next" pointer is vast overkill for most things, * but allows patterns to get big without disasters. */ #define OP(p) (*(p)) #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377)) #define OPERAND(p) ((p) + 3) // Utility definitions. #define UCHARAT(p) ((int)*(unsigned char *)(p)) #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?') #define META "^$.[{}|?+*\\" // Flags to be passed up and down. #define HASWIDTH 01 /* Known never to match null string. */ #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */ #define SPSTART 04 /* Starts with * or +. */ #define WORST 0 /* Worst case. */ RegExpr::RegExpr() { re = NULL; } RegExpr::~RegExpr() { delete re; } RegExpr::RegExpr(const char *r,bool nocase) { re = NULL; init(r,nocase); } bool RegExpr::init(const char *exp,bool nocase) // Compiles the regular expression ready for Find // if nocase = 1 the matching is case insensitive (where possible) { kill(); char *scan; char *longest; memsize_t len; REGFLAGS flags; assertex(exp!=NULL); delete re; re = new RECOMP; re->clarion = false; re->parse = exp; re->npar = 1; re->program = (char *)malloc(MAXEXPR); re->code = re->program; re->nocase = nocase; re->start = '\0'; /* Worst-case defaults. */ re->anch = 0; re->must = NULL; re->mlen = 0; if (re->reg(0, flags) == NULL) return false; /* Dig out information for optimizations. */ scan = re->program; /* First BRANCH. */ if (OP(re->next(scan)) == END) { /* Only one top-level choice. */ scan = OPERAND(scan); /* Starting-point info. */ if (OP(scan) == EXACTLY) re->start = *OPERAND(scan); else if (OP(scan) == BOL) re->anch++; /* * If there's something expensive in the r.e., find the * longest literal string that must appear and make it the * regmust. Resolve ties in favor of later strings, since * the regstart check works with the beginning of the r.e. * and avoiding duplication strengthens checking. Not a * strong reason, but sufficient in the absence of others. */ if (flags&SPSTART) { longest = NULL; len = 0; for (; scan != NULL; scan = re->next(scan)) if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= (size32_t) len) { longest = OPERAND(scan); len = strlen(OPERAND(scan)); } re->must = longest; re->mlen = (size32_t)len; } } int reloc; if (re->must) reloc = re->must-re->program; re->program = (char *)realloc(re->program,(re->code-re->program)+1); if (re->must) re->must = (char *)(re->program + reloc); return true; } const char * RegExpr::find(const char *string,size32_t from,size32_t len,size32_t maxlen) // finds the first occurrence of the RE in string { /* Be paranoid... */ if (re == NULL || re->program== NULL) { assertex(!"NULL parameter"); return NULL; } if (!string) { // find again if (re->findstart == NULL) { assertex(!"NULL parameter"); return NULL; } string = (char *)re->s_end[0]; if ((string==NULL)||(string == re->findend)) return NULL; } else { re->findstart = string; string+=from; re->replacemax = maxlen; if (maxlen) { re->clarion = true; re->findend = re->findstart+maxlen; } else { const char *s=string; for (size32_t l=0;*s&&(lfindend = s; } } free((void *)re->s_save); re->s_save = NULL; // clear saved (replaced string) /* If there is a "must appear" string, look for it. */ if (re->must != NULL) { const char *s = string; while ((s = re->rstrchr(s, re->must[0])) != NULL) { if (re->strnequ(s, re->must, re->mlen)) break; /* Found it. */ s++; } if (s == NULL) /* Not present. */ return 0; } /* Mark beginning of line for ^ . */ re->bol = string; /* Simplest case: anchored match need be tried only once. */ char *ret = NULL; if (re->anch) ret = re->rtry(string); else { /* Messy cases: unanchored match. */ const char *s = string; if (re->start != '\0') /* We know what char it must start with. */ while ((s = re->rstrchr(s, re->start)) != NULL) { if ((ret=re->rtry(s))!=NULL) break; s++; } else /* We don't -- general case. */ do { if ((ret=re->rtry(s))!=NULL) break; } while (!re->eob(s++)); } if (ret) { const char *ss = re->s_start[0]; size32_t tl = re->s_end[0]-ss; char *ds=(char *)malloc(tl+1); memcpy(ds,ss,tl); ds[tl] = 0; for (int i = 0;is_start[i]; if (ss2) { re->s_savelen[i] = re->s_end[i]-ss2; re->s_savestr[i] = ds+(ss2-ss); } else { re->s_savelen[i] = 0; re->s_savestr[i] = NULL; } } re->s_save = ds; } else { _clear(re->s_start); } return ret; } size32_t RegExpr::findlen(unsigned n) // size of string last matched using find { if (re == NULL || re->program== NULL) { assertex(!"NULL parameter"); return 0; } assertex(n=NSUBEXP)) return 0; if (re->s_save) return re->s_savelen[n]; return 0; } const char *RegExpr::findstr(StringBuffer &s,unsigned n) // returns string last matched (n = 0) or substring n (n>0) { size32_t l = findlen(n); if (l && re->s_save) { s.append(l,re->s_savestr[n]); } return s.str(); } const char *RegExpr::findnext() { return find(NULL); } void RegExpr::replace(const char *rs,unsigned maxlen,unsigned n) // replaces string (n = 0), or substring (n>0) by 's' // in string at position previously found by Find or FindNext. // Multiple replaces may be called on a single Find/FindNext { if (re == NULL || re->program== NULL) { assertex(!"NULL parameter"); return; } assertex(n=NSUBEXP)) return; const char *s=re->s_start[n]; if (!s) return; if (maxlen==0) maxlen = re->replacemax; if (maxlen==0) return; const char *e=re->s_end[n]; if (!e) return; size32_t lo=e-s; size32_t lt=re->rstrlen(e); size32_t lr = (size32_t)strlen(rs); if (lr>lo) { size32_t d = lr-lo; // delta size32_t l = ((e-re->findstart)+lt); // total length size32_t r = maxlen; if (l>r) { assertex(!"replace - maxlen too small for passed string!"); return; } r-=l; // r = max gap left if (!re->clarion) r--; // for null if (rclarion) *((char *)(s+(lr+lt))) = 0; // terminate } else if (lrclarion) { memmove((void *)(e-d2),e,lt); memset((void *)(e+(lt-d2)),' ',d2); } else memmove((void *)(e-lo+lr),e,lt+1); } memcpy((void *)s,rs,lr); re->adjustptrs(e,lo,lr); } static void dosubstitute(RegExpr *re,StringBuffer &out,char *s) { for (char* b= strchr(s,'#');b;b=strchr(b+1,'#')) { char c = b[1]; if (c=='#') { *(++b)=0; out.append(s); s = b+1; } else if (!isdigit(c)) continue; else { (*b++)=0; out.append(s); re->findstr(out,c-'0'); } s = b+1; // reset start } out.append(s); } const char * RegExpr::substitute(StringBuffer &out,const char *mask,...) { char * str = (char *)malloc(MAXSUBST); va_list ap; va_start(ap,mask); vsprintf(str,mask,ap); va_end(ap); dosubstitute(this,out,str); free(str); return (char*)out.str(); } void RegExpr::kill() // releases extra storage used by RegularExpressionClass { delete re; re = NULL; } char *RECOMP::reg (bool paren, REGFLAGS &flags) { char *ret; char *br; char *ender; int parno; flags = HASWIDTH; /* Tentatively. */ /* Make an OPEN node, if parenthesized. */ if (paren) { if (npar >= NSUBEXP) FAIL("too many {}"); parno = npar; npar++; ret = rnode((char)(OPEN+parno)); } else ret = NULL; /* Pick up the branches, linking them together. */ REGFLAGS bflags; br = branch(bflags); if (br == NULL) return(NULL); if (ret != NULL) tail(ret, br); /* OPEN -> first. */ else ret = br; if (!(bflags&HASWIDTH)) flags &= ~HASWIDTH; flags |= bflags&SPSTART; while (*parse == '|') { parse++; br = branch(bflags); if (br == NULL) return(NULL); tail(ret, br); /* BRANCH -> BRANCH. */ if (!(bflags&HASWIDTH)) flags &= ~HASWIDTH; flags |= bflags&SPSTART; } /* Make a closing node, and hook it on the end. */ ender = rnode((char)((paren) ? CLOSE+parno : END)); tail(ret, ender); /* Hook the tails of the branches to the closing node. */ for (br = ret; br != NULL; br = next(br)) optail(br, ender); /* Check for proper termination. */ if (paren && *parse++ != SUBPARENR) { FAIL("unmatched {}"); } else if (!paren && *parse != '\0') { if (*parse == SUBPARENR) { FAIL("unmatched {}"); } else FAIL("junk on end"); /* "Can't happen". */ /* NOTREACHED */ } return(ret); } char *RECOMP::branch (REGFLAGS &flags) /* - branch - one alternative of an | operator * * Implements the concatenation operator. */ { char *ret; char *chain; char *latest; flags = WORST; /* Tentatively. */ ret = rnode(BRANCH); chain = NULL; REGFLAGS pflags; while (*parse != '\0' && *parse != '|' && *parse != SUBPARENR) { latest = piece(pflags); if (latest == NULL) return(NULL); flags |= pflags&HASWIDTH; if (chain == NULL) /* First piece. */ flags |= pflags&SPSTART; else tail(chain, latest); chain = latest; } if (chain == NULL) /* Loop ran zero times. */ (void) rnode(NOTHING); return(ret); } char *RECOMP::piece (REGFLAGS &flags) /* - piece - something followed by possible [*+?] * * Note that the branching code sequences used for ? and the general cases * of * and + are somewhat optimized: they use the same NOTHING node as * both the endmarker for their branch list and the body of the last branch. * It might seem that this node could be dispensed with entirely, but the * endmarker role is not redundant. */ { char *ret; char op; char *next; REGFLAGS aflags; ret = atom(aflags); if (ret == NULL) return(NULL); op = *parse; if (!ISMULT(op)) { flags = aflags; return(ret); } if (!(aflags&HASWIDTH) && op != '?') FAIL("*+ operand could be empty"); flags = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH); if (op == '*' && (aflags&SIMPLE)) insert(STAR, ret); else if (op == '*') { /* Emit x* as (x&|), where & means "self". */ insert(BRANCH, ret); /* Either x */ optail(ret, rnode(BACK)); /* and loop */ optail(ret, ret); /* back */ tail(ret, rnode(BRANCH)); /* or */ tail(ret, rnode(NOTHING)); /* null. */ } else if (op == '+' && (aflags&SIMPLE)) insert(PLUS, ret); else if (op == '+') { /* Emit x+ as x(&|), where & means "self". */ next = rnode(BRANCH); /* Either */ tail(ret, next); tail(rnode(BACK), ret); /* loop back */ tail(next, rnode(BRANCH)); /* or */ tail(ret, rnode(NOTHING)); /* null. */ } else if (op == '?') { /* Emit x? as (x|) */ insert(BRANCH, ret); /* Either x */ tail(ret, rnode(BRANCH)); /* or */ next = rnode(NOTHING); /* null. */ tail(ret, next); optail(ret, next); } parse++; if (ISMULT(*parse)) FAIL("nested *?+"); return(ret); } char *RECOMP::atom (REGFLAGS &flags) /* - atom - the lowest level * * Optimization: gobbles an entire sequence of ordinary characters so that * it can turn them into a single node, which is smaller to store and * faster to run. Backslashed characters are exceptions, each becoming a * separate node; the code is simpler that way and it's not worth fixing. */ { char *ret; flags = WORST; /* Tentatively. */ switch (*parse++) { case '^': ret = rnode(BOL); break; case '$': ret = rnode(EOL); break; case '.': ret = rnode(ANY); flags |= HASWIDTH|SIMPLE; break; case '[': { int clss; int clssend; if ((*parse == '^') || (*parse == '~')) { /* Complement of range. */ ret = rnode(ANYBUT); parse++; } else ret = rnode(ANYOF); if (*parse == ']' || *parse == '-') REGC(*parse++); while (*parse != '\0' && *parse != ']') { if (*parse == '-') { parse++; if (*parse == ']' || *parse == '\0') REGC('-'); else { clss = UCHARAT(parse-2)+1; clssend = UCHARAT(parse); if (clss > clssend+1) FAIL("invalid [] range"); for (; clss <= clssend; clss++) REGC((char) clss); parse++; } } else REGC(*parse++); } REGC('\0'); if (*parse != ']') FAIL("unmatched []"); parse++; flags |= HASWIDTH|SIMPLE; } break; case SUBPARENL: { REGFLAGS sflags; ret = reg(1, sflags); if (ret == NULL) return(NULL); flags |= sflags&(HASWIDTH|SPSTART); } break; case '\0': case '|': case SUBPARENR: FAIL("internal urp"); /* Supposed to be caught earlier. */ break; case '?': case '+': case '*': FAIL("?+* follows nothing"); break; case '\\': if (*parse == '\0') FAIL("trailing \\"); ret = rnode(EXACTLY); REGC(*parse++); REGC('\0'); flags |= HASWIDTH|SIMPLE; break; default: { memsize_t len; char ender; parse--; len = strcspn(parse, META); if (len <= 0) FAIL("internal disaster"); ender = *(parse+len); if (len > 1 && ISMULT(ender)) len--; /* Back off clear of ?+* operand. */ flags |= HASWIDTH; if (len == 1) flags |= SIMPLE; ret = rnode(EXACTLY); while (len > 0) { REGC(*parse++); len--; } REGC('\0'); } break; } return(ret); } char *RECOMP::rnode (char op) /* - rnode - emit a node */ { char *ret = code; REGC(op); REGC(0); REGC(0); return(ret); } void RECOMP::insert (char op, char *opnd) /* - insert - insert an operator in front of already-emitted operand * * Means relocating the operand. */ { char *src=code; code+=3; char *dst=code; while (src > opnd) *--dst = *--src; char *place = (char *)opnd; /* Op node, where operand used to be. */ *place++ = op; *place++ = '\0'; *place++ = '\0'; } void RECOMP::tail (char *p, char *val) /* - tail - set the next-pointer at the end of a node chain */ { char *scan; char *temp; int offset; /* Find last node. */ scan = p; for (;;) { temp = next(scan); if (temp == NULL) break; scan = temp; } if (OP(scan) == BACK) offset = scan - val; else offset = val - scan; *(scan+1) = (char) ((offset>>8)&0377); *(scan+2) = (char) (offset&0377); } void RECOMP::optail (char *p, char *val) /* - optail - tail on operand of first argument; nop if operandless */ { /* "Operandless" and "op != BRANCH" are synonymous in practice. */ if (p == NULL || OP(p) != BRANCH) return; tail(OPERAND(p), val); } char *RECOMP::next (char *p) /* - next - dig the "next" pointer out of a node */ { int offset = NEXT(p); if (offset == 0) return(NULL); if (OP(p) == BACK) return(p-offset); else return(p+offset); } /* - rtry - try match at specific point */ char * RECOMP::rtry (const char *string) { input = string; startp = s_start; endp = s_end; _clear(s_start); _clear(s_end); if (match(program)) { s_start[0] = string; s_end[0] = input; return (char *)string; } return NULL; } bool RECOMP::match (char *prog) { char *scan; /* Current node. */ char *n; /* Next node. */ scan = prog; while (scan != NULL) { n = next(scan); switch (OP(scan)) { case BOL: if (input != bol) return false; break; case EOL: if (clarion) { if (!claeol(input)) return false; } else if (!eob(input)) return false; break; case ANY: if (eob(input)) return false; input++; break; case EXACTLY: { memsize_t len; char *opnd; opnd = OPERAND(scan); /* Inline the first character, for speed. */ if (!equch(*opnd,*input)) return false; len = strlen(opnd); if (len > 1 && !strnequ(input, opnd, (size32_t)len)) return false; input += len; } break; case ANYOF: if (eob(input) || strchr(OPERAND(scan), *input) == NULL) return false; input++; break; case ANYBUT: if (eob(input) || strchr(OPERAND(scan), *input) != NULL) return false; input++; break; case NOTHING: break; case BACK: break; case OPEN+1: case OPEN+2: case OPEN+3: case OPEN+4: case OPEN+5: case OPEN+6: case OPEN+7: case OPEN+8: case OPEN+9: { int no; const char *save; no = OP(scan) - OPEN; save = input; if (match(n)) { /* * Don't set start if some later * invocation of the same parentheses * already has. */ if (startp[no] == NULL) startp[no] = save; return true; } else return false; } break; case CLOSE+1: case CLOSE+2: case CLOSE+3: case CLOSE+4: case CLOSE+5: case CLOSE+6: case CLOSE+7: case CLOSE+8: case CLOSE+9: { int no; const char *save; no = OP(scan) - CLOSE; save = input; if (match(n)) { /* * Don't set end if some later * invocation of the same parentheses * already has. */ if (endp[no] == NULL) endp[no] = save; return true; } else return false; } break; case BRANCH: { const char *save; if (OP(n) != BRANCH) /* No choice. */ n = OPERAND(scan); /* Avoid recursion. */ else { do { save = input; if (match(OPERAND(scan))) return true; input = save; scan = next(scan); } while (scan != NULL && OP(scan) == BRANCH); return false; /* NOTREACHED */ } } break; case STAR: case PLUS: { char nextch; int no; const char *save; int min; /* * Lookahead to avoid useless match attempts * when we know what character comes n. */ nextch = '\0'; if (OP(n) == EXACTLY) nextch = *OPERAND(n); min = (OP(scan) == STAR) ? 0 : 1; save = input; no = repeat(OPERAND(scan)); while (no >= min) { /* If it could work, try it. */ if (nextch == '\0' || equch(*input,nextch)) if (match(n)) return true; /* Couldn't or didn't -- back up. */ no--; input = save + no; } return false; } break; case END: return true; /* Success! */ break; default: assertex(!"memory corruption"); return false; break; } scan = n; } /* * We get here only if there's trouble -- normally "case END" is * the terminating point. */ assertex(!"corrupted pointers"); return false; } int RECOMP::repeat (char *p) /* - regrepeat - repeatedly match something simple, report how many */ { int count = 0; const char *scan = input; char *opnd = OPERAND(p); switch (OP(p)) { case ANY: count = rstrlen(scan); scan += count; break; case EXACTLY: while (equch(*opnd,*scan)) { count++; scan++; } break; case ANYOF: while (!eob(scan) && strchr(opnd, *scan) != NULL) { // NB not rstrchr! count++; scan++; } break; case ANYBUT: while (!eob(scan) && strchr(opnd, *scan) == NULL) { // NB nnot rstrchr count++; scan++; } break; default: /* Oh dear. Called inappropriately. */ assertex(!"internal foulup"); count = 0; /* Best compromise. */ break; } input = scan; return(count); } void RECOMP::adjustptrs(const char *e,size32_t lo,size32_t lr) { if (lo==lr) return; size32_t o=e-findstart; for (int i=0;i=o) { s_start[i]=s+lr-lo; } so = s_end[i]-findstart; if (so>=o) s_end[i]+=lr-lo; } } findend+=(lr-lo); } const char *RECOMP::rstrchr(const char *s,int c) { if (nocase) { c = toupper(c); while (s!=findend) { if (toupper(*s)==c) return s; s++; } } else { while (s!=findend) { if (*s==c) return s; s++; } } return NULL; } bool RECOMP::strnequ(const char *s1,const char *s2,size32_t n) { // s1 is in regsearch if (findend-s1 < (signed)n) return false; if (nocase) { while (n--) { if (toupper(*s1)!=toupper(*s2)) return false; s1++; s2++; } } else { while (n--) { if (*s1!=*s2) return false; s1++; s2++; } } return true; } bool RECOMP::claeol(const char *s) { do { if (eob(s)) return true; } while ((*(s++))==' '); return false; } unsigned char xlat[26]={0,'1','2','3',0,'1','2',0,0,'2','2','4','5','5',0,'1','2','6','2','3',0,'1',0,'2',0,'2'}; static char *SoundexCode(const char *s,int l,char *res) { char *r=res; r[1] = '0'; r[2] = '0'; r[3] = '0'; r[4] = 0; loop { if (!l||!*s) { *r = '!'; return res; } if (isalpha(*s)) break; s++; l--; } char c=toupper(*s); *r = c; r++; s++; char dl = ((c>='A')&&(c<='Z'))?xlat[c-'A']:0; while (l&&*s) { c = toupper(*s); s++; l--; if ((c>='A')&&(c<='Z')) { char d = xlat[c-'A']; if (d) { if(d!=dl) { *r = d; r++; if (!*r) break; } } else if (c != 'H' && c != 'W') dl = d; } } return res; } static bool WildMatchN ( const char *src, int srclen, int srcidx, const char *pat, int patlen, int patidx,int nocase) { char next_char; for (;;) { if (patidx == patlen) return (srcidx == srclen); next_char = pat[patidx++]; if (next_char == '?') { if (srcidx == srclen) return false; srcidx++; } else if (next_char != '*') { if (nocase) { if ((srcidx == srclen) || (toupper(src[srcidx])!=toupper(next_char))) return false; } else if ((srcidx == srclen) || (src[srcidx]!=next_char)) return false; srcidx++; } else { loop { if (patidx == patlen) return true; if (pat[patidx] != '*') break; patidx++; } loop { //No need to guard patLen since guaranteed to contain an ASTERISK const char tail_char = pat[patlen-1]; if (tail_char == '*') break; if (srcidx == srclen) return false; if (tail_char != '?') { if (nocase) { if ((toupper(tail_char)!=toupper(src[srclen-1]))) return false; } else { if ((tail_char!=src[srclen-1])) return false; } } patlen--; srclen--; if (patidx == patlen) return true; } while (srcidx < srclen) { if (WildMatchN(src,srclen,srcidx, pat, patlen, patidx,nocase)) return true; srcidx++; } return false; } } } bool jlib_decl WildMatch(const char *src, int srclen, const char *pat, int patlen,bool nocase) { return WildMatchN(src,srclen,0,pat,patlen,0,nocase); } bool jlib_decl WildMatch(const char *src, const char *pat, bool nocase) { return WildMatch(src,(size32_t)strlen(src),pat,(size32_t)strlen(pat),nocase); } static bool WildMatchNreplace ( const char *src, int srclen, int srcidx, const char *pat, int patlen, int patidx, int nocase, UnsignedArray &wild, UnsignedArray &wildlen ) { unsigned wl = wild.ordinality(); char next_char; for (;;) { if (patidx == patlen) { if (srcidx == srclen) return true; goto Fail; } next_char = pat[patidx++]; if (next_char == '?') { if (srcidx == srclen) goto Fail; wild.append(srcidx++); wildlen.append(1); } else if (next_char != '*') { if (nocase) { if ((srcidx == srclen) || (toupper(src[srcidx])!=toupper(next_char))) goto Fail; } else if ((srcidx == srclen) || (src[srcidx]!=next_char)) goto Fail; srcidx++; } else { loop { if (patidx == patlen) { wild.append(srcidx); wildlen.append(srclen-srcidx); return true; } wild.append(srcidx); wildlen.append(0); // placemarker if (pat[patidx] != '*') break; patidx++; // someone being silly! } while (srcidx < srclen) { if (WildMatchNreplace(src,srclen,srcidx, pat, patlen, patidx,nocase, wild,wildlen)) return true; wildlen.append(wildlen.pop()+1); // prob can do a bit better than this! srcidx++; } break; // fail } } Fail: unsigned pn = wild.ordinality()-wl; if (pn) { wild.popn(pn); wildlen.popn(pn); } return false; } bool jlib_decl WildMatchReplace(const char *src, const char *pat, const char *repl, bool nocase, StringBuffer &out) { UnsignedArray wild; UnsignedArray wildlen; if (!WildMatchNreplace(src,(size32_t)strlen(src),0,pat,(size32_t)strlen(pat),0,nocase,wild,wildlen)) return false; loop { char c = *(repl++); if (!c) break; if (c!='$') out.append(c); else { char c2 = *(repl++); if (!c2) { out.append(c); break; } if (c2=='$') out.append(c); else { unsigned idx = c2-'0'; if (idx==0) out.append(src); else if ((idx<=9)&&(idx<=wild.ordinality())) out.append(wildlen.item(idx-1),src+wild.item(idx-1)); else out.append(c).append(c2); } } } return true; } bool jlib_decl SoundexMatch(const char *src, const char *pat) { char s1[5]; char s2[5]; return memcmp(SoundexCode(src,(size32_t)strlen(src),s1),SoundexCode(pat,(size32_t)strlen(pat),s2),4)==0; } //--------------------------------------------------------------------------- StringMatcher::StringMatcher() { for (unsigned idx=0; idx < 256; idx++) { firstLevel[idx].value = 0; firstLevel[idx].table = NULL; } } void StringMatcher::freeLevel(entry * elems) { for (unsigned idx = 0; idx < 256; idx++) { if (elems[idx].table) { freeLevel(elems[idx].table); free(elems[idx].table); elems[idx].table = NULL; } elems[idx].value = 0; } } StringMatcher::~StringMatcher() { freeLevel(firstLevel); } void StringMatcher::addEntry(const char * text, unsigned action) { if (!queryAddEntry((size32_t)strlen(text), text, action)) throw MakeStringException(-1, "Duplicate entry \"%s\" added to string matcher", text); } void StringMatcher::addEntry(unsigned len, const char * text, unsigned action) { if (!queryAddEntry(len, text, action)) throw MakeStringException(-1, "Duplicate entry \"%*s\" added to string matcher", len, text); } bool StringMatcher::queryAddEntry(unsigned len, const char * text, unsigned action) { if (len == 0) return false; entry * curTable = firstLevel; loop { byte c = *text++; entry & curElement = curTable[c]; if (--len == 0) { if (curElement.value != 0) return false; curElement.value = action; return true; } if (!curElement.table) curElement.table = (entry *)calloc(sizeof(entry), 256); curTable = curElement.table; } } unsigned StringMatcher::getMatch(unsigned maxLen, const char * text, unsigned & matchLen) { unsigned bestValue = 0; unsigned bestLen = 0; if (maxLen) { const byte * start = (const byte *)text; const byte * cursor = (const byte *)text; const byte * end = (const byte *)cursor+maxLen; byte next = *cursor++; entry * cur = &firstLevel[next]; while ((cursor != end) && cur->table) { if (cur->value) { bestValue = cur->value; bestLen = cursor-start; } next = *cursor++; cur = &cur->table[next]; } if (cur->value) { matchLen = cursor-start; return cur->value; } } matchLen = bestLen; return bestValue; } void addActionList(StringMatcher & matcher, const char * text, unsigned action, unsigned * maxElementLength) { if (!text) return; unsigned idx=0; while (*text) { StringBuffer str; while (*text) { char next = *text++; if (next == ',') break; if (next == '\\' && *text) { next = *text++; switch (next) { case 'r': next = '\r'; break; case 'n': next = '\n'; break; case 't': next = '\t'; break; case 'x': //hex constant - at least we can define spaces then... if (text[0] && text[1]) { next = (hex2num(*text) << 4) | hex2num(text[1]); text+=2; } break; default: break; //otherwise \ just quotes the character e.g. \, } } str.append(next); } if (str.length()) { matcher.addEntry(str.str(), action+(idx++<<8)); if (maxElementLength && (str.length() > *maxElementLength)) *maxElementLength = str.length(); } } } //--------------------------------------------------------------------------- #if 0 #define MATCHsimple 0 #define MATCHwild 1 #define MATCHregular 2 #define MATCHsoundex 3 #define MATCHmask 0x0f #define MATCHnocase 0x10 static class _MatchRE : public RegExpr { public: bool init(const char *rexpr,int nocase) { if (!cachere||(cachenocase!=nocase)||!strequ(cachere,rexpr)) { free(cachere); cachere = strdup(rexpr); cachenocase = (unsigned char)nocase; return RegExpr::init(cachere,cachenocase); } return true; } _MatchRE() { cachere=NULL; } ~_MatchRE() { free(cachere); } private: char *cachere; unsigned char cachenocase; } MatchRE; extern "Pascal" long Cla$MATCH(/* STRING s, STRING p, */ unsigned char flags) { StringBuffer s; s.PopString(); s.clip(); StringBuffer p; p.PopString(); p.clip(); int nocase = flags&MATCHnocase?1:0; switch(flags&MATCHmask) { case MATCHsimple: { if (s.strlen()!=p.strlen()) return false; if (nocase) return striequ((char *)s,(char *)p); return strequ((char *)s,(char *)p); } break; case MATCHwild: return WildMatch(s,s.strlen(),p,p.strlen(),nocase); case MATCHregular: { if (!MatchRE.init(p,nocase)) return false; const char *fs=MatchRE.find(s,0,RE_ALL,0); return (fs!=NULL); } break; case MATCHsoundex: { char s1[5]; char s2[5]; return memcmp(SoundexCode(s,s.strlen(),s1),SoundexCode(p,p.strlen(),s2),4)==0; } break; } return false; } #endif //-------------------------------- // Testing #ifdef _DEBUG void tr (RegExpr &RE,const char *s,const char *p,const char *r=NULL,size32_t n=0,bool nocase=false) { PrintLog("Test '%s','%s','%s',%d,%s\r\n",s,p,r?r:"-",n,nocase?"NOCASE":""); char l[256]; strcpy(l,s); StringBuffer ds; RE.init(p,nocase); const char * f = RE.find(l,0,RE_ALL); if (!f) PrintLog("Not Found\r\n"); else { PrintLog("Found '%s'\r\n",RE.findstr(ds)); for (int i = 1;i<=9;i++) { size32_t s = RE.findlen(i); if (s) { ds.clear(); PrintLog("Found Sub %d,%d,'%s'\r\n",i,s,RE.findstr(ds,i)); } } } if (r) { RE.replace(r,sizeof(l)); PrintLog("Replace to '%s'\r\n",l); } } #define QBF "the quick brown fox jumped over the lazy dogs" #define RS "Nigel Was Here" void RE$Test() { RegExpr RE; tr(RE,QBF,"the",RS); tr(RE,QBF,"quick",RS); tr(RE,QBF,"dogs",RS); tr(RE,QBF,"lazz",RS); tr(RE,QBF,"laz.",RS); tr(RE,QBF,"^laz.",RS); tr(RE,QBF,"laz.$",RS); tr(RE,QBF,"^the.",RS); tr(RE,QBF,"dogs^",RS); tr(RE,QBF,"^t.*dogs^",RS); tr(RE,QBF,"dog.",RS); tr(RE,QBF,".the",RS); tr(RE,QBF,".he",RS); tr(RE,QBF,".*",RS); tr(RE,QBF,"q.*",RS); tr(RE,QBF,"q.*z",RS); tr(RE,QBF,"qu.*k",RS); tr(RE,QBF,"d{[oa]g}s",RS); tr(RE,QBF,"laz[a-z]",RS); tr(RE,QBF,"{laz[a-z]} {dogs}",RS); tr(RE,QBF,"the",RS,50); tr(RE,QBF,"quick",RS,50); tr(RE,QBF,"dogs",RS,50); tr(RE,QBF,"lazz",RS,50); tr(RE,QBF,"laz.",RS,50); tr(RE,QBF,"^laz.",RS,50); tr(RE,QBF,"laz.$",RS,50); tr(RE,QBF,"^the.",RS,50); tr(RE,QBF,"dogs^",RS,50); tr(RE,QBF,"^t.*dogs^",RS,50); tr(RE,QBF,"dog.",RS,50); tr(RE,QBF,".the",RS,50); tr(RE,QBF,".he",RS,50); tr(RE,QBF,".*",RS,50); tr(RE,QBF,"q.*",RS,50); tr(RE,QBF,"q.*z",RS,50); tr(RE,QBF,"qu.*k",RS,50); tr(RE,QBF,"d{[oa]g}s",RS,50); tr(RE,QBF,"laz[a-z]",RS,50); tr(RE,QBF,"{laz[a-z]} {dogs}",RS,50); tr(RE,"INRANGE (a,b)","[ ,]INRANGE *({.+},{.+})"); tr(RE," inrange (a,b)","[ ,]INRANGE *({.+},{.+})",NULL,0,true); tr(RE,",INRANGE(aa,bb)","[ ,]INRANGE *({.+},{.+})"); tr(RE,",INRANGE(,bb)","[ ,]INRANGE *({.+},{.+})"); tr(RE,"10/14/1998","^{.*}/{.*}/{.*}$"); tr(RE,"freddy","^{|{.+}\\@}{.+}$"); tr(RE,"myname@freddy","^{|{.+}\\@}{.+}$"); tr(RE,"freddy","^{{.+}\\@|}{.+}$"); tr(RE,"myname@freddy","^{{.+}\\@|}{.+}$"); tr(RE,"freddy","^{[^@]+}@?{[^@]+}$"); tr(RE,"myname@freddy","^{.+}@{.+}$|{.+}$"); tr(RE,"freddy","^{|{.+}@}{[^@]+}$"); char str[256]; strcpy(str,"myname@freddy"); RE.init("^{|{.+}@}{[^@]+}$"); RE.find(str); StringBuffer t; PrintLog("Substitute to '%s'\r\n",RE.substitute(t,"#'#3###2'#")); t.clear(); RE.replace(RE.findstr(t,2),sizeof(str),3); t.clear(); RE.replace(RE.findstr(t,3),sizeof(str),2); PrintLog("Replace to '%s'\r\n",str); } void FixDate(char *buff,size32_t max) { RegExpr RE("[0-9]+/[0-9]+/{[0-9]+}"); const char *s = RE.find(buff); while (s) { StringBuffer ys; int y=atoi(RE.findstr(ys,1)); if (y<1000) { ys.clear(); ys.append(y+1900); RE.replace(ys.str(),max,1); } s = RE.findnext(); } } #endif