jregexp.cpp 46 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842
  1. /*##############################################################################
  2. HPCC SYSTEMS software Copyright (C) 2012 HPCC Systems®.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. ############################################################################## */
  13. #include "jlib.hpp"
  14. #include "jmisc.hpp"
  15. #include "jregexp.hpp"
  16. #define FAIL(s) { /*assert(!s);*/ return NULL; }
  17. #define SUBPARENL '{'
  18. #define SUBPARENR '}'
  19. // NB Meta must change if the above change
  20. typedef unsigned REGFLAGS;
  21. #define MAXEXPR 0xf000
  22. #define MAXSUBST 0xf000
  23. #define NSUBEXP 10
  24. class RECOMP
  25. {
  26. public:
  27. const char *s_start[NSUBEXP];
  28. const char *s_end[NSUBEXP];
  29. const char *s_save; // save text after find
  30. const char *s_savestr[NSUBEXP];
  31. size32_t s_savelen[NSUBEXP];
  32. char start;
  33. char anch;
  34. char *must;
  35. size32_t mlen;
  36. char *program;
  37. const char *parse; /* Input-scan pointer. */
  38. int npar; /* {} count. */
  39. char *code; /* Code-emit pointer. */
  40. const char *input; /* String-input pointer. */
  41. const char *bol; /* Beginning of input, for ^ check. */
  42. const char **startp; /* Pointer to startp array. */
  43. const char **endp; /* Ditto for end. */
  44. char * reg(bool sub, REGFLAGS &);
  45. char *branch (REGFLAGS &);
  46. char *piece (REGFLAGS &);
  47. char *atom (REGFLAGS &);
  48. char *rnode (char);
  49. #ifdef _DEBUG
  50. void regc (char b) {
  51. if (code-program>=MAXEXPR-3)
  52. assertex(!"regular expression too complicated!");
  53. *code++ = b;
  54. };
  55. #endif
  56. void insert (char, char *);
  57. void tail (char *, char *);
  58. void optail (char *, char *);
  59. char *next (char *);
  60. bool match (char *prog);
  61. char *rtry (const char *);
  62. int repeat (char *);
  63. const char * findstart;
  64. const char * findend;
  65. size32_t replacemax; // needed for replace (clarion)
  66. bool nocase;
  67. const char *rstrchr(const char *s,int c);
  68. bool strnequ(const char *s1,const char *s2,size32_t n);
  69. bool eob(const char *s) { return s==findend; };
  70. bool claeol(const char *s);
  71. bool equch(int c1,int c2) { if (nocase) return (toupper(c1)==toupper(c2)); return c1==c2; };
  72. void adjustptrs(const char *e,size32_t lo,size32_t lr);
  73. size32_t rstrlen(const char *s) {size32_t l=0; while (s++!=findend) l++; return l; };
  74. bool clarion;
  75. bool suboverlap(int n,int m);
  76. RECOMP() { _clear(s_start); s_save=NULL; program = NULL; };
  77. ~RECOMP() {
  78. free((void *)s_save);
  79. free((void *)program);
  80. }
  81. };
  82. #ifndef _DEBUG
  83. #define REGC(b) *code++ = b
  84. #else
  85. #define REGC(b) regc(b)
  86. #endif
  87. //----
  88. /*
  89. * The "internal use only" fields in regexp.h are present to pass info from
  90. * compile to execute that permits the execute phase to run lots faster on
  91. * simple cases. They are:
  92. *
  93. * regstart char that must begin a match; '\0' if none obvious
  94. * reganch is the match anchored (at beginning-of-line only)?
  95. * regmust string (pointer into program) that match must include, or NULL
  96. * regmlen length of regmust string
  97. *
  98. * Regstart and reganch permit very fast decisions on suitable starting points
  99. * for a match, cutting down the work a lot. Regmust permits fast rejection
  100. * of lines that cannot possibly match. The regmust tests are costly enough
  101. * that regcomp() supplies a regmust only if the r.e. contains something
  102. * potentially expensive (at present, the only such thing detected is * or +
  103. * at the start of the r.e., which can involve a lot of backup). Regmlen is
  104. * supplied because the test in regexec() needs it and regcomp() is computing
  105. * it anyway.
  106. */
  107. /*
  108. * Structure for regexp "program". This is essentially a linear encoding
  109. * of a nondeterministic finite-state machine (aka syntax charts or
  110. * "railroad normal form" in parsing technology). Each node is an opcode
  111. * plus a "next" pointer, possibly plus an operand. "Next" pointers of
  112. * all nodes except BRANCH implement concatenation; a "next" pointer with
  113. * a BRANCH on both ends of it is connecting two alternatives. (Here we
  114. * have one of the subtle syntax dependencies: an individual BRANCH (as
  115. * opposed to a collection of them) is never concatenated with anything
  116. * because of operator precedence.) The operand of some types of node is
  117. * a literal string; for others, it is a node leading into a sub-FSM. In
  118. * particular, the operand of a BRANCH node is the first node of the branch.
  119. * (NB this is *not* a tree structure: the tail of the branch connects
  120. * to the thing following the set of BRANCHes.) The opcodes are:
  121. */
  122. /* definition number opnd? meaning */
  123. #define END 0 /* no End of program. */
  124. #define BOL 1 /* no Match "" at beginning of line. */
  125. #define EOL 2 /* no Match "" at end of line. */
  126. #define ANY 3 /* no Match any one character. */
  127. #define ANYOF 4 /* str Match any character in this string. */
  128. #define ANYBUT 5 /* str Match any character not in this string. */
  129. #define BRANCH 6 /* node Match this alternative, or the next... */
  130. #define BACK 7 /* no Match "", "next" ptr points backward. */
  131. #define EXACTLY 8 /* str Match this string. */
  132. #define NOTHING 9 /* no Match empty string. */
  133. #define STAR 10 /* node Match this (simple) thing 0 or more times. */
  134. #define PLUS 11 /* node Match this (simple) thing 1 or more times. */
  135. #define OPEN 20 /* no Mark this point in input as start of #n. */
  136. /* OPEN+1 is number 1, etc. */
  137. #define CLOSE 30 /* no Analogous to OPEN. */
  138. /*
  139. * Opcode notes:
  140. *
  141. * BRANCH The set of branches constituting a single choice are hooked
  142. * together with their "next" pointers, since precedence prevents
  143. * anything being concatenated to any individual branch. The
  144. * "next" pointer of the last BRANCH in a choice points to the
  145. * thing following the whole choice. This is also where the
  146. * final "next" pointer of each individual branch points; each
  147. * branch starts with the operand node of a BRANCH node.
  148. *
  149. * BACK Normal "next" pointers all implicitly point forward; BACK
  150. * exists to make loop structures possible.
  151. *
  152. * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
  153. * BRANCH structures using BACK. Simple cases (one character
  154. * per match) are implemented with STAR and PLUS for speed
  155. * and to minimize recursive plunges.
  156. *
  157. * OPEN,CLOSE ...are numbered at compile time.
  158. */
  159. /*
  160. * A node is one char of opcode followed by two chars of "next" pointer.
  161. * "Next" pointers are stored as two 8-bit pieces, high order first. The
  162. * value is a positive offset from the opcode of the node containing it.
  163. * An operand, if any, simply follows the node. (Note that much of the
  164. * code generation knows about this implicit relationship.)
  165. *
  166. * Using two bytes for the "next" pointer is vast overkill for most things,
  167. * but allows patterns to get big without disasters.
  168. */
  169. #define OP(p) (*(p))
  170. #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
  171. #define OPERAND(p) ((p) + 3)
  172. // Utility definitions.
  173. #define UCHARAT(p) ((int)*(unsigned char *)(p))
  174. #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?')
  175. #define META "^$.[{}|?+*\\"
  176. // Flags to be passed up and down.
  177. #define HASWIDTH 01 /* Known never to match null string. */
  178. #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
  179. #define SPSTART 04 /* Starts with * or +. */
  180. #define WORST 0 /* Worst case. */
  181. RegExpr::RegExpr()
  182. {
  183. re = NULL;
  184. }
  185. RegExpr::~RegExpr()
  186. {
  187. delete re;
  188. }
  189. RegExpr::RegExpr(const char *r,bool nocase)
  190. {
  191. re = NULL;
  192. init(r,nocase);
  193. }
  194. bool RegExpr::init(const char *exp,bool nocase)
  195. // Compiles the regular expression ready for Find
  196. // if nocase = 1 the matching is case insensitive (where possible)
  197. {
  198. kill();
  199. char *scan;
  200. char *longest;
  201. memsize_t len;
  202. REGFLAGS flags;
  203. assertex(exp!=NULL);
  204. delete re;
  205. re = new RECOMP;
  206. re->clarion = false;
  207. re->parse = exp;
  208. re->npar = 1;
  209. re->program = (char *)malloc(MAXEXPR);
  210. re->code = re->program;
  211. re->nocase = nocase;
  212. re->start = '\0'; /* Worst-case defaults. */
  213. re->anch = 0;
  214. re->must = NULL;
  215. re->mlen = 0;
  216. if (re->reg(0, flags) == NULL)
  217. return false;
  218. /* Dig out information for optimizations. */
  219. scan = re->program; /* First BRANCH. */
  220. if (OP(re->next(scan)) == END) { /* Only one top-level choice. */
  221. scan = OPERAND(scan);
  222. /* Starting-point info. */
  223. if (OP(scan) == EXACTLY)
  224. re->start = *OPERAND(scan);
  225. else if (OP(scan) == BOL)
  226. re->anch++;
  227. /*
  228. * If there's something expensive in the r.e., find the
  229. * longest literal string that must appear and make it the
  230. * regmust. Resolve ties in favor of later strings, since
  231. * the regstart check works with the beginning of the r.e.
  232. * and avoiding duplication strengthens checking. Not a
  233. * strong reason, but sufficient in the absence of others.
  234. */
  235. if (flags&SPSTART) {
  236. longest = NULL;
  237. len = 0;
  238. for (; scan != NULL; scan = re->next(scan))
  239. if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= (size32_t) len) {
  240. longest = OPERAND(scan);
  241. len = strlen(OPERAND(scan));
  242. }
  243. re->must = longest;
  244. re->mlen = (size32_t)len;
  245. }
  246. }
  247. int reloc = 0;
  248. if (re->must)
  249. reloc = re->must-re->program;
  250. re->program = (char *)realloc(re->program,(re->code-re->program)+1);
  251. if (re->must)
  252. re->must = (char *)(re->program + reloc);
  253. return true;
  254. }
  255. const char * RegExpr::find(const char *string,size32_t from,size32_t len,size32_t maxlen)
  256. // finds the first occurrence of the RE in string
  257. {
  258. /* Be paranoid... */
  259. if (re == NULL || re->program== NULL) {
  260. assertex(!"NULL parameter");
  261. return NULL;
  262. }
  263. if (!string) { // find again
  264. if (re->findstart == NULL) {
  265. assertex(!"NULL parameter");
  266. return NULL;
  267. }
  268. string = (char *)re->s_end[0];
  269. if ((string==NULL)||(string == re->findend))
  270. return NULL;
  271. }
  272. else {
  273. re->findstart = string;
  274. string+=from;
  275. re->replacemax = maxlen;
  276. if (maxlen) {
  277. re->clarion = true;
  278. re->findend = re->findstart+maxlen;
  279. }
  280. else {
  281. const char *s=string;
  282. for (size32_t l=0;*s&&(l<len);l++) s++;
  283. re->findend = s;
  284. }
  285. }
  286. free((void *)re->s_save);
  287. re->s_save = NULL; // clear saved (replaced string)
  288. /* If there is a "must appear" string, look for it. */
  289. if (re->must != NULL) {
  290. const char *s = string;
  291. while ((s = re->rstrchr(s, re->must[0])) != NULL) {
  292. if (re->strnequ(s, re->must, re->mlen))
  293. break; /* Found it. */
  294. s++;
  295. }
  296. if (s == NULL) /* Not present. */
  297. return 0;
  298. }
  299. /* Mark beginning of line for ^ . */
  300. re->bol = string;
  301. /* Simplest case: anchored match need be tried only once. */
  302. char *ret = NULL;
  303. if (re->anch)
  304. ret = re->rtry(string);
  305. else {
  306. /* Messy cases: unanchored match. */
  307. const char *s = string;
  308. if (re->start != '\0')
  309. /* We know what char it must start with. */
  310. while ((s = re->rstrchr(s, re->start)) != NULL) {
  311. if ((ret=re->rtry(s))!=NULL)
  312. break;
  313. s++;
  314. }
  315. else
  316. /* We don't -- general case. */
  317. do {
  318. if ((ret=re->rtry(s))!=NULL)
  319. break;
  320. } while (!re->eob(s++));
  321. }
  322. if (ret) {
  323. const char *ss = re->s_start[0];
  324. size32_t tl = re->s_end[0]-ss;
  325. char *ds=(char *)malloc(tl+1);
  326. memcpy(ds,ss,tl);
  327. ds[tl] = 0;
  328. for (int i = 0;i<NSUBEXP;i++) {
  329. const char *ss2 = re->s_start[i];
  330. if (ss2) {
  331. re->s_savelen[i] = re->s_end[i]-ss2;
  332. re->s_savestr[i] = ds+(ss2-ss);
  333. }
  334. else {
  335. re->s_savelen[i] = 0;
  336. re->s_savestr[i] = NULL;
  337. }
  338. }
  339. re->s_save = ds;
  340. }
  341. else {
  342. _clear(re->s_start);
  343. }
  344. return ret;
  345. }
  346. size32_t RegExpr::findlen(unsigned n)
  347. // size of string last matched using find
  348. {
  349. if (re == NULL || re->program== NULL) {
  350. assertex(!"NULL parameter");
  351. return 0;
  352. }
  353. assertex(n<NSUBEXP);
  354. if ((n>=NSUBEXP)) return 0;
  355. if (re->s_save)
  356. return re->s_savelen[n];
  357. return 0;
  358. }
  359. const char *RegExpr::findstr(StringBuffer &s,unsigned n)
  360. // returns string last matched (n = 0) or substring n (n>0)
  361. {
  362. size32_t l = findlen(n);
  363. if (l && re->s_save) {
  364. s.append(l,re->s_savestr[n]);
  365. }
  366. return s.str();
  367. }
  368. const char *RegExpr::findnext()
  369. {
  370. return find(NULL);
  371. }
  372. void RegExpr::replace(const char *rs,unsigned maxlen,unsigned n)
  373. // replaces string (n = 0), or substring (n>0) by 's'
  374. // in string at position previously found by Find or FindNext.
  375. // Multiple replaces may be called on a single Find/FindNext
  376. {
  377. if (re == NULL || re->program== NULL) {
  378. assertex(!"NULL parameter");
  379. return;
  380. }
  381. assertex(n<NSUBEXP);
  382. if ((n>=NSUBEXP)) return;
  383. const char *s=re->s_start[n];
  384. if (!s) return;
  385. if (maxlen==0) maxlen = re->replacemax;
  386. if (maxlen==0) return;
  387. const char *e=re->s_end[n];
  388. if (!e) return;
  389. size32_t lo=e-s;
  390. size32_t lt=re->rstrlen(e);
  391. size32_t lr = (size32_t)strlen(rs);
  392. if (lr>lo) {
  393. size32_t d = lr-lo; // delta
  394. size32_t l = ((e-re->findstart)+lt); // total length
  395. size32_t r = maxlen;
  396. if (l>r) {
  397. assertex(!"replace - maxlen too small for passed string!");
  398. return;
  399. }
  400. r-=l; // r = max gap left
  401. if (!re->clarion) r--; // for null
  402. if (r<d) {
  403. if (lt<d-r) {
  404. lr+=lt;
  405. if (lr<=(d-r)) return; // can't really happen
  406. lr-=(d-r); // new string too big
  407. lt=0; // lose tail
  408. }
  409. else
  410. lt-=(d-r); // losing part of tail
  411. }
  412. if (lt) memmove((void *)(e+d),e,lt);
  413. if (!re->clarion) *((char *)(s+(lr+lt))) = 0; // terminate
  414. }
  415. else if (lr<lo) {
  416. size32_t d2 = lo-lr; // delta
  417. // must be enough room so fairly simple
  418. if (re->clarion) {
  419. memmove((void *)(e-d2),e,lt);
  420. memset((void *)(e+(lt-d2)),' ',d2);
  421. }
  422. else
  423. memmove((void *)(e-lo+lr),e,lt+1);
  424. }
  425. memcpy((void *)s,rs,lr);
  426. re->adjustptrs(e,lo,lr);
  427. }
  428. static void dosubstitute(RegExpr *re,StringBuffer &out,char *s)
  429. {
  430. for (char* b= strchr(s,'#');b;b=strchr(b+1,'#')) {
  431. char c = b[1];
  432. if (c=='#') {
  433. *(++b)=0;
  434. out.append(s);
  435. s = b+1;
  436. }
  437. else if (!isdigit(c))
  438. continue;
  439. else {
  440. (*b++)=0;
  441. out.append(s);
  442. re->findstr(out,c-'0');
  443. }
  444. s = b+1; // reset start
  445. }
  446. out.append(s);
  447. }
  448. const char * RegExpr::substitute(StringBuffer &out,const char *mask,...)
  449. {
  450. char * str = (char *)malloc(MAXSUBST);
  451. va_list ap;
  452. va_start(ap,mask);
  453. vsprintf(str,mask,ap);
  454. va_end(ap);
  455. dosubstitute(this,out,str);
  456. free(str);
  457. return (char*)out.str();
  458. }
  459. void RegExpr::kill()
  460. // releases extra storage used by RegularExpressionClass
  461. {
  462. delete re;
  463. re = NULL;
  464. }
  465. char *RECOMP::reg (bool paren, REGFLAGS &flags)
  466. {
  467. char *ret;
  468. char *br;
  469. char *ender;
  470. int parno = 0;
  471. flags = HASWIDTH; /* Tentatively. */
  472. /* Make an OPEN node, if parenthesized. */
  473. if (paren) {
  474. if (npar >= NSUBEXP)
  475. FAIL("too many {}");
  476. parno = npar;
  477. npar++;
  478. ret = rnode((char)(OPEN+parno));
  479. } else
  480. ret = NULL;
  481. /* Pick up the branches, linking them together. */
  482. REGFLAGS bflags;
  483. br = branch(bflags);
  484. if (br == NULL)
  485. return(NULL);
  486. if (ret != NULL)
  487. tail(ret, br); /* OPEN -> first. */
  488. else
  489. ret = br;
  490. if (!(bflags&HASWIDTH))
  491. flags &= ~HASWIDTH;
  492. flags |= bflags&SPSTART;
  493. while (*parse == '|') {
  494. parse++;
  495. br = branch(bflags);
  496. if (br == NULL)
  497. return(NULL);
  498. tail(ret, br); /* BRANCH -> BRANCH. */
  499. if (!(bflags&HASWIDTH))
  500. flags &= ~HASWIDTH;
  501. flags |= bflags&SPSTART;
  502. }
  503. /* Make a closing node, and hook it on the end. */
  504. ender = rnode((char)((paren) ? CLOSE+parno : END));
  505. tail(ret, ender);
  506. /* Hook the tails of the branches to the closing node. */
  507. for (br = ret; br != NULL; br = next(br))
  508. optail(br, ender);
  509. /* Check for proper termination. */
  510. if (paren && *parse++ != SUBPARENR) {
  511. FAIL("unmatched {}");
  512. } else if (!paren && *parse != '\0') {
  513. if (*parse == SUBPARENR) {
  514. FAIL("unmatched {}");
  515. } else
  516. FAIL("junk on end"); /* "Can't happen". */
  517. /* NOTREACHED */
  518. }
  519. return(ret);
  520. }
  521. char *RECOMP::branch (REGFLAGS &flags)
  522. /*
  523. - branch - one alternative of an | operator
  524. *
  525. * Implements the concatenation operator.
  526. */
  527. {
  528. char *ret;
  529. char *chain;
  530. char *latest;
  531. flags = WORST; /* Tentatively. */
  532. ret = rnode(BRANCH);
  533. chain = NULL;
  534. REGFLAGS pflags;
  535. while (*parse != '\0' && *parse != '|' && *parse != SUBPARENR) {
  536. latest = piece(pflags);
  537. if (latest == NULL)
  538. return(NULL);
  539. flags |= pflags&HASWIDTH;
  540. if (chain == NULL) /* First piece. */
  541. flags |= pflags&SPSTART;
  542. else
  543. tail(chain, latest);
  544. chain = latest;
  545. }
  546. if (chain == NULL) /* Loop ran zero times. */
  547. (void) rnode(NOTHING);
  548. return(ret);
  549. }
  550. char *RECOMP::piece (REGFLAGS &flags)
  551. /*
  552. - piece - something followed by possible [*+?]
  553. *
  554. * Note that the branching code sequences used for ? and the general cases
  555. * of * and + are somewhat optimized: they use the same NOTHING node as
  556. * both the endmarker for their branch list and the body of the last branch.
  557. * It might seem that this node could be dispensed with entirely, but the
  558. * endmarker role is not redundant.
  559. */
  560. {
  561. char *ret;
  562. char op;
  563. char *next;
  564. REGFLAGS aflags;
  565. ret = atom(aflags);
  566. if (ret == NULL)
  567. return(NULL);
  568. op = *parse;
  569. if (!ISMULT(op)) {
  570. flags = aflags;
  571. return(ret);
  572. }
  573. if (!(aflags&HASWIDTH) && op != '?')
  574. FAIL("*+ operand could be empty");
  575. flags = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
  576. if (op == '*' && (aflags&SIMPLE))
  577. insert(STAR, ret);
  578. else if (op == '*') {
  579. /* Emit x* as (x&|), where & means "self". */
  580. insert(BRANCH, ret); /* Either x */
  581. optail(ret, rnode(BACK)); /* and loop */
  582. optail(ret, ret); /* back */
  583. tail(ret, rnode(BRANCH)); /* or */
  584. tail(ret, rnode(NOTHING)); /* null. */
  585. } else if (op == '+' && (aflags&SIMPLE))
  586. insert(PLUS, ret);
  587. else if (op == '+') {
  588. /* Emit x+ as x(&|), where & means "self". */
  589. next = rnode(BRANCH); /* Either */
  590. tail(ret, next);
  591. tail(rnode(BACK), ret); /* loop back */
  592. tail(next, rnode(BRANCH)); /* or */
  593. tail(ret, rnode(NOTHING)); /* null. */
  594. } else if (op == '?') {
  595. /* Emit x? as (x|) */
  596. insert(BRANCH, ret); /* Either x */
  597. tail(ret, rnode(BRANCH)); /* or */
  598. next = rnode(NOTHING); /* null. */
  599. tail(ret, next);
  600. optail(ret, next);
  601. }
  602. parse++;
  603. if (ISMULT(*parse))
  604. FAIL("nested *?+");
  605. return(ret);
  606. }
  607. char *RECOMP::atom (REGFLAGS &flags)
  608. /*
  609. - atom - the lowest level
  610. *
  611. * Optimization: gobbles an entire sequence of ordinary characters so that
  612. * it can turn them into a single node, which is smaller to store and
  613. * faster to run. Backslashed characters are exceptions, each becoming a
  614. * separate node; the code is simpler that way and it's not worth fixing.
  615. */
  616. {
  617. char *ret;
  618. flags = WORST; /* Tentatively. */
  619. switch (*parse++) {
  620. case '^':
  621. ret = rnode(BOL);
  622. break;
  623. case '$':
  624. ret = rnode(EOL);
  625. break;
  626. case '.':
  627. ret = rnode(ANY);
  628. flags |= HASWIDTH|SIMPLE;
  629. break;
  630. case '[': {
  631. int clss;
  632. int clssend;
  633. if ((*parse == '^') || (*parse == '~')) { /* Complement of range. */
  634. ret = rnode(ANYBUT);
  635. parse++;
  636. } else
  637. ret = rnode(ANYOF);
  638. if (*parse == ']' || *parse == '-')
  639. REGC(*parse++);
  640. while (*parse != '\0' && *parse != ']') {
  641. if (*parse == '-') {
  642. parse++;
  643. if (*parse == ']' || *parse == '\0')
  644. REGC('-');
  645. else {
  646. clss = UCHARAT(parse-2)+1;
  647. clssend = UCHARAT(parse);
  648. if (clss > clssend+1)
  649. FAIL("invalid [] range");
  650. for (; clss <= clssend; clss++)
  651. REGC((char) clss);
  652. parse++;
  653. }
  654. } else
  655. REGC(*parse++);
  656. }
  657. REGC('\0');
  658. if (*parse != ']')
  659. FAIL("unmatched []");
  660. parse++;
  661. flags |= HASWIDTH|SIMPLE;
  662. }
  663. break;
  664. case SUBPARENL: {
  665. REGFLAGS sflags;
  666. ret = reg(1, sflags);
  667. if (ret == NULL)
  668. return(NULL);
  669. flags |= sflags&(HASWIDTH|SPSTART);
  670. }
  671. break;
  672. case '\0':
  673. case '|':
  674. case SUBPARENR:
  675. FAIL("internal urp"); /* Supposed to be caught earlier. */
  676. break;
  677. case '?':
  678. case '+':
  679. case '*':
  680. FAIL("?+* follows nothing");
  681. break;
  682. case '\\':
  683. if (*parse == '\0')
  684. FAIL("trailing \\");
  685. ret = rnode(EXACTLY);
  686. REGC(*parse++);
  687. REGC('\0');
  688. flags |= HASWIDTH|SIMPLE;
  689. break;
  690. default: {
  691. memsize_t len;
  692. char ender;
  693. parse--;
  694. len = strcspn(parse, META);
  695. if (len <= 0)
  696. FAIL("internal disaster");
  697. ender = *(parse+len);
  698. if (len > 1 && ISMULT(ender))
  699. len--; /* Back off clear of ?+* operand. */
  700. flags |= HASWIDTH;
  701. if (len == 1)
  702. flags |= SIMPLE;
  703. ret = rnode(EXACTLY);
  704. while (len > 0) {
  705. REGC(*parse++);
  706. len--;
  707. }
  708. REGC('\0');
  709. }
  710. break;
  711. }
  712. return(ret);
  713. }
  714. char *RECOMP::rnode (char op)
  715. /*
  716. - rnode - emit a node
  717. */
  718. {
  719. char *ret = code;
  720. REGC(op);
  721. REGC(0);
  722. REGC(0);
  723. return(ret);
  724. }
  725. void RECOMP::insert (char op, char *opnd)
  726. /*
  727. - insert - insert an operator in front of already-emitted operand
  728. *
  729. * Means relocating the operand.
  730. */
  731. {
  732. char *src=code;
  733. code+=3;
  734. char *dst=code;
  735. while (src > opnd)
  736. *--dst = *--src;
  737. char *place = (char *)opnd; /* Op node, where operand used to be. */
  738. *place++ = op;
  739. *place++ = '\0';
  740. *place++ = '\0';
  741. }
  742. void RECOMP::tail (char *p, char *val)
  743. /*
  744. - tail - set the next-pointer at the end of a node chain
  745. */
  746. {
  747. char *scan;
  748. char *temp;
  749. int offset;
  750. /* Find last node. */
  751. scan = p;
  752. for (;;) {
  753. temp = next(scan);
  754. if (temp == NULL)
  755. break;
  756. scan = temp;
  757. }
  758. if (OP(scan) == BACK)
  759. offset = scan - val;
  760. else
  761. offset = val - scan;
  762. *(scan+1) = (char) ((offset>>8)&0377);
  763. *(scan+2) = (char) (offset&0377);
  764. }
  765. void RECOMP::optail (char *p, char *val)
  766. /*
  767. - optail - tail on operand of first argument; nop if operandless
  768. */
  769. {
  770. /* "Operandless" and "op != BRANCH" are synonymous in practice. */
  771. if (p == NULL || OP(p) != BRANCH)
  772. return;
  773. tail(OPERAND(p), val);
  774. }
  775. char *RECOMP::next (char *p)
  776. /*
  777. - next - dig the "next" pointer out of a node
  778. */
  779. {
  780. int offset = NEXT(p);
  781. if (offset == 0)
  782. return(NULL);
  783. if (OP(p) == BACK)
  784. return(p-offset);
  785. else
  786. return(p+offset);
  787. }
  788. /*
  789. - rtry - try match at specific point
  790. */
  791. char * RECOMP::rtry (const char *string)
  792. {
  793. input = string;
  794. startp = s_start;
  795. endp = s_end;
  796. _clear(s_start);
  797. _clear(s_end);
  798. if (match(program)) {
  799. s_start[0] = string;
  800. s_end[0] = input;
  801. return (char *)string;
  802. }
  803. return NULL;
  804. }
  805. bool RECOMP::match (char *prog)
  806. {
  807. char *scan; /* Current node. */
  808. char *n; /* Next node. */
  809. scan = prog;
  810. while (scan != NULL) {
  811. n = next(scan);
  812. switch (OP(scan)) {
  813. case BOL:
  814. if (input != bol)
  815. return false;
  816. break;
  817. case EOL:
  818. if (clarion) {
  819. if (!claeol(input))
  820. return false;
  821. }
  822. else if (!eob(input))
  823. return false;
  824. break;
  825. case ANY:
  826. if (eob(input))
  827. return false;
  828. input++;
  829. break;
  830. case EXACTLY: {
  831. memsize_t len;
  832. char *opnd;
  833. opnd = OPERAND(scan);
  834. /* Inline the first character, for speed. */
  835. if (!equch(*opnd,*input))
  836. return false;
  837. len = strlen(opnd);
  838. if (len > 1 && !strnequ(input, opnd, (size32_t)len))
  839. return false;
  840. input += len;
  841. }
  842. break;
  843. case ANYOF:
  844. if (eob(input) || strchr(OPERAND(scan), *input) == NULL)
  845. return false;
  846. input++;
  847. break;
  848. case ANYBUT:
  849. if (eob(input) || strchr(OPERAND(scan), *input) != NULL)
  850. return false;
  851. input++;
  852. break;
  853. case NOTHING:
  854. break;
  855. case BACK:
  856. break;
  857. case OPEN+1:
  858. case OPEN+2:
  859. case OPEN+3:
  860. case OPEN+4:
  861. case OPEN+5:
  862. case OPEN+6:
  863. case OPEN+7:
  864. case OPEN+8:
  865. case OPEN+9: {
  866. int no;
  867. const char *save;
  868. no = OP(scan) - OPEN;
  869. save = input;
  870. if (match(n)) {
  871. /*
  872. * Don't set start if some later
  873. * invocation of the same parentheses
  874. * already has.
  875. */
  876. if (startp[no] == NULL)
  877. startp[no] = save;
  878. return true;
  879. } else
  880. return false;
  881. }
  882. break;
  883. case CLOSE+1:
  884. case CLOSE+2:
  885. case CLOSE+3:
  886. case CLOSE+4:
  887. case CLOSE+5:
  888. case CLOSE+6:
  889. case CLOSE+7:
  890. case CLOSE+8:
  891. case CLOSE+9: {
  892. int no;
  893. const char *save;
  894. no = OP(scan) - CLOSE;
  895. save = input;
  896. if (match(n)) {
  897. /*
  898. * Don't set end if some later
  899. * invocation of the same parentheses
  900. * already has.
  901. */
  902. if (endp[no] == NULL)
  903. endp[no] = save;
  904. return true;
  905. } else
  906. return false;
  907. }
  908. break;
  909. case BRANCH: {
  910. const char *save;
  911. if (OP(n) != BRANCH) /* No choice. */
  912. n = OPERAND(scan); /* Avoid recursion. */
  913. else {
  914. do {
  915. save = input;
  916. if (match(OPERAND(scan)))
  917. return true;
  918. input = save;
  919. scan = next(scan);
  920. } while (scan != NULL && OP(scan) == BRANCH);
  921. return false;
  922. /* NOTREACHED */
  923. }
  924. }
  925. break;
  926. case STAR:
  927. case PLUS: {
  928. char nextch;
  929. int no;
  930. const char *save;
  931. int min;
  932. /*
  933. * Lookahead to avoid useless match attempts
  934. * when we know what character comes n.
  935. */
  936. nextch = '\0';
  937. if (OP(n) == EXACTLY)
  938. nextch = *OPERAND(n);
  939. min = (OP(scan) == STAR) ? 0 : 1;
  940. save = input;
  941. no = repeat(OPERAND(scan));
  942. while (no >= min) {
  943. /* If it could work, try it. */
  944. if (nextch == '\0' || equch(*input,nextch))
  945. if (match(n))
  946. return true;
  947. /* Couldn't or didn't -- back up. */
  948. no--;
  949. input = save + no;
  950. }
  951. return false;
  952. }
  953. break;
  954. case END:
  955. return true; /* Success! */
  956. break;
  957. default:
  958. assertex(!"memory corruption");
  959. return false;
  960. break;
  961. }
  962. scan = n;
  963. }
  964. /*
  965. * We get here only if there's trouble -- normally "case END" is
  966. * the terminating point.
  967. */
  968. assertex(!"corrupted pointers");
  969. return false;
  970. }
  971. int RECOMP::repeat (char *p)
  972. /*
  973. - regrepeat - repeatedly match something simple, report how many
  974. */
  975. {
  976. int count = 0;
  977. const char *scan = input;
  978. char *opnd = OPERAND(p);
  979. switch (OP(p)) {
  980. case ANY:
  981. count = rstrlen(scan);
  982. scan += count;
  983. break;
  984. case EXACTLY:
  985. while (equch(*opnd,*scan)) {
  986. count++;
  987. scan++;
  988. }
  989. break;
  990. case ANYOF:
  991. while (!eob(scan) && strchr(opnd, *scan) != NULL) { // NB not rstrchr!
  992. count++;
  993. scan++;
  994. }
  995. break;
  996. case ANYBUT:
  997. while (!eob(scan) && strchr(opnd, *scan) == NULL) { // NB nnot rstrchr
  998. count++;
  999. scan++;
  1000. }
  1001. break;
  1002. default: /* Oh dear. Called inappropriately. */
  1003. assertex(!"internal foulup");
  1004. count = 0; /* Best compromise. */
  1005. break;
  1006. }
  1007. input = scan;
  1008. return(count);
  1009. }
  1010. void RECOMP::adjustptrs(const char *e,size32_t lo,size32_t lr)
  1011. {
  1012. if (lo==lr) return;
  1013. size32_t o=e-findstart;
  1014. for (int i=0;i<NSUBEXP;i++) {
  1015. const char *s= s_start[i];
  1016. if (s) {
  1017. size32_t so = s-findstart;
  1018. if (so>=o) {
  1019. s_start[i]=s+lr-lo;
  1020. }
  1021. so = s_end[i]-findstart;
  1022. if (so>=o)
  1023. s_end[i]+=lr-lo;
  1024. }
  1025. }
  1026. findend+=(lr-lo);
  1027. }
  1028. const char *RECOMP::rstrchr(const char *s,int c)
  1029. {
  1030. if (nocase) {
  1031. c = toupper(c);
  1032. while (s!=findend) {
  1033. if (toupper(*s)==c) return s;
  1034. s++;
  1035. }
  1036. }
  1037. else {
  1038. while (s!=findend) {
  1039. if (*s==c) return s;
  1040. s++;
  1041. }
  1042. }
  1043. return NULL;
  1044. }
  1045. bool RECOMP::strnequ(const char *s1,const char *s2,size32_t n)
  1046. { // s1 is in regsearch
  1047. if (findend-s1 < (signed)n) return false;
  1048. if (nocase) {
  1049. while (n--) {
  1050. if (toupper(*s1)!=toupper(*s2)) return false;
  1051. s1++;
  1052. s2++;
  1053. }
  1054. }
  1055. else {
  1056. while (n--) {
  1057. if (*s1!=*s2) return false;
  1058. s1++;
  1059. s2++;
  1060. }
  1061. }
  1062. return true;
  1063. }
  1064. bool RECOMP::claeol(const char *s)
  1065. {
  1066. do {
  1067. if (eob(s)) return true;
  1068. }
  1069. while ((*(s++))==' ');
  1070. return false;
  1071. }
  1072. unsigned char xlat[26]={0,'1','2','3',0,'1','2',0,0,'2','2','4','5','5',0,'1','2','6','2','3',0,'1',0,'2',0,'2'};
  1073. static char *SoundexCode(const char *s,int l,char *res)
  1074. {
  1075. char *r=res;
  1076. r[1] = '0';
  1077. r[2] = '0';
  1078. r[3] = '0';
  1079. r[4] = 0;
  1080. for (;;) {
  1081. if (!l||!*s) {
  1082. *r = '!';
  1083. return res;
  1084. }
  1085. if (isalpha(*s)) break;
  1086. s++;
  1087. l--;
  1088. }
  1089. char c=toupper(*s);
  1090. *r = c;
  1091. r++; s++;
  1092. char dl = ((c>='A')&&(c<='Z'))?xlat[c-'A']:0;
  1093. while (l&&*s) {
  1094. c = toupper(*s);
  1095. s++;
  1096. l--;
  1097. if ((c>='A')&&(c<='Z')) {
  1098. char d = xlat[c-'A'];
  1099. if (d) {
  1100. if(d!=dl) {
  1101. *r = d;
  1102. r++;
  1103. if (!*r) break;
  1104. }
  1105. }
  1106. else if (c != 'H' && c != 'W')
  1107. dl = d;
  1108. }
  1109. }
  1110. return res;
  1111. }
  1112. //---------------------------------------------------------------------------------------------------------------------
  1113. inline bool matches(char cur, char next, bool nocase)
  1114. {
  1115. return (nocase ? (toupper(cur)==toupper(next)) : cur == next);
  1116. }
  1117. /* Search for a pattern pat anywhere within the search string src */
  1118. static bool WildSubStringMatch(const char *src, size_t srclen, const char *pat, size_t patlen, bool nocase)
  1119. {
  1120. //On entry the pattern to match contains at least one leading non '*' character
  1121. char pat0 = pat[0];
  1122. if (nocase)
  1123. pat0 = toupper(pat[0]);
  1124. //Could special case '?' at the start of the string, but fairly unlikely.
  1125. for (size_t srcdelta=0; srcdelta < srclen; srcdelta++)
  1126. {
  1127. size_t patidx=0;
  1128. size_t srcidx = srcdelta;
  1129. if (likely(pat0 != '?'))
  1130. {
  1131. //Quick scan to find a match for the first character
  1132. if (!nocase)
  1133. {
  1134. for (;;)
  1135. {
  1136. if (unlikely(src[srcdelta] == pat0))
  1137. break;
  1138. srcdelta++;
  1139. if (unlikely(srcdelta == srclen))
  1140. return false;
  1141. }
  1142. }
  1143. else
  1144. {
  1145. for (;;)
  1146. {
  1147. if (unlikely(toupper(src[srcdelta]) == pat0))
  1148. break;
  1149. srcdelta++;
  1150. if (unlikely(srcdelta == srclen))
  1151. return false;
  1152. }
  1153. }
  1154. patidx=1;
  1155. srcidx = srcdelta+1;
  1156. }
  1157. for (;;)
  1158. {
  1159. if (patidx == patlen)
  1160. return true;
  1161. char next = pat[patidx];
  1162. if (next == '*')
  1163. {
  1164. do
  1165. {
  1166. patidx++;
  1167. }
  1168. while ((patidx < patlen) && (pat[patidx] == '*'));
  1169. dbgassertex((patidx != patlen)); // pattern should never finish with a '*'
  1170. if (WildSubStringMatch(src+srcidx, srclen-srcidx, pat+patidx, patlen-patidx, nocase))
  1171. return true;
  1172. break; // retry at next position
  1173. }
  1174. if (srcidx == srclen)
  1175. break; // retry at next position
  1176. if (next != '?')
  1177. {
  1178. char cur = src[srcidx];
  1179. if (!matches(cur, next, nocase))
  1180. break; // retry at next position
  1181. }
  1182. patidx++;
  1183. srcidx++;
  1184. }
  1185. }
  1186. return false;
  1187. }
  1188. static bool WildMatchN ( const char *src, size_t srclen, size_t srcidx,
  1189. const char *pat, size_t patlen, size_t patidx, bool nocase)
  1190. {
  1191. //First check for matching prefix
  1192. char next;
  1193. while (patidx < patlen)
  1194. {
  1195. next = pat[patidx];
  1196. if (next == '*')
  1197. break;
  1198. if (srcidx >= srclen)
  1199. return false;
  1200. if (next != '?')
  1201. {
  1202. if (!matches(src[srcidx], next, nocase))
  1203. return false;
  1204. }
  1205. srcidx++;
  1206. patidx++;
  1207. }
  1208. //Now check for matching suffix
  1209. while (patidx < patlen)
  1210. {
  1211. next = pat[patlen-1];
  1212. if (next == '*')
  1213. break;
  1214. if (srcidx >= srclen)
  1215. return false;
  1216. if (next != '?')
  1217. {
  1218. if (!matches(src[srclen-1], next, nocase))
  1219. return false;
  1220. }
  1221. srclen--;
  1222. patlen--;
  1223. }
  1224. //String contains no wildcards...
  1225. if (patidx == patlen)
  1226. return (srcidx == srclen);
  1227. dbgassertex(pat[patidx] == '*');
  1228. dbgassertex(pat[patlen-1] == '*');
  1229. //Skip multiple wildcards on the prefix and suffix.
  1230. while (patidx < patlen && pat[patidx] == '*')
  1231. patidx++;
  1232. while (patidx < patlen && pat[patlen-1] == '*')
  1233. patlen--;
  1234. //abc*def
  1235. if (patidx == patlen)
  1236. return true;
  1237. //Must match at least one character, if no characters left in the search string, then it fails to match
  1238. if (srcidx == srclen)
  1239. return false;
  1240. //Search for the remaining pattern at an arbitrary position with the search string
  1241. return WildSubStringMatch(src+srcidx, srclen-srcidx, pat+patidx, patlen-patidx, nocase);
  1242. }
  1243. bool jlib_decl WildMatch(const char *src, size_t srclen, const char *pat, size_t patlen, bool nocase)
  1244. {
  1245. return WildMatchN(src,srclen,0,pat,patlen,0,nocase);
  1246. }
  1247. bool jlib_decl WildMatch(const char *src, const char *pat, bool nocase)
  1248. {
  1249. //This could match constant prefixes before calling strlen(), but unlikely to be very significant
  1250. return WildMatchN(src, strlen(src), 0, pat, strlen(pat), 0, nocase);
  1251. }
  1252. bool jlib_decl containsWildcard(const char * pattern)
  1253. {
  1254. for (;;)
  1255. {
  1256. char c = *pattern++;
  1257. switch (c)
  1258. {
  1259. case 0:
  1260. return false;
  1261. case '?':
  1262. case '*':
  1263. return true;
  1264. }
  1265. }
  1266. }
  1267. static bool WildMatchNreplace ( const char *src, int srclen, int srcidx,
  1268. const char *pat, int patlen, int patidx,
  1269. int nocase,
  1270. UnsignedArray &wild, UnsignedArray &wildlen
  1271. )
  1272. {
  1273. unsigned wl = wild.ordinality();
  1274. char next_char;
  1275. for (;;) {
  1276. if (patidx == patlen) {
  1277. if (srcidx == srclen)
  1278. return true;
  1279. goto Fail;
  1280. }
  1281. next_char = pat[patidx++];
  1282. if (next_char == '?') {
  1283. if (srcidx == srclen)
  1284. goto Fail;
  1285. wild.append(srcidx++);
  1286. wildlen.append(1);
  1287. }
  1288. else if (next_char != '*') {
  1289. if (nocase) {
  1290. if ((srcidx == srclen) ||
  1291. (toupper(src[srcidx])!=toupper(next_char)))
  1292. goto Fail;
  1293. }
  1294. else {
  1295. if ((srcidx == srclen) || (src[srcidx]!=next_char))
  1296. goto Fail;
  1297. }
  1298. srcidx++;
  1299. }
  1300. else {
  1301. for (;;) {
  1302. if (patidx == patlen) {
  1303. wild.append(srcidx);
  1304. wildlen.append(srclen-srcidx);
  1305. return true;
  1306. }
  1307. wild.append(srcidx);
  1308. wildlen.append(0); // placemarker
  1309. if (pat[patidx] != '*')
  1310. break;
  1311. patidx++; // someone being silly!
  1312. }
  1313. while (srcidx < srclen) {
  1314. if (WildMatchNreplace(src,srclen,srcidx,
  1315. pat, patlen, patidx,nocase, wild,wildlen))
  1316. return true;
  1317. wildlen.append(wildlen.popGet()+1); // prob can do a bit better than this!
  1318. srcidx++;
  1319. }
  1320. break; // fail
  1321. }
  1322. }
  1323. Fail:
  1324. unsigned pn = wild.ordinality()-wl;
  1325. if (pn) {
  1326. wild.popn(pn);
  1327. wildlen.popn(pn);
  1328. }
  1329. return false;
  1330. }
  1331. bool jlib_decl WildMatchReplace(const char *src, const char *pat, const char *repl, bool nocase, StringBuffer &out)
  1332. {
  1333. UnsignedArray wild;
  1334. UnsignedArray wildlen;
  1335. if (!WildMatchNreplace(src,(size32_t)strlen(src),0,pat,(size32_t)strlen(pat),0,nocase,wild,wildlen))
  1336. return false;
  1337. for (;;) {
  1338. char c = *(repl++);
  1339. if (!c)
  1340. break;
  1341. if (c!='$')
  1342. out.append(c);
  1343. else {
  1344. char c2 = *(repl++);
  1345. if (!c2) {
  1346. out.append(c);
  1347. break;
  1348. }
  1349. if (c2=='$')
  1350. out.append(c);
  1351. else {
  1352. unsigned idx = c2-'0';
  1353. if (idx==0)
  1354. out.append(src);
  1355. else if ((idx<=9)&&(idx<=wild.ordinality()))
  1356. out.append(wildlen.item(idx-1),src+wild.item(idx-1));
  1357. else
  1358. out.append(c).append(c2);
  1359. }
  1360. }
  1361. }
  1362. return true;
  1363. }
  1364. bool jlib_decl SoundexMatch(const char *src, const char *pat)
  1365. {
  1366. char s1[5];
  1367. char s2[5];
  1368. return memcmp(SoundexCode(src,(size32_t)strlen(src),s1),SoundexCode(pat,(size32_t)strlen(pat),s2),4)==0;
  1369. }
  1370. //---------------------------------------------------------------------------
  1371. StringMatcher::StringMatcher()
  1372. {
  1373. for (unsigned idx=0; idx < 256; idx++)
  1374. {
  1375. firstLevel[idx].value = 0;
  1376. firstLevel[idx].table = NULL;
  1377. }
  1378. }
  1379. void StringMatcher::freeLevel(entry * elems)
  1380. {
  1381. for (unsigned idx = 0; idx < 256; idx++)
  1382. {
  1383. if (elems[idx].table)
  1384. {
  1385. freeLevel(elems[idx].table);
  1386. free(elems[idx].table);
  1387. elems[idx].table = NULL;
  1388. }
  1389. elems[idx].value = 0;
  1390. }
  1391. }
  1392. StringMatcher::~StringMatcher()
  1393. {
  1394. freeLevel(firstLevel);
  1395. }
  1396. void StringMatcher::addEntry(const char * text, unsigned action)
  1397. {
  1398. if (!queryAddEntry((size32_t)strlen(text), text, action))
  1399. throw MakeStringException(-1, "Duplicate entry \"%s\" added to string matcher", text);
  1400. }
  1401. void StringMatcher::addEntry(unsigned len, const char * text, unsigned action)
  1402. {
  1403. if (!queryAddEntry(len, text, action))
  1404. throw MakeStringException(-1, "Duplicate entry \"%*s\" added to string matcher", len, text);
  1405. }
  1406. bool StringMatcher::queryAddEntry(unsigned len, const char * text, unsigned action)
  1407. {
  1408. if (len == 0)
  1409. return false;
  1410. entry * curTable = firstLevel;
  1411. for (;;)
  1412. {
  1413. byte c = *text++;
  1414. entry & curElement = curTable[c];
  1415. if (--len == 0)
  1416. {
  1417. if (curElement.value != 0)
  1418. return false;
  1419. curElement.value = action;
  1420. return true;
  1421. }
  1422. if (!curElement.table)
  1423. curElement.table = (entry *)calloc(sizeof(entry), 256);
  1424. curTable = curElement.table;
  1425. }
  1426. }
  1427. unsigned StringMatcher::getMatch(unsigned maxLen, const char * text, unsigned & matchLen)
  1428. {
  1429. unsigned bestValue = 0;
  1430. unsigned bestLen = 0;
  1431. if (maxLen)
  1432. {
  1433. const byte * start = (const byte *)text;
  1434. const byte * cursor = (const byte *)text;
  1435. const byte * end = (const byte *)cursor+maxLen;
  1436. byte next = *cursor++;
  1437. entry * cur = &firstLevel[next];
  1438. while ((cursor != end) && cur->table)
  1439. {
  1440. if (cur->value)
  1441. {
  1442. bestValue = cur->value;
  1443. bestLen = cursor-start;
  1444. }
  1445. next = *cursor++;
  1446. cur = &cur->table[next];
  1447. }
  1448. if (cur->value)
  1449. {
  1450. matchLen = cursor-start;
  1451. return cur->value;
  1452. }
  1453. }
  1454. matchLen = bestLen;
  1455. return bestValue;
  1456. }
  1457. void addActionList(StringMatcher & matcher, const char * text, unsigned action, unsigned * maxElementLength)
  1458. {
  1459. if (!text)
  1460. return;
  1461. unsigned idx=0;
  1462. while (*text)
  1463. {
  1464. StringBuffer str;
  1465. while (*text)
  1466. {
  1467. char next = *text++;
  1468. if (next == ',')
  1469. break;
  1470. if (next == '\\' && *text)
  1471. {
  1472. next = *text++;
  1473. switch (next)
  1474. {
  1475. case 'r': next = '\r'; break;
  1476. case 'n': next = '\n'; break;
  1477. case 't': next = '\t'; break;
  1478. case 'x':
  1479. //hex constant - at least we can define spaces then...
  1480. if (text[0] && text[1])
  1481. {
  1482. next = (hex2num(*text) << 4) | hex2num(text[1]);
  1483. text+=2;
  1484. }
  1485. break;
  1486. default:
  1487. break; //otherwise \ just quotes the character e.g. \,
  1488. }
  1489. }
  1490. str.append(next);
  1491. }
  1492. if (str.length())
  1493. {
  1494. matcher.addEntry(str.str(), action+(idx++<<8));
  1495. if (maxElementLength && (str.length() > *maxElementLength))
  1496. *maxElementLength = str.length();
  1497. }
  1498. }
  1499. }
  1500. //---------------------------------------------------------------------------
  1501. #if 0
  1502. #define MATCHsimple 0
  1503. #define MATCHwild 1
  1504. #define MATCHregular 2
  1505. #define MATCHsoundex 3
  1506. #define MATCHmask 0x0f
  1507. #define MATCHnocase 0x10
  1508. static class _MatchRE : public RegExpr
  1509. {
  1510. public:
  1511. bool init(const char *rexpr,int nocase) {
  1512. if (!cachere||(cachenocase!=nocase)||!strequ(cachere,rexpr)) {
  1513. free(cachere);
  1514. cachere = strdup(rexpr);
  1515. cachenocase = (unsigned char)nocase;
  1516. return RegExpr::init(cachere,cachenocase);
  1517. }
  1518. return true;
  1519. }
  1520. _MatchRE() { cachere=NULL; }
  1521. ~_MatchRE() { free(cachere); }
  1522. private:
  1523. char *cachere;
  1524. unsigned char cachenocase;
  1525. } MatchRE;
  1526. extern "Pascal"
  1527. long Cla$MATCH(/* STRING s, STRING p, */ unsigned char flags)
  1528. {
  1529. StringBuffer s;
  1530. s.PopString();
  1531. s.clip();
  1532. StringBuffer p;
  1533. p.PopString();
  1534. p.clip();
  1535. int nocase = flags&MATCHnocase?1:0;
  1536. switch(flags&MATCHmask) {
  1537. case MATCHsimple: {
  1538. if (s.strlen()!=p.strlen()) return false;
  1539. if (nocase)
  1540. return striequ((char *)s,(char *)p);
  1541. return strequ((char *)s,(char *)p);
  1542. }
  1543. break;
  1544. case MATCHwild:
  1545. return WildMatch(s,s.strlen(),p,p.strlen(),nocase);
  1546. case MATCHregular: {
  1547. if (!MatchRE.init(p,nocase))
  1548. return false;
  1549. const char *fs=MatchRE.find(s,0,RE_ALL,0);
  1550. return (fs!=NULL);
  1551. }
  1552. break;
  1553. case MATCHsoundex: {
  1554. char s1[5];
  1555. char s2[5];
  1556. return memcmp(SoundexCode(s,s.strlen(),s1),SoundexCode(p,p.strlen(),s2),4)==0;
  1557. }
  1558. break;
  1559. }
  1560. return false;
  1561. }
  1562. #endif
  1563. //--------------------------------
  1564. // Testing
  1565. #ifdef _DEBUG
  1566. void tr (RegExpr &RE,const char *s,const char *p,const char *r=NULL,size32_t n=0,bool nocase=false)
  1567. {
  1568. PrintLog("Test '%s','%s','%s',%d,%s\r\n",s,p,r?r:"-",n,nocase?"NOCASE":"");
  1569. char l[256];
  1570. strcpy(l,s);
  1571. StringBuffer ds;
  1572. RE.init(p,nocase);
  1573. const char * f = RE.find(l,0,RE_ALL);
  1574. if (!f)
  1575. PrintLog("Not Found\r\n");
  1576. else {
  1577. PrintLog("Found '%s'\r\n",RE.findstr(ds));
  1578. for (int i = 1;i<=9;i++) {
  1579. size32_t s = RE.findlen(i);
  1580. if (s) {
  1581. ds.clear();
  1582. PrintLog("Found Sub %d,%d,'%s'\r\n",i,s,RE.findstr(ds,i));
  1583. }
  1584. }
  1585. }
  1586. if (r) {
  1587. RE.replace(r,sizeof(l));
  1588. PrintLog("Replace to '%s'\r\n",l);
  1589. }
  1590. }
  1591. #define QBF "the quick brown fox jumped over the lazy dogs"
  1592. #define RS "Nigel Was Here"
  1593. void RE$Test()
  1594. {
  1595. RegExpr RE;
  1596. tr(RE,QBF,"the",RS);
  1597. tr(RE,QBF,"quick",RS);
  1598. tr(RE,QBF,"dogs",RS);
  1599. tr(RE,QBF,"lazz",RS);
  1600. tr(RE,QBF,"laz.",RS);
  1601. tr(RE,QBF,"^laz.",RS);
  1602. tr(RE,QBF,"laz.$",RS);
  1603. tr(RE,QBF,"^the.",RS);
  1604. tr(RE,QBF,"dogs^",RS);
  1605. tr(RE,QBF,"^t.*dogs^",RS);
  1606. tr(RE,QBF,"dog.",RS);
  1607. tr(RE,QBF,".the",RS);
  1608. tr(RE,QBF,".he",RS);
  1609. tr(RE,QBF,".*",RS);
  1610. tr(RE,QBF,"q.*",RS);
  1611. tr(RE,QBF,"q.*z",RS);
  1612. tr(RE,QBF,"qu.*k",RS);
  1613. tr(RE,QBF,"d{[oa]g}s",RS);
  1614. tr(RE,QBF,"laz[a-z]",RS);
  1615. tr(RE,QBF,"{laz[a-z]} {dogs}",RS);
  1616. tr(RE,QBF,"the",RS,50);
  1617. tr(RE,QBF,"quick",RS,50);
  1618. tr(RE,QBF,"dogs",RS,50);
  1619. tr(RE,QBF,"lazz",RS,50);
  1620. tr(RE,QBF,"laz.",RS,50);
  1621. tr(RE,QBF,"^laz.",RS,50);
  1622. tr(RE,QBF,"laz.$",RS,50);
  1623. tr(RE,QBF,"^the.",RS,50);
  1624. tr(RE,QBF,"dogs^",RS,50);
  1625. tr(RE,QBF,"^t.*dogs^",RS,50);
  1626. tr(RE,QBF,"dog.",RS,50);
  1627. tr(RE,QBF,".the",RS,50);
  1628. tr(RE,QBF,".he",RS,50);
  1629. tr(RE,QBF,".*",RS,50);
  1630. tr(RE,QBF,"q.*",RS,50);
  1631. tr(RE,QBF,"q.*z",RS,50);
  1632. tr(RE,QBF,"qu.*k",RS,50);
  1633. tr(RE,QBF,"d{[oa]g}s",RS,50);
  1634. tr(RE,QBF,"laz[a-z]",RS,50);
  1635. tr(RE,QBF,"{laz[a-z]} {dogs}",RS,50);
  1636. tr(RE,"INRANGE (a,b)","[ ,]INRANGE *({.+},{.+})");
  1637. tr(RE," inrange (a,b)","[ ,]INRANGE *({.+},{.+})",NULL,0,true);
  1638. tr(RE,",INRANGE(aa,bb)","[ ,]INRANGE *({.+},{.+})");
  1639. tr(RE,",INRANGE(,bb)","[ ,]INRANGE *({.+},{.+})");
  1640. tr(RE,"10/14/1998","^{.*}/{.*}/{.*}$");
  1641. tr(RE,"freddy","^{|{.+}\\@}{.+}$");
  1642. tr(RE,"myname@freddy","^{|{.+}\\@}{.+}$");
  1643. tr(RE,"freddy","^{{.+}\\@|}{.+}$");
  1644. tr(RE,"myname@freddy","^{{.+}\\@|}{.+}$");
  1645. tr(RE,"freddy","^{[^@]+}@?{[^@]+}$");
  1646. tr(RE,"myname@freddy","^{.+}@{.+}$|{.+}$");
  1647. tr(RE,"freddy","^{|{.+}@}{[^@]+}$");
  1648. char str[256];
  1649. strcpy(str,"myname@freddy");
  1650. RE.init("^{|{.+}@}{[^@]+}$");
  1651. RE.find(str);
  1652. StringBuffer t;
  1653. PrintLog("Substitute to '%s'\r\n",RE.substitute(t,"#'#3###2'#"));
  1654. t.clear();
  1655. RE.replace(RE.findstr(t,2),sizeof(str),3);
  1656. t.clear();
  1657. RE.replace(RE.findstr(t,3),sizeof(str),2);
  1658. PrintLog("Replace to '%s'\r\n",str);
  1659. }
  1660. void FixDate(char *buff,size32_t max)
  1661. {
  1662. RegExpr RE("[0-9]+/[0-9]+/{[0-9]+}");
  1663. const char *s = RE.find(buff);
  1664. while (s) {
  1665. StringBuffer ys;
  1666. int y=atoi(RE.findstr(ys,1));
  1667. if (y<1000) {
  1668. ys.clear();
  1669. ys.append(y+1900);
  1670. RE.replace(ys.str(),max,1);
  1671. }
  1672. s = RE.findnext();
  1673. }
  1674. }
  1675. #endif