hqlregex.cpp 88 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773
  1. /*##############################################################################
  2. HPCC SYSTEMS software Copyright (C) 2012 HPCC Systems®.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. ############################################################################## */
  13. #ifndef U_OVERRIDE_CXX_ALLOCATION
  14. #define U_OVERRIDE_CXX_ALLOCATION 0 // Enabling this forces all allocation of ICU objects to ICU's heap, but is incompatible with jmemleak
  15. #endif
  16. #include <algorithm>
  17. #include "jliball.hpp"
  18. #include "eclrtl.hpp"
  19. #include "hqlexpr.hpp"
  20. #include "hqlcerrors.hpp"
  21. #include "hqlutil.hpp"
  22. #include "hqlregex.ipp"
  23. #include "hqlhtcpp.ipp"
  24. #include "hqlcatom.hpp"
  25. #include "hqlcpputil.hpp"
  26. #include "thorralgo.ipp"
  27. #include "hqlthql.hpp"
  28. #ifdef _USE_ICU
  29. #include "unicode/uchar.h"
  30. #endif
  31. //#define NEW_DFA_CALC
  32. //#define DEFAULT_DFA_COMPLEXITY 0
  33. #define DEFAULT_UNICODE_DFA_COMPLEXITY 0
  34. #define DEFAULT_UTF8_DFA_COMPLEXITY 2000
  35. #define DEFAULT_DFA_COMPLEXITY 10000
  36. inline unsigned addWithoutOverflow(unsigned x, unsigned y)
  37. {
  38. if (UINT_MAX - x <= y)
  39. return UINT_MAX;
  40. return x + y;
  41. }
  42. inline unsigned multiplyWithoutOverflow(unsigned x, unsigned y)
  43. {
  44. if (UINT_MAX / x <= y)
  45. return UINT_MAX;
  46. return x * y;
  47. }
  48. //===========================================================================
  49. static IValue * getCastConstant(IValue * value, type_t tc)
  50. {
  51. switch (tc)
  52. {
  53. case type_unicode:
  54. return value->castTo(unknownUnicodeType);
  55. case type_utf8:
  56. return value->castTo(unknownUtf8Type);
  57. case type_string:
  58. return value->castTo(unknownStringType);
  59. }
  60. throwUnexpected();
  61. }
  62. static HqlRegexExpr * createRegexExpr(const ParseInformation & options, IHqlExpression * expr, bool caseSensitive)
  63. {
  64. switch (expr->getOperator())
  65. {
  66. case no_null:
  67. case no_pat_beginrecursive:
  68. case no_pat_beginpattern:
  69. case no_pat_endpattern:
  70. case no_pat_singlechar:
  71. return new HqlSimpleRegexExpr(options, expr, caseSensitive);
  72. }
  73. return new HqlComplexRegexExpr(options, expr, caseSensitive);
  74. }
  75. static HqlRegexExpr * createRegexExpr(const ParseInformation & options, node_operator op, IHqlExpression * expr, bool caseSensitive)
  76. {
  77. switch (op)
  78. {
  79. case no_null:
  80. case no_pat_beginrecursive:
  81. case no_pat_beginpattern:
  82. case no_pat_endpattern:
  83. case no_pat_singlechar:
  84. return new HqlSimpleRegexExpr(options, op, expr, caseSensitive);
  85. }
  86. return new HqlComplexRegexExpr(options, op, expr, caseSensitive);
  87. }
  88. //---------------------------------------------------------------------------
  89. void HqlRegexHashTable::onAdd(void *et)
  90. {
  91. ((HqlRegexExpr*)et)->Link();
  92. }
  93. void HqlRegexHashTable::onRemove(void *et)
  94. {
  95. ((HqlRegexExpr*)et)->Release();
  96. }
  97. unsigned HqlRegexHashTable::getHashFromElement(const void *et) const
  98. {
  99. return ((HqlRegexExpr*)et)->getHash();
  100. }
  101. unsigned HqlRegexHashTable::getHashFromFindParam(const void *fp) const
  102. {
  103. return ((HqlRegexExpr*)fp)->getHash();
  104. }
  105. const void * HqlRegexHashTable::getFindParam(const void *et) const
  106. {
  107. return et;
  108. }
  109. bool HqlRegexHashTable::matchesFindParam(const void *et, const void *key, unsigned fphash) const
  110. {
  111. return et == key;
  112. }
  113. void HqlRegexUniqueArray::append(HqlRegexExpr & expr)
  114. {
  115. if (!hash.find(&expr))
  116. {
  117. array.append(expr);
  118. hash.add(&expr);
  119. }
  120. else
  121. {
  122. expr.Release();
  123. }
  124. }
  125. //---------------------------------------------------------------------------
  126. inline unsigned limitedAdd(unsigned a, unsigned b)
  127. {
  128. if (PATTERN_UNLIMITED_LENGTH - a <= b)
  129. return PATTERN_UNLIMITED_LENGTH;
  130. return a+b;
  131. }
  132. inline unsigned limitedMult(unsigned a, unsigned b)
  133. {
  134. if (a == 0)
  135. return 0;
  136. if (PATTERN_UNLIMITED_LENGTH / a <= b)
  137. return PATTERN_UNLIMITED_LENGTH;
  138. return a*b;
  139. }
  140. IHqlExpression * RegexIdAllocator::createKey(IHqlExpression * expr, IAtom * name)
  141. {
  142. if (!expr)
  143. return NULL;
  144. IHqlExpression * body = expr->queryBody();
  145. if (name)
  146. return createSymbol(createIdAtom(str(name)), LINK(body), ob_private);
  147. return LINK(body);
  148. }
  149. void RegexIdAllocator::setID(IHqlExpression * expr, IAtom * name, unsigned value)
  150. {
  151. OwnedHqlExpr key = createKey(expr, name);
  152. map.setValue(key, value);
  153. }
  154. regexid_t RegexIdAllocator::queryID(IHqlExpression * expr, IAtom * name)
  155. {
  156. OwnedHqlExpr key = createKey(expr, name);
  157. regexid_t * match = map.getValue(key);
  158. if (match)
  159. return *match;
  160. map.setValue(key, ++nextId);
  161. return nextId;
  162. }
  163. int compareUnsigned(unsigned const * left, unsigned const * right)
  164. {
  165. return (*left < *right) ? -1 : (*left > *right) ? +1 : 0;
  166. }
  167. class SymbolArray
  168. {
  169. friend class SymbolArrayIterator;
  170. public:
  171. void addUniqueRange(unsigned low, unsigned high)
  172. {
  173. unsigned max = symbols.ordinality();
  174. unsigned i;
  175. for (i = 0; i < max; i++)
  176. {
  177. unsigned cur = symbols.item(i);
  178. if (cur == low)
  179. low++;
  180. if (cur > low)
  181. {
  182. if (cur > high)
  183. cur = high+1;
  184. while (low != cur)
  185. symbols.add(low++, i++);
  186. low++;
  187. }
  188. if (cur > high)
  189. return;
  190. }
  191. while (low <= high)
  192. symbols.append(low++);
  193. }
  194. inline void addUnique(unsigned value)
  195. {
  196. bool isNew;
  197. symbols.bAdd(value, compareUnsigned, isNew);
  198. }
  199. protected:
  200. UnsignedArray symbols;
  201. };
  202. class SymbolArrayIterator
  203. {
  204. public:
  205. SymbolArrayIterator(SymbolArray & _table) : table(_table) { cur = 0; }
  206. inline bool first()
  207. {
  208. cur = 0; return table.symbols.isItem(cur);
  209. }
  210. inline bool isValid()
  211. {
  212. return table.symbols.isItem(cur);
  213. }
  214. inline unsigned get()
  215. {
  216. return table.symbols.item(cur);
  217. }
  218. inline bool next()
  219. {
  220. cur++; return table.symbols.isItem(cur);
  221. }
  222. protected:
  223. SymbolArray & table;
  224. unsigned cur;
  225. };
  226. //---------------------------------------------------------------------------
  227. HqlNamedRegex::HqlNamedRegex(IHqlExpression * _expr, IAtom * _name, IHqlExpression * _searchExpr, node_operator _kind, bool _caseSensitive, bool _isMatched)
  228. {
  229. kind = _kind;
  230. expr.set(_expr);
  231. searchExpr.set(_searchExpr);
  232. name = _name;
  233. numUses = 1;
  234. isMatched = _isMatched;
  235. isRecursive = false;
  236. doneCalcDfaScore = false;
  237. doneCreateDFA = false;
  238. doneExpandNamed = false;
  239. doneExpandRecursion = false;
  240. doneMarkDfas = false;
  241. noGenerate = (kind == no_pat_dfa);
  242. caseSensitive = _caseSensitive;
  243. }
  244. HqlNamedRegex::~HqlNamedRegex()
  245. {
  246. cleanup();
  247. }
  248. void HqlNamedRegex::addBeginEnd(const ParseInformation & options)
  249. {
  250. //Add nodes at either end of the pattern - at the start as a placeholder for all the possible first patterns,
  251. //and at the end to mark which paths are terminal.
  252. HqlRegexExpr * follow = createRegexExpr(options, no_pat_follow, NULL, caseSensitive);
  253. follow->addOperand(createRegexExpr(options, isRecursive ? no_pat_beginrecursive : no_pat_beginpattern, NULL, caseSensitive));
  254. follow->addOperand(LINK(def));
  255. if (isRecursive)
  256. follow->addOperand(createRegexExpr(options, no_pat_endrecursive, NULL, caseSensitive));
  257. follow->addOperand(createRegexExpr(options, no_pat_endpattern, NULL, caseSensitive));
  258. def.setown(follow);
  259. }
  260. void HqlNamedRegex::analyseNullLeadTrail()
  261. {
  262. def->analyseNullLeadTrail();
  263. limit = def->limit;
  264. }
  265. void HqlNamedRegex::calcDfaScore()
  266. {
  267. if (!doneCalcDfaScore)
  268. {
  269. doneCalcDfaScore = true;
  270. def->calcDfaScore();
  271. }
  272. }
  273. void HqlNamedRegex::calculateNext()
  274. {
  275. def->calculateNext();
  276. }
  277. bool HqlNamedRegex::canBeNull()
  278. {
  279. return limit.canBeNull();
  280. }
  281. void HqlNamedRegex::cleanup()
  282. {
  283. if (def)
  284. def->cleanup();
  285. if (created)
  286. created->dispose();
  287. }
  288. void HqlNamedRegex::createDFAs(RegexContext & ctx)
  289. {
  290. if (!doneCreateDFA)
  291. {
  292. doneCreateDFA = true;
  293. def.setown(def->createDFAs(ctx));
  294. }
  295. }
  296. void HqlNamedRegex::expandNamedSymbols()
  297. {
  298. if (!doneExpandNamed)
  299. {
  300. doneExpandNamed = true;
  301. def.setown(def->expandNamedSymbols());
  302. }
  303. }
  304. void HqlNamedRegex::expandRecursion(RegexContext & ctx, HqlNamedRegex * self)
  305. {
  306. if (!doneExpandRecursion)
  307. {
  308. doneExpandRecursion = true;
  309. if (kind == no_pat_instance)
  310. self = this;
  311. def.setown(def->expandRecursion(ctx, self));
  312. }
  313. }
  314. void HqlNamedRegex::generateDFAs()
  315. {
  316. if (!noGenerate)
  317. def->generateDFAs();
  318. }
  319. void HqlNamedRegex::generateRegex(GenerateRegexCtx & ctx)
  320. {
  321. if (created || noGenerate)
  322. return;
  323. node_operator savedKind = ctx.namedKind;
  324. ctx.namedKind = kind;
  325. created.setown(new RegexNamed(name, ctx.idAllocator.queryID(expr, name)));
  326. def->generateRegex(ctx);
  327. def->connectRegex();
  328. HqlRegexExpr * first = def->queryChild(0);
  329. if ((first->getOperator() == no_pat_beginpattern) && (first->following.ordinality() == 1))
  330. first = &first->following.item(0);
  331. created->setFirst(first->created);
  332. ctx.namedKind = savedKind;
  333. }
  334. unsigned HqlNamedRegex::getDfaScore()
  335. {
  336. if (isRecursive)
  337. return NO_DFA_SCORE;
  338. return def->dfaScore;
  339. }
  340. void HqlNamedRegex::insertSeparators(IHqlExpression * separator, RegexContext * ctx)
  341. {
  342. if (expr->queryType()->getTypeCode() == type_pattern)
  343. return;
  344. def.setown(def->insertSeparators(separator, ctx));
  345. }
  346. void HqlNamedRegex::markDFAs(unsigned complexity)
  347. {
  348. if (!doneMarkDfas)
  349. {
  350. doneMarkDfas = true;
  351. def->markDFAs(complexity);
  352. }
  353. }
  354. bool HqlNamedRegex::matchesDefine(IHqlExpression * name, bool _caseSensitive)
  355. {
  356. return (name->queryBody() == defineName) && (caseSensitive == _caseSensitive);
  357. }
  358. void HqlNamedRegex::mergeCreateSets()
  359. {
  360. def.setown(def->mergeCreateSets());
  361. }
  362. bool HqlNamedRegex::queryExpandInline()
  363. {
  364. expandNamedSymbols();
  365. if (!isMatched)
  366. {
  367. if (numUses == 1)
  368. return true;
  369. //expand if trivial - this should be improved once dfa support is implemented.
  370. switch (def->getOperator())
  371. {
  372. case no_pat_instance:
  373. case no_pat_set:
  374. case no_pat_const:
  375. case no_pat_first:
  376. case no_pat_last:
  377. case no_pat_anychar:
  378. return true;
  379. }
  380. }
  381. return false;
  382. }
  383. RegexPattern * HqlNamedRegex::queryRootPattern()
  384. {
  385. //return a reference to the begin
  386. HqlRegexExpr * first = def->queryChild(0);
  387. if ((first->getOperator() == no_pat_beginpattern) && (first->following.ordinality() == 1))
  388. first = &first->following.item(0);
  389. return first->created;
  390. }
  391. void HqlNamedRegex::setRegexOwn(HqlRegexExpr * _def)
  392. {
  393. def.setown(_def);
  394. IHqlExpression * defExpr = def->expr;
  395. if (def->getOperator() == no_define)
  396. defineName.set(defExpr->queryChild(1)->queryBody());
  397. }
  398. //---------------------------------------------------------------------------
  399. static HqlRegexExpr * expandStringAsChars(IHqlExpression * expr, const ParseInformation & options, bool caseSensitive)
  400. {
  401. Owned<IValue> castValue = getCastConstant(expr->queryChild(0)->queryValue(), options.type);
  402. //convert text strings into a sequence of characters...
  403. ITypeInfo * type = castValue->queryType();
  404. unsigned len = type->getStringLen();
  405. if (len == 0)
  406. return createRegexExpr(options, no_null, NULL, caseSensitive);
  407. HqlRegexExpr * expanded = createRegexExpr(options, no_pat_follow, NULL, caseSensitive);
  408. bool readUChar = false;
  409. const void * value = castValue->queryValue();
  410. switch (options.type)
  411. {
  412. case type_unicode:
  413. readUChar = true;
  414. break;
  415. case type_utf8:
  416. //UTF8 is matched as a sequence of characters...
  417. len = rtlUtf8Size(len, value);
  418. break;
  419. }
  420. for (unsigned i = 0; i < len; i++)
  421. {
  422. unsigned nextValue;
  423. if (readUChar)
  424. nextValue = ((UChar *)value)[i];
  425. else
  426. nextValue = ((unsigned char *)value)[i];
  427. OwnedHqlExpr next = getSizetConstant(nextValue);
  428. expanded->addOperand(createRegexExpr(options, no_pat_singlechar, next, caseSensitive));
  429. }
  430. return expanded;
  431. }
  432. //---------------------------------------------------------------------------
  433. HqlRegexExpr::HqlRegexExpr(const ParseInformation & _options, IHqlExpression * _expr, bool _caseSensitive) : options(_options)
  434. {
  435. expr.set(_expr);
  436. op = expr->getOperator();
  437. caseSensitive = _caseSensitive;
  438. init();
  439. }
  440. HqlRegexExpr::HqlRegexExpr(const ParseInformation & _options, node_operator _op, IHqlExpression * _expr, bool _caseSensitive) : options(_options)
  441. {
  442. expr.set(_expr);
  443. op = _op;
  444. caseSensitive = _caseSensitive;
  445. init();
  446. }
  447. HqlRegexExpr::~HqlRegexExpr()
  448. {
  449. ::Release(dfa);
  450. }
  451. void HqlRegexExpr::init()
  452. {
  453. uid = (unsigned)(getUniqueId()-options.uidBase);
  454. cleaned = false;
  455. connected = false;
  456. analysed = false;
  457. createDFA = false;
  458. dfaScore = NO_DFA_SCORE;
  459. dfa = NULL;
  460. added = false;
  461. }
  462. void inherit(HqlRegexUniqueArray & target, const HqlRegexUniqueArray & source)
  463. {
  464. ForEachItemIn(idx, source)
  465. target.append(OLINK(source.item(idx)));
  466. }
  467. void HqlRegexExpr::calcDfaScore()
  468. {
  469. ForEachItemIn(idx, args)
  470. args.item(idx).calcDfaScore();
  471. if (querySubPattern())
  472. querySubPattern()->calcDfaScore();
  473. if (queryNamed())
  474. queryNamed()->calcDfaScore();
  475. //on entry dfaScore = NO_DFA_SCORE
  476. switch (getOperator())
  477. {
  478. case no_null:
  479. dfaScore = 0;
  480. break;
  481. case no_pat_const:
  482. dfaScore = expr->queryChild(0)->queryType()->getStringLen();
  483. break;
  484. case no_pat_anychar:
  485. dfaScore = 1;
  486. break;
  487. case no_pat_set:
  488. {
  489. unsigned score = 0;
  490. ForEachChild(idx, expr)
  491. {
  492. IHqlExpression * child = expr->queryChild(idx);
  493. switch (child->getOperator())
  494. {
  495. case no_range:
  496. {
  497. unsigned low = (unsigned)child->queryChild(0)->queryValue()->getIntValue();
  498. unsigned high = (unsigned)child->queryChild(1)->queryValue()->getIntValue();
  499. score = addWithoutOverflow(score, (high-low)+1);
  500. break;
  501. }
  502. case no_constant:
  503. score = addWithoutOverflow(score, 1);
  504. break;
  505. }
  506. }
  507. dfaScore = score;
  508. break;
  509. }
  510. case no_pat_repeat:
  511. if (isStandardRepeat())
  512. {
  513. unsigned argScore = args.item(0).dfaScore;
  514. if ((getRepeatMax() > 1) && (argScore != NO_DFA_SCORE))
  515. dfaScore = addWithoutOverflow(argScore, argScore);
  516. else
  517. dfaScore = argScore;
  518. }
  519. else if (!expr->hasAttribute(minimalAtom))
  520. {
  521. unsigned max = getRepeatMax();
  522. unsigned namedDfaScore = queryNamed()->getDfaScore();
  523. if (max <= options.dfaRepeatMax)
  524. dfaScore = multiplyWithoutOverflow(max, namedDfaScore);
  525. }
  526. break;
  527. case no_pat_instance:
  528. if (!queryNamed()->isMatched)
  529. dfaScore = queryNamed()->getDfaScore();
  530. break;
  531. case no_pat_or:
  532. {
  533. unsigned totalScore = 0;
  534. ForEachItemIn(idx, args)
  535. {
  536. unsigned score = args.item(idx).dfaScore;
  537. if (score == NO_DFA_SCORE)
  538. {
  539. totalScore = NO_DFA_SCORE;
  540. break;
  541. }
  542. totalScore = addWithoutOverflow(totalScore, score);
  543. }
  544. dfaScore = totalScore;
  545. if (dfaScore == NO_DFA_SCORE)
  546. {
  547. //Work out if there are two or more entires which can be converted into a dfa, and if so combine them
  548. //Also potentially separate simple sting lists into a separate or, so we always generate them
  549. unsigned scoreCount = 0;
  550. unsigned stringListCount = 0;
  551. ForEachItemIn(iCount, args)
  552. {
  553. HqlRegexExpr & cur = args.item(iCount);
  554. if (cur.dfaScore != NO_DFA_SCORE)
  555. {
  556. scoreCount++;
  557. if (cur.isSimpleStringList())
  558. stringListCount++;
  559. }
  560. }
  561. if (scoreCount > 1)
  562. {
  563. HqlRegexExpr * newor = createRegexExpr(options, no_pat_or, NULL, caseSensitive);
  564. HqlRegexExpr * stringListOr = ((stringListCount > 1) && (scoreCount != stringListCount)) ? createRegexExpr(options, no_pat_or, NULL, caseSensitive) : NULL;
  565. unsigned totalScore = 0;
  566. unsigned stringListScore = 0;
  567. for (unsigned i =0; i < args.ordinality(); )
  568. {
  569. HqlRegexExpr & cur = args.item(i);
  570. unsigned score = cur.dfaScore;
  571. if (score != NO_DFA_SCORE)
  572. {
  573. if (stringListOr && cur.isSimpleStringList())
  574. {
  575. stringListScore = addWithoutOverflow(stringListScore, score);
  576. stringListOr->addOperand(&OLINK(cur));
  577. }
  578. else
  579. newor->addOperand(&OLINK(cur));
  580. totalScore = addWithoutOverflow(totalScore, score);
  581. args.remove(i);
  582. }
  583. else
  584. i++;
  585. }
  586. if (stringListOr)
  587. {
  588. stringListOr->dfaScore = stringListScore;
  589. newor->addOperand(stringListOr);
  590. }
  591. newor->dfaScore = totalScore;
  592. //add the dfa first - since it will probably be quickest
  593. args.add(*newor, 0);
  594. }
  595. }
  596. break;
  597. }
  598. case no_pat_follow:
  599. {
  600. //MORE: Should extract applicable subranges out of the sequences
  601. //MORE: This should multiple rather than add.
  602. dfaScore = 0;
  603. ForEachItemIn(idx, args)
  604. {
  605. unsigned score = args.item(idx).dfaScore;
  606. if (score == NO_DFA_SCORE)
  607. {
  608. dfaScore = NO_DFA_SCORE;
  609. break;
  610. }
  611. dfaScore = addWithoutOverflow(dfaScore, score);
  612. }
  613. if (dfaScore == NO_DFA_SCORE)
  614. {
  615. //Try and find ranges that are valid, and extract those into sub-elements.
  616. for (unsigned i =0; i < args.ordinality(); i++)
  617. {
  618. unsigned score = args.item(i).dfaScore;
  619. if (score != NO_DFA_SCORE)
  620. {
  621. unsigned max = args.ordinality();
  622. unsigned j;
  623. for (j=i+1; j < max; j++)
  624. {
  625. unsigned thisScore = args.item(j).dfaScore;
  626. if (thisScore == NO_DFA_SCORE)
  627. break;
  628. score = addWithoutOverflow(score, thisScore);
  629. }
  630. if (j != i+1)
  631. {
  632. HqlRegexExpr * follow = createRegexExpr(options, no_pat_follow, NULL, caseSensitive);
  633. for (unsigned k=i; k < j; k++)
  634. follow->addOperand(&OLINK(args.item(k)));
  635. follow->dfaScore = score;
  636. while (j != i)
  637. args.remove(--j);
  638. args.add(*follow, i);
  639. }
  640. }
  641. }
  642. }
  643. break;
  644. }
  645. }
  646. }
  647. void HqlRegexExpr::calculateNext()
  648. {
  649. //Dragon p138
  650. //if a concatenation (a,b), then all the trailing from (a) have leading(b) in their following set
  651. //if a repeat [max > 1] then all positions in leading(x) are also in the following list
  652. //Limited/minimal repeats aren't done the same way - so they aren't added in the same way
  653. //Because I am cheating the order is important - repeats need to be done first since they are greedy.
  654. switch (getOperator())
  655. {
  656. case no_pat_repeat:
  657. if (isStandardRepeat() && getRepeatMax() > 1)
  658. {
  659. //All the trailing elements have the leading elements as possible next.
  660. unsigned max = numTrailing();
  661. for (unsigned idx=0; idx < max; idx++)
  662. gatherLeading(queryTrailing(idx).following);
  663. }
  664. break;
  665. }
  666. ForEachItemIn(idx, args)
  667. args.item(idx).calculateNext();
  668. switch (getOperator())
  669. {
  670. case no_pat_follow:
  671. case no_pat_separator:
  672. {
  673. //Internally connect up following items within the no_pat_follow.
  674. unsigned max = args.ordinality();
  675. assertex(max);
  676. for (unsigned pairs=0; pairs<max-1; pairs++)
  677. {
  678. HqlRegexExpr * left = queryChild(pairs);
  679. for (unsigned other=pairs+1; other<max; other++)
  680. {
  681. HqlRegexExpr * right = queryChild(other);
  682. unsigned maxTrail = left->numTrailing();
  683. for (unsigned trailIdx=0; trailIdx<maxTrail; trailIdx++)
  684. {
  685. HqlRegexExpr & cur = left->queryTrailing(trailIdx);
  686. right->gatherLeading(cur.following);
  687. }
  688. if (!right->canBeNull())
  689. break;
  690. }
  691. }
  692. break;
  693. }
  694. }
  695. }
  696. #if 0
  697. //convert strings/unicode to correct kind depending on what is being searched, when the expr is built.
  698. UBool U_CALL_CONV noteUCharRange(const void * context, UChar32 start, UChar limit, UCharCategory type)
  699. {
  700. }
  701. void gatherRange(void * context, UCharCategory type)
  702. {
  703. u_enumCharTypes(noteUCharRange range, context);
  704. }
  705. #endif
  706. static bool canConsume(const ParseInformation & options, unsigned nextChar, unsigned matcherChar, bool caseSensitive)
  707. {
  708. if (nextChar == matcherChar)
  709. return true;
  710. if (matcherChar & RegexSpecialMask)
  711. {
  712. assertex(options.type != type_utf8);
  713. if (!(nextChar & RegexSpecialMask))
  714. {
  715. if (options.type != type_string)
  716. return isUnicodeMatch(matcherChar, nextChar);
  717. else
  718. return isAsciiMatch(matcherChar, nextChar);
  719. }
  720. //Should only occur if unicode.
  721. switch (matcherChar)
  722. {
  723. case RCCalnum:
  724. return (nextChar == RCCalpha) || (nextChar == RCClower) || (nextChar == RCCupper) || (nextChar == RCCdigit);
  725. case RCCalpha:
  726. return (nextChar == RCClower) || (nextChar == RCCupper);
  727. case RCCspace:
  728. return (nextChar == RCCblank);
  729. case RCCprint:
  730. return (nextChar == RCCalnum) || (nextChar == RCCalpha) || (nextChar == RCClower) || (nextChar == RCCupper) || (nextChar == RCCdigit) || (nextChar == RCCpunct) || (nextChar == RCCxdigit) || (nextChar == RCCgraph);
  731. case RCCgraph:
  732. return (nextChar == RCCalnum) || (nextChar == RCCalpha) || (nextChar == RCClower) || (nextChar == RCCupper) || (nextChar == RCCdigit) || (nextChar == RCCpunct) || (nextChar == RCCxdigit);
  733. case RCCxdigit:
  734. return (nextChar == RCCdigit);
  735. case RCCany:
  736. return true;
  737. }
  738. return false;
  739. }
  740. else if (nextChar & RegexSpecialMask)
  741. return false;
  742. if (caseSensitive || (options.type == type_utf8))
  743. return false;
  744. #ifdef _USE_ICU
  745. if (options.type == type_unicode)
  746. {
  747. //MORE: This needs improving for full folding.
  748. return u_foldCase(nextChar, U_FOLD_CASE_DEFAULT) == u_foldCase(matcherChar, U_FOLD_CASE_DEFAULT);
  749. }
  750. #else
  751. rtlThrowNoUnicode();
  752. #endif
  753. return (nextChar == tolower(matcherChar)) || (nextChar == toupper(matcherChar));
  754. }
  755. bool HqlRegexExpr::canConsume(unsigned nextChar)
  756. {
  757. switch (getOperator())
  758. {
  759. case no_pat_singlechar:
  760. return ::canConsume(options, nextChar, (unsigned)expr->queryValue()->getIntValue(), caseSensitive);
  761. case no_pat_utf8single:
  762. assertex(!(nextChar & RegexSpecialMask));
  763. return nextChar < 0x80;
  764. case no_pat_utf8lead:
  765. assertex(!(nextChar & RegexSpecialMask));
  766. return nextChar >= 0xc0;
  767. case no_pat_utf8follow:
  768. assertex(!(nextChar & RegexSpecialMask));
  769. return nextChar >= 0x80 && nextChar <= 0xbf;
  770. case no_pat_anychar:
  771. assertex(options.type != type_utf8);
  772. return true;
  773. case no_pat_set:
  774. {
  775. assertex(options.type != type_utf8);
  776. bool invert = expr->hasAttribute(notAtom);
  777. ForEachChild(idx, expr)
  778. {
  779. IHqlExpression * child = expr->queryChild(idx);
  780. switch (child->getOperator())
  781. {
  782. case no_range:
  783. {
  784. unsigned low = (unsigned)child->queryChild(0)->queryValue()->getIntValue();
  785. unsigned high = (unsigned)child->queryChild(1)->queryValue()->getIntValue();
  786. if (!(nextChar & RegexSpecialMask))
  787. {
  788. if (!caseSensitive)
  789. {
  790. if (options.type == type_unicode)
  791. {
  792. #ifdef _USE_ICU
  793. //MORE: Improved unicode
  794. if (u_foldCase(nextChar, U_FOLD_CASE_DEFAULT) >= u_foldCase(low, U_FOLD_CASE_DEFAULT) &&
  795. u_foldCase(nextChar, U_FOLD_CASE_DEFAULT) <= u_foldCase(high, U_FOLD_CASE_DEFAULT))
  796. return !invert;
  797. #else
  798. rtlThrowNoUnicode();
  799. #endif
  800. }
  801. else
  802. {
  803. if (nextChar >= (unsigned)tolower(low) && nextChar <= (unsigned)tolower(high))
  804. return !invert;
  805. if (nextChar >= (unsigned)toupper(low) && nextChar <= (unsigned)toupper(high))
  806. return !invert;
  807. }
  808. }
  809. else
  810. {
  811. if (nextChar >= low && nextChar <= high)
  812. return !invert;
  813. }
  814. }
  815. break;
  816. }
  817. case no_constant:
  818. {
  819. unsigned value = (unsigned)child->queryValue()->getIntValue();
  820. if (::canConsume(options, nextChar, value, caseSensitive))
  821. return !invert;
  822. break;
  823. }
  824. case no_attr:
  825. case no_attr_expr:
  826. case no_attr_link:
  827. break;
  828. default:
  829. UNIMPLEMENTED;
  830. }
  831. }
  832. return invert;
  833. }
  834. case no_pat_endpattern:
  835. return false;
  836. default:
  837. UNIMPLEMENTED;
  838. }
  839. }
  840. void HqlRegexExpr::gatherConsumeSymbols(SymbolArray & symbols)
  841. {
  842. switch (getOperator())
  843. {
  844. case no_pat_singlechar:
  845. {
  846. unsigned charValue = (unsigned)expr->queryValue()->getIntValue();
  847. if (!caseSensitive && (options.type != type_utf8))
  848. {
  849. if (options.type == type_unicode)
  850. {
  851. #ifdef _USE_ICU
  852. //MORE: Unicode - can you have several values, what about case conversion?
  853. //MORE: String might need expanding to ('a'|'A')('ss'|'SS'|'B') or something similar.
  854. symbols.addUnique(u_tolower(charValue));
  855. symbols.addUnique(u_toupper(charValue));
  856. #else
  857. rtlThrowNoUnicode();
  858. #endif
  859. }
  860. else
  861. {
  862. symbols.addUnique(tolower(charValue));
  863. symbols.addUnique(toupper(charValue));
  864. }
  865. }
  866. else
  867. symbols.addUnique(charValue);
  868. break;
  869. }
  870. case no_pat_anychar:
  871. assertex(options.type != type_utf8);
  872. if (options.type == type_unicode)
  873. symbols.addUnique(RCCany);
  874. else
  875. symbols.addUniqueRange(0, 255);
  876. return;
  877. case no_pat_endpattern:
  878. return;
  879. case no_pat_set:
  880. assertex(options.type == type_string);
  881. if (options.type == type_unicode)
  882. {
  883. #ifdef _USE_ICU
  884. //MORE: Need some kind of implementation for unicode - although invert and ranges may not be possible.
  885. if (expr->hasAttribute(notAtom))
  886. throwError(HQLERR_DfaTooComplex);
  887. ForEachChild(idx, expr)
  888. {
  889. IHqlExpression * child = expr->queryChild(idx);
  890. switch (child->getOperator())
  891. {
  892. case no_range:
  893. {
  894. unsigned low = (unsigned)child->queryChild(0)->queryValue()->getIntValue();
  895. unsigned high = (unsigned)child->queryChild(1)->queryValue()->getIntValue();
  896. if (!caseSensitive)
  897. {
  898. //MORE: This really doesn't work very well - I'm not even sure what it means...
  899. symbols.addUniqueRange(u_tolower(low), u_tolower(high));
  900. symbols.addUniqueRange(u_toupper(low), u_toupper(high));
  901. }
  902. else
  903. symbols.addUniqueRange(low, high);
  904. break;
  905. }
  906. case no_constant:
  907. {
  908. unsigned value = (unsigned)child->queryValue()->getIntValue();
  909. if (caseSensitive || (value & RegexSpecialMask))
  910. symbols.addUnique(value);
  911. else
  912. {
  913. symbols.addUnique(u_tolower(value));
  914. symbols.addUnique(u_toupper(value));
  915. }
  916. break;
  917. }
  918. case no_attr:
  919. case no_attr_expr:
  920. case no_attr_link:
  921. break;
  922. default:
  923. UNIMPLEMENTED;
  924. }
  925. }
  926. #else
  927. rtlThrowNoUnicode();
  928. #endif
  929. }
  930. else
  931. {
  932. //This is probably the easiest way to cope with inverted sets and all other complications.
  933. for (unsigned i = 0; i < 256; i++)
  934. if (canConsume(i))
  935. symbols.addUnique(i);
  936. }
  937. return;
  938. case no_pat_utf8single:
  939. {
  940. for (unsigned i = 0; i < 0x80; i++)
  941. symbols.addUnique(i);
  942. break;
  943. }
  944. case no_pat_utf8lead:
  945. {
  946. for (unsigned i = 0xc0; i < 0x100; i++)
  947. symbols.addUnique(i);
  948. break;
  949. }
  950. case no_pat_utf8follow:
  951. {
  952. for (unsigned i = 0x80; i < 0xc0; i++)
  953. symbols.addUnique(i);
  954. break;
  955. }
  956. default:
  957. throwError1(HQLERR_BadDfaOperator, getOpString(getOperator()));
  958. }
  959. }
  960. void HqlRegexExpr::cleanup()
  961. {
  962. if (cleaned)
  963. return;
  964. cleaned = true;
  965. //kill anything that could have a loop.
  966. ForEachItemIn(i0, args)
  967. args.item(i0).cleanup();
  968. if (querySubPattern())
  969. querySubPattern()->cleanup();
  970. if (created)
  971. created->dispose();
  972. following.kill();
  973. args.kill();
  974. }
  975. HqlRegexExpr * HqlRegexExpr::clone()
  976. {
  977. assertex(!created && !analysed);
  978. HqlRegexExpr * cloned = createRegexExpr(options, op, expr, caseSensitive);
  979. ForEachItemIn(idx, args)
  980. cloned->args.append(*args.item(idx).clone());
  981. cloned->setNamed(queryNamed());
  982. cloned->setSubPattern(querySubPattern());
  983. return cloned;
  984. }
  985. void HqlRegexExpr::connectRegex()
  986. {
  987. if (connected)
  988. return;
  989. connected = true;
  990. ForEachItemIn(idx, args)
  991. args.item(idx).connectRegex();
  992. if (created)
  993. {
  994. ForEachItemIn(idx2, following)
  995. created->addLink(following.item(idx2).created);
  996. HqlNamedRegex * named = queryNamed();
  997. if (named && named->created && !named->noGenerate)
  998. created->setBody(named->created);
  999. if (querySubPattern())
  1000. created->setSubPattern(querySubPattern()->queryRootPattern());
  1001. }
  1002. }
  1003. HqlRegexExpr * HqlRegexExpr::createDFAs(RegexContext & ctx)
  1004. {
  1005. if (createDFA)
  1006. {
  1007. OwnedHqlExpr searchExpr = createValue(no_pat_dfa, makePatternType(), createUniqueId());
  1008. HqlNamedRegex * newNamed = new HqlNamedRegex(searchExpr, NULL, searchExpr, no_pat_dfa, caseSensitive, false);
  1009. ctx.named.append(*newNamed);
  1010. newNamed->setRegexOwn(expandAsDFA());
  1011. HqlRegexExpr * dfa = createRegexExpr(options, no_pat_dfa, NULL, caseSensitive);
  1012. dfa->setNamed(newNamed);
  1013. dfa->dfaScore = dfaScore;
  1014. //MORE: Need to save the original as a *named/subPattern* attribute in case conversion to a DFA fails
  1015. return dfa;
  1016. }
  1017. unsigned max = args.ordinality();
  1018. unsigned idx;
  1019. for (idx = 0; idx < max; idx++)
  1020. {
  1021. HqlRegexExpr & cur = args.item(idx);
  1022. HqlRegexExpr * transformed = cur.createDFAs(ctx);
  1023. args.replace(*transformed, idx);
  1024. if (transformed->getOperator() == no_pat_dfa)
  1025. {
  1026. if ((idx != 0) && (idx+1 != max) && (getOperator() == no_pat_follow))
  1027. {
  1028. if ((args.item(idx-1).getOperator() == no_pat_begintoken) &&
  1029. (args.item(idx+1).getOperator() == no_pat_endtoken))
  1030. {
  1031. transformed->addOperand(&OLINK(args.item(idx-1)));
  1032. args.remove(idx+1);
  1033. args.remove(idx-1);
  1034. idx--;
  1035. max -= 2;
  1036. }
  1037. }
  1038. }
  1039. }
  1040. if (querySubPattern())
  1041. querySubPattern()->createDFAs(ctx);
  1042. if (queryNamed())
  1043. queryNamed()->createDFAs(ctx);
  1044. return LINK(this);
  1045. }
  1046. HqlRegexExpr * HqlRegexExpr::expandAsRepeatedDFA(unsigned count)
  1047. {
  1048. if (count == 0)
  1049. return createRegexExpr(options, no_null, NULL, caseSensitive);
  1050. if (count == 1)
  1051. return expandAsDFA();
  1052. HqlRegexExpr * cloned = createRegexExpr(options, no_pat_follow, NULL, caseSensitive);
  1053. for (unsigned i=0; i < count; i++)
  1054. cloned->args.append(*expandAsDFA());
  1055. return cloned;
  1056. }
  1057. HqlRegexExpr * HqlRegexExpr::expandAsDFA()
  1058. {
  1059. assertex(!created && !analysed);
  1060. switch (getOperator())
  1061. {
  1062. case no_pat_const:
  1063. return expandStringAsChars(expr, options, caseSensitive);
  1064. case no_pat_instance:
  1065. return queryNamed()->def->expandAsDFA();
  1066. case no_pat_or:
  1067. case no_pat_follow:
  1068. case no_null:
  1069. case no_pat_singlechar:
  1070. break;
  1071. case no_pat_anychar:
  1072. if (options.type == type_utf8)
  1073. {
  1074. // convert to low | follow(lead, tail+);
  1075. Owned<HqlRegexExpr> orRegexExpr = createRegexExpr(options, no_pat_or, NULL, caseSensitive);
  1076. orRegexExpr->addOperand(createRegexExpr(options, no_pat_utf8single, NULL, caseSensitive));
  1077. Owned<HqlRegexExpr> followRegexExpr = createRegexExpr(options, no_pat_follow, NULL, caseSensitive);
  1078. followRegexExpr->addOperand(createRegexExpr(options, no_pat_utf8lead, NULL, caseSensitive));
  1079. OwnedHqlExpr trailExpr = createValue(no_pat_utf8follow, makePatternType());
  1080. Owned<HqlRegexExpr> trailRegexExpr = createRegexExpr(options, no_pat_utf8follow, NULL, caseSensitive);
  1081. OwnedHqlExpr repeatExpr = createValue(no_pat_repeat, makePatternType(), LINK(trailExpr), createConstantOne(), createValue(no_any, makeNullType()));
  1082. Owned<HqlRegexExpr> repeatRegexExpr = createRegexExpr(options, no_pat_repeat, repeatExpr, caseSensitive);
  1083. repeatRegexExpr->addOperand(trailRegexExpr.getClear());
  1084. followRegexExpr->addOperand(repeatRegexExpr.getClear());
  1085. orRegexExpr->addOperand(followRegexExpr.getClear());
  1086. return orRegexExpr.getClear();
  1087. }
  1088. break;
  1089. case no_pat_set:
  1090. if (options.type == type_utf8)
  1091. {
  1092. throwUnexpected();
  1093. //expand set values...
  1094. // return low | follow(lead, tail+);
  1095. }
  1096. break;
  1097. case no_pat_dfa:
  1098. return queryNamed()->def->expandAsDFA();
  1099. case no_pat_repeat:
  1100. {
  1101. if (isStandardRepeat())
  1102. break;
  1103. unsigned min = getRepeatMin();
  1104. unsigned max = getRepeatMax();
  1105. if (min == max)
  1106. return queryNamed()->def->expandAsRepeatedDFA(min);
  1107. HqlRegexExpr * cloned = createRegexExpr(options, no_pat_or, NULL, caseSensitive);
  1108. for (unsigned i=min; i <= max; i++)
  1109. cloned->args.append(*queryNamed()->def->expandAsRepeatedDFA(i));
  1110. return cloned;
  1111. }
  1112. default:
  1113. UNIMPLEMENTED;
  1114. }
  1115. HqlRegexExpr * cloned = createRegexExpr(options, op, expr, caseSensitive);
  1116. ForEachItemIn(idx, args)
  1117. cloned->args.append(*args.item(idx).expandAsDFA());
  1118. assertex(!queryNamed());
  1119. assertex(!querySubPattern());
  1120. return cloned;
  1121. }
  1122. HqlRegexExpr * HqlRegexExpr::expandNamedSymbols()
  1123. {
  1124. ForEachItemIn(idx, args)
  1125. args.replace(*args.item(idx).expandNamedSymbols(), idx);
  1126. if (querySubPattern())
  1127. querySubPattern()->expandNamedSymbols();
  1128. if (queryNamed())
  1129. queryNamed()->expandNamedSymbols();
  1130. switch (getOperator())
  1131. {
  1132. case no_pat_instance:
  1133. if (queryNamed()->queryExpandInline())
  1134. {
  1135. queryNamed()->removeUse();
  1136. return queryNamed()->def->clone();
  1137. }
  1138. break;
  1139. case no_pat_follow:
  1140. {
  1141. ForEachItemIn(idx, args)
  1142. {
  1143. node_operator op = args.item(idx).getOperator();
  1144. switch (op)
  1145. {
  1146. //Remove begin_token/end_token around the following, otherwise
  1147. //we might get too many separators which may be non-optional
  1148. case no_pat_first:
  1149. case no_pat_last:
  1150. case no_penalty:
  1151. case no_pat_x_before_y:
  1152. case no_pat_before_y:
  1153. case no_pat_x_after_y:
  1154. case no_pat_after_y:
  1155. case no_pat_guard:
  1156. if ((idx > 0) && args.item(idx-1).getOperator() == no_pat_begintoken)
  1157. {
  1158. assertex(args.item(idx+1).getOperator() == no_pat_endtoken);
  1159. args.remove(idx+1);
  1160. args.remove(idx-1);
  1161. if (args.isItem(idx) && (args.item(idx).getOperator() == no_pat_separator))
  1162. args.remove(idx);
  1163. }
  1164. break;
  1165. }
  1166. }
  1167. break;
  1168. }
  1169. }
  1170. return LINK(this);
  1171. }
  1172. HqlRegexExpr * HqlRegexExpr::expandRecursion(RegexContext & ctx, HqlNamedRegex * self)
  1173. {
  1174. ForEachItemIn(idx, args)
  1175. args.replace(*args.item(idx).expandRecursion(ctx, self), idx);
  1176. if (querySubPattern())
  1177. querySubPattern()->expandRecursion(ctx, self);
  1178. if (queryNamed())
  1179. queryNamed()->expandRecursion(ctx, self);
  1180. HqlNamedRegex * replacement = NULL;
  1181. switch (getOperator())
  1182. {
  1183. case no_self:
  1184. replacement = self;
  1185. break;
  1186. case no_pat_use:
  1187. assertex(expr->queryChild(0)->queryValue());
  1188. replacement = ctx.queryDefine(expr->queryChild(0), caseSensitive);
  1189. break;
  1190. case no_define:
  1191. return LINK(&args.item(0));
  1192. }
  1193. if (replacement)
  1194. {
  1195. replacement->expandRecursion(ctx, self);
  1196. replacement->addRecursiveUse();
  1197. HqlRegexExpr * instance = createRegexExpr(options, no_pat_instance, NULL, caseSensitive);
  1198. instance->setNamed(replacement);
  1199. return instance;
  1200. }
  1201. return LINK(this);
  1202. }
  1203. void HqlRegexExpr::generateDFA(IDfaPattern * pattern)
  1204. {
  1205. //Estimate the number of transitions to avoid lots of reallocs.
  1206. unsigned numTransitions = 0;
  1207. {
  1208. ForEachItemIn(idx, dfa->states)
  1209. {
  1210. HqlDfaState & cur = dfa->states.item(idx);
  1211. unsigned min = 255;
  1212. unsigned max = 0;
  1213. ForEachItemIn(idx2, cur.transitions)
  1214. {
  1215. HqlDfaTransition & curTransition = cur.transitions.item(idx2);
  1216. if (curTransition.code < min) min = curTransition.code;
  1217. if (curTransition.code > max) max = curTransition.code;
  1218. }
  1219. if (min <= max)
  1220. numTransitions += (max-min)+1;
  1221. }
  1222. }
  1223. pattern->init(dfa->states.ordinality(), numTransitions);
  1224. ForEachItemIn(idx, dfa->states)
  1225. {
  1226. HqlDfaState & cur = dfa->states.item(idx);
  1227. assertex(cur.id == idx);
  1228. pattern->beginState(cur.id);
  1229. ForEachItemIn(idx, cur.position)
  1230. {
  1231. HqlRegexExpr & curExpr = cur.position.item(idx);
  1232. if (curExpr.getOperator() == no_pat_endpattern)
  1233. pattern->setStateAccept(curExpr.getAcceptId());
  1234. }
  1235. ForEachItemIn(idx2, cur.transitions)
  1236. {
  1237. HqlDfaTransition & curTransition = cur.transitions.item(idx2);
  1238. pattern->addTransition(curTransition.code, curTransition.next->id);
  1239. }
  1240. pattern->endState();
  1241. }
  1242. pattern->finished();
  1243. }
  1244. void HqlRegexExpr::generateRegex(GenerateRegexCtx & ctx)
  1245. {
  1246. ForEachItemIn(idx, args)
  1247. args.item(idx).generateRegex(ctx);
  1248. assertex(!created);
  1249. if (queryNamed())
  1250. queryNamed()->generateRegex(ctx);
  1251. if (querySubPattern())
  1252. querySubPattern()->generateRegex(ctx);
  1253. switch (getOperator())
  1254. {
  1255. case no_null:
  1256. case no_pat_beginpattern:
  1257. created.setown(new RegexNullPattern);
  1258. break;
  1259. case no_pat_beginrecursive:
  1260. created.setown(new RegexRecursivePattern);
  1261. break;
  1262. case no_pat_beginseparator:
  1263. created.setown(new RegexBeginSeparatorPattern);
  1264. break;
  1265. case no_pat_endseparator:
  1266. created.setown(new RegexEndSeparatorPattern);
  1267. break;
  1268. case no_pat_endrecursive:
  1269. created.setown(new RegexEndRecursivePattern);
  1270. break;
  1271. case no_pat_instance:
  1272. created.setown(new RegexNamedPattern);
  1273. break;
  1274. case no_pat_begincheck:
  1275. case no_pat_beginvalidate:
  1276. created.setown(new RegexBeginCheckPattern);
  1277. break;
  1278. case no_pat_endcheckin:
  1279. {
  1280. bool stripSeparator = ctx.info.addedSeparators && (expr->queryType()->getTypeCode() != type_pattern);
  1281. created.setown(new RegexCheckInPattern(expr->hasAttribute(notAtom), stripSeparator));
  1282. break;
  1283. }
  1284. case no_pat_endchecklength:
  1285. {
  1286. unsigned minLength = 0;
  1287. unsigned maxLength = PATTERN_UNLIMITED_LENGTH;
  1288. getCheckRange(expr->queryChild(1), minLength, maxLength, ctx.info.charSize);
  1289. bool stripSeparator = ctx.info.addedSeparators && (expr->queryType()->getTypeCode() != type_pattern);
  1290. created.setown(new RegexCheckLengthPattern(expr->hasAttribute(notAtom), stripSeparator, minLength, maxLength));
  1291. break;
  1292. }
  1293. case no_pat_endvalidate:
  1294. {
  1295. unsigned idx = ctx.regex.getValidatorIndex(expr);
  1296. assertex(idx != NotFound);
  1297. ValidateKind validatorKind = getValidateKind(ctx.regex.queryValidateExpr(expr));
  1298. bool stripSeparator = ctx.info.addedSeparators && (expr->queryType()->getTypeCode() != type_pattern);
  1299. switch (ctx.info.inputFormat())
  1300. {
  1301. case NlpUnicode:
  1302. if (validatorKind != ValidateIsString)
  1303. created.setown(new RegexValidateUniAsUniPattern(stripSeparator, idx));
  1304. else
  1305. created.setown(new RegexValidateUniAsAscPattern(stripSeparator, idx));
  1306. break;
  1307. case NlpUtf8:
  1308. if (validatorKind != ValidateIsString)
  1309. created.setown(new RegexValidateUtf8AsUniPattern(stripSeparator, idx));
  1310. else
  1311. created.setown(new RegexValidateUtf8AsAscPattern(stripSeparator, idx));
  1312. break;
  1313. case NlpAscii:
  1314. if (validatorKind != ValidateIsUnicode)
  1315. created.setown(new RegexValidateAscAsAscPattern(stripSeparator, idx));
  1316. else
  1317. created.setown(new RegexValidateAscAsUniPattern(stripSeparator, idx));
  1318. break;
  1319. }
  1320. break;
  1321. }
  1322. case no_pat_begintoken:
  1323. created.setown(new RegexBeginTokenPattern);
  1324. break;
  1325. case no_pat_endtoken:
  1326. created.setown(new RegexEndTokenPattern);
  1327. break;
  1328. case no_pat_x_before_y:
  1329. case no_pat_before_y:
  1330. created.setown(new RegexAssertNextPattern(expr->hasAttribute(notAtom)));
  1331. break;
  1332. case no_pat_x_after_y:
  1333. case no_pat_after_y:
  1334. {
  1335. unsigned minSize = limitedMult(querySubPattern()->limit.minLength, ctx.info.charSize);
  1336. unsigned maxSize = limitedMult(querySubPattern()->limit.maxLength, ctx.info.charSize);
  1337. created.setown(new RegexAssertPrevPattern(expr->hasAttribute(notAtom), minSize, maxSize));
  1338. break;
  1339. }
  1340. case no_pat_first:
  1341. created.setown(new RegexStartPattern);
  1342. break;
  1343. case no_pat_last:
  1344. created.setown(new RegexFinishPattern);
  1345. break;
  1346. case no_pat_anychar:
  1347. created.setown(new RegexAnyCharPattern);
  1348. break;
  1349. case no_penalty:
  1350. {
  1351. int penalty = 1;
  1352. if (expr->numChildren() > 0)
  1353. penalty = (int)expr->queryChild(0)->queryValue()->getIntValue();
  1354. created.setown(new RegexPenaltyPattern(penalty));
  1355. }
  1356. break;
  1357. case no_pat_set:
  1358. {
  1359. RegexSetBasePattern * pattern;
  1360. if (ctx.info.type != type_string)
  1361. #ifdef _USE_ICU
  1362. pattern = new RegexUnicodeSetPattern(caseSensitive);
  1363. #else
  1364. rtlThrowNoUnicode();
  1365. #endif
  1366. else if (caseSensitive)
  1367. pattern = new RegexAsciiSetPattern;
  1368. else
  1369. pattern = new RegexAsciiISetPattern;
  1370. created.setown(pattern);
  1371. if (expr->hasAttribute(notAtom))
  1372. pattern->setInvert(true);
  1373. ForEachChild(idx, expr)
  1374. {
  1375. IHqlExpression * child = expr->queryChild(idx);
  1376. switch (child->getOperator())
  1377. {
  1378. case no_range:
  1379. pattern->addRange((unsigned)child->queryChild(0)->queryValue()->getIntValue(), (unsigned)child->queryChild(1)->queryValue()->getIntValue());
  1380. break;
  1381. case no_constant:
  1382. pattern->addRange((unsigned)child->queryValue()->getIntValue(), (unsigned)child->queryValue()->getIntValue());
  1383. break;
  1384. case no_attr:
  1385. case no_attr_expr:
  1386. case no_attr_link:
  1387. break;
  1388. default:
  1389. UNIMPLEMENTED;
  1390. }
  1391. }
  1392. break;
  1393. }
  1394. case no_pat_const:
  1395. {
  1396. IValue * value = expr->queryChild(0)->queryValue();
  1397. Owned<IValue> castValue = getCastConstant(value, ctx.info.type);
  1398. unsigned len = castValue->queryType()->getStringLen();
  1399. switch (ctx.info.type)
  1400. {
  1401. case type_unicode:
  1402. {
  1403. #ifdef _USE_ICU
  1404. const UChar * data = (const UChar *)castValue->queryValue();
  1405. if (caseSensitive)
  1406. created.setown(new RegexUnicodePattern(len, data));
  1407. else
  1408. created.setown(new RegexUnicodeIPattern(len, data));
  1409. break;
  1410. #else
  1411. rtlThrowNoUnicode();
  1412. #endif
  1413. }
  1414. case type_utf8:
  1415. {
  1416. #ifdef _USE_ICU
  1417. const char * data = (const char *)castValue->queryValue();
  1418. if (caseSensitive)
  1419. created.setown(new RegexUtf8Pattern(len, data));
  1420. else
  1421. created.setown(new RegexUtf8IPattern(len, data));
  1422. break;
  1423. #else
  1424. rtlThrowNoUnicode();
  1425. #endif
  1426. }
  1427. case type_string:
  1428. {
  1429. const char * data = (const char *)castValue->queryValue();
  1430. if (caseSensitive)
  1431. created.setown(new RegexAsciiPattern(len, data));
  1432. else
  1433. created.setown(new RegexAsciiIPattern(len, data));
  1434. break;
  1435. }
  1436. default:
  1437. UNIMPLEMENTED;
  1438. }
  1439. break;
  1440. }
  1441. case no_pat_or:
  1442. case no_pat_follow:
  1443. case no_pat_separator:
  1444. //Nothing is generated for these....
  1445. break;
  1446. case no_pat_dfa:
  1447. {
  1448. RegexDfaPattern * pattern;
  1449. bool isToken = false;
  1450. HqlRegexExpr * child = queryChild(0);
  1451. if (child && child->getOperator() == no_pat_begintoken)
  1452. isToken = true;
  1453. if (options.type == type_unicode)
  1454. {
  1455. throwUnexpected();
  1456. pattern = new RegexUnicodeDfaPattern;
  1457. }
  1458. else
  1459. pattern = new RegexAsciiDfaPattern(isToken);
  1460. created.setown(pattern);
  1461. Owned<IDfaPattern> builder = pattern->createBuilder();
  1462. generateDFA(builder);
  1463. }
  1464. break;
  1465. case no_pat_repeat:
  1466. if (!isStandardRepeat())
  1467. {
  1468. if (expr->queryChild(0)->getOperator() == no_pat_anychar)
  1469. {
  1470. created.setown(new RegexRepeatAnyPattern(getRepeatMin(), getRepeatMax(), expr->hasAttribute(minimalAtom)));
  1471. queryNamed()->noGenerate = true;
  1472. }
  1473. else
  1474. created.setown(new RegexRepeatPattern(getRepeatMin(), getRepeatMax(), !expr->hasAttribute(minimalAtom)));
  1475. }
  1476. break;
  1477. case no_pat_endpattern:
  1478. switch (ctx.namedKind)
  1479. {
  1480. case no_pat_instance:
  1481. case no_pat_repeat:
  1482. created.setown(new RegexEndNestedPattern);
  1483. break;
  1484. case no_pat_checkin:
  1485. created.setown(new RegexDonePattern);
  1486. break;
  1487. case no_parse:
  1488. created.setown(new RegexDonePattern);
  1489. break;
  1490. default:
  1491. UNIMPLEMENTED;
  1492. }
  1493. break;
  1494. default:
  1495. UNIMPLEMENTED;
  1496. }
  1497. }
  1498. unsigned HqlRegexExpr::getAcceptId()
  1499. {
  1500. //Overloaded, and a bit nasty, but will do for the moment....
  1501. if (dfaScore == NO_DFA_SCORE)
  1502. return 99;
  1503. return dfaScore;
  1504. }
  1505. unsigned HqlRegexExpr::getRepeatMin()
  1506. {
  1507. return ::getRepeatMin(expr);
  1508. }
  1509. unsigned HqlRegexExpr::getRepeatMax()
  1510. {
  1511. return ::getRepeatMax(expr);
  1512. }
  1513. bool HqlRegexExpr::isStandardRepeat()
  1514. {
  1515. if (::isStandardRepeat(expr))
  1516. {
  1517. if (options.expandRepeatAnyAsDfa || expr->queryChild(0)->getOperator() != no_pat_anychar)
  1518. return true;
  1519. }
  1520. return false;
  1521. }
  1522. HqlRegexExpr * HqlRegexExpr::insertSeparators(IHqlExpression * separator, RegexContext * ctx)
  1523. {
  1524. switch (getOperator())
  1525. {
  1526. case no_pat_follow:
  1527. {
  1528. ForEachItemInRev(idx, args)
  1529. {
  1530. HqlRegexExpr & cur = args.item(idx);
  1531. if (cur.getOperator() == no_pat_endtoken)
  1532. {
  1533. if (args.item(idx-1).getOperator() != no_pat_last)
  1534. {
  1535. //Separators are slightly weird. Because they are implicit, the whole thing should
  1536. //be optional (unlike a name where just the contents are optional).
  1537. //Should probably remove the optionality from the separator's value, and
  1538. //put it on the surrounding separator instead.
  1539. HqlRegexExpr * sepRegex = createRegexExpr(options, no_pat_separator, NULL, caseSensitive);
  1540. sepRegex->addOperand(createRegexExpr(options, no_pat_beginseparator, NULL, caseSensitive));
  1541. sepRegex->addOperand(ctx->createStructure(separator, ctx->isCaseSensitive()));
  1542. sepRegex->addOperand(createRegexExpr(options, no_pat_endseparator, NULL, caseSensitive));
  1543. args.add(*sepRegex, idx+1);
  1544. }
  1545. }
  1546. }
  1547. break;
  1548. }
  1549. }
  1550. ForEachChild(idx, this)
  1551. {
  1552. replaceOperand(queryChild(idx)->insertSeparators(separator, ctx), idx);
  1553. }
  1554. return LINK(this);
  1555. }
  1556. bool HqlRegexExpr::isWorthConvertingToDFA()
  1557. {
  1558. switch (getOperator())
  1559. {
  1560. case no_pat_or: case no_pat_repeat:
  1561. case no_pat_follow:
  1562. return true;
  1563. }
  1564. ForEachItemIn(idx, args)
  1565. if (args.item(idx).isWorthConvertingToDFA())
  1566. return true;
  1567. return false;
  1568. }
  1569. bool HqlRegexExpr::isSimpleStringList()
  1570. {
  1571. switch (getOperator())
  1572. {
  1573. case no_pat_or:
  1574. case no_pat_case:
  1575. case no_pat_nocase:
  1576. break;
  1577. case no_pat_const:
  1578. return true;
  1579. default:
  1580. return false;
  1581. }
  1582. ForEachItemIn(idx, args)
  1583. if (!args.item(idx).isSimpleStringList())
  1584. return false;
  1585. return true;
  1586. }
  1587. void HqlRegexExpr::markDFAs(unsigned complexity)
  1588. {
  1589. if (dfaScore != NO_DFA_SCORE)
  1590. {
  1591. //Don't convert instances - convert the definition so it can be reused.
  1592. //and if not worth converting, then children won't be either.
  1593. if (getOperator() != no_pat_instance)
  1594. {
  1595. if (dfaScore <= complexity)
  1596. {
  1597. createDFA = isWorthConvertingToDFA();
  1598. return;
  1599. }
  1600. else if (isSimpleStringList())
  1601. {
  1602. createDFA = true;
  1603. return;
  1604. }
  1605. }
  1606. }
  1607. ForEachItemIn(idx, args)
  1608. args.item(idx).markDFAs(complexity);
  1609. if (queryNamed())
  1610. queryNamed()->markDFAs(complexity);
  1611. if (querySubPattern())
  1612. querySubPattern()->markDFAs(complexity);
  1613. }
  1614. HqlRegexExpr * HqlRegexExpr::mergeCreateSets()
  1615. {
  1616. ForEachItemIn(idx, args)
  1617. replaceOperand(args.item(idx).mergeCreateSets(), idx);
  1618. if (getOperator() == no_pat_or)
  1619. {
  1620. unsigned singleCount = 0;
  1621. unsigned setCount = 0;
  1622. unsigned nullCount = 0;
  1623. HqlExprArray setArgs;
  1624. HqlRegexExprArray newArgs;
  1625. ForEachItemIn(idx, args)
  1626. {
  1627. HqlRegexExpr * cur = &args.item(idx);
  1628. bool curIsOptional = false;
  1629. if ((cur->getOperator() == no_pat_repeat) && (cur->getRepeatMin() == 0) && (cur->getRepeatMax() == 1))
  1630. {
  1631. curIsOptional = true;
  1632. cur = cur->queryChild(0);
  1633. }
  1634. if (cur->caseSensitive == caseSensitive)
  1635. {
  1636. switch (cur->getOperator())
  1637. {
  1638. case no_null:
  1639. nullCount++;
  1640. break;
  1641. case no_pat_set:
  1642. {
  1643. setCount++;
  1644. ForEachChild(iset, cur->expr)
  1645. {
  1646. IHqlExpression * child = cur->expr->queryChild(iset);
  1647. if (setArgs.find(*child) == NotFound)
  1648. setArgs.append(*LINK(child));
  1649. }
  1650. if (curIsOptional) nullCount++;
  1651. break;
  1652. }
  1653. case no_pat_const:
  1654. {
  1655. ITypeInfo * type = cur->expr->queryChild(0)->queryType();
  1656. unsigned len = type->getStringLen();
  1657. if (len == 0)
  1658. nullCount++;
  1659. else if (len == 1)
  1660. {
  1661. const void * value = cur->expr->queryChild(0)->queryValue()->queryValue();
  1662. unsigned nextValue;
  1663. if (type->getTypeCode() == type_utf8)
  1664. nextValue = rtlUtf8Char(value);
  1665. else if (isUnicodeType(type))
  1666. nextValue = *(UChar *)value;
  1667. else
  1668. nextValue = *(unsigned char *)value;
  1669. OwnedHqlExpr nextExpr = createConstant((int)nextValue);
  1670. if (setArgs.find(*nextExpr) == NotFound)
  1671. setArgs.append(*nextExpr.getClear());
  1672. singleCount++;
  1673. if (curIsOptional) nullCount++;
  1674. }
  1675. else
  1676. newArgs.append(*LINK(cur));
  1677. break;
  1678. }
  1679. default:
  1680. newArgs.append(*LINK(cur));
  1681. break;
  1682. }
  1683. }
  1684. else
  1685. newArgs.append(*LINK(cur));
  1686. }
  1687. if ((setCount > 1) || (setCount == 1 && singleCount > 0) || (singleCount > 1))
  1688. {
  1689. OwnedHqlExpr newSetExpr = createValue(no_pat_set, makePatternType(), setArgs);
  1690. if (nullCount)
  1691. newSetExpr.setown(createValue(no_pat_repeat, makePatternType(), newSetExpr.getClear(), createConstant(0), createConstantOne()));
  1692. HqlRegexExpr * newSetRegex = createRegexExpr(options, newSetExpr, caseSensitive);
  1693. if (newArgs.ordinality() == 0)
  1694. return newSetRegex;
  1695. args.kill();
  1696. ForEachItemIn(idx, newArgs)
  1697. args.append(OLINK(newArgs.item(idx)));
  1698. args.append(*newSetRegex);
  1699. }
  1700. }
  1701. return LINK(this);
  1702. }
  1703. #if 0
  1704. HqlRegexExpr * HqlRegexExpr::removeTrivialInstances()
  1705. {
  1706. node_operator op = expr->getOperator();
  1707. switch (op)
  1708. {
  1709. case no_pat_instance:
  1710. {
  1711. IHqlExpression * def = expr->queryChild(0);
  1712. if (!queryNamed()->isMatched && def->getOperator() == no_pat_instance)
  1713. {
  1714. queryNamed()->removeUse();
  1715. return
  1716. #if 0
  1717. //Do this later - recursion means we can't do it yet
  1718. //Optimization whilst building the tree. If a named symbol is just a definition of another named symbol
  1719. //and this named symbol isn't matched explicitly, then expand it.
  1720. if (!isMatched(def) && def->getOperator() == no_pat_instance)
  1721. return createStructure(def);
  1722. else
  1723. }
  1724. }
  1725. }
  1726. }
  1727. #endif
  1728. #endif
  1729. //---------------------------------------------------------------------------
  1730. void HqlSimpleRegexExpr::analyseNullLeadTrail()
  1731. {
  1732. if (analysed)
  1733. return;
  1734. analysed = true;
  1735. switch (getOperator())
  1736. {
  1737. case no_null:
  1738. limit.minLength = 0;
  1739. limit.maxLength = 0;
  1740. break;
  1741. case no_pat_beginrecursive:
  1742. limit.minLength = 0;
  1743. limit.maxLength = 0;
  1744. limit.containsAssertion = true;
  1745. break;
  1746. case no_pat_beginpattern:
  1747. limit.minLength = 0;
  1748. limit.maxLength = 0;
  1749. break;
  1750. case no_pat_endpattern:
  1751. limit.minLength = 0;
  1752. limit.maxLength = 0;
  1753. limit.containsAssertion = true;
  1754. break;
  1755. case no_pat_singlechar:
  1756. limit.minLength = 1;
  1757. limit.maxLength = 1;
  1758. break;
  1759. }
  1760. }
  1761. void HqlSimpleRegexExpr::gatherLeading(HqlRegexUniqueArray & target)
  1762. {
  1763. switch (op)
  1764. {
  1765. case no_null:
  1766. case no_pat_beginrecursive:
  1767. case no_pat_beginpattern:
  1768. break;
  1769. case no_pat_endpattern:
  1770. case no_pat_singlechar:
  1771. target.append(OLINK(*this));
  1772. break;
  1773. default:
  1774. UNIMPLEMENTED;
  1775. }
  1776. }
  1777. void HqlSimpleRegexExpr::gatherTrailing(HqlRegexUniqueArray & target)
  1778. {
  1779. unsigned max = numTrailing();
  1780. for (unsigned i = 0; i < max; i++)
  1781. target.append(OLINK(queryTrailing(i)));
  1782. }
  1783. unsigned HqlSimpleRegexExpr::numTrailing() const
  1784. {
  1785. switch (getOperator())
  1786. {
  1787. case no_null:
  1788. case no_pat_endpattern:
  1789. return 0;
  1790. case no_pat_beginrecursive:
  1791. case no_pat_beginpattern:
  1792. case no_pat_singlechar:
  1793. return 1;
  1794. default:
  1795. UNIMPLEMENTED;
  1796. }
  1797. }
  1798. HqlRegexExpr & HqlSimpleRegexExpr::queryTrailing(unsigned idx)
  1799. {
  1800. switch (getOperator())
  1801. {
  1802. case no_pat_beginrecursive:
  1803. case no_pat_beginpattern:
  1804. case no_pat_singlechar:
  1805. if (idx == 0)
  1806. return *this;
  1807. break;
  1808. }
  1809. UNIMPLEMENTED;
  1810. }
  1811. //---------------------------------------------------------------------------
  1812. void HqlComplexRegexExpr::analyseNullLeadTrail()
  1813. {
  1814. if (analysed)
  1815. return;
  1816. analysed = true;
  1817. ForEachItemIn(idx, args)
  1818. args.item(idx).analyseNullLeadTrail();
  1819. if (queryNamed())
  1820. named->analyseNullLeadTrail();
  1821. switch (getOperator())
  1822. {
  1823. case no_null:
  1824. case no_pat_beginrecursive:
  1825. case no_pat_beginpattern:
  1826. case no_pat_endpattern:
  1827. case no_pat_singlechar:
  1828. //Should be handled by simpleRegex
  1829. UNIMPLEMENTED;
  1830. break;
  1831. case no_pat_instance:
  1832. limit = named->queryLimit();
  1833. limit.containsAssertion = true; // require us to walk it, even if internally matches nothing.
  1834. leading.append(*LINK(this));
  1835. trailing.append(*LINK(this));
  1836. break;
  1837. case no_pat_dfa:
  1838. limit = named->queryLimit();
  1839. limit.containsAssertion = true; // this means that it will process optional values
  1840. leading.append(*LINK(this));
  1841. trailing.append(*LINK(this));
  1842. break;
  1843. case no_pat_endrecursive:
  1844. case no_pat_x_before_y:
  1845. case no_pat_before_y:
  1846. case no_pat_x_after_y:
  1847. case no_pat_after_y:
  1848. case no_pat_first:
  1849. case no_pat_last:
  1850. case no_pat_begintoken:
  1851. case no_pat_endtoken:
  1852. case no_pat_begincheck:
  1853. case no_pat_endcheckin:
  1854. case no_pat_endchecklength:
  1855. case no_pat_beginseparator:
  1856. case no_pat_endseparator:
  1857. case no_pat_beginvalidate:
  1858. case no_pat_endvalidate:
  1859. case no_penalty:
  1860. limit.containsAssertion = true;
  1861. limit.minLength = 0;
  1862. limit.maxLength = 0;
  1863. leading.append(*LINK(this));
  1864. trailing.append(*LINK(this));
  1865. break;
  1866. case no_pat_anychar:
  1867. case no_pat_set:
  1868. limit.minLength = 1;
  1869. limit.maxLength = 1;
  1870. leading.append(*LINK(this));
  1871. trailing.append(*LINK(this));
  1872. break;
  1873. case no_pat_const:
  1874. {
  1875. unsigned len = expr->queryChild(0)->queryType()->getStringLen();
  1876. limit.minLength = len;
  1877. limit.maxLength = len;
  1878. if (len != 0)
  1879. {
  1880. leading.append(*LINK(this));
  1881. trailing.append(*LINK(this));
  1882. }
  1883. }
  1884. break;
  1885. case no_pat_or:
  1886. {
  1887. limit.minLength = PATTERN_UNLIMITED_LENGTH;
  1888. limit.maxLength = 0;
  1889. limit.containsAssertion = true;
  1890. ForEachItemIn(idx, args)
  1891. {
  1892. HqlRegexExpr & cur = args.item(idx);
  1893. if (cur.limit.minLength < limit.minLength)
  1894. limit.minLength = cur.limit.minLength;
  1895. if (cur.limit.maxLength > limit.maxLength)
  1896. limit.maxLength = cur.limit.maxLength;
  1897. if (cur.limit.canBeNull())
  1898. limit.containsAssertion = false;
  1899. cur.gatherLeading(leading);
  1900. cur.gatherTrailing(trailing);
  1901. }
  1902. break;
  1903. }
  1904. case no_pat_separator:
  1905. {
  1906. assertex(args.ordinality() == 3);
  1907. limit = args.item(1).limit;
  1908. leading.append(OLINK(args.item(0)));
  1909. trailing.append(OLINK(args.item(2)));
  1910. break;
  1911. }
  1912. case no_pat_follow:
  1913. {
  1914. limit.minLength = 0;
  1915. limit.maxLength = 0;
  1916. ForEachItemIn(idx0, args)
  1917. {
  1918. HqlRegexExpr & cur = args.item(idx0);
  1919. limit.minLength = limitedAdd(limit.minLength, cur.limit.minLength);
  1920. limit.maxLength = limitedAdd(limit.maxLength, cur.limit.maxLength);
  1921. if (cur.limit.containsAssertion)
  1922. limit.containsAssertion = true;
  1923. }
  1924. ForEachItemIn(idx1, args)
  1925. {
  1926. HqlRegexExpr & cur = args.item(idx1);
  1927. cur.gatherLeading(leading);
  1928. if (!cur.canBeNull())
  1929. break;
  1930. }
  1931. ForEachItemInRev(idx2, args)
  1932. {
  1933. HqlRegexExpr & cur = args.item(idx2);
  1934. cur.gatherTrailing(trailing);
  1935. if (!cur.canBeNull())
  1936. break;
  1937. }
  1938. break;
  1939. }
  1940. case no_pat_repeat:
  1941. {
  1942. if (isStandardRepeat())
  1943. {
  1944. HqlRegexExpr & arg = args.item(0);
  1945. limit.minLength = limitedMult(getRepeatMin(), arg.limit.minLength);
  1946. limit.maxLength = limitedMult(getRepeatMax(), arg.limit.maxLength);
  1947. if (getRepeatMin() != 0) limit.containsAssertion = arg.limit.containsAssertion;
  1948. arg.gatherLeading(leading);
  1949. arg.gatherTrailing(trailing);
  1950. }
  1951. else
  1952. {
  1953. limit.minLength = limitedMult(getRepeatMin(), named->getMinLength());
  1954. limit.maxLength = limitedMult(getRepeatMax(), named->getMaxLength());
  1955. //if (getRepeatMin() != 0) limit.containsAssertion = named->queryLimit().containsAssertion;
  1956. limit.containsAssertion = true;
  1957. leading.append(*LINK(this));
  1958. trailing.append(*LINK(this));
  1959. }
  1960. break;
  1961. }
  1962. case no_pat_pattern:
  1963. // Should have been converted by the time we get here...
  1964. default:
  1965. UNIMPLEMENTED;
  1966. break;
  1967. }
  1968. }
  1969. void HqlComplexRegexExpr::cleanup()
  1970. {
  1971. if (cleaned)
  1972. return;
  1973. HqlRegexExpr::cleanup();
  1974. trailing.kill();
  1975. leading.kill();
  1976. subPattern.clear();
  1977. }
  1978. void HqlComplexRegexExpr::gatherLeading(HqlRegexUniqueArray & target)
  1979. {
  1980. inherit(target, leading);
  1981. }
  1982. void HqlComplexRegexExpr::gatherTrailing(HqlRegexUniqueArray & target)
  1983. {
  1984. inherit(target, trailing);
  1985. }
  1986. //---------------------------------------------------------------------------
  1987. // DFA helper classes.
  1988. inline int compareHqlRegexExpr(HqlRegexExpr * left, HqlRegexExpr * right)
  1989. {
  1990. unique_id_t idl = left->queryUid();
  1991. unique_id_t idr = right->queryUid();
  1992. return (idl < idr) ? -1 : (idl > idr) ? +1 : 0;
  1993. }
  1994. static int compareHqlRegexExpr(CInterface * const * left, CInterface * const * right)
  1995. {
  1996. return compareHqlRegexExpr((HqlRegexExpr *)*left, (HqlRegexExpr *)*right);
  1997. }
  1998. void HqlRegexExprSet::add(HqlRegexExpr * value)
  1999. {
  2000. bool isNew;
  2001. CInterface * castValue = value;
  2002. values.bAdd(castValue, compareHqlRegexExpr, isNew);
  2003. }
  2004. int HqlRegexExprSet::compare(const HqlRegexExprSet & other) const
  2005. {
  2006. unsigned numThis = values.ordinality();
  2007. unsigned numOther = other.values.ordinality();
  2008. unsigned numCommon = std::min(numThis, numOther);
  2009. for (unsigned i = 0; i < numCommon; i++)
  2010. {
  2011. HqlRegexExpr & left = values.item(i);
  2012. HqlRegexExpr & right = other.values.item(i);
  2013. int c = compareHqlRegexExpr(&left, &right);
  2014. if (c)
  2015. return c;
  2016. }
  2017. if (numThis > numOther)
  2018. return +1;
  2019. else if (numThis < numOther)
  2020. return -1;
  2021. else
  2022. return 0;
  2023. }
  2024. bool HqlRegexExprSet::equals(const HqlRegexExprSet & other) const
  2025. {
  2026. unsigned numThis = values.ordinality();
  2027. unsigned numOther = other.values.ordinality();
  2028. if (numThis != numOther)
  2029. return false;
  2030. for (unsigned i = 0; i < numThis; i++)
  2031. {
  2032. if (&values.item(i) != &other.values.item(i))
  2033. return false;
  2034. }
  2035. return true;
  2036. }
  2037. bool HqlDfaState::isAccepting()
  2038. {
  2039. ForEachItemIn(idx, position)
  2040. if (position.item(idx).getOperator() == no_pat_endpattern)
  2041. return true;
  2042. return false;
  2043. }
  2044. static inline int compareState(HqlDfaState * left, HqlDfaState * right)
  2045. {
  2046. return left->position.compare(right->position);
  2047. }
  2048. static int compareState(CInterface * const * left, CInterface * const * right)
  2049. {
  2050. return compareState((HqlDfaState *)*left, (HqlDfaState *)*right);
  2051. }
  2052. void HqlRegexExpr::generateDFAs()
  2053. {
  2054. if (op != no_pat_dfa)
  2055. {
  2056. ForEachItemIn(idx, args)
  2057. args.item(idx).generateDFAs();
  2058. return;
  2059. }
  2060. //Adapted from Dragon p141 Fig 3.44
  2061. //Main variation is that the set of potential symbols is calculated first, rather than trying the whole alphabet
  2062. //it might also have character-classes e.g., [[:alpha:]] in the stream.
  2063. dfa = new HqlDfaInfo;
  2064. HqlDfaState * firstState = new HqlDfaState(0);
  2065. HqlRegexExpr * first = queryNamed()->def->queryChild(0);
  2066. ForEachItemIn(idx, first->following)
  2067. firstState->position.add(&first->following.item(idx));
  2068. //Store a list of states, and an ordered list. First marks which have been processed,
  2069. //second gives efficient duplicate detection.
  2070. dfa->states.append(*firstState);
  2071. dfa->orderedStates.append(*LINK(firstState));
  2072. unsigned curStateIndex = 0;
  2073. while (curStateIndex < dfa->states.ordinality())
  2074. {
  2075. HqlDfaState & curState = dfa->states.item(curStateIndex++);
  2076. //First gather the potential symbols that come next (otherwise we would die with unicode!)
  2077. //it also speeds up ascii by a fairly large factor on sets of strings.
  2078. SymbolArray nextSymbols;
  2079. ForEachItemIn(ip, curState.position)
  2080. curState.position.item(ip).gatherConsumeSymbols(nextSymbols);
  2081. //For each symbol, work out what combination of states we could become translated to.
  2082. SymbolArrayIterator iter(nextSymbols);
  2083. ForEach(iter)
  2084. {
  2085. unsigned curSymbol = iter.get();
  2086. HqlDfaState * nextState = new HqlDfaState(dfa->states.ordinality());
  2087. //For each NFA state we could be in, what new NFA state could be now be in...
  2088. ForEachItemIn(ip, curState.position)
  2089. {
  2090. HqlRegexExpr & curRegex = curState.position.item(ip);
  2091. if (curRegex.canConsume(curSymbol))
  2092. {
  2093. ForEachItemIn(idx2, curRegex.following)
  2094. nextState->position.add(&curRegex.following.item(idx2));
  2095. }
  2096. }
  2097. //If no states, then don't bother adding a transition - we're done..
  2098. if (nextState->position.ordinality())
  2099. {
  2100. bool isNew = false;
  2101. CInterface * castState = nextState;
  2102. unsigned matchIndex = dfa->orderedStates.bAdd(castState, compareState, isNew);
  2103. if (isNew)
  2104. dfa->states.append(*LINK(nextState));
  2105. else
  2106. nextState->Release();
  2107. HqlDfaState * matchedState = &dfa->orderedStates.item(matchIndex);
  2108. //Finally associate a transition for this symbol...
  2109. curState.transitions.append(*new HqlDfaTransition(curSymbol, matchedState));
  2110. }
  2111. else
  2112. nextState->Release();
  2113. }
  2114. }
  2115. }
  2116. /**
  2117. Some unicode pseudo code
  2118. ForEach(ip in curState.position)
  2119. {
  2120. position(ip).getConsumed(consumeSet);
  2121. positionSet = [ip];
  2122. if (consumeSet.ordinality())
  2123. Merge(consumeSet, 0, positionSet);
  2124. }
  2125. Merge(consumeSet, first, positionSet)
  2126. {
  2127. for (i = first; i < sets.ordinality(); i++)
  2128. {
  2129. diff = intersect(sets(i).consume, consumeSet);
  2130. if (diff.ordinality())
  2131. {
  2132. rightOnly = sets(i).consume - diff;
  2133. if (rightOnly.ordinality() == 0)
  2134. sets(i).positions.union(positionSet);
  2135. else
  2136. {
  2137. sets(i).consume = rightOnly;
  2138. Merge(diff, i+1, union(sets(i).positions, positionSet);
  2139. }
  2140. newOnly = consumeSet - diff;
  2141. if (newOnly = [])
  2142. return;
  2143. consumeSet = newOnly;
  2144. }
  2145. }
  2146. sets.append(consumeSet, positionSet);
  2147. }
  2148. Output would be a set of disjoint sets, together with a lits of positions that are matched by those sets.
  2149. They could then be used to generate the tables required by a dfa matcher list of
  2150. low, high, target-state
  2151. which was binary chopped.
  2152. It might be better to not use the icu sets, except for generating a set of ranges, and handle everything else ourselves.
  2153. Possibly a set of [low,high,positions], or maybe even [low, high, position], sorted by low
  2154. with some clever code to walk through and retain lists of which ones are active.
  2155. **/
  2156. //---------------------------------------------------------------------------
  2157. RegexContext::RegexContext(IHqlExpression * _expr, IWorkUnit * _wu, const HqlCppOptions & _options, byte _algorithm) : NlpParseContext(_expr, _wu, _options), parser(NULL, _algorithm)
  2158. {
  2159. info.addedSeparators = false;
  2160. switch (info.type)
  2161. {
  2162. case type_string: info.dfaComplexity = DEFAULT_DFA_COMPLEXITY; break;
  2163. case type_utf8: info.dfaComplexity = DEFAULT_UTF8_DFA_COMPLEXITY; break;
  2164. case type_unicode: info.dfaComplexity = DEFAULT_UNICODE_DFA_COMPLEXITY; break;
  2165. }
  2166. if (_options.parseDfaComplexity != (unsigned)-1)
  2167. info.dfaComplexity = _options.parseDfaComplexity;
  2168. info.expandRepeatAnyAsDfa = _options.expandRepeatAnyAsDfa;
  2169. createLexer = false;
  2170. }
  2171. RegexContext::~RegexContext()
  2172. {
  2173. ForEachItemIn(idx, named)
  2174. named.item(idx).cleanup();
  2175. }
  2176. HqlNamedRegex * RegexContext::queryNamed(IHqlExpression * defn, IAtom * name, node_operator op, bool caseSensitive)
  2177. {
  2178. ForEachItemIn(idx, named)
  2179. {
  2180. HqlNamedRegex & cur = named.item(idx);
  2181. if (cur.matches(defn, name, caseSensitive))
  2182. {
  2183. assertex(cur.matches(defn, name, op, caseSensitive));
  2184. return &cur;
  2185. }
  2186. }
  2187. return NULL;
  2188. }
  2189. HqlNamedRegex * RegexContext::createNamed(IHqlExpression * expr, IAtom * name, node_operator op, bool caseSensitive)
  2190. {
  2191. LinkedHqlExpr searchExpr = expr;
  2192. if (op != no_pat_instance)
  2193. //Create an expression that should never clash with anything we would find in the parse tree
  2194. searchExpr.setown(createValue(op, expr->getType(), LINK(expr), createAttribute(tempAtom)));
  2195. HqlNamedRegex * match = queryNamed(searchExpr, name, op, caseSensitive);
  2196. if (match)
  2197. match->addUse();
  2198. else
  2199. {
  2200. match = new HqlNamedRegex(expr, name, searchExpr, op, caseSensitive, isMatched(searchExpr, name));
  2201. named.append(*match); // add to list first to avoid recursion problems.
  2202. match->setRegexOwn(createStructure(expr, caseSensitive));
  2203. }
  2204. return match;
  2205. }
  2206. static HqlRegexExpr * createFollow(const ParseInformation & options, HqlRegexExpr * a1, HqlRegexExpr * a2, HqlRegexExpr * a3, bool caseSensitive)
  2207. {
  2208. HqlRegexExpr * follow = createRegexExpr(options, no_pat_follow, NULL, caseSensitive);
  2209. follow->addOperand(a1);
  2210. follow->addOperand(a2);
  2211. if (a3)
  2212. follow->addOperand(a3);
  2213. return follow;
  2214. }
  2215. HqlRegexExpr * RegexContext::createFollow(HqlRegexExpr * start, IHqlExpression * body, node_operator endOp, IHqlExpression * endExpr, bool caseSensitive)
  2216. {
  2217. HqlRegexExpr * next = createStructure(body, caseSensitive);
  2218. HqlRegexExpr * finish = createRegexExpr(info, endOp, endExpr, caseSensitive);
  2219. return ::createFollow(info, start, next, finish, caseSensitive);
  2220. }
  2221. HqlRegexExpr * RegexContext::createStructure(IHqlExpression * expr, bool caseSensitive)
  2222. {
  2223. node_operator op = expr->getOperator();
  2224. switch (op)
  2225. {
  2226. case no_pat_production:
  2227. throwError(HQLERR_RegexNoTransformSupport);
  2228. case no_pat_instance:
  2229. {
  2230. IHqlExpression * def = expr->queryChild(0);
  2231. if (createLexer)
  2232. return createStructure(def, caseSensitive);
  2233. Owned<HqlRegexExpr> instance = createRegexExpr(info, expr, caseSensitive);
  2234. IHqlExpression * nameExpr = expr->queryChild(1);
  2235. instance->setNamed(createNamed(def, nameExpr ? nameExpr->queryName() : NULL, op, caseSensitive));
  2236. return instance.getClear();
  2237. }
  2238. case no_pat_guard:
  2239. return createRegexExpr(info, no_null, NULL, caseSensitive);
  2240. case no_attr:
  2241. case no_attr_expr:
  2242. case no_attr_link:
  2243. return NULL;
  2244. case no_implicitcast:
  2245. //can sometimes occur when parameters are bound
  2246. return createStructure(expr->queryChild(0), caseSensitive);
  2247. case no_pat_const:
  2248. if (createLexer)
  2249. return expandStringAsChars(expr, info, caseSensitive);
  2250. return createRegexExpr(info, expr, caseSensitive);
  2251. case no_pat_set:
  2252. case no_penalty:
  2253. //Don't walk children...
  2254. return createRegexExpr(info, expr, caseSensitive);
  2255. case no_pat_x_before_y:
  2256. case no_pat_x_after_y:
  2257. {
  2258. Owned<HqlRegexExpr> pattern = createStructure(expr->queryChild(0), caseSensitive);
  2259. Owned<HqlRegexExpr> check = createRegexExpr(info, expr, caseSensitive);
  2260. check->setSubPattern(createNamed(expr->queryChild(1), NULL, no_pat_checkin, caseSensitive));
  2261. if (op == no_pat_x_before_y)
  2262. return ::createFollow(info, pattern.getClear(), check.getClear(), NULL, caseSensitive);
  2263. else
  2264. return ::createFollow(info, check.getClear(), pattern.getClear(), NULL, caseSensitive);
  2265. }
  2266. case no_pat_before_y:
  2267. case no_pat_after_y:
  2268. {
  2269. Owned<HqlRegexExpr> check = createRegexExpr(info, expr, caseSensitive);
  2270. check->setSubPattern(createNamed(expr->queryChild(0), NULL, no_pat_checkin, caseSensitive));
  2271. return check.getClear();
  2272. }
  2273. case no_pat_checkin:
  2274. {
  2275. Owned<HqlRegexExpr> start = createRegexExpr(info, no_pat_begincheck, expr, caseSensitive);
  2276. Owned<HqlRegexExpr> next = createStructure(expr->queryChild(0), caseSensitive);
  2277. Owned<HqlRegexExpr> finish = createRegexExpr(info, no_pat_endcheckin, expr, caseSensitive);
  2278. finish->setSubPattern(createNamed(expr->queryChild(1), NULL, no_pat_checkin, caseSensitive));
  2279. return ::createFollow(info, start.getClear(), next.getClear(), finish.getClear(), caseSensitive);
  2280. }
  2281. case no_pat_checklength:
  2282. {
  2283. HqlRegexExpr * start = createRegexExpr(info, no_pat_begincheck, expr, caseSensitive);
  2284. return createFollow(start, expr->queryChild(0), no_pat_endchecklength, expr, caseSensitive);
  2285. }
  2286. case no_pat_validate:
  2287. {
  2288. HqlRegexExpr * start = createRegexExpr(info, no_pat_beginvalidate, expr, caseSensitive);
  2289. return createFollow(start, expr->queryChild(0), no_pat_endvalidate, expr, caseSensitive);
  2290. }
  2291. case no_pat_token:
  2292. case no_pat_imptoken:
  2293. {
  2294. IHqlExpression * child = expr->queryChild(0);
  2295. //MORE: Covert into a function to allow multiple values of following:
  2296. switch (child->getOperator())
  2297. {
  2298. case no_null:
  2299. case no_pat_first:
  2300. case no_pat_last:
  2301. return createStructure(child, caseSensitive);
  2302. case no_penalty:
  2303. case no_pat_x_before_y:
  2304. case no_pat_before_y:
  2305. case no_pat_x_after_y:
  2306. case no_pat_after_y:
  2307. case no_pat_guard:
  2308. return createStructure(child, caseSensitive);
  2309. case no_pat_const:
  2310. // if (!info.separator)
  2311. // return createStructure(child);
  2312. break;
  2313. }
  2314. HqlRegexExpr * start = createRegexExpr(info, no_pat_begintoken, expr, caseSensitive);
  2315. return createFollow(start, expr->queryChild(0), no_pat_endtoken, expr, caseSensitive);
  2316. }
  2317. case no_pat_repeat:
  2318. {
  2319. HqlRegexExpr * ret = createRegexExpr(info, expr, caseSensitive);
  2320. IHqlExpression * repeated = expr->queryChild(0);
  2321. if (ret->isStandardRepeat())
  2322. ret->addOperand(createStructure(repeated, caseSensitive));
  2323. else
  2324. ret->setNamed(createNamed(repeated, NULL, op, caseSensitive));
  2325. return ret;
  2326. }
  2327. case no_pat_or:
  2328. case no_pat_follow:
  2329. {
  2330. //Expand these out as much as possible....
  2331. HqlExprArray args;
  2332. expr->unwindList(args, op);
  2333. OwnedHqlExpr flatExpr = expr->clone(args);
  2334. Owned<HqlRegexExpr> ret = createRegexExpr(info, flatExpr, caseSensitive);
  2335. ForEachChild(idx, flatExpr)
  2336. {
  2337. HqlRegexExpr * child = createStructure(flatExpr->queryChild(idx), caseSensitive);
  2338. if (child)
  2339. ret->addOperand(child);
  2340. }
  2341. return ret.getClear();
  2342. }
  2343. case no_pat_case:
  2344. case no_pat_nocase:
  2345. return createStructure(expr->queryChild(0), op==no_pat_case);
  2346. case no_pat_featureactual:
  2347. throwError(HQLERR_RegexFeatureNotSupport);
  2348. }
  2349. Owned<HqlRegexExpr> ret = createRegexExpr(info, expr, caseSensitive);
  2350. ForEachChild(idx, expr)
  2351. {
  2352. HqlRegexExpr * child = createStructure(expr->queryChild(idx), caseSensitive);
  2353. if (child)
  2354. ret->addOperand(child);
  2355. }
  2356. return ret.getClear();
  2357. }
  2358. void RegexContext::buildStructure()
  2359. {
  2360. IHqlExpression * grammar = expr->queryChild(2);
  2361. assertex(grammar->getOperator() == no_pat_instance);
  2362. OwnedHqlExpr structure = LINK(grammar);//createValue(no_pat_instance, makeRuleType(NULL), LINK(grammar), LINK(grammar->queryChild(1)));
  2363. HqlRegexExpr * rootRegex = createStructure(structure, isCaseSensitive());
  2364. root.setown(new HqlNamedRegex(structure, internalAtom, structure, no_parse, isCaseSensitive(), false));
  2365. root->setRegexOwn(rootRegex);
  2366. named.append(*LINK(root));
  2367. }
  2368. void RegexContext::expandRecursion()
  2369. {
  2370. //First add all the symbols referenced by the use() attributes on the parse so they can be matched.
  2371. ForEachChild(idx, expr)
  2372. {
  2373. IHqlExpression * cur = expr->queryChild(idx);
  2374. if (cur->getOperator() == no_pat_use)
  2375. {
  2376. //NB: Does not return a linked item.
  2377. HqlNamedRegex * named = createNamed(cur->queryChild(0), NULL, no_pat_instance, true);
  2378. named->removeUse(); // Not actually used at the moment...
  2379. HqlNamedRegex * named2 = createNamed(cur->queryChild(0), NULL, no_pat_instance, false);
  2380. named2->removeUse(); // Not actually used at the moment...
  2381. }
  2382. }
  2383. root->expandRecursion(*this, NULL);
  2384. ForEachItemInRev(idx2, named)
  2385. if (!named.item(idx2).queryExpandedRecursion())
  2386. named.remove(idx2);
  2387. }
  2388. void RegexContext::insertSeparators()
  2389. {
  2390. if (info.separator)
  2391. {
  2392. ForEachItemIn(idx, named)
  2393. named.item(idx).insertSeparators(info.separator, this);
  2394. //add separator/first onto the front of root.
  2395. info.addedSeparators = true;
  2396. }
  2397. }
  2398. void RegexContext::optimizeSpotDFA()
  2399. {
  2400. if (info.dfaComplexity > 0)
  2401. {
  2402. ForEachItemIn(idx, named)
  2403. named.item(idx).calcDfaScore();
  2404. root->markDFAs(info.dfaComplexity);
  2405. ForEachItemIn(idx2, named)
  2406. named.item(idx2).createDFAs(*this);
  2407. }
  2408. }
  2409. void RegexContext::optimizePattern()
  2410. {
  2411. ForEachItemIn(idx1, named)
  2412. named.item(idx1).mergeCreateSets();
  2413. root->expandNamedSymbols();
  2414. ForEachItemInRev(idx2, named)
  2415. {
  2416. if (!named.item(idx2).isUsed())
  2417. {
  2418. //Can't delete it otherwise everything gets cleaned up...
  2419. deadNamed.append(named.item(idx2));
  2420. named.remove(idx2, true);
  2421. }
  2422. }
  2423. optimizeSpotDFA();
  2424. }
  2425. HqlNamedRegex * RegexContext::queryDefine(IHqlExpression * defineName, bool caseSensitive)
  2426. {
  2427. ForEachItemIn(idx, named)
  2428. {
  2429. HqlNamedRegex & cur = named.item(idx);
  2430. if (cur.matchesDefine(defineName,caseSensitive))
  2431. return &cur;
  2432. }
  2433. StringBuffer s;
  2434. defineName->toString(s);
  2435. throwError1(HQLERR_DefineUseStrNotFound, s.str());
  2436. return NULL;
  2437. }
  2438. void RegexContext::analysePattern()
  2439. {
  2440. //This conversion is based around the description in the Dragon book:
  2441. //3.9 From a regular expression to a DFA
  2442. //even though we don't always convert it, the steps form a useful algorithm
  2443. ForEachItemIn(idx0, named)
  2444. named.item(idx0).addBeginEnd(info);
  2445. ForEachItemIn(idx1, named)
  2446. named.item(idx1).analyseNullLeadTrail();
  2447. ForEachItemIn(idx2, named)
  2448. named.item(idx2).calculateNext();
  2449. ForEachItemIn(idx3, named)
  2450. named.item(idx3).generateDFAs();
  2451. }
  2452. void RegexContext::generateRegex()
  2453. {
  2454. parser.addedSeparators = info.addedSeparators;
  2455. setParserOptions(parser);
  2456. GenerateRegexCtx ctx(*this, info, idAllocator);
  2457. ForEachItemIn(idx0, named)
  2458. {
  2459. HqlNamedRegex & cur = named.item(idx0);
  2460. cur.generateRegex(ctx);
  2461. }
  2462. parser.grammar.set(root->queryRootPattern());
  2463. parser.minPatternLength = root->getMinLength();
  2464. }
  2465. //void RegexContext::removeTrivialInstances()
  2466. //{
  2467. // ForEachItemIn(idx, named)
  2468. // named.item(idx).removeTrivialInstances();
  2469. //}
  2470. void RegexContext::compileSearchPattern()
  2471. {
  2472. checkValidMatches();
  2473. buildStructure();
  2474. expandRecursion();
  2475. // removeTrivialInstances();
  2476. insertSeparators();
  2477. optimizePattern();
  2478. analysePattern();
  2479. generateRegex();
  2480. compileMatched(parser);
  2481. }
  2482. void RegexContext::getDebugText(StringBuffer & s, unsigned detail)
  2483. {
  2484. NlpParseContext::getDebugText(s, detail);
  2485. regexToXml(s, parser.grammar, detail);
  2486. }
  2487. NlpParseContext * createRegexContext(IHqlExpression * expr, IWorkUnit * wu, const HqlCppOptions & options, byte algorithm)
  2488. {
  2489. return new RegexContext(expr, wu, options, algorithm);
  2490. }
  2491. //-- Lexer creation
  2492. void RegexContext::beginLexer()
  2493. {
  2494. createLexer = true;
  2495. info.expandRepeatAnyAsDfa = true;
  2496. OwnedHqlExpr searchExpr = createValue(no_pat_dfa, makePatternType(), createUniqueId());
  2497. lexerNamed.setown(new HqlNamedRegex(searchExpr, NULL, searchExpr, no_pat_dfa, isCaseSensitive(), false));
  2498. lexerOr.setown(createRegexExpr(info, no_pat_or, NULL, isCaseSensitive()));
  2499. lexerNamed->setRegexOwn(LINK(lexerOr));
  2500. lexerRoot.setown(createRegexExpr(info, no_pat_dfa, NULL, isCaseSensitive()));
  2501. lexerRoot->setNamed(lexerNamed);
  2502. root.setown(new HqlNamedRegex(expr, NULL, expr, no_parse, isCaseSensitive(), false));
  2503. root->setRegexOwn(LINK(lexerRoot));
  2504. named.append(*LINK(lexerNamed));
  2505. named.append(*LINK(root));
  2506. }
  2507. void RegexContext::addLexerToken(unsigned id, IHqlExpression * pattern)
  2508. {
  2509. HqlRegexExpr * token = createStructure(pattern, isCaseSensitive());
  2510. HqlRegexExpr * end = createRegexExpr(info, no_pat_endpattern, NULL, isCaseSensitive());
  2511. end->setAcceptId(id);
  2512. if (token->getOperator() == no_pat_follow)
  2513. token->addOperand(end);
  2514. else
  2515. {
  2516. HqlRegexExpr * follow = createRegexExpr(info, no_pat_follow, NULL, isCaseSensitive());
  2517. follow->addOperand(token);
  2518. follow->addOperand(end);
  2519. token = follow;
  2520. }
  2521. lexerOr->addOperand(token);
  2522. }
  2523. void RegexContext::generateLexer(IDfaPattern * builder)
  2524. {
  2525. //MORE: Need to call some elements of optimizePattern() to expand limited repeats.
  2526. analysePattern();
  2527. lexerRoot->generateDFA(builder);
  2528. }
  2529. /*
  2530. ToDo:
  2531. o Should move some of the logic from HqlregexExpr to complex + remove the queyNamed() etc. virtuals.
  2532. */