hqlregex.cpp 88 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753
  1. /*##############################################################################
  2. HPCC SYSTEMS software Copyright (C) 2012 HPCC Systems.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. ############################################################################## */
  13. #ifndef U_OVERRIDE_CXX_ALLOCATION
  14. #define U_OVERRIDE_CXX_ALLOCATION 0 // Enabling this forces all allocation of ICU objects to ICU's heap, but is incompatible with jmemleak
  15. #endif
  16. #include <algorithm>
  17. #include "jliball.hpp"
  18. #include "eclrtl.hpp"
  19. #include "hqlexpr.hpp"
  20. #include "hqlcerrors.hpp"
  21. #include "hqlutil.hpp"
  22. #include "hqlregex.ipp"
  23. #include "hqlhtcpp.ipp"
  24. #include "hqlcatom.hpp"
  25. #include "hqlcpputil.hpp"
  26. #include "thorralgo.ipp"
  27. #include "hqlthql.hpp"
  28. #include "unicode/uchar.h"
  29. //#define NEW_DFA_CALC
  30. //#define DEFAULT_DFA_COMPLEXITY 0
  31. #define DEFAULT_UNICODE_DFA_COMPLEXITY 0
  32. #define DEFAULT_UTF8_DFA_COMPLEXITY 2000
  33. #define DEFAULT_DFA_COMPLEXITY 10000
  34. inline unsigned addWithoutOverflow(unsigned x, unsigned y)
  35. {
  36. if (UINT_MAX - x <= y)
  37. return UINT_MAX;
  38. return x + y;
  39. }
  40. inline unsigned multiplyWithoutOverflow(unsigned x, unsigned y)
  41. {
  42. if (UINT_MAX / x <= y)
  43. return UINT_MAX;
  44. return x * y;
  45. }
  46. //===========================================================================
  47. static IValue * getCastConstant(IValue * value, type_t tc)
  48. {
  49. switch (tc)
  50. {
  51. case type_unicode:
  52. return value->castTo(unknownUnicodeType);
  53. case type_utf8:
  54. return value->castTo(unknownUtf8Type);
  55. case type_string:
  56. return value->castTo(unknownStringType);
  57. }
  58. throwUnexpected();
  59. }
  60. static HqlRegexExpr * createRegexExpr(const ParseInformation & options, IHqlExpression * expr, bool caseSensitive)
  61. {
  62. switch (expr->getOperator())
  63. {
  64. case no_null:
  65. case no_pat_beginrecursive:
  66. case no_pat_beginpattern:
  67. case no_pat_endpattern:
  68. case no_pat_singlechar:
  69. return new HqlSimpleRegexExpr(options, expr, caseSensitive);
  70. }
  71. return new HqlComplexRegexExpr(options, expr, caseSensitive);
  72. }
  73. static HqlRegexExpr * createRegexExpr(const ParseInformation & options, node_operator op, IHqlExpression * expr, bool caseSensitive)
  74. {
  75. switch (op)
  76. {
  77. case no_null:
  78. case no_pat_beginrecursive:
  79. case no_pat_beginpattern:
  80. case no_pat_endpattern:
  81. case no_pat_singlechar:
  82. return new HqlSimpleRegexExpr(options, op, expr, caseSensitive);
  83. }
  84. return new HqlComplexRegexExpr(options, op, expr, caseSensitive);
  85. }
  86. //---------------------------------------------------------------------------
  87. void HqlRegexHashTable::onAdd(void *et)
  88. {
  89. ((HqlRegexExpr*)et)->Link();
  90. }
  91. void HqlRegexHashTable::onRemove(void *et)
  92. {
  93. ((HqlRegexExpr*)et)->Release();
  94. }
  95. unsigned HqlRegexHashTable::getHashFromElement(const void *et) const
  96. {
  97. return ((HqlRegexExpr*)et)->getHash();
  98. }
  99. unsigned HqlRegexHashTable::getHashFromFindParam(const void *fp) const
  100. {
  101. return ((HqlRegexExpr*)fp)->getHash();
  102. }
  103. const void * HqlRegexHashTable::getFindParam(const void *et) const
  104. {
  105. return et;
  106. }
  107. bool HqlRegexHashTable::matchesFindParam(const void *et, const void *key, unsigned fphash) const
  108. {
  109. return et == key;
  110. }
  111. void HqlRegexUniqueArray::append(HqlRegexExpr & expr)
  112. {
  113. if (!hash.find(&expr))
  114. {
  115. array.append(expr);
  116. hash.add(&expr);
  117. }
  118. else
  119. {
  120. expr.Release();
  121. }
  122. }
  123. //---------------------------------------------------------------------------
  124. inline unsigned limitedAdd(unsigned a, unsigned b)
  125. {
  126. if (PATTERN_UNLIMITED_LENGTH - a <= b)
  127. return PATTERN_UNLIMITED_LENGTH;
  128. return a+b;
  129. }
  130. inline unsigned limitedMult(unsigned a, unsigned b)
  131. {
  132. if (a == 0)
  133. return 0;
  134. if (PATTERN_UNLIMITED_LENGTH / a <= b)
  135. return PATTERN_UNLIMITED_LENGTH;
  136. return a*b;
  137. }
  138. IHqlExpression * RegexIdAllocator::createKey(IHqlExpression * expr, IAtom * name)
  139. {
  140. if (!expr)
  141. return NULL;
  142. IHqlExpression * body = expr->queryBody();
  143. if (name)
  144. return createSymbol(createIdAtom(name->str()), LINK(body), ob_private);
  145. return LINK(body);
  146. }
  147. void RegexIdAllocator::setID(IHqlExpression * expr, IAtom * name, unsigned value)
  148. {
  149. OwnedHqlExpr key = createKey(expr, name);
  150. map.setValue(key, value);
  151. }
  152. regexid_t RegexIdAllocator::queryID(IHqlExpression * expr, IAtom * name)
  153. {
  154. OwnedHqlExpr key = createKey(expr, name);
  155. regexid_t * match = map.getValue(key);
  156. if (match)
  157. return *match;
  158. map.setValue(key, ++nextId);
  159. return nextId;
  160. }
  161. int compareUnsigned(unsigned * left, unsigned * right)
  162. {
  163. return (*left < *right) ? -1 : (*left > *right) ? +1 : 0;
  164. }
  165. class SymbolArray
  166. {
  167. friend class SymbolArrayIterator;
  168. public:
  169. void addUniqueRange(unsigned low, unsigned high)
  170. {
  171. unsigned max = symbols.ordinality();
  172. unsigned i;
  173. for (i = 0; i < max; i++)
  174. {
  175. unsigned cur = symbols.item(i);
  176. if (cur == low)
  177. low++;
  178. if (cur > low)
  179. {
  180. if (cur > high)
  181. cur = high+1;
  182. while (low != cur)
  183. symbols.add(low++, i++);
  184. low++;
  185. }
  186. if (cur > high)
  187. return;
  188. }
  189. while (low <= high)
  190. symbols.append(low++);
  191. }
  192. inline void addUnique(unsigned value)
  193. {
  194. bool isNew;
  195. symbols.bAdd(value, compareUnsigned, isNew);
  196. }
  197. protected:
  198. UnsignedArray symbols;
  199. };
  200. class SymbolArrayIterator
  201. {
  202. public:
  203. SymbolArrayIterator(SymbolArray & _table) : table(_table) { cur = 0; }
  204. inline bool first()
  205. {
  206. cur = 0; return table.symbols.isItem(cur);
  207. }
  208. inline bool isValid()
  209. {
  210. return table.symbols.isItem(cur);
  211. }
  212. inline unsigned get()
  213. {
  214. return table.symbols.item(cur);
  215. }
  216. inline bool next()
  217. {
  218. cur++; return table.symbols.isItem(cur);
  219. }
  220. protected:
  221. SymbolArray & table;
  222. unsigned cur;
  223. };
  224. //---------------------------------------------------------------------------
  225. HqlNamedRegex::HqlNamedRegex(IHqlExpression * _expr, IAtom * _name, IHqlExpression * _searchExpr, node_operator _kind, bool _caseSensitive, bool _isMatched)
  226. {
  227. kind = _kind;
  228. expr.set(_expr);
  229. searchExpr.set(_searchExpr);
  230. name = _name;
  231. numUses = 1;
  232. isMatched = _isMatched;
  233. isRecursive = false;
  234. doneCalcDfaScore = false;
  235. doneCreateDFA = false;
  236. doneExpandNamed = false;
  237. doneExpandRecursion = false;
  238. doneMarkDfas = false;
  239. noGenerate = (kind == no_pat_dfa);
  240. caseSensitive = _caseSensitive;
  241. }
  242. HqlNamedRegex::~HqlNamedRegex()
  243. {
  244. cleanup();
  245. }
  246. void HqlNamedRegex::addBeginEnd(const ParseInformation & options)
  247. {
  248. //Add nodes at either end of the pattern - at the start as a placeholder for all the possible first patterns,
  249. //and at the end to mark which paths are terminal.
  250. HqlRegexExpr * follow = createRegexExpr(options, no_pat_follow, NULL, caseSensitive);
  251. follow->addOperand(createRegexExpr(options, isRecursive ? no_pat_beginrecursive : no_pat_beginpattern, NULL, caseSensitive));
  252. follow->addOperand(LINK(def));
  253. if (isRecursive)
  254. follow->addOperand(createRegexExpr(options, no_pat_endrecursive, NULL, caseSensitive));
  255. follow->addOperand(createRegexExpr(options, no_pat_endpattern, NULL, caseSensitive));
  256. def.setown(follow);
  257. }
  258. void HqlNamedRegex::analyseNullLeadTrail()
  259. {
  260. def->analyseNullLeadTrail();
  261. limit = def->limit;
  262. }
  263. void HqlNamedRegex::calcDfaScore()
  264. {
  265. if (!doneCalcDfaScore)
  266. {
  267. doneCalcDfaScore = true;
  268. def->calcDfaScore();
  269. }
  270. }
  271. void HqlNamedRegex::calculateNext()
  272. {
  273. def->calculateNext();
  274. }
  275. bool HqlNamedRegex::canBeNull()
  276. {
  277. return limit.canBeNull();
  278. }
  279. void HqlNamedRegex::cleanup()
  280. {
  281. if (def)
  282. def->cleanup();
  283. if (created)
  284. created->dispose();
  285. }
  286. void HqlNamedRegex::createDFAs(RegexContext & ctx)
  287. {
  288. if (!doneCreateDFA)
  289. {
  290. doneCreateDFA = true;
  291. def.setown(def->createDFAs(ctx));
  292. }
  293. }
  294. void HqlNamedRegex::expandNamedSymbols()
  295. {
  296. if (!doneExpandNamed)
  297. {
  298. doneExpandNamed = true;
  299. def.setown(def->expandNamedSymbols());
  300. }
  301. }
  302. void HqlNamedRegex::expandRecursion(RegexContext & ctx, HqlNamedRegex * self)
  303. {
  304. if (!doneExpandRecursion)
  305. {
  306. doneExpandRecursion = true;
  307. if (kind == no_pat_instance)
  308. self = this;
  309. def.setown(def->expandRecursion(ctx, self));
  310. }
  311. }
  312. void HqlNamedRegex::generateDFAs()
  313. {
  314. if (!noGenerate)
  315. def->generateDFAs();
  316. }
  317. void HqlNamedRegex::generateRegex(GenerateRegexCtx & ctx)
  318. {
  319. if (created || noGenerate)
  320. return;
  321. node_operator savedKind = ctx.namedKind;
  322. ctx.namedKind = kind;
  323. created.setown(new RegexNamed(name, ctx.idAllocator.queryID(expr, name)));
  324. def->generateRegex(ctx);
  325. def->connectRegex();
  326. HqlRegexExpr * first = def->queryChild(0);
  327. if ((first->getOperator() == no_pat_beginpattern) && (first->following.ordinality() == 1))
  328. first = &first->following.item(0);
  329. created->setFirst(first->created);
  330. ctx.namedKind = savedKind;
  331. }
  332. unsigned HqlNamedRegex::getDfaScore()
  333. {
  334. if (isRecursive)
  335. return NO_DFA_SCORE;
  336. return def->dfaScore;
  337. }
  338. void HqlNamedRegex::insertSeparators(IHqlExpression * separator, RegexContext * ctx)
  339. {
  340. if (expr->queryType()->getTypeCode() == type_pattern)
  341. return;
  342. def.setown(def->insertSeparators(separator, ctx));
  343. }
  344. void HqlNamedRegex::markDFAs(unsigned complexity)
  345. {
  346. if (!doneMarkDfas)
  347. {
  348. doneMarkDfas = true;
  349. def->markDFAs(complexity);
  350. }
  351. }
  352. bool HqlNamedRegex::matchesDefine(IHqlExpression * name, bool _caseSensitive)
  353. {
  354. return (name->queryBody() == defineName) && (caseSensitive == _caseSensitive);
  355. }
  356. void HqlNamedRegex::mergeCreateSets()
  357. {
  358. def.setown(def->mergeCreateSets());
  359. }
  360. bool HqlNamedRegex::queryExpandInline()
  361. {
  362. expandNamedSymbols();
  363. if (!isMatched)
  364. {
  365. if (numUses == 1)
  366. return true;
  367. //expand if trivial - this should be improved once dfa support is implemented.
  368. switch (def->getOperator())
  369. {
  370. case no_pat_instance:
  371. case no_pat_set:
  372. case no_pat_const:
  373. case no_pat_first:
  374. case no_pat_last:
  375. case no_pat_anychar:
  376. return true;
  377. }
  378. }
  379. return false;
  380. }
  381. RegexPattern * HqlNamedRegex::queryRootPattern()
  382. {
  383. //return a reference to the begin
  384. HqlRegexExpr * first = def->queryChild(0);
  385. if ((first->getOperator() == no_pat_beginpattern) && (first->following.ordinality() == 1))
  386. first = &first->following.item(0);
  387. return first->created;
  388. }
  389. void HqlNamedRegex::setRegexOwn(HqlRegexExpr * _def)
  390. {
  391. def.setown(_def);
  392. IHqlExpression * defExpr = def->expr;
  393. if (def->getOperator() == no_define)
  394. defineName.set(defExpr->queryChild(1)->queryBody());
  395. }
  396. //---------------------------------------------------------------------------
  397. static HqlRegexExpr * expandStringAsChars(IHqlExpression * expr, const ParseInformation & options, bool caseSensitive)
  398. {
  399. Owned<IValue> castValue = getCastConstant(expr->queryChild(0)->queryValue(), options.type);
  400. //convert text strings into a sequence of characters...
  401. ITypeInfo * type = castValue->queryType();
  402. unsigned len = type->getStringLen();
  403. if (len == 0)
  404. return createRegexExpr(options, no_null, NULL, caseSensitive);
  405. HqlRegexExpr * expanded = createRegexExpr(options, no_pat_follow, NULL, caseSensitive);
  406. bool readUChar = false;
  407. const void * value = castValue->queryValue();
  408. switch (options.type)
  409. {
  410. case type_unicode:
  411. readUChar = true;
  412. break;
  413. case type_utf8:
  414. //UTF8 is matched as a sequence of characters...
  415. len = rtlUtf8Size(len, value);
  416. break;
  417. }
  418. for (unsigned i = 0; i < len; i++)
  419. {
  420. unsigned nextValue;
  421. if (readUChar)
  422. nextValue = ((UChar *)value)[i];
  423. else
  424. nextValue = ((unsigned char *)value)[i];
  425. OwnedHqlExpr next = getSizetConstant(nextValue);
  426. expanded->addOperand(createRegexExpr(options, no_pat_singlechar, next, caseSensitive));
  427. }
  428. return expanded;
  429. }
  430. //---------------------------------------------------------------------------
  431. HqlRegexExpr::HqlRegexExpr(const ParseInformation & _options, IHqlExpression * _expr, bool _caseSensitive) : options(_options)
  432. {
  433. expr.set(_expr);
  434. op = expr->getOperator();
  435. caseSensitive = _caseSensitive;
  436. init();
  437. }
  438. HqlRegexExpr::HqlRegexExpr(const ParseInformation & _options, node_operator _op, IHqlExpression * _expr, bool _caseSensitive) : options(_options)
  439. {
  440. expr.set(_expr);
  441. op = _op;
  442. caseSensitive = _caseSensitive;
  443. init();
  444. }
  445. HqlRegexExpr::~HqlRegexExpr()
  446. {
  447. ::Release(dfa);
  448. }
  449. void HqlRegexExpr::init()
  450. {
  451. uid = (unsigned)(getUniqueId()-options.uidBase);
  452. cleaned = false;
  453. connected = false;
  454. analysed = false;
  455. createDFA = false;
  456. dfaScore = NO_DFA_SCORE;
  457. dfa = NULL;
  458. added = false;
  459. }
  460. void inherit(HqlRegexUniqueArray & target, const HqlRegexUniqueArray & source)
  461. {
  462. ForEachItemIn(idx, source)
  463. target.append(OLINK(source.item(idx)));
  464. }
  465. void HqlRegexExpr::calcDfaScore()
  466. {
  467. ForEachItemIn(idx, args)
  468. args.item(idx).calcDfaScore();
  469. if (querySubPattern())
  470. querySubPattern()->calcDfaScore();
  471. if (queryNamed())
  472. queryNamed()->calcDfaScore();
  473. //on entry dfaScore = NO_DFA_SCORE
  474. switch (getOperator())
  475. {
  476. case no_null:
  477. dfaScore = 0;
  478. break;
  479. case no_pat_const:
  480. dfaScore = expr->queryChild(0)->queryType()->getStringLen();
  481. break;
  482. case no_pat_anychar:
  483. dfaScore = 1;
  484. break;
  485. case no_pat_set:
  486. {
  487. unsigned score = 0;
  488. ForEachChild(idx, expr)
  489. {
  490. IHqlExpression * child = expr->queryChild(idx);
  491. switch (child->getOperator())
  492. {
  493. case no_range:
  494. {
  495. unsigned low = (unsigned)child->queryChild(0)->queryValue()->getIntValue();
  496. unsigned high = (unsigned)child->queryChild(1)->queryValue()->getIntValue();
  497. score = addWithoutOverflow(score, (high-low)+1);
  498. break;
  499. }
  500. case no_constant:
  501. score = addWithoutOverflow(score, 1);
  502. break;
  503. }
  504. }
  505. dfaScore = score;
  506. break;
  507. }
  508. case no_pat_repeat:
  509. if (isStandardRepeat())
  510. {
  511. unsigned argScore = args.item(0).dfaScore;
  512. if ((getRepeatMax() > 1) && (argScore != NO_DFA_SCORE))
  513. dfaScore = addWithoutOverflow(argScore, argScore);
  514. else
  515. dfaScore = argScore;
  516. }
  517. else if (!expr->hasAttribute(minimalAtom))
  518. {
  519. unsigned max = getRepeatMax();
  520. unsigned namedDfaScore = queryNamed()->getDfaScore();
  521. if (max <= options.dfaRepeatMax)
  522. dfaScore = multiplyWithoutOverflow(max, namedDfaScore);
  523. }
  524. break;
  525. case no_pat_instance:
  526. if (!queryNamed()->isMatched)
  527. dfaScore = queryNamed()->getDfaScore();
  528. break;
  529. case no_pat_or:
  530. {
  531. unsigned totalScore = 0;
  532. ForEachItemIn(idx, args)
  533. {
  534. unsigned score = args.item(idx).dfaScore;
  535. if (score == NO_DFA_SCORE)
  536. {
  537. totalScore = NO_DFA_SCORE;
  538. break;
  539. }
  540. totalScore = addWithoutOverflow(totalScore, score);
  541. }
  542. dfaScore = totalScore;
  543. if (dfaScore == NO_DFA_SCORE)
  544. {
  545. //Work out if there are two or more entires which can be converted into a dfa, and if so combine them
  546. //Also potentially separate simple sting lists into a separate or, so we always generate them
  547. unsigned scoreCount = 0;
  548. unsigned stringListCount = 0;
  549. ForEachItemIn(iCount, args)
  550. {
  551. HqlRegexExpr & cur = args.item(iCount);
  552. if (cur.dfaScore != NO_DFA_SCORE)
  553. {
  554. scoreCount++;
  555. if (cur.isSimpleStringList())
  556. stringListCount++;
  557. }
  558. }
  559. if (scoreCount > 1)
  560. {
  561. HqlRegexExpr * newor = createRegexExpr(options, no_pat_or, NULL, caseSensitive);
  562. HqlRegexExpr * stringListOr = ((stringListCount > 1) && (scoreCount != stringListCount)) ? createRegexExpr(options, no_pat_or, NULL, caseSensitive) : NULL;
  563. unsigned totalScore = 0;
  564. unsigned stringListScore = 0;
  565. for (unsigned i =0; i < args.ordinality(); )
  566. {
  567. HqlRegexExpr & cur = args.item(i);
  568. unsigned score = cur.dfaScore;
  569. if (score != NO_DFA_SCORE)
  570. {
  571. if (stringListOr && cur.isSimpleStringList())
  572. {
  573. stringListScore = addWithoutOverflow(stringListScore, score);
  574. stringListOr->addOperand(&OLINK(cur));
  575. }
  576. else
  577. newor->addOperand(&OLINK(cur));
  578. totalScore = addWithoutOverflow(totalScore, score);
  579. args.remove(i);
  580. }
  581. else
  582. i++;
  583. }
  584. if (stringListOr)
  585. {
  586. stringListOr->dfaScore = stringListScore;
  587. newor->addOperand(stringListOr);
  588. }
  589. newor->dfaScore = totalScore;
  590. //add the dfa first - since it will probably be quickest
  591. args.add(*newor, 0);
  592. }
  593. }
  594. break;
  595. }
  596. case no_pat_follow:
  597. {
  598. //MORE: Should extract applicable subranges out of the sequences
  599. //MORE: This should multiple rather than add.
  600. dfaScore = 0;
  601. ForEachItemIn(idx, args)
  602. {
  603. unsigned score = args.item(idx).dfaScore;
  604. if (score == NO_DFA_SCORE)
  605. {
  606. dfaScore = NO_DFA_SCORE;
  607. break;
  608. }
  609. dfaScore = addWithoutOverflow(dfaScore, score);
  610. }
  611. if (dfaScore == NO_DFA_SCORE)
  612. {
  613. //Try and find ranges that are valid, and extract those into sub-elements.
  614. for (unsigned i =0; i < args.ordinality(); i++)
  615. {
  616. unsigned score = args.item(i).dfaScore;
  617. if (score != NO_DFA_SCORE)
  618. {
  619. unsigned max = args.ordinality();
  620. unsigned j;
  621. for (j=i+1; j < max; j++)
  622. {
  623. unsigned thisScore = args.item(j).dfaScore;
  624. if (thisScore == NO_DFA_SCORE)
  625. break;
  626. score = addWithoutOverflow(score, thisScore);
  627. }
  628. if (j != i+1)
  629. {
  630. HqlRegexExpr * follow = createRegexExpr(options, no_pat_follow, NULL, caseSensitive);
  631. for (unsigned k=i; k < j; k++)
  632. follow->addOperand(&OLINK(args.item(k)));
  633. follow->dfaScore = score;
  634. while (j != i)
  635. args.remove(--j);
  636. args.add(*follow, i);
  637. }
  638. }
  639. }
  640. }
  641. break;
  642. }
  643. }
  644. }
  645. void HqlRegexExpr::calculateNext()
  646. {
  647. //Dragon p138
  648. //if a concatenation (a,b), then all the trailing from (a) have leading(b) in their following set
  649. //if a repeat [max > 1] then all positions in leading(x) are also in the following list
  650. //Limited/minimal repeats aren't done the same way - so they aren't added in the same way
  651. //Because I am cheating the order is important - repeats need to be done first since they are greedy.
  652. switch (getOperator())
  653. {
  654. case no_pat_repeat:
  655. if (isStandardRepeat() && getRepeatMax() > 1)
  656. {
  657. //All the trailing elements have the leading elements as possible next.
  658. unsigned max = numTrailing();
  659. for (unsigned idx=0; idx < max; idx++)
  660. gatherLeading(queryTrailing(idx).following);
  661. }
  662. break;
  663. }
  664. ForEachItemIn(idx, args)
  665. args.item(idx).calculateNext();
  666. switch (getOperator())
  667. {
  668. case no_pat_follow:
  669. case no_pat_separator:
  670. {
  671. //Internally connect up following items within the no_pat_follow.
  672. unsigned max = args.ordinality();
  673. assertex(max);
  674. for (unsigned pairs=0; pairs<max-1; pairs++)
  675. {
  676. HqlRegexExpr * left = queryChild(pairs);
  677. for (unsigned other=pairs+1; other<max; other++)
  678. {
  679. HqlRegexExpr * right = queryChild(other);
  680. unsigned maxTrail = left->numTrailing();
  681. for (unsigned trailIdx=0; trailIdx<maxTrail; trailIdx++)
  682. {
  683. HqlRegexExpr & cur = left->queryTrailing(trailIdx);
  684. right->gatherLeading(cur.following);
  685. }
  686. if (!right->canBeNull())
  687. break;
  688. }
  689. }
  690. break;
  691. }
  692. }
  693. }
  694. #if 0
  695. //convert strings/unicode to correct kind depending on what is being searched, when the expr is built.
  696. UBool U_CALL_CONV noteUCharRange(const void * context, UChar32 start, UChar limit, UCharCategory type)
  697. {
  698. }
  699. void gatherRange(void * context, UCharCategory type)
  700. {
  701. u_enumCharTypes(noteUCharRange range, context);
  702. }
  703. #endif
  704. static bool canConsume(const ParseInformation & options, unsigned nextChar, unsigned matcherChar, bool caseSensitive)
  705. {
  706. if (nextChar == matcherChar)
  707. return true;
  708. if (matcherChar & RegexSpecialMask)
  709. {
  710. assertex(options.type != type_utf8);
  711. if (!(nextChar & RegexSpecialMask))
  712. {
  713. if (options.type != type_string)
  714. return isUnicodeMatch(matcherChar, nextChar);
  715. else
  716. return isAsciiMatch(matcherChar, nextChar);
  717. }
  718. //Should only occur if unicode.
  719. switch (matcherChar)
  720. {
  721. case RCCalnum:
  722. return (nextChar == RCCalpha) || (nextChar == RCClower) || (nextChar == RCCupper) || (nextChar == RCCdigit);
  723. case RCCalpha:
  724. return (nextChar == RCClower) || (nextChar == RCCupper);
  725. case RCCspace:
  726. return (nextChar == RCCblank);
  727. case RCCprint:
  728. return (nextChar == RCCalnum) || (nextChar == RCCalpha) || (nextChar == RCClower) || (nextChar == RCCupper) || (nextChar == RCCdigit) || (nextChar == RCCpunct) || (nextChar == RCCxdigit) || (nextChar == RCCgraph);
  729. case RCCgraph:
  730. return (nextChar == RCCalnum) || (nextChar == RCCalpha) || (nextChar == RCClower) || (nextChar == RCCupper) || (nextChar == RCCdigit) || (nextChar == RCCpunct) || (nextChar == RCCxdigit);
  731. case RCCxdigit:
  732. return (nextChar == RCCdigit);
  733. case RCCany:
  734. return true;
  735. }
  736. return false;
  737. }
  738. else if (nextChar & RegexSpecialMask)
  739. return false;
  740. if (caseSensitive || (options.type == type_utf8))
  741. return false;
  742. if (options.type == type_unicode)
  743. {
  744. //MORE: This needs improving for full folding.
  745. return u_foldCase(nextChar, U_FOLD_CASE_DEFAULT) == u_foldCase(matcherChar, U_FOLD_CASE_DEFAULT);
  746. }
  747. return (nextChar == tolower(matcherChar)) || (nextChar == toupper(matcherChar));
  748. }
  749. bool HqlRegexExpr::canConsume(unsigned nextChar)
  750. {
  751. switch (getOperator())
  752. {
  753. case no_pat_singlechar:
  754. return ::canConsume(options, nextChar, (unsigned)expr->queryValue()->getIntValue(), caseSensitive);
  755. case no_pat_utf8single:
  756. assertex(!(nextChar & RegexSpecialMask));
  757. return nextChar < 0x80;
  758. case no_pat_utf8lead:
  759. assertex(!(nextChar & RegexSpecialMask));
  760. return nextChar >= 0xc0;
  761. case no_pat_utf8follow:
  762. assertex(!(nextChar & RegexSpecialMask));
  763. return nextChar >= 0x80 && nextChar <= 0xbf;
  764. case no_pat_anychar:
  765. assertex(options.type != type_utf8);
  766. return true;
  767. case no_pat_set:
  768. {
  769. assertex(options.type != type_utf8);
  770. bool invert = expr->hasAttribute(notAtom);
  771. ForEachChild(idx, expr)
  772. {
  773. IHqlExpression * child = expr->queryChild(idx);
  774. switch (child->getOperator())
  775. {
  776. case no_range:
  777. {
  778. unsigned low = (unsigned)child->queryChild(0)->queryValue()->getIntValue();
  779. unsigned high = (unsigned)child->queryChild(1)->queryValue()->getIntValue();
  780. if (!(nextChar & RegexSpecialMask))
  781. {
  782. if (!caseSensitive)
  783. {
  784. if (options.type == type_unicode)
  785. {
  786. //MORE: Improved unicode
  787. if (u_foldCase(nextChar, U_FOLD_CASE_DEFAULT) >= u_foldCase(low, U_FOLD_CASE_DEFAULT) &&
  788. u_foldCase(nextChar, U_FOLD_CASE_DEFAULT) <= u_foldCase(high, U_FOLD_CASE_DEFAULT))
  789. return !invert;
  790. }
  791. else
  792. {
  793. if (nextChar >= (unsigned)tolower(low) && nextChar <= (unsigned)tolower(high))
  794. return !invert;
  795. if (nextChar >= (unsigned)toupper(low) && nextChar <= (unsigned)toupper(high))
  796. return !invert;
  797. }
  798. }
  799. else
  800. {
  801. if (nextChar >= low && nextChar <= high)
  802. return !invert;
  803. }
  804. }
  805. break;
  806. }
  807. case no_constant:
  808. {
  809. unsigned value = (unsigned)child->queryValue()->getIntValue();
  810. if (::canConsume(options, nextChar, value, caseSensitive))
  811. return !invert;
  812. break;
  813. }
  814. case no_attr:
  815. case no_attr_expr:
  816. case no_attr_link:
  817. break;
  818. default:
  819. UNIMPLEMENTED;
  820. }
  821. }
  822. return invert;
  823. }
  824. case no_pat_endpattern:
  825. return false;
  826. default:
  827. UNIMPLEMENTED;
  828. }
  829. }
  830. void HqlRegexExpr::gatherConsumeSymbols(SymbolArray & symbols)
  831. {
  832. switch (getOperator())
  833. {
  834. case no_pat_singlechar:
  835. {
  836. unsigned charValue = (unsigned)expr->queryValue()->getIntValue();
  837. if (!caseSensitive && (options.type != type_utf8))
  838. {
  839. if (options.type == type_unicode)
  840. {
  841. //MORE: Unicode - can you have several values, what about case conversion?
  842. //MORE: String might need expanding to ('a'|'A')('ss'|'SS'|'B') or something similar.
  843. symbols.addUnique(u_tolower(charValue));
  844. symbols.addUnique(u_toupper(charValue));
  845. }
  846. else
  847. {
  848. symbols.addUnique(tolower(charValue));
  849. symbols.addUnique(toupper(charValue));
  850. }
  851. }
  852. else
  853. symbols.addUnique(charValue);
  854. break;
  855. }
  856. case no_pat_anychar:
  857. assertex(options.type != type_utf8);
  858. if (options.type == type_unicode)
  859. symbols.addUnique(RCCany);
  860. else
  861. symbols.addUniqueRange(0, 255);
  862. return;
  863. case no_pat_endpattern:
  864. return;
  865. case no_pat_set:
  866. assertex(options.type == type_string);
  867. if (options.type == type_unicode)
  868. {
  869. //MORE: Need some kind of implementation for unicode - although invert and ranges may not be possible.
  870. if (expr->hasAttribute(notAtom))
  871. throwError(HQLERR_DfaTooComplex);
  872. ForEachChild(idx, expr)
  873. {
  874. IHqlExpression * child = expr->queryChild(idx);
  875. switch (child->getOperator())
  876. {
  877. case no_range:
  878. {
  879. unsigned low = (unsigned)child->queryChild(0)->queryValue()->getIntValue();
  880. unsigned high = (unsigned)child->queryChild(1)->queryValue()->getIntValue();
  881. if (!caseSensitive)
  882. {
  883. //MORE: This really doesn't work very well - I'm not even sure what it means...
  884. symbols.addUniqueRange(u_tolower(low), u_tolower(high));
  885. symbols.addUniqueRange(u_toupper(low), u_toupper(high));
  886. }
  887. else
  888. symbols.addUniqueRange(low, high);
  889. break;
  890. }
  891. case no_constant:
  892. {
  893. unsigned value = (unsigned)child->queryValue()->getIntValue();
  894. if (caseSensitive || (value & RegexSpecialMask))
  895. symbols.addUnique(value);
  896. else
  897. {
  898. symbols.addUnique(u_tolower(value));
  899. symbols.addUnique(u_toupper(value));
  900. }
  901. break;
  902. }
  903. case no_attr:
  904. case no_attr_expr:
  905. case no_attr_link:
  906. break;
  907. default:
  908. UNIMPLEMENTED;
  909. }
  910. }
  911. }
  912. else
  913. {
  914. //This is probably the easiest way to cope with inverted sets and all other complications.
  915. for (unsigned i = 0; i < 256; i++)
  916. if (canConsume(i))
  917. symbols.addUnique(i);
  918. }
  919. return;
  920. case no_pat_utf8single:
  921. {
  922. for (unsigned i = 0; i < 0x80; i++)
  923. symbols.addUnique(i);
  924. break;
  925. }
  926. case no_pat_utf8lead:
  927. {
  928. for (unsigned i = 0xc0; i < 0x100; i++)
  929. symbols.addUnique(i);
  930. break;
  931. }
  932. case no_pat_utf8follow:
  933. {
  934. for (unsigned i = 0x80; i < 0xc0; i++)
  935. symbols.addUnique(i);
  936. break;
  937. }
  938. default:
  939. throwError1(HQLERR_BadDfaOperator, getOpString(getOperator()));
  940. }
  941. }
  942. void HqlRegexExpr::cleanup()
  943. {
  944. if (cleaned)
  945. return;
  946. cleaned = true;
  947. //kill anything that could have a loop.
  948. ForEachItemIn(i0, args)
  949. args.item(i0).cleanup();
  950. if (querySubPattern())
  951. querySubPattern()->cleanup();
  952. if (created)
  953. created->dispose();
  954. following.kill();
  955. args.kill();
  956. }
  957. HqlRegexExpr * HqlRegexExpr::clone()
  958. {
  959. assertex(!created && !analysed);
  960. HqlRegexExpr * cloned = createRegexExpr(options, op, expr, caseSensitive);
  961. ForEachItemIn(idx, args)
  962. cloned->args.append(*args.item(idx).clone());
  963. cloned->setNamed(queryNamed());
  964. cloned->setSubPattern(querySubPattern());
  965. return cloned;
  966. }
  967. void HqlRegexExpr::connectRegex()
  968. {
  969. if (connected)
  970. return;
  971. connected = true;
  972. ForEachItemIn(idx, args)
  973. args.item(idx).connectRegex();
  974. if (created)
  975. {
  976. ForEachItemIn(idx2, following)
  977. created->addLink(following.item(idx2).created);
  978. HqlNamedRegex * named = queryNamed();
  979. if (named && named->created && !named->noGenerate)
  980. created->setBody(named->created);
  981. if (querySubPattern())
  982. created->setSubPattern(querySubPattern()->queryRootPattern());
  983. }
  984. }
  985. HqlRegexExpr * HqlRegexExpr::createDFAs(RegexContext & ctx)
  986. {
  987. if (createDFA)
  988. {
  989. OwnedHqlExpr searchExpr = createValue(no_pat_dfa, makePatternType(), createUniqueId());
  990. HqlNamedRegex * newNamed = new HqlNamedRegex(searchExpr, NULL, searchExpr, no_pat_dfa, caseSensitive, false);
  991. ctx.named.append(*newNamed);
  992. newNamed->setRegexOwn(expandAsDFA());
  993. HqlRegexExpr * dfa = createRegexExpr(options, no_pat_dfa, NULL, caseSensitive);
  994. dfa->setNamed(newNamed);
  995. dfa->dfaScore = dfaScore;
  996. //MORE: Need to save the original as a *named/subPattern* attribute in case conversion to a DFA fails
  997. return dfa;
  998. }
  999. unsigned max = args.ordinality();
  1000. unsigned idx;
  1001. for (idx = 0; idx < max; idx++)
  1002. {
  1003. HqlRegexExpr & cur = args.item(idx);
  1004. HqlRegexExpr * transformed = cur.createDFAs(ctx);
  1005. args.replace(*transformed, idx);
  1006. if (transformed->getOperator() == no_pat_dfa)
  1007. {
  1008. if ((idx != 0) && (idx+1 != max) && (getOperator() == no_pat_follow))
  1009. {
  1010. if ((args.item(idx-1).getOperator() == no_pat_begintoken) &&
  1011. (args.item(idx+1).getOperator() == no_pat_endtoken))
  1012. {
  1013. transformed->addOperand(&OLINK(args.item(idx-1)));
  1014. args.remove(idx+1);
  1015. args.remove(idx-1);
  1016. idx--;
  1017. max -= 2;
  1018. }
  1019. }
  1020. }
  1021. }
  1022. if (querySubPattern())
  1023. querySubPattern()->createDFAs(ctx);
  1024. if (queryNamed())
  1025. queryNamed()->createDFAs(ctx);
  1026. return LINK(this);
  1027. }
  1028. HqlRegexExpr * HqlRegexExpr::expandAsRepeatedDFA(unsigned count)
  1029. {
  1030. if (count == 0)
  1031. return createRegexExpr(options, no_null, NULL, caseSensitive);
  1032. if (count == 1)
  1033. return expandAsDFA();
  1034. HqlRegexExpr * cloned = createRegexExpr(options, no_pat_follow, NULL, caseSensitive);
  1035. for (unsigned i=0; i < count; i++)
  1036. cloned->args.append(*expandAsDFA());
  1037. return cloned;
  1038. }
  1039. HqlRegexExpr * HqlRegexExpr::expandAsDFA()
  1040. {
  1041. assertex(!created && !analysed);
  1042. switch (getOperator())
  1043. {
  1044. case no_pat_const:
  1045. return expandStringAsChars(expr, options, caseSensitive);
  1046. case no_pat_instance:
  1047. return queryNamed()->def->expandAsDFA();
  1048. case no_pat_or:
  1049. case no_pat_follow:
  1050. case no_null:
  1051. case no_pat_singlechar:
  1052. break;
  1053. case no_pat_anychar:
  1054. if (options.type == type_utf8)
  1055. {
  1056. // convert to low | follow(lead, tail+);
  1057. Owned<HqlRegexExpr> orRegexExpr = createRegexExpr(options, no_pat_or, NULL, caseSensitive);
  1058. orRegexExpr->addOperand(createRegexExpr(options, no_pat_utf8single, NULL, caseSensitive));
  1059. Owned<HqlRegexExpr> followRegexExpr = createRegexExpr(options, no_pat_follow, NULL, caseSensitive);
  1060. followRegexExpr->addOperand(createRegexExpr(options, no_pat_utf8lead, NULL, caseSensitive));
  1061. OwnedHqlExpr trailExpr = createValue(no_pat_utf8follow, makePatternType());
  1062. Owned<HqlRegexExpr> trailRegexExpr = createRegexExpr(options, no_pat_utf8follow, NULL, caseSensitive);
  1063. OwnedHqlExpr repeatExpr = createValue(no_pat_repeat, makePatternType(), LINK(trailExpr), createConstantOne(), createValue(no_any, makeNullType()));
  1064. Owned<HqlRegexExpr> repeatRegexExpr = createRegexExpr(options, no_pat_repeat, repeatExpr, caseSensitive);
  1065. repeatRegexExpr->addOperand(trailRegexExpr.getClear());
  1066. followRegexExpr->addOperand(repeatRegexExpr.getClear());
  1067. orRegexExpr->addOperand(followRegexExpr.getClear());
  1068. return orRegexExpr.getClear();
  1069. }
  1070. break;
  1071. case no_pat_set:
  1072. if (options.type == type_utf8)
  1073. {
  1074. throwUnexpected();
  1075. //expand set values...
  1076. // return low | follow(lead, tail+);
  1077. }
  1078. break;
  1079. case no_pat_dfa:
  1080. return queryNamed()->def->expandAsDFA();
  1081. case no_pat_repeat:
  1082. {
  1083. if (isStandardRepeat())
  1084. break;
  1085. unsigned min = getRepeatMin();
  1086. unsigned max = getRepeatMax();
  1087. if (min == max)
  1088. return queryNamed()->def->expandAsRepeatedDFA(min);
  1089. HqlRegexExpr * cloned = createRegexExpr(options, no_pat_or, NULL, caseSensitive);
  1090. for (unsigned i=min; i <= max; i++)
  1091. cloned->args.append(*queryNamed()->def->expandAsRepeatedDFA(i));
  1092. return cloned;
  1093. }
  1094. default:
  1095. UNIMPLEMENTED;
  1096. }
  1097. HqlRegexExpr * cloned = createRegexExpr(options, op, expr, caseSensitive);
  1098. ForEachItemIn(idx, args)
  1099. cloned->args.append(*args.item(idx).expandAsDFA());
  1100. assertex(!queryNamed());
  1101. assertex(!querySubPattern());
  1102. return cloned;
  1103. }
  1104. HqlRegexExpr * HqlRegexExpr::expandNamedSymbols()
  1105. {
  1106. ForEachItemIn(idx, args)
  1107. args.replace(*args.item(idx).expandNamedSymbols(), idx);
  1108. if (querySubPattern())
  1109. querySubPattern()->expandNamedSymbols();
  1110. if (queryNamed())
  1111. queryNamed()->expandNamedSymbols();
  1112. switch (getOperator())
  1113. {
  1114. case no_pat_instance:
  1115. if (queryNamed()->queryExpandInline())
  1116. {
  1117. queryNamed()->removeUse();
  1118. return queryNamed()->def->clone();
  1119. }
  1120. break;
  1121. case no_pat_follow:
  1122. {
  1123. ForEachItemIn(idx, args)
  1124. {
  1125. node_operator op = args.item(idx).getOperator();
  1126. switch (op)
  1127. {
  1128. //Remove begin_token/end_token around the following, otherwise
  1129. //we might get too many separators which may be non-optional
  1130. case no_pat_first:
  1131. case no_pat_last:
  1132. case no_penalty:
  1133. case no_pat_x_before_y:
  1134. case no_pat_before_y:
  1135. case no_pat_x_after_y:
  1136. case no_pat_after_y:
  1137. case no_pat_guard:
  1138. if ((idx > 0) && args.item(idx-1).getOperator() == no_pat_begintoken)
  1139. {
  1140. assertex(args.item(idx+1).getOperator() == no_pat_endtoken);
  1141. args.remove(idx+1);
  1142. args.remove(idx-1);
  1143. if (args.isItem(idx) && (args.item(idx).getOperator() == no_pat_separator))
  1144. args.remove(idx);
  1145. }
  1146. break;
  1147. }
  1148. }
  1149. break;
  1150. }
  1151. }
  1152. return LINK(this);
  1153. }
  1154. HqlRegexExpr * HqlRegexExpr::expandRecursion(RegexContext & ctx, HqlNamedRegex * self)
  1155. {
  1156. ForEachItemIn(idx, args)
  1157. args.replace(*args.item(idx).expandRecursion(ctx, self), idx);
  1158. if (querySubPattern())
  1159. querySubPattern()->expandRecursion(ctx, self);
  1160. if (queryNamed())
  1161. queryNamed()->expandRecursion(ctx, self);
  1162. HqlNamedRegex * replacement = NULL;
  1163. switch (getOperator())
  1164. {
  1165. case no_self:
  1166. replacement = self;
  1167. break;
  1168. case no_pat_use:
  1169. assertex(expr->queryChild(0)->queryValue());
  1170. replacement = ctx.queryDefine(expr->queryChild(0), caseSensitive);
  1171. break;
  1172. case no_define:
  1173. return LINK(&args.item(0));
  1174. }
  1175. if (replacement)
  1176. {
  1177. replacement->expandRecursion(ctx, self);
  1178. replacement->addRecursiveUse();
  1179. HqlRegexExpr * instance = createRegexExpr(options, no_pat_instance, NULL, caseSensitive);
  1180. instance->setNamed(replacement);
  1181. return instance;
  1182. }
  1183. return LINK(this);
  1184. }
  1185. void HqlRegexExpr::generateDFA(IDfaPattern * pattern)
  1186. {
  1187. //Estimate the number of transitions to avoid lots of reallocs.
  1188. unsigned numTransitions = 0;
  1189. {
  1190. ForEachItemIn(idx, dfa->states)
  1191. {
  1192. HqlDfaState & cur = dfa->states.item(idx);
  1193. unsigned min = 255;
  1194. unsigned max = 0;
  1195. ForEachItemIn(idx2, cur.transitions)
  1196. {
  1197. HqlDfaTransition & curTransition = cur.transitions.item(idx2);
  1198. if (curTransition.code < min) min = curTransition.code;
  1199. if (curTransition.code > max) max = curTransition.code;
  1200. }
  1201. if (min <= max)
  1202. numTransitions += (max-min)+1;
  1203. }
  1204. }
  1205. pattern->init(dfa->states.ordinality(), numTransitions);
  1206. ForEachItemIn(idx, dfa->states)
  1207. {
  1208. HqlDfaState & cur = dfa->states.item(idx);
  1209. assertex(cur.id == idx);
  1210. pattern->beginState(cur.id);
  1211. ForEachItemIn(idx, cur.position)
  1212. {
  1213. HqlRegexExpr & curExpr = cur.position.item(idx);
  1214. if (curExpr.getOperator() == no_pat_endpattern)
  1215. pattern->setStateAccept(curExpr.getAcceptId());
  1216. }
  1217. ForEachItemIn(idx2, cur.transitions)
  1218. {
  1219. HqlDfaTransition & curTransition = cur.transitions.item(idx2);
  1220. pattern->addTransition(curTransition.code, curTransition.next->id);
  1221. }
  1222. pattern->endState();
  1223. }
  1224. pattern->finished();
  1225. }
  1226. void HqlRegexExpr::generateRegex(GenerateRegexCtx & ctx)
  1227. {
  1228. ForEachItemIn(idx, args)
  1229. args.item(idx).generateRegex(ctx);
  1230. assertex(!created);
  1231. if (queryNamed())
  1232. queryNamed()->generateRegex(ctx);
  1233. if (querySubPattern())
  1234. querySubPattern()->generateRegex(ctx);
  1235. switch (getOperator())
  1236. {
  1237. case no_null:
  1238. case no_pat_beginpattern:
  1239. created.setown(new RegexNullPattern);
  1240. break;
  1241. case no_pat_beginrecursive:
  1242. created.setown(new RegexRecursivePattern);
  1243. break;
  1244. case no_pat_beginseparator:
  1245. created.setown(new RegexBeginSeparatorPattern);
  1246. break;
  1247. case no_pat_endseparator:
  1248. created.setown(new RegexEndSeparatorPattern);
  1249. break;
  1250. case no_pat_endrecursive:
  1251. created.setown(new RegexEndRecursivePattern);
  1252. break;
  1253. case no_pat_instance:
  1254. created.setown(new RegexNamedPattern);
  1255. break;
  1256. case no_pat_begincheck:
  1257. case no_pat_beginvalidate:
  1258. created.setown(new RegexBeginCheckPattern);
  1259. break;
  1260. case no_pat_endcheckin:
  1261. {
  1262. bool stripSeparator = ctx.info.addedSeparators && (expr->queryType()->getTypeCode() != type_pattern);
  1263. created.setown(new RegexCheckInPattern(expr->hasAttribute(notAtom), stripSeparator));
  1264. break;
  1265. }
  1266. case no_pat_endchecklength:
  1267. {
  1268. unsigned minLength = 0;
  1269. unsigned maxLength = PATTERN_UNLIMITED_LENGTH;
  1270. getCheckRange(expr->queryChild(1), minLength, maxLength, ctx.info.charSize);
  1271. bool stripSeparator = ctx.info.addedSeparators && (expr->queryType()->getTypeCode() != type_pattern);
  1272. created.setown(new RegexCheckLengthPattern(expr->hasAttribute(notAtom), stripSeparator, minLength, maxLength));
  1273. break;
  1274. }
  1275. case no_pat_endvalidate:
  1276. {
  1277. unsigned idx = ctx.regex.getValidatorIndex(expr);
  1278. assertex(idx != NotFound);
  1279. ValidateKind validatorKind = getValidateKind(ctx.regex.queryValidateExpr(expr));
  1280. bool stripSeparator = ctx.info.addedSeparators && (expr->queryType()->getTypeCode() != type_pattern);
  1281. switch (ctx.info.inputFormat())
  1282. {
  1283. case NlpUnicode:
  1284. if (validatorKind != ValidateIsString)
  1285. created.setown(new RegexValidateUniAsUniPattern(stripSeparator, idx));
  1286. else
  1287. created.setown(new RegexValidateUniAsAscPattern(stripSeparator, idx));
  1288. break;
  1289. case NlpUtf8:
  1290. if (validatorKind != ValidateIsString)
  1291. created.setown(new RegexValidateUtf8AsUniPattern(stripSeparator, idx));
  1292. else
  1293. created.setown(new RegexValidateUtf8AsAscPattern(stripSeparator, idx));
  1294. break;
  1295. case NlpAscii:
  1296. if (validatorKind != ValidateIsUnicode)
  1297. created.setown(new RegexValidateAscAsAscPattern(stripSeparator, idx));
  1298. else
  1299. created.setown(new RegexValidateAscAsUniPattern(stripSeparator, idx));
  1300. break;
  1301. }
  1302. break;
  1303. }
  1304. case no_pat_begintoken:
  1305. created.setown(new RegexBeginTokenPattern);
  1306. break;
  1307. case no_pat_endtoken:
  1308. created.setown(new RegexEndTokenPattern);
  1309. break;
  1310. case no_pat_x_before_y:
  1311. case no_pat_before_y:
  1312. created.setown(new RegexAssertNextPattern(expr->hasAttribute(notAtom)));
  1313. break;
  1314. case no_pat_x_after_y:
  1315. case no_pat_after_y:
  1316. {
  1317. unsigned minSize = limitedMult(querySubPattern()->limit.minLength, ctx.info.charSize);
  1318. unsigned maxSize = limitedMult(querySubPattern()->limit.maxLength, ctx.info.charSize);
  1319. created.setown(new RegexAssertPrevPattern(expr->hasAttribute(notAtom), minSize, maxSize));
  1320. break;
  1321. }
  1322. case no_pat_first:
  1323. created.setown(new RegexStartPattern);
  1324. break;
  1325. case no_pat_last:
  1326. created.setown(new RegexFinishPattern);
  1327. break;
  1328. case no_pat_anychar:
  1329. created.setown(new RegexAnyCharPattern);
  1330. break;
  1331. case no_penalty:
  1332. {
  1333. int penalty = 1;
  1334. if (expr->numChildren() > 0)
  1335. penalty = (int)expr->queryChild(0)->queryValue()->getIntValue();
  1336. created.setown(new RegexPenaltyPattern(penalty));
  1337. }
  1338. break;
  1339. case no_pat_set:
  1340. {
  1341. RegexSetBasePattern * pattern;
  1342. if (ctx.info.type != type_string)
  1343. pattern = new RegexUnicodeSetPattern(caseSensitive);
  1344. else if (caseSensitive)
  1345. pattern = new RegexAsciiSetPattern;
  1346. else
  1347. pattern = new RegexAsciiISetPattern;
  1348. created.setown(pattern);
  1349. if (expr->hasAttribute(notAtom))
  1350. pattern->setInvert(true);
  1351. ForEachChild(idx, expr)
  1352. {
  1353. IHqlExpression * child = expr->queryChild(idx);
  1354. switch (child->getOperator())
  1355. {
  1356. case no_range:
  1357. pattern->addRange((unsigned)child->queryChild(0)->queryValue()->getIntValue(), (unsigned)child->queryChild(1)->queryValue()->getIntValue());
  1358. break;
  1359. case no_constant:
  1360. pattern->addRange((unsigned)child->queryValue()->getIntValue(), (unsigned)child->queryValue()->getIntValue());
  1361. break;
  1362. case no_attr:
  1363. case no_attr_expr:
  1364. case no_attr_link:
  1365. break;
  1366. default:
  1367. UNIMPLEMENTED;
  1368. }
  1369. }
  1370. break;
  1371. }
  1372. case no_pat_const:
  1373. {
  1374. IValue * value = expr->queryChild(0)->queryValue();
  1375. Owned<IValue> castValue = getCastConstant(value, ctx.info.type);
  1376. unsigned len = castValue->queryType()->getStringLen();
  1377. switch (ctx.info.type)
  1378. {
  1379. case type_unicode:
  1380. {
  1381. const UChar * data = (const UChar *)castValue->queryValue();
  1382. if (caseSensitive)
  1383. created.setown(new RegexUnicodePattern(len, data));
  1384. else
  1385. created.setown(new RegexUnicodeIPattern(len, data));
  1386. break;
  1387. }
  1388. case type_utf8:
  1389. {
  1390. const char * data = (const char *)castValue->queryValue();
  1391. if (caseSensitive)
  1392. created.setown(new RegexUtf8Pattern(len, data));
  1393. else
  1394. created.setown(new RegexUtf8IPattern(len, data));
  1395. break;
  1396. }
  1397. case type_string:
  1398. {
  1399. const char * data = (const char *)castValue->queryValue();
  1400. if (caseSensitive)
  1401. created.setown(new RegexAsciiPattern(len, data));
  1402. else
  1403. created.setown(new RegexAsciiIPattern(len, data));
  1404. break;
  1405. }
  1406. }
  1407. break;
  1408. }
  1409. case no_pat_or:
  1410. case no_pat_follow:
  1411. case no_pat_separator:
  1412. //Nothing is generated for these....
  1413. break;
  1414. case no_pat_dfa:
  1415. {
  1416. RegexDfaPattern * pattern;
  1417. bool isToken = false;
  1418. HqlRegexExpr * child = queryChild(0);
  1419. if (child && child->getOperator() == no_pat_begintoken)
  1420. isToken = true;
  1421. if (options.type == type_unicode)
  1422. {
  1423. throwUnexpected();
  1424. pattern = new RegexUnicodeDfaPattern;
  1425. }
  1426. else
  1427. pattern = new RegexAsciiDfaPattern(isToken);
  1428. created.setown(pattern);
  1429. Owned<IDfaPattern> builder = pattern->createBuilder();
  1430. generateDFA(builder);
  1431. }
  1432. break;
  1433. case no_pat_repeat:
  1434. if (!isStandardRepeat())
  1435. {
  1436. if (expr->queryChild(0)->getOperator() == no_pat_anychar)
  1437. {
  1438. created.setown(new RegexRepeatAnyPattern(getRepeatMin(), getRepeatMax(), expr->hasAttribute(minimalAtom)));
  1439. queryNamed()->noGenerate = true;
  1440. }
  1441. else
  1442. created.setown(new RegexRepeatPattern(getRepeatMin(), getRepeatMax(), !expr->hasAttribute(minimalAtom)));
  1443. }
  1444. break;
  1445. case no_pat_endpattern:
  1446. switch (ctx.namedKind)
  1447. {
  1448. case no_pat_instance:
  1449. case no_pat_repeat:
  1450. created.setown(new RegexEndNestedPattern);
  1451. break;
  1452. case no_pat_checkin:
  1453. created.setown(new RegexDonePattern);
  1454. break;
  1455. case no_parse:
  1456. created.setown(new RegexDonePattern);
  1457. break;
  1458. default:
  1459. UNIMPLEMENTED;
  1460. }
  1461. break;
  1462. default:
  1463. UNIMPLEMENTED;
  1464. }
  1465. }
  1466. unsigned HqlRegexExpr::getAcceptId()
  1467. {
  1468. //Overloaded, and a bit nasty, but will do for the moment....
  1469. if (dfaScore == NO_DFA_SCORE)
  1470. return 99;
  1471. return dfaScore;
  1472. }
  1473. unsigned HqlRegexExpr::getRepeatMin()
  1474. {
  1475. return ::getRepeatMin(expr);
  1476. }
  1477. unsigned HqlRegexExpr::getRepeatMax()
  1478. {
  1479. return ::getRepeatMax(expr);
  1480. }
  1481. bool HqlRegexExpr::isStandardRepeat()
  1482. {
  1483. if (::isStandardRepeat(expr))
  1484. {
  1485. if (options.expandRepeatAnyAsDfa || expr->queryChild(0)->getOperator() != no_pat_anychar)
  1486. return true;
  1487. }
  1488. return false;
  1489. }
  1490. HqlRegexExpr * HqlRegexExpr::insertSeparators(IHqlExpression * separator, RegexContext * ctx)
  1491. {
  1492. switch (getOperator())
  1493. {
  1494. case no_pat_follow:
  1495. {
  1496. ForEachItemInRev(idx, args)
  1497. {
  1498. HqlRegexExpr & cur = args.item(idx);
  1499. if (cur.getOperator() == no_pat_endtoken)
  1500. {
  1501. if (args.item(idx-1).getOperator() != no_pat_last)
  1502. {
  1503. //Separators are slightly weird. Because they are implicit, the whole thing should
  1504. //be optional (unlike a name where just the contents are optional).
  1505. //Should probably remove the optionality from the separator's value, and
  1506. //put it on the surrounding separator instead.
  1507. HqlRegexExpr * sepRegex = createRegexExpr(options, no_pat_separator, NULL, caseSensitive);
  1508. sepRegex->addOperand(createRegexExpr(options, no_pat_beginseparator, NULL, caseSensitive));
  1509. sepRegex->addOperand(ctx->createStructure(separator, ctx->isCaseSensitive()));
  1510. sepRegex->addOperand(createRegexExpr(options, no_pat_endseparator, NULL, caseSensitive));
  1511. args.add(*sepRegex, idx+1);
  1512. }
  1513. }
  1514. }
  1515. break;
  1516. }
  1517. }
  1518. ForEachChild(idx, this)
  1519. {
  1520. replaceOperand(queryChild(idx)->insertSeparators(separator, ctx), idx);
  1521. }
  1522. return LINK(this);
  1523. }
  1524. bool HqlRegexExpr::isWorthConvertingToDFA()
  1525. {
  1526. switch (getOperator())
  1527. {
  1528. case no_pat_or: case no_pat_repeat:
  1529. case no_pat_follow:
  1530. return true;
  1531. }
  1532. ForEachItemIn(idx, args)
  1533. if (args.item(idx).isWorthConvertingToDFA())
  1534. return true;
  1535. return false;
  1536. }
  1537. bool HqlRegexExpr::isSimpleStringList()
  1538. {
  1539. switch (getOperator())
  1540. {
  1541. case no_pat_or:
  1542. case no_pat_case:
  1543. case no_pat_nocase:
  1544. break;
  1545. case no_pat_const:
  1546. return true;
  1547. default:
  1548. return false;
  1549. }
  1550. ForEachItemIn(idx, args)
  1551. if (!args.item(idx).isSimpleStringList())
  1552. return false;
  1553. return true;
  1554. }
  1555. void HqlRegexExpr::markDFAs(unsigned complexity)
  1556. {
  1557. if (dfaScore != NO_DFA_SCORE)
  1558. {
  1559. //Don't convert instances - convert the definition so it can be reused.
  1560. //and if not worth converting, then children won't be either.
  1561. if (getOperator() != no_pat_instance)
  1562. {
  1563. if (dfaScore <= complexity)
  1564. {
  1565. createDFA = isWorthConvertingToDFA();
  1566. return;
  1567. }
  1568. else if (isSimpleStringList())
  1569. {
  1570. createDFA = true;
  1571. return;
  1572. }
  1573. }
  1574. }
  1575. ForEachItemIn(idx, args)
  1576. args.item(idx).markDFAs(complexity);
  1577. if (queryNamed())
  1578. queryNamed()->markDFAs(complexity);
  1579. if (querySubPattern())
  1580. querySubPattern()->markDFAs(complexity);
  1581. }
  1582. HqlRegexExpr * HqlRegexExpr::mergeCreateSets()
  1583. {
  1584. ForEachItemIn(idx, args)
  1585. replaceOperand(args.item(idx).mergeCreateSets(), idx);
  1586. if (getOperator() == no_pat_or)
  1587. {
  1588. unsigned singleCount = 0;
  1589. unsigned setCount = 0;
  1590. unsigned nullCount = 0;
  1591. HqlExprArray setArgs;
  1592. HqlRegexExprArray newArgs;
  1593. ForEachItemIn(idx, args)
  1594. {
  1595. HqlRegexExpr * cur = &args.item(idx);
  1596. bool curIsOptional = false;
  1597. if ((cur->getOperator() == no_pat_repeat) && (cur->getRepeatMin() == 0) && (cur->getRepeatMax() == 1))
  1598. {
  1599. curIsOptional = true;
  1600. cur = cur->queryChild(0);
  1601. }
  1602. if (cur->caseSensitive == caseSensitive)
  1603. {
  1604. switch (cur->getOperator())
  1605. {
  1606. case no_null:
  1607. nullCount++;
  1608. break;
  1609. case no_pat_set:
  1610. {
  1611. setCount++;
  1612. ForEachChild(iset, cur->expr)
  1613. {
  1614. IHqlExpression * child = cur->expr->queryChild(iset);
  1615. if (setArgs.find(*child) == NotFound)
  1616. setArgs.append(*LINK(child));
  1617. }
  1618. if (curIsOptional) nullCount++;
  1619. break;
  1620. }
  1621. case no_pat_const:
  1622. {
  1623. ITypeInfo * type = cur->expr->queryChild(0)->queryType();
  1624. unsigned len = type->getStringLen();
  1625. if (len == 0)
  1626. nullCount++;
  1627. else if (len == 1)
  1628. {
  1629. const void * value = cur->expr->queryChild(0)->queryValue()->queryValue();
  1630. unsigned nextValue;
  1631. if (type->getTypeCode() == type_utf8)
  1632. nextValue = rtlUtf8Char(value);
  1633. else if (isUnicodeType(type))
  1634. nextValue = *(UChar *)value;
  1635. else
  1636. nextValue = *(unsigned char *)value;
  1637. OwnedHqlExpr nextExpr = createConstant((int)nextValue);
  1638. if (setArgs.find(*nextExpr) == NotFound)
  1639. setArgs.append(*nextExpr.getClear());
  1640. singleCount++;
  1641. if (curIsOptional) nullCount++;
  1642. }
  1643. else
  1644. newArgs.append(*LINK(cur));
  1645. break;
  1646. }
  1647. default:
  1648. newArgs.append(*LINK(cur));
  1649. break;
  1650. }
  1651. }
  1652. else
  1653. newArgs.append(*LINK(cur));
  1654. }
  1655. if ((setCount > 1) || (setCount == 1 && singleCount > 0) || (singleCount > 1))
  1656. {
  1657. OwnedHqlExpr newSetExpr = createValue(no_pat_set, makePatternType(), setArgs);
  1658. if (nullCount)
  1659. newSetExpr.setown(createValue(no_pat_repeat, makePatternType(), newSetExpr.getClear(), createConstant(0), createConstantOne()));
  1660. HqlRegexExpr * newSetRegex = createRegexExpr(options, newSetExpr, caseSensitive);
  1661. if (newArgs.ordinality() == 0)
  1662. return newSetRegex;
  1663. args.kill();
  1664. ForEachItemIn(idx, newArgs)
  1665. args.append(OLINK(newArgs.item(idx)));
  1666. args.append(*newSetRegex);
  1667. }
  1668. }
  1669. return LINK(this);
  1670. }
  1671. #if 0
  1672. HqlRegexExpr * HqlRegexExpr::removeTrivialInstances()
  1673. {
  1674. node_operator op = expr->getOperator();
  1675. switch (op)
  1676. {
  1677. case no_pat_instance:
  1678. {
  1679. IHqlExpression * def = expr->queryChild(0);
  1680. if (!queryNamed()->isMatched && def->getOperator() == no_pat_instance)
  1681. {
  1682. queryNamed()->removeUse();
  1683. return
  1684. #if 0
  1685. //Do this later - recursion means we can't do it yet
  1686. //Optimization whilst building the tree. If a named symbol is just a definition of another named symbol
  1687. //and this named symbol isn't matched explicitly, then expand it.
  1688. if (!isMatched(def) && def->getOperator() == no_pat_instance)
  1689. return createStructure(def);
  1690. else
  1691. }
  1692. }
  1693. }
  1694. }
  1695. #endif
  1696. #endif
  1697. //---------------------------------------------------------------------------
  1698. void HqlSimpleRegexExpr::analyseNullLeadTrail()
  1699. {
  1700. if (analysed)
  1701. return;
  1702. analysed = true;
  1703. switch (getOperator())
  1704. {
  1705. case no_null:
  1706. limit.minLength = 0;
  1707. limit.maxLength = 0;
  1708. break;
  1709. case no_pat_beginrecursive:
  1710. limit.minLength = 0;
  1711. limit.maxLength = 0;
  1712. limit.containsAssertion = true;
  1713. break;
  1714. case no_pat_beginpattern:
  1715. limit.minLength = 0;
  1716. limit.maxLength = 0;
  1717. break;
  1718. case no_pat_endpattern:
  1719. limit.minLength = 0;
  1720. limit.maxLength = 0;
  1721. limit.containsAssertion = true;
  1722. break;
  1723. case no_pat_singlechar:
  1724. limit.minLength = 1;
  1725. limit.maxLength = 1;
  1726. break;
  1727. }
  1728. }
  1729. void HqlSimpleRegexExpr::gatherLeading(HqlRegexUniqueArray & target)
  1730. {
  1731. switch (op)
  1732. {
  1733. case no_null:
  1734. case no_pat_beginrecursive:
  1735. case no_pat_beginpattern:
  1736. break;
  1737. case no_pat_endpattern:
  1738. case no_pat_singlechar:
  1739. target.append(OLINK(*this));
  1740. break;
  1741. default:
  1742. UNIMPLEMENTED;
  1743. }
  1744. }
  1745. void HqlSimpleRegexExpr::gatherTrailing(HqlRegexUniqueArray & target)
  1746. {
  1747. unsigned max = numTrailing();
  1748. for (unsigned i = 0; i < max; i++)
  1749. target.append(OLINK(queryTrailing(i)));
  1750. }
  1751. unsigned HqlSimpleRegexExpr::numTrailing() const
  1752. {
  1753. switch (getOperator())
  1754. {
  1755. case no_null:
  1756. case no_pat_endpattern:
  1757. return 0;
  1758. case no_pat_beginrecursive:
  1759. case no_pat_beginpattern:
  1760. case no_pat_singlechar:
  1761. return 1;
  1762. default:
  1763. UNIMPLEMENTED;
  1764. }
  1765. }
  1766. HqlRegexExpr & HqlSimpleRegexExpr::queryTrailing(unsigned idx)
  1767. {
  1768. switch (getOperator())
  1769. {
  1770. case no_pat_beginrecursive:
  1771. case no_pat_beginpattern:
  1772. case no_pat_singlechar:
  1773. if (idx == 0)
  1774. return *this;
  1775. break;
  1776. }
  1777. UNIMPLEMENTED;
  1778. }
  1779. //---------------------------------------------------------------------------
  1780. void HqlComplexRegexExpr::analyseNullLeadTrail()
  1781. {
  1782. if (analysed)
  1783. return;
  1784. analysed = true;
  1785. ForEachItemIn(idx, args)
  1786. args.item(idx).analyseNullLeadTrail();
  1787. if (queryNamed())
  1788. named->analyseNullLeadTrail();
  1789. switch (getOperator())
  1790. {
  1791. case no_null:
  1792. case no_pat_beginrecursive:
  1793. case no_pat_beginpattern:
  1794. case no_pat_endpattern:
  1795. case no_pat_singlechar:
  1796. //Should be handled by simpleRegex
  1797. UNIMPLEMENTED;
  1798. break;
  1799. case no_pat_instance:
  1800. limit = named->queryLimit();
  1801. limit.containsAssertion = true; // require us to walk it, even if internally matches nothing.
  1802. leading.append(*LINK(this));
  1803. trailing.append(*LINK(this));
  1804. break;
  1805. case no_pat_dfa:
  1806. limit = named->queryLimit();
  1807. limit.containsAssertion = true; // this means that it will process optional values
  1808. leading.append(*LINK(this));
  1809. trailing.append(*LINK(this));
  1810. break;
  1811. case no_pat_endrecursive:
  1812. case no_pat_x_before_y:
  1813. case no_pat_before_y:
  1814. case no_pat_x_after_y:
  1815. case no_pat_after_y:
  1816. case no_pat_first:
  1817. case no_pat_last:
  1818. case no_pat_begintoken:
  1819. case no_pat_endtoken:
  1820. case no_pat_begincheck:
  1821. case no_pat_endcheckin:
  1822. case no_pat_endchecklength:
  1823. case no_pat_beginseparator:
  1824. case no_pat_endseparator:
  1825. case no_pat_beginvalidate:
  1826. case no_pat_endvalidate:
  1827. case no_penalty:
  1828. limit.containsAssertion = true;
  1829. limit.minLength = 0;
  1830. limit.maxLength = 0;
  1831. leading.append(*LINK(this));
  1832. trailing.append(*LINK(this));
  1833. break;
  1834. case no_pat_anychar:
  1835. case no_pat_set:
  1836. limit.minLength = 1;
  1837. limit.maxLength = 1;
  1838. leading.append(*LINK(this));
  1839. trailing.append(*LINK(this));
  1840. break;
  1841. case no_pat_const:
  1842. {
  1843. unsigned len = expr->queryChild(0)->queryType()->getStringLen();
  1844. limit.minLength = len;
  1845. limit.maxLength = len;
  1846. if (len != 0)
  1847. {
  1848. leading.append(*LINK(this));
  1849. trailing.append(*LINK(this));
  1850. }
  1851. }
  1852. break;
  1853. case no_pat_or:
  1854. {
  1855. limit.minLength = PATTERN_UNLIMITED_LENGTH;
  1856. limit.maxLength = 0;
  1857. limit.containsAssertion = true;
  1858. ForEachItemIn(idx, args)
  1859. {
  1860. HqlRegexExpr & cur = args.item(idx);
  1861. if (cur.limit.minLength < limit.minLength)
  1862. limit.minLength = cur.limit.minLength;
  1863. if (cur.limit.maxLength > limit.maxLength)
  1864. limit.maxLength = cur.limit.maxLength;
  1865. if (cur.limit.canBeNull())
  1866. limit.containsAssertion = false;
  1867. cur.gatherLeading(leading);
  1868. cur.gatherTrailing(trailing);
  1869. }
  1870. break;
  1871. }
  1872. case no_pat_separator:
  1873. {
  1874. assertex(args.ordinality() == 3);
  1875. limit = args.item(1).limit;
  1876. leading.append(OLINK(args.item(0)));
  1877. trailing.append(OLINK(args.item(2)));
  1878. break;
  1879. }
  1880. case no_pat_follow:
  1881. {
  1882. limit.minLength = 0;
  1883. limit.maxLength = 0;
  1884. ForEachItemIn(idx0, args)
  1885. {
  1886. HqlRegexExpr & cur = args.item(idx0);
  1887. limit.minLength = limitedAdd(limit.minLength, cur.limit.minLength);
  1888. limit.maxLength = limitedAdd(limit.maxLength, cur.limit.maxLength);
  1889. if (cur.limit.containsAssertion)
  1890. limit.containsAssertion = true;
  1891. }
  1892. ForEachItemIn(idx1, args)
  1893. {
  1894. HqlRegexExpr & cur = args.item(idx1);
  1895. cur.gatherLeading(leading);
  1896. if (!cur.canBeNull())
  1897. break;
  1898. }
  1899. ForEachItemInRev(idx2, args)
  1900. {
  1901. HqlRegexExpr & cur = args.item(idx2);
  1902. cur.gatherTrailing(trailing);
  1903. if (!cur.canBeNull())
  1904. break;
  1905. }
  1906. break;
  1907. }
  1908. case no_pat_repeat:
  1909. {
  1910. if (isStandardRepeat())
  1911. {
  1912. HqlRegexExpr & arg = args.item(0);
  1913. limit.minLength = limitedMult(getRepeatMin(), arg.limit.minLength);
  1914. limit.maxLength = limitedMult(getRepeatMax(), arg.limit.maxLength);
  1915. if (getRepeatMin() != 0) limit.containsAssertion = arg.limit.containsAssertion;
  1916. arg.gatherLeading(leading);
  1917. arg.gatherTrailing(trailing);
  1918. }
  1919. else
  1920. {
  1921. limit.minLength = limitedMult(getRepeatMin(), named->getMinLength());
  1922. limit.maxLength = limitedMult(getRepeatMax(), named->getMaxLength());
  1923. //if (getRepeatMin() != 0) limit.containsAssertion = named->queryLimit().containsAssertion;
  1924. limit.containsAssertion = true;
  1925. leading.append(*LINK(this));
  1926. trailing.append(*LINK(this));
  1927. }
  1928. break;
  1929. }
  1930. case no_pat_pattern:
  1931. // Should have been converted by the time we get here...
  1932. default:
  1933. UNIMPLEMENTED;
  1934. break;
  1935. }
  1936. }
  1937. void HqlComplexRegexExpr::cleanup()
  1938. {
  1939. if (cleaned)
  1940. return;
  1941. HqlRegexExpr::cleanup();
  1942. trailing.kill();
  1943. leading.kill();
  1944. subPattern.clear();
  1945. }
  1946. void HqlComplexRegexExpr::gatherLeading(HqlRegexUniqueArray & target)
  1947. {
  1948. inherit(target, leading);
  1949. }
  1950. void HqlComplexRegexExpr::gatherTrailing(HqlRegexUniqueArray & target)
  1951. {
  1952. inherit(target, trailing);
  1953. }
  1954. //---------------------------------------------------------------------------
  1955. // DFA helper classes.
  1956. inline int compareHqlRegexExpr(HqlRegexExpr * left, HqlRegexExpr * right)
  1957. {
  1958. unique_id_t idl = left->queryUid();
  1959. unique_id_t idr = right->queryUid();
  1960. return (idl < idr) ? -1 : (idl > idr) ? +1 : 0;
  1961. }
  1962. int compareHqlRegexExpr(CInterface * * left, CInterface * * right)
  1963. {
  1964. return compareHqlRegexExpr((HqlRegexExpr *)*left, (HqlRegexExpr *)*right);
  1965. }
  1966. void HqlRegexExprSet::add(HqlRegexExpr * value)
  1967. {
  1968. bool isNew;
  1969. CInterface * castValue = value;
  1970. values.bAdd(castValue, compareHqlRegexExpr, isNew);
  1971. }
  1972. int HqlRegexExprSet::compare(const HqlRegexExprSet & other) const
  1973. {
  1974. unsigned numThis = values.ordinality();
  1975. unsigned numOther = other.values.ordinality();
  1976. unsigned numCommon = std::min(numThis, numOther);
  1977. for (unsigned i = 0; i < numCommon; i++)
  1978. {
  1979. HqlRegexExpr & left = values.item(i);
  1980. HqlRegexExpr & right = other.values.item(i);
  1981. int c = compareHqlRegexExpr(&left, &right);
  1982. if (c)
  1983. return c;
  1984. }
  1985. if (numThis > numOther)
  1986. return +1;
  1987. else if (numThis < numOther)
  1988. return -1;
  1989. else
  1990. return 0;
  1991. }
  1992. bool HqlRegexExprSet::equals(const HqlRegexExprSet & other) const
  1993. {
  1994. unsigned numThis = values.ordinality();
  1995. unsigned numOther = other.values.ordinality();
  1996. if (numThis != numOther)
  1997. return false;
  1998. for (unsigned i = 0; i < numThis; i++)
  1999. {
  2000. if (&values.item(i) != &other.values.item(i))
  2001. return false;
  2002. }
  2003. return true;
  2004. }
  2005. bool HqlDfaState::isAccepting()
  2006. {
  2007. ForEachItemIn(idx, position)
  2008. if (position.item(idx).getOperator() == no_pat_endpattern)
  2009. return true;
  2010. return false;
  2011. }
  2012. static inline int compareState(HqlDfaState * left, HqlDfaState * right)
  2013. {
  2014. return left->position.compare(right->position);
  2015. }
  2016. static int compareState(CInterface * * left, CInterface * * right)
  2017. {
  2018. return compareState((HqlDfaState *)*left, (HqlDfaState *)*right);
  2019. }
  2020. void HqlRegexExpr::generateDFAs()
  2021. {
  2022. if (op != no_pat_dfa)
  2023. {
  2024. ForEachItemIn(idx, args)
  2025. args.item(idx).generateDFAs();
  2026. return;
  2027. }
  2028. //Adapted from Dragon p141 Fig 3.44
  2029. //Main variation is that the set of potential symbols is calculated first, rather than trying the whole alphabet
  2030. //it might also have character-classes e.g., [[:alpha:]] in the stream.
  2031. dfa = new HqlDfaInfo;
  2032. HqlDfaState * firstState = new HqlDfaState(0);
  2033. HqlRegexExpr * first = queryNamed()->def->queryChild(0);
  2034. ForEachItemIn(idx, first->following)
  2035. firstState->position.add(&first->following.item(idx));
  2036. //Store a list of states, and an ordered list. First marks which have been processed,
  2037. //second gives efficient duplicate detection.
  2038. dfa->states.append(*firstState);
  2039. dfa->orderedStates.append(*LINK(firstState));
  2040. unsigned curStateIndex = 0;
  2041. while (curStateIndex < dfa->states.ordinality())
  2042. {
  2043. HqlDfaState & curState = dfa->states.item(curStateIndex++);
  2044. //First gather the potential symbols that come next (otherwise we would die with unicode!)
  2045. //it also speeds up ascii by a fairly large factor on sets of strings.
  2046. SymbolArray nextSymbols;
  2047. ForEachItemIn(ip, curState.position)
  2048. curState.position.item(ip).gatherConsumeSymbols(nextSymbols);
  2049. //For each symbol, work out what combination of states we could become translated to.
  2050. SymbolArrayIterator iter(nextSymbols);
  2051. ForEach(iter)
  2052. {
  2053. unsigned curSymbol = iter.get();
  2054. HqlDfaState * nextState = new HqlDfaState(dfa->states.ordinality());
  2055. //For each NFA state we could be in, what new NFA state could be now be in...
  2056. ForEachItemIn(ip, curState.position)
  2057. {
  2058. HqlRegexExpr & curRegex = curState.position.item(ip);
  2059. if (curRegex.canConsume(curSymbol))
  2060. {
  2061. ForEachItemIn(idx2, curRegex.following)
  2062. nextState->position.add(&curRegex.following.item(idx2));
  2063. }
  2064. }
  2065. //If no states, then don't bother adding a transition - we're done..
  2066. if (nextState->position.ordinality())
  2067. {
  2068. bool isNew = false;
  2069. CInterface * castState = nextState;
  2070. unsigned matchIndex = dfa->orderedStates.bAdd(castState, compareState, isNew);
  2071. if (isNew)
  2072. dfa->states.append(*LINK(nextState));
  2073. else
  2074. nextState->Release();
  2075. HqlDfaState * matchedState = &dfa->orderedStates.item(matchIndex);
  2076. //Finally associate a transition for this symbol...
  2077. curState.transitions.append(*new HqlDfaTransition(curSymbol, matchedState));
  2078. }
  2079. else
  2080. nextState->Release();
  2081. }
  2082. }
  2083. }
  2084. /**
  2085. Some unicode pseudo code
  2086. ForEach(ip in curState.position)
  2087. {
  2088. position(ip).getConsumed(consumeSet);
  2089. positionSet = [ip];
  2090. if (consumeSet.ordinality())
  2091. Merge(consumeSet, 0, positionSet);
  2092. }
  2093. Merge(consumeSet, first, positionSet)
  2094. {
  2095. for (i = first; i < sets.ordinality(); i++)
  2096. {
  2097. diff = intersect(sets(i).consume, consumeSet);
  2098. if (diff.ordinality())
  2099. {
  2100. rightOnly = sets(i).consume - diff;
  2101. if (rightOnly.ordinality() == 0)
  2102. sets(i).positions.union(positionSet);
  2103. else
  2104. {
  2105. sets(i).consume = rightOnly;
  2106. Merge(diff, i+1, union(sets(i).positions, positionSet);
  2107. }
  2108. newOnly = consumeSet - diff;
  2109. if (newOnly = [])
  2110. return;
  2111. consumeSet = newOnly;
  2112. }
  2113. }
  2114. sets.append(consumeSet, positionSet);
  2115. }
  2116. Output would be a set of disjoint sets, together with a lits of positions that are matched by those sets.
  2117. They could then be used to generate the tables required by a dfa matcher list of
  2118. low, high, target-state
  2119. which was binary chopped.
  2120. It might be better to not use the icu sets, except for generating a set of ranges, and handle everything else ourselves.
  2121. Possibly a set of [low,high,positions], or maybe even [low, high, position], sorted by low
  2122. with some clever code to walk through and retain lists of which ones are active.
  2123. **/
  2124. //---------------------------------------------------------------------------
  2125. RegexContext::RegexContext(IHqlExpression * _expr, IWorkUnit * _wu, const HqlCppOptions & _options, ITimeReporter * _timeReporter, byte _algorithm) : NlpParseContext(_expr, _wu, _options, _timeReporter), parser(NULL, _algorithm)
  2126. {
  2127. info.addedSeparators = false;
  2128. switch (info.type)
  2129. {
  2130. case type_string: info.dfaComplexity = DEFAULT_DFA_COMPLEXITY; break;
  2131. case type_utf8: info.dfaComplexity = DEFAULT_UTF8_DFA_COMPLEXITY; break;
  2132. case type_unicode: info.dfaComplexity = DEFAULT_UNICODE_DFA_COMPLEXITY; break;
  2133. }
  2134. if (_options.parseDfaComplexity != (unsigned)-1)
  2135. info.dfaComplexity = _options.parseDfaComplexity;
  2136. info.expandRepeatAnyAsDfa = _options.expandRepeatAnyAsDfa;
  2137. createLexer = false;
  2138. }
  2139. RegexContext::~RegexContext()
  2140. {
  2141. ForEachItemIn(idx, named)
  2142. named.item(idx).cleanup();
  2143. }
  2144. HqlNamedRegex * RegexContext::queryNamed(IHqlExpression * defn, IAtom * name, node_operator op, bool caseSensitive)
  2145. {
  2146. ForEachItemIn(idx, named)
  2147. {
  2148. HqlNamedRegex & cur = named.item(idx);
  2149. if (cur.matches(defn, name, caseSensitive))
  2150. {
  2151. assertex(cur.matches(defn, name, op, caseSensitive));
  2152. return &cur;
  2153. }
  2154. }
  2155. return NULL;
  2156. }
  2157. HqlNamedRegex * RegexContext::createNamed(IHqlExpression * expr, IAtom * name, node_operator op, bool caseSensitive)
  2158. {
  2159. LinkedHqlExpr searchExpr = expr;
  2160. if (op != no_pat_instance)
  2161. //Create an expression that should never clash with anything we would find in the parse tree
  2162. searchExpr.setown(createValue(op, expr->getType(), LINK(expr), createAttribute(tempAtom)));
  2163. HqlNamedRegex * match = queryNamed(searchExpr, name, op, caseSensitive);
  2164. if (match)
  2165. match->addUse();
  2166. else
  2167. {
  2168. match = new HqlNamedRegex(expr, name, searchExpr, op, caseSensitive, isMatched(searchExpr, name));
  2169. named.append(*match); // add to list first to avoid recursion problems.
  2170. match->setRegexOwn(createStructure(expr, caseSensitive));
  2171. }
  2172. return match;
  2173. }
  2174. static HqlRegexExpr * createFollow(const ParseInformation & options, HqlRegexExpr * a1, HqlRegexExpr * a2, HqlRegexExpr * a3, bool caseSensitive)
  2175. {
  2176. HqlRegexExpr * follow = createRegexExpr(options, no_pat_follow, NULL, caseSensitive);
  2177. follow->addOperand(a1);
  2178. follow->addOperand(a2);
  2179. if (a3)
  2180. follow->addOperand(a3);
  2181. return follow;
  2182. }
  2183. HqlRegexExpr * RegexContext::createFollow(HqlRegexExpr * start, IHqlExpression * body, node_operator endOp, IHqlExpression * endExpr, bool caseSensitive)
  2184. {
  2185. HqlRegexExpr * next = createStructure(body, caseSensitive);
  2186. HqlRegexExpr * finish = createRegexExpr(info, endOp, endExpr, caseSensitive);
  2187. return ::createFollow(info, start, next, finish, caseSensitive);
  2188. }
  2189. HqlRegexExpr * RegexContext::createStructure(IHqlExpression * expr, bool caseSensitive)
  2190. {
  2191. node_operator op = expr->getOperator();
  2192. switch (op)
  2193. {
  2194. case no_pat_production:
  2195. throwError(HQLERR_RegexNoTransformSupport);
  2196. case no_pat_instance:
  2197. {
  2198. IHqlExpression * def = expr->queryChild(0);
  2199. if (createLexer)
  2200. return createStructure(def, caseSensitive);
  2201. Owned<HqlRegexExpr> instance = createRegexExpr(info, expr, caseSensitive);
  2202. IHqlExpression * nameExpr = expr->queryChild(1);
  2203. instance->setNamed(createNamed(def, nameExpr ? nameExpr->queryName() : NULL, op, caseSensitive));
  2204. return instance.getClear();
  2205. }
  2206. case no_pat_guard:
  2207. return createRegexExpr(info, no_null, NULL, caseSensitive);
  2208. case no_attr:
  2209. case no_attr_expr:
  2210. case no_attr_link:
  2211. return NULL;
  2212. case no_implicitcast:
  2213. //can sometimes occur when parameters are bound
  2214. return createStructure(expr->queryChild(0), caseSensitive);
  2215. case no_pat_const:
  2216. if (createLexer)
  2217. return expandStringAsChars(expr, info, caseSensitive);
  2218. return createRegexExpr(info, expr, caseSensitive);
  2219. case no_pat_set:
  2220. case no_penalty:
  2221. //Don't walk children...
  2222. return createRegexExpr(info, expr, caseSensitive);
  2223. case no_pat_x_before_y:
  2224. case no_pat_x_after_y:
  2225. {
  2226. Owned<HqlRegexExpr> pattern = createStructure(expr->queryChild(0), caseSensitive);
  2227. Owned<HqlRegexExpr> check = createRegexExpr(info, expr, caseSensitive);
  2228. check->setSubPattern(createNamed(expr->queryChild(1), NULL, no_pat_checkin, caseSensitive));
  2229. if (op == no_pat_x_before_y)
  2230. return ::createFollow(info, pattern.getClear(), check.getClear(), NULL, caseSensitive);
  2231. else
  2232. return ::createFollow(info, check.getClear(), pattern.getClear(), NULL, caseSensitive);
  2233. }
  2234. case no_pat_before_y:
  2235. case no_pat_after_y:
  2236. {
  2237. Owned<HqlRegexExpr> check = createRegexExpr(info, expr, caseSensitive);
  2238. check->setSubPattern(createNamed(expr->queryChild(0), NULL, no_pat_checkin, caseSensitive));
  2239. return check.getClear();
  2240. }
  2241. case no_pat_checkin:
  2242. {
  2243. Owned<HqlRegexExpr> start = createRegexExpr(info, no_pat_begincheck, expr, caseSensitive);
  2244. Owned<HqlRegexExpr> next = createStructure(expr->queryChild(0), caseSensitive);
  2245. Owned<HqlRegexExpr> finish = createRegexExpr(info, no_pat_endcheckin, expr, caseSensitive);
  2246. finish->setSubPattern(createNamed(expr->queryChild(1), NULL, no_pat_checkin, caseSensitive));
  2247. return ::createFollow(info, start.getClear(), next.getClear(), finish.getClear(), caseSensitive);
  2248. }
  2249. case no_pat_checklength:
  2250. {
  2251. HqlRegexExpr * start = createRegexExpr(info, no_pat_begincheck, expr, caseSensitive);
  2252. return createFollow(start, expr->queryChild(0), no_pat_endchecklength, expr, caseSensitive);
  2253. }
  2254. case no_pat_validate:
  2255. {
  2256. HqlRegexExpr * start = createRegexExpr(info, no_pat_beginvalidate, expr, caseSensitive);
  2257. return createFollow(start, expr->queryChild(0), no_pat_endvalidate, expr, caseSensitive);
  2258. }
  2259. case no_pat_token:
  2260. case no_pat_imptoken:
  2261. {
  2262. IHqlExpression * child = expr->queryChild(0);
  2263. //MORE: Covert into a function to allow multiple values of following:
  2264. switch (child->getOperator())
  2265. {
  2266. case no_null:
  2267. case no_pat_first:
  2268. case no_pat_last:
  2269. return createStructure(child, caseSensitive);
  2270. case no_penalty:
  2271. case no_pat_x_before_y:
  2272. case no_pat_before_y:
  2273. case no_pat_x_after_y:
  2274. case no_pat_after_y:
  2275. case no_pat_guard:
  2276. return createStructure(child, caseSensitive);
  2277. case no_pat_const:
  2278. // if (!info.separator)
  2279. // return createStructure(child);
  2280. break;
  2281. }
  2282. HqlRegexExpr * start = createRegexExpr(info, no_pat_begintoken, expr, caseSensitive);
  2283. return createFollow(start, expr->queryChild(0), no_pat_endtoken, expr, caseSensitive);
  2284. }
  2285. case no_pat_repeat:
  2286. {
  2287. HqlRegexExpr * ret = createRegexExpr(info, expr, caseSensitive);
  2288. IHqlExpression * repeated = expr->queryChild(0);
  2289. if (ret->isStandardRepeat())
  2290. ret->addOperand(createStructure(repeated, caseSensitive));
  2291. else
  2292. ret->setNamed(createNamed(repeated, NULL, op, caseSensitive));
  2293. return ret;
  2294. }
  2295. case no_pat_or:
  2296. case no_pat_follow:
  2297. {
  2298. //Expand these out as much as possible....
  2299. HqlExprArray args;
  2300. expr->unwindList(args, op);
  2301. OwnedHqlExpr flatExpr = expr->clone(args);
  2302. Owned<HqlRegexExpr> ret = createRegexExpr(info, flatExpr, caseSensitive);
  2303. ForEachChild(idx, flatExpr)
  2304. {
  2305. HqlRegexExpr * child = createStructure(flatExpr->queryChild(idx), caseSensitive);
  2306. if (child)
  2307. ret->addOperand(child);
  2308. }
  2309. return ret.getClear();
  2310. }
  2311. case no_pat_case:
  2312. case no_pat_nocase:
  2313. return createStructure(expr->queryChild(0), op==no_pat_case);
  2314. case no_pat_featureactual:
  2315. throwError(HQLERR_RegexFeatureNotSupport);
  2316. }
  2317. Owned<HqlRegexExpr> ret = createRegexExpr(info, expr, caseSensitive);
  2318. ForEachChild(idx, expr)
  2319. {
  2320. HqlRegexExpr * child = createStructure(expr->queryChild(idx), caseSensitive);
  2321. if (child)
  2322. ret->addOperand(child);
  2323. }
  2324. return ret.getClear();
  2325. }
  2326. void RegexContext::buildStructure()
  2327. {
  2328. unsigned startTime = msTick();
  2329. IHqlExpression * grammar = expr->queryChild(2);
  2330. assertex(grammar->getOperator() == no_pat_instance);
  2331. IAtom * name = grammar->queryChild(1)->queryName();
  2332. OwnedHqlExpr structure = LINK(grammar);//createValue(no_pat_instance, makeRuleType(NULL), LINK(grammar), LINK(grammar->queryChild(1)));
  2333. HqlRegexExpr * rootRegex = createStructure(structure, isCaseSensitive());
  2334. root.setown(new HqlNamedRegex(structure, internalAtom, structure, no_parse, isCaseSensitive(), false));
  2335. root->setRegexOwn(rootRegex);
  2336. named.append(*LINK(root));
  2337. DEBUG_TIMERX(timeReporter, "EclServer: Generate PARSE: Create Structure", msTick()-startTime);
  2338. }
  2339. void RegexContext::expandRecursion()
  2340. {
  2341. //First add all the symbols referenced by the use() attributes on the parse so they can be matched.
  2342. ForEachChild(idx, expr)
  2343. {
  2344. IHqlExpression * cur = expr->queryChild(idx);
  2345. if (cur->getOperator() == no_pat_use)
  2346. {
  2347. //NB: Does not return a linked item.
  2348. HqlNamedRegex * named = createNamed(cur->queryChild(0), NULL, no_pat_instance, true);
  2349. named->removeUse(); // Not actually used at the moment...
  2350. HqlNamedRegex * named2 = createNamed(cur->queryChild(0), NULL, no_pat_instance, false);
  2351. named2->removeUse(); // Not actually used at the moment...
  2352. }
  2353. }
  2354. root->expandRecursion(*this, NULL);
  2355. ForEachItemInRev(idx2, named)
  2356. if (!named.item(idx2).queryExpandedRecursion())
  2357. named.remove(idx2);
  2358. }
  2359. void RegexContext::insertSeparators()
  2360. {
  2361. if (info.separator)
  2362. {
  2363. ForEachItemIn(idx, named)
  2364. named.item(idx).insertSeparators(info.separator, this);
  2365. //add separator/first onto the front of root.
  2366. info.addedSeparators = true;
  2367. }
  2368. }
  2369. void RegexContext::optimizeSpotDFA()
  2370. {
  2371. if (info.dfaComplexity > 0)
  2372. {
  2373. ForEachItemIn(idx, named)
  2374. named.item(idx).calcDfaScore();
  2375. root->markDFAs(info.dfaComplexity);
  2376. ForEachItemIn(idx2, named)
  2377. named.item(idx2).createDFAs(*this);
  2378. }
  2379. }
  2380. void RegexContext::optimizePattern()
  2381. {
  2382. unsigned startTime = msTick();
  2383. ForEachItemIn(idx1, named)
  2384. named.item(idx1).mergeCreateSets();
  2385. root->expandNamedSymbols();
  2386. ForEachItemInRev(idx2, named)
  2387. {
  2388. if (!named.item(idx2).isUsed())
  2389. {
  2390. //Can't delete it otherwise everything gets cleaned up...
  2391. deadNamed.append(named.item(idx2));
  2392. named.remove(idx2, true);
  2393. }
  2394. }
  2395. optimizeSpotDFA();
  2396. DEBUG_TIMERX(timeReporter, "EclServer: Generate PARSE: Optimize", msTick()-startTime);
  2397. }
  2398. HqlNamedRegex * RegexContext::queryDefine(IHqlExpression * defineName, bool caseSensitive)
  2399. {
  2400. ForEachItemIn(idx, named)
  2401. {
  2402. HqlNamedRegex & cur = named.item(idx);
  2403. if (cur.matchesDefine(defineName,caseSensitive))
  2404. return &cur;
  2405. }
  2406. StringBuffer s;
  2407. defineName->toString(s);
  2408. throwError1(HQLERR_DefineUseStrNotFound, s.str());
  2409. return NULL;
  2410. }
  2411. void RegexContext::analysePattern()
  2412. {
  2413. unsigned startTime = msTick();
  2414. //This conversion is based around the description in the Dragon book:
  2415. //3.9 From a regular expression to a DFA
  2416. //even though we don't always convert it, the steps form a useful algorithm
  2417. ForEachItemIn(idx0, named)
  2418. named.item(idx0).addBeginEnd(info);
  2419. ForEachItemIn(idx1, named)
  2420. named.item(idx1).analyseNullLeadTrail();
  2421. ForEachItemIn(idx2, named)
  2422. named.item(idx2).calculateNext();
  2423. ForEachItemIn(idx3, named)
  2424. named.item(idx3).generateDFAs();
  2425. DEBUG_TIMERX(timeReporter, "EclServer: Generate PARSE: Analyse", msTick()-startTime);
  2426. }
  2427. void RegexContext::generateRegex()
  2428. {
  2429. unsigned startTime = msTick();
  2430. parser.addedSeparators = info.addedSeparators;
  2431. setParserOptions(parser);
  2432. GenerateRegexCtx ctx(*this, info, idAllocator);
  2433. ForEachItemIn(idx0, named)
  2434. {
  2435. HqlNamedRegex & cur = named.item(idx0);
  2436. cur.generateRegex(ctx);
  2437. }
  2438. parser.grammar.set(root->queryRootPattern());
  2439. parser.minPatternLength = root->getMinLength();
  2440. DEBUG_TIMERX(timeReporter, "EclServer: Generate PARSE: Generate", msTick()-startTime);
  2441. }
  2442. //void RegexContext::removeTrivialInstances()
  2443. //{
  2444. // ForEachItemIn(idx, named)
  2445. // named.item(idx).removeTrivialInstances();
  2446. //}
  2447. void RegexContext::compileSearchPattern()
  2448. {
  2449. checkValidMatches();
  2450. buildStructure();
  2451. expandRecursion();
  2452. // removeTrivialInstances();
  2453. insertSeparators();
  2454. optimizePattern();
  2455. analysePattern();
  2456. generateRegex();
  2457. compileMatched(parser);
  2458. }
  2459. void RegexContext::getDebugText(StringBuffer & s, unsigned detail)
  2460. {
  2461. NlpParseContext::getDebugText(s, detail);
  2462. regexToXml(s, parser.grammar, detail);
  2463. }
  2464. NlpParseContext * createRegexContext(IHqlExpression * expr, IWorkUnit * wu, const HqlCppOptions & options, ITimeReporter * timeReporter, byte algorithm)
  2465. {
  2466. return new RegexContext(expr, wu, options, timeReporter, algorithm);
  2467. }
  2468. //-- Lexer creation
  2469. void RegexContext::beginLexer()
  2470. {
  2471. createLexer = true;
  2472. info.expandRepeatAnyAsDfa = true;
  2473. OwnedHqlExpr searchExpr = createValue(no_pat_dfa, makePatternType(), createUniqueId());
  2474. lexerNamed.setown(new HqlNamedRegex(searchExpr, NULL, searchExpr, no_pat_dfa, isCaseSensitive(), false));
  2475. lexerOr.setown(createRegexExpr(info, no_pat_or, NULL, isCaseSensitive()));
  2476. lexerNamed->setRegexOwn(LINK(lexerOr));
  2477. lexerRoot.setown(createRegexExpr(info, no_pat_dfa, NULL, isCaseSensitive()));
  2478. lexerRoot->setNamed(lexerNamed);
  2479. root.setown(new HqlNamedRegex(expr, NULL, expr, no_parse, isCaseSensitive(), false));
  2480. root->setRegexOwn(LINK(lexerRoot));
  2481. named.append(*LINK(lexerNamed));
  2482. named.append(*LINK(root));
  2483. }
  2484. void RegexContext::addLexerToken(unsigned id, IHqlExpression * pattern)
  2485. {
  2486. HqlRegexExpr * token = createStructure(pattern, isCaseSensitive());
  2487. HqlRegexExpr * end = createRegexExpr(info, no_pat_endpattern, NULL, isCaseSensitive());
  2488. end->setAcceptId(id);
  2489. if (token->getOperator() == no_pat_follow)
  2490. token->addOperand(end);
  2491. else
  2492. {
  2493. HqlRegexExpr * follow = createRegexExpr(info, no_pat_follow, NULL, isCaseSensitive());
  2494. follow->addOperand(token);
  2495. follow->addOperand(end);
  2496. token = follow;
  2497. }
  2498. lexerOr->addOperand(token);
  2499. }
  2500. void RegexContext::generateLexer(IDfaPattern * builder)
  2501. {
  2502. //MORE: Need to call some elements of optimizePattern() to expand limited repeats.
  2503. analysePattern();
  2504. lexerRoot->generateDFA(builder);
  2505. }
  2506. /*
  2507. ToDo:
  2508. o Should move some of the logic from HqlregexExpr to complex + remove the queyNamed() etc. virtuals.
  2509. */