thorrparse.cpp 98 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649
  1. /*##############################################################################
  2. HPCC SYSTEMS software Copyright (C) 2012 HPCC Systems®.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. ############################################################################## */
  13. #ifndef U_OVERRIDE_CXX_ALLOCATION
  14. #define U_OVERRIDE_CXX_ALLOCATION 0 // Enabling this forces all allocation of ICU objects to ICU's heap, but is incompatible with jmemleak
  15. #endif
  16. #include "jliball.hpp"
  17. #include "junicode.hpp"
  18. #include "thorrparse.ipp"
  19. #include "thorregex.hpp"
  20. #include "eclrtl.hpp"
  21. #include "thorcommon.ipp"
  22. #ifdef _USE_ICU
  23. #include "unicode/utf.h"
  24. #include "unicode/uchar.h"
  25. #include "unicode/schriter.h"
  26. #include "unicode/coll.h"
  27. #include "unicode/ustring.h"
  28. using icu::UnicodeString;
  29. #endif
  30. #ifdef TRACE_REGEX
  31. #define MATCH traceMatch
  32. #else
  33. #define MATCH match
  34. #endif
  35. #define TRANSFORM_BEFORE_CREATE_LIMIT 4096 // Over this size, transform into a temp
  36. #define REGEX_VERSION 4
  37. #define REGEXERR_IncompatibleVersion 1000
  38. #define REGEXERR_MatchFailure 1001
  39. #define REGEXERR_IncompatibleVersion_Text "Serialized regex library is not compatible, please rebuild workunit"
  40. #define REGEXERR_MatchFailure_Text "Failed to match pattern: pattern too complicated (offset %d)"
  41. static const char * getPatternText(ThorRegexKind kind)
  42. {
  43. switch (kind)
  44. {
  45. case ThorRegexNull: return "Null";
  46. case ThorRegexAnyChar: return "AnyChar";
  47. case ThorRegexAsciiDFA: return "AsciiDFA";
  48. case ThorRegexUnicodeDFA: return "UnicodeDFA";
  49. case ThorRegexAscii: return "Ascii";
  50. case ThorRegexAsciiI: return "AsciiI";
  51. case ThorRegexAsciiSet: return "AsciiSet";
  52. case ThorRegexAsciiISet: return "AsciiISet";
  53. case ThorRegexUnicode: return "Unicode";
  54. case ThorRegexUnicodeI: return "UnicodeI";
  55. case ThorRegexUnicodeSet: return "UnicodeSet";
  56. case ThorRegexUnicodeISet: return "UnicodeISet";
  57. case ThorRegexUtf8: return "Utf8";
  58. case ThorRegexUtf8I: return "Utf8I";
  59. case ThorRegexStart: return "Start";
  60. case ThorRegexFinish: return "Finish";
  61. case ThorRegexBeginToken: return "BeginToken";
  62. case ThorRegexEndToken: return "EndToken";
  63. case ThorRegexBeginSeparator: return "BeginSeparator";
  64. case ThorRegexEndSeparator: return "EndSeparator";
  65. case ThorRegexRepeat: return "Repeat";
  66. case ThorRegexBeginCheck: return "BeginCheck";
  67. case ThorRegexCheck: return "Check";
  68. case ThorRegexCheckLength: return "CheckLength";
  69. case ThorRegexAssertNext: return "AssertNext";
  70. case ThorRegexAssertPrev: return "AssertPrev";
  71. case ThorRegexNamed: return "Named";
  72. case ThorRegexEndNamed: return "EndNamed";
  73. case ThorRegexEndNested: return "EndNested";
  74. case ThorRegexDone: return "Done";
  75. case ThorRegexRepeatInstance: return "RepeatInstance";
  76. case ThorRegexValidateAscAsAsc: return "ValidateAscAsAsc";
  77. case ThorRegexValidateUniAsAsc: return "ValidateUniAsAsc" ;
  78. case ThorRegexValidateUtf8AsAsc: return "ValidateUtf8AsAsc" ;
  79. case ThorRegexValidateAscAsUni: return "ValidateAscAsUni" ;
  80. case ThorRegexValidateUniAsUni: return "ValidateUniAsUni";
  81. case ThorRegexValidateUtf8AsUni: return "ValidateUtf8AsUni" ;
  82. case ThorRegexRecursive: return "Recursive";
  83. case ThorRegexEndRecursive: return "EndRecursive";
  84. case ThorRegexPenalty: return "Penalty";
  85. case ThorRegexRepeatAny: return "RepeatAny";
  86. }
  87. UNIMPLEMENTED;
  88. }
  89. static const char * resultText[] = { "Done", "Backtrack", "TokenBack", "Continue" };
  90. #ifdef TRACE_REGEX
  91. static unsigned patternDepth;
  92. #endif
  93. bool isAsciiMatch(unsigned code, unsigned next)
  94. {
  95. switch (code)
  96. {
  97. case RCCalnum: return isalnum(next) != 0;
  98. case RCCcntrl: return iscntrl(next) != 0;
  99. case RCClower: return islower(next) != 0;
  100. case RCCupper: return isupper(next) != 0;
  101. case RCCspace: return isspace(next) != 0;
  102. case RCCalpha: return isalpha(next) != 0;
  103. case RCCdigit: return isdigit(next) != 0;
  104. case RCCprint: return isprint(next) != 0;
  105. case RCCblank: return (next == ' ' || next == '\t'); // return isblank(next);
  106. case RCCgraph: return isgraph(next) != 0;
  107. case RCCpunct: return ispunct(next) != 0;
  108. case RCCxdigit: return isxdigit(next) != 0;
  109. case RCCany: return true;
  110. default:
  111. UNIMPLEMENTED;
  112. }
  113. }
  114. bool isUnicodeMatch(unsigned code, unsigned next)
  115. {
  116. #ifdef _USE_ICU
  117. switch (code)
  118. {
  119. case RCCalnum: return u_isalnum(next) != 0;
  120. case RCCcntrl: return u_iscntrl(next) != 0;
  121. case RCClower: return u_islower(next) != 0;
  122. case RCCupper: return u_isupper(next) != 0;
  123. case RCCspace: return u_isWhitespace(next) != 0;
  124. case RCCalpha: return u_isalpha(next) != 0;
  125. case RCCdigit: return u_isdigit(next) != 0;
  126. case RCCprint: return u_isprint(next) != 0;
  127. case RCCblank: return u_isspace(next) != 0 || (next == '\t');
  128. case RCCgraph: return u_isprint(next) && !u_isspace(next);
  129. case RCCpunct: return u_isprint(next) && !(u_isspace(next) || u_isalnum(next));
  130. case RCCxdigit: return (next < 128) && isxdigit(next); // should be good enough.
  131. case RCCany: return true;
  132. default:
  133. UNIMPLEMENTED;
  134. }
  135. #else
  136. rtlThrowNoUnicode();
  137. #endif
  138. }
  139. //Anyone got some better code - this works, but isn't wonderfully efficient.
  140. //works by accessing esp[-4096] and seeing if it dies.
  141. inline void checkStackOverflow(unsigned offset)
  142. {
  143. try
  144. {
  145. doStackProbe();
  146. }
  147. catch (...)
  148. {
  149. throwError1(REGEXERR_MatchFailure, offset);
  150. }
  151. }
  152. //---------------------------------------------------------------------------
  153. void serializeLink(MemoryBuffer & out, RegexPattern * pattern, RegexSerializeState & state)
  154. {
  155. unsigned match = state.patterns.find(*pattern);
  156. assertex(match != NotFound);
  157. out.append(match);
  158. }
  159. RegexPattern * deserializeLink(MemoryBuffer & in, RegexSerializeState & state)
  160. {
  161. unsigned match;
  162. in.read(match);
  163. return LINK(&state.patterns.item(match));
  164. }
  165. void serializeName(MemoryBuffer & out, RegexNamed * pattern, RegexSerializeState & state)
  166. {
  167. unsigned match = state.named.find(*pattern);
  168. assertex(match != NotFound);
  169. out.append(match);
  170. }
  171. RegexNamed * deserializeName(MemoryBuffer & in, RegexSerializeState & state)
  172. {
  173. unsigned match;
  174. in.read(match);
  175. return LINK(&state.named.item(match));
  176. }
  177. MemoryBuffer & serializeKind(MemoryBuffer & out, ThorRegexKind kind)
  178. {
  179. return out.append((byte)kind);
  180. }
  181. void deserialize(MemoryBuffer & in, IAtom * & name)
  182. {
  183. StringAttr x;
  184. in.read(x);
  185. name = createAtom(x);
  186. }
  187. //---------------------------------------------------------------------------
  188. static MatchState * findTrailingSeparator(MatchState & matched)
  189. {
  190. MatchState * child = matched.firstChild;
  191. while (child)
  192. {
  193. //find the last child.
  194. MatchState * next = child->next;
  195. while (next)
  196. {
  197. child = next;
  198. next = child->next;
  199. }
  200. //If the last child is a separator that matches the last offset of the matched token, adjust the end
  201. if (child->queryName() == separatorTagAtom)
  202. return child;
  203. child = child->firstChild;
  204. }
  205. return NULL;
  206. }
  207. static void removeTrailingSeparator(MatchState & matched)
  208. {
  209. MatchState * child = findTrailingSeparator(matched);
  210. if (child && child->end == matched.end)
  211. matched.end = child->start;
  212. }
  213. MatchState * RegexMatchSearchInstance::find(MatchState * top, const NlpMatchPath & path, unsigned depth)
  214. {
  215. regexid_t id = path.getId(depth);
  216. do
  217. {
  218. if (top->queryID() == id)
  219. {
  220. bool matchAny = path.matchAny(depth);
  221. if (matchAny || (nextIndex == 1))
  222. {
  223. if (depth+1 == path.numItems())
  224. {
  225. lastExactMatchDepth = depth+1;
  226. return top;
  227. }
  228. if (!matchAny)
  229. {
  230. lastExactMatchDepth = depth+1;
  231. nextIndex = path.nextExactMatchIndex(depth+1);
  232. }
  233. MatchState * ret = NULL;
  234. unsigned prevExactMatchDepth = lastExactMatchDepth;
  235. if (top->firstChild)
  236. ret = find(top->firstChild, path, depth+1);
  237. //If must match a child, or one of children had a required match then we have a result
  238. if (!matchAny || (prevExactMatchDepth != lastExactMatchDepth))
  239. return ret;
  240. }
  241. else
  242. nextIndex--;
  243. }
  244. else
  245. {
  246. if (top->firstChild)
  247. {
  248. unsigned prevExactMatchDepth = lastExactMatchDepth;
  249. MatchState * ret = find(top->firstChild, path, depth);
  250. //return if matched another level - may have failed to match, or matched completely
  251. if (prevExactMatchDepth != lastExactMatchDepth)
  252. return ret;
  253. }
  254. }
  255. top = top->next;
  256. } while (top);
  257. return NULL;
  258. }
  259. IMatchedElement * RegexMatchPath::getMatch(MatchState * top, bool removeTrailingSep) const
  260. {
  261. RegexMatchSearchInstance search;
  262. search.lastExactMatchDepth = 0;
  263. search.nextIndex = nextExactMatchIndex(0);
  264. MatchState * state = search.find(top, *this, 0);
  265. if (!state)
  266. return NULL;
  267. if (removeTrailingSep)
  268. removeTrailingSeparator(*state);
  269. return new CMatchedElement(state);
  270. }
  271. //---------------------------------------------------------------------------
  272. void CRegexMatchedResults::extractResults(MatchState & top, const byte * _in, bool removeTrailingSeparator)
  273. {
  274. in = _in;
  275. notMatched.ptr = in;
  276. ForEachItemIn(idx, def->matchResults)
  277. {
  278. ::Release(matched[idx]);
  279. matched[idx] = ((RegexMatchPath&)def->matchResults.item(idx)).getMatch(&top, removeTrailingSeparator);
  280. if (!matched[idx]) matched[idx] = LINK(&notMatched);
  281. }
  282. }
  283. //---------------------------------------------------------------------------
  284. static const char * stateText[] = { "Init", "Follow", "Retry", "Finished", "Repeat", "FollowOnly", "RepeatOnly", "DoneRepeat" };
  285. #ifdef TRACE_REGEX
  286. ActiveStage & ActiveStage::setState(unsigned _state)
  287. {
  288. DBGLOG("%*s[%p]Change state to %s", patternDepth, "", pattern, stateText[_state]);
  289. state = _state;
  290. return *this;
  291. }
  292. #endif
  293. //---------------------------------------------------------------------------
  294. static void disposeLink(Owned<RegexPattern> & pattern)
  295. {
  296. if (pattern)
  297. {
  298. RegexPattern * cur = pattern.getClear();
  299. cur->dispose();
  300. cur->Release();
  301. }
  302. }
  303. static void disposeName(Owned<RegexNamed> & named)
  304. {
  305. if (named)
  306. {
  307. RegexNamed * cur = named.getClear();
  308. cur->dispose();
  309. cur->Release();
  310. }
  311. }
  312. RegexPattern::RegexPattern()
  313. {
  314. gathered = false;
  315. }
  316. void RegexPattern::dispose()
  317. {
  318. while (next.ordinality())
  319. {
  320. RegexPattern & cur = next.popGet();
  321. cur.dispose();
  322. cur.Release();
  323. }
  324. }
  325. bool RegexPattern::gather(RegexSerializeState & state)
  326. {
  327. if (gathered)
  328. return false;
  329. gathered = true;
  330. state.patterns.append(OLINK(*this));
  331. gatherNext(state);
  332. return true;
  333. }
  334. void RegexPattern::gatherNext(RegexSerializeState & state)
  335. {
  336. ForEachItemIn(idx, next)
  337. next.item(idx).gather(state);
  338. }
  339. void RegexPattern::serializePattern(MemoryBuffer & out)
  340. {
  341. serializeKind(out, getKind());
  342. }
  343. void RegexPattern::serializeLinks(MemoryBuffer & out, RegexSerializeState & state)
  344. {
  345. out.append((unsigned)next.ordinality());
  346. ForEachItemIn(idx, next)
  347. serializeLink(out, &next.item(idx), state);
  348. }
  349. void RegexPattern::deserializePattern(MemoryBuffer & in)
  350. {
  351. }
  352. void RegexPattern::deserializeLinks(MemoryBuffer & in, RegexSerializeState & state)
  353. {
  354. unsigned nextCount;
  355. in.read(nextCount);
  356. for (unsigned idx=0; idx < nextCount; idx++)
  357. next.append(*deserializeLink(in, state));
  358. }
  359. RegexMatchAction RegexPattern::matchNext(RegexState & state)
  360. {
  361. unsigned num = next.ordinality();
  362. assertex(num);
  363. if (num == 1)
  364. return next.item(0).MATCH(state);
  365. const byte * saved = state.cur;
  366. for (unsigned idx = 0; idx < num; idx++)
  367. {
  368. RegexMatchAction ret = next.item(idx).MATCH(state);
  369. if (ret != RegexMatchBacktrack)
  370. return ret;
  371. state.cur = saved;
  372. }
  373. return RegexMatchBacktrack;
  374. }
  375. inline RegexMatchAction RegexPattern::pushMatched(RegexState & state)
  376. {
  377. pushStage(state).setMatched();
  378. return RegexMatchContinue;
  379. }
  380. inline RegexMatchAction RegexPattern::markFinishContinueMatch(RegexState & state)
  381. {
  382. MatchSaveState saved;
  383. state.markFinish(saved);
  384. RegexMatchAction ret = matchNext(state);
  385. state.unmarkFinish(saved);
  386. return ret;
  387. }
  388. ActiveStage & RegexPattern::pushStageBeginMatch(RegexState & state, RegexMatchStateSave * matched)
  389. {
  390. ActiveStage & stage = pushStage(state);
  391. stage.setMatched();
  392. stage.extra.matched = matched;
  393. state.pushMatch(*matched, matched->save);
  394. #ifdef TRACE_REGEX
  395. DBGLOG("%*s[%p]Push Begin Match", patternDepth, "", stage.pattern);
  396. #endif
  397. return stage;
  398. }
  399. void RegexPattern::cleanupBeginMatch(ActiveStage & stage, RegexState & state)
  400. {
  401. #ifdef TRACE_REGEX
  402. DBGLOG("%*s[%p]Pop Begin Match", patternDepth, "", stage.pattern);
  403. #endif
  404. state.popMatch(stage.extra.matched->save);
  405. state.cache.destroyStateSave(stage.extra.matched);
  406. stage.extra.matched = NULL;
  407. }
  408. RegexMatchAction RegexPattern::pushStageEndMatch(RegexState & state)
  409. {
  410. ActiveStage & stage = pushStage(state);
  411. stage.setMatched();
  412. state.markFinish(stage.extra.saved);
  413. #ifdef TRACE_REGEX
  414. DBGLOG("%*s[%p]Push End Match", patternDepth, "", stage.pattern);
  415. #endif
  416. return RegexMatchContinue;
  417. }
  418. void RegexPattern::cleanupEndMatch(ActiveStage & stage, RegexState & state)
  419. {
  420. #ifdef TRACE_REGEX
  421. DBGLOG("%*s[%p]Pop End Match", patternDepth, "", stage.pattern);
  422. #endif
  423. state.unmarkFinish(stage.extra.saved);
  424. }
  425. RegexMatchAction RegexPattern::traceMatch(RegexState & state)
  426. {
  427. static unsigned i;
  428. StringBuffer s,t;
  429. unsigned len = (size32_t)(state.end-state.cur);
  430. if (len > 10) len = 10;
  431. getTraceText(t);
  432. s.appendN(i++, ' ').appendf("Begin %s [%p] >%.*s<", t.str(), this, len, state.cur);
  433. DBGLOG("%s", s.str());
  434. RegexMatchAction ret = match(state);
  435. s.clear().appendN(--i, ' ').appendf("End %s [%p] = %s", t.str(), this, resultText[ret]);
  436. DBGLOG("%s", s.str());
  437. return ret;
  438. }
  439. void RegexPattern::getTraceText(StringBuffer & s)
  440. {
  441. s.append(getPatternText(getKind()));
  442. }
  443. void RegexPattern::toXML(StringBuffer & out, RegexXmlState & state)
  444. {
  445. out.appendf("\t<expr id=\"%d\"", (unsigned)state.patterns.find(*this));
  446. toXMLattr(out,state);
  447. if (next.ordinality())
  448. {
  449. out.append(">").newline();
  450. ForEachItemIn(idx, next)
  451. {
  452. out.appendf("\t\t<next id=\"%d\"/>", (unsigned)state.patterns.find(next.item(idx))).newline();
  453. }
  454. out.append("\t</expr>").newline();
  455. ForEachItemInRev(idx2, next)
  456. state.addLink(next.item(idx2));
  457. }
  458. else
  459. out.append("/>").newline();
  460. }
  461. void RegexPattern::toXMLattr(StringBuffer & out, RegexXmlState & state)
  462. {
  463. out.appendf(" kind=\"%s\"", getPatternText(getKind()));
  464. }
  465. void RegexPattern::killStage(ActiveStage & stage, RegexState & state)
  466. {
  467. }
  468. RegexMatchAction RegexPattern::nextChild(ActiveStage & stage, RegexState & state)
  469. {
  470. if (next.isItem(stage.nextFollow))
  471. {
  472. RegexPattern & child = next.item(stage.nextFollow);
  473. stage.nextFollow++;
  474. state.cur = stage.followPosition;
  475. return child.beginMatch(state);
  476. }
  477. stage.setState(RSretry);
  478. return RegexMatchContinue;
  479. }
  480. RegexMatchAction RegexPattern::nextAction(ActiveStage & stage, RegexState & state)
  481. {
  482. switch (stage.getState())
  483. {
  484. case RSinit:
  485. throwUnexpected();
  486. case RSnextfollow:
  487. return nextChild(stage, state);
  488. case RSfollowonly:
  489. {
  490. RegexMatchAction action = nextChild(stage, state);
  491. if (stage.getState() == RSretry)
  492. stage.setState(RSfinished);
  493. return action;
  494. }
  495. case RSretry:
  496. case RSfinished:
  497. state.popStage();
  498. return RegexMatchBacktrack;
  499. }
  500. UNIMPLEMENTED;
  501. }
  502. ActiveStage & RegexPattern::pushStage(RegexState & state)
  503. {
  504. #ifdef TRACE_REGEX
  505. StringBuffer t;
  506. unsigned len = state.end-state.cur;
  507. if (len > 10) len = 10;
  508. getTraceText(t);
  509. DBGLOG("%*s[%p]Push %s >%.*s<", patternDepth++, "", this, t.str(), len, state.cur);
  510. #endif
  511. ActiveStage & cur = state.pushStage();
  512. cur.pattern = this;
  513. cur.followPosition = state.cur;
  514. cur.state = RSinit;
  515. cur.flags = 0;
  516. return cur;
  517. }
  518. //---------------------------------------------------------------------------
  519. RegexMatchAction RegexNullPattern::match(RegexState & state)
  520. {
  521. return matchNext(state);
  522. }
  523. RegexMatchAction RegexNullPattern::beginMatch(RegexState & state)
  524. {
  525. return pushMatched(state);
  526. }
  527. RegexMatchAction RegexBeginTokenPattern::match(RegexState & state)
  528. {
  529. RegexMatchAction ret = matchNext(state);
  530. if (ret == RegexMatchBacktrackToken)
  531. return RegexMatchBacktrack;
  532. return ret;
  533. }
  534. RegexMatchAction RegexBeginTokenPattern::beginMatch(RegexState & state)
  535. {
  536. ActiveStage & stage = pushStage(state).setMatched();
  537. stage.flags |= RFbeginToken;
  538. return RegexMatchContinue;
  539. }
  540. RegexMatchAction RegexEndTokenPattern::match(RegexState & state)
  541. {
  542. RegexMatchAction ret = matchNext(state);
  543. if (ret == RegexMatchBacktrack)
  544. return RegexMatchBacktrackToken;
  545. return ret;
  546. }
  547. RegexMatchAction RegexEndTokenPattern::beginMatch(RegexState & state)
  548. {
  549. return pushMatched(state);
  550. }
  551. RegexMatchAction RegexEndTokenPattern::nextAction(ActiveStage & stage, RegexState & state)
  552. {
  553. switch (stage.getState())
  554. {
  555. case RSretry:
  556. case RSfinished:
  557. state.popStage();
  558. return RegexMatchBacktrackToken;
  559. default:
  560. return RegexPattern::nextAction(stage, state);
  561. }
  562. }
  563. RegexMatchAction RegexStartPattern::match(RegexState & state)
  564. {
  565. if (state.cur == state.start)
  566. return matchNext(state);
  567. return RegexMatchBacktrack;
  568. }
  569. RegexMatchAction RegexStartPattern::beginMatch(RegexState & state)
  570. {
  571. if (state.cur == state.start)
  572. return pushMatched(state);
  573. return RegexMatchBacktrack;
  574. }
  575. RegexMatchAction RegexFinishPattern::match(RegexState & state)
  576. {
  577. if (state.cur == state.end)
  578. return matchNext(state);
  579. return RegexMatchBacktrack;
  580. }
  581. RegexMatchAction RegexFinishPattern::beginMatch(RegexState & state)
  582. {
  583. if (state.cur == state.end)
  584. return pushMatched(state);
  585. return RegexMatchBacktrack;
  586. }
  587. RegexMatchAction RegexAnyCharPattern::match(RegexState & state)
  588. {
  589. if (state.cur != state.end)
  590. {
  591. state.cur += state.charSize;
  592. return matchNext(state);
  593. }
  594. return RegexMatchBacktrack;
  595. }
  596. RegexMatchAction RegexAnyCharPattern::beginMatch(RegexState & state)
  597. {
  598. if (state.cur != state.end)
  599. {
  600. state.cur += state.charSize;
  601. return pushMatched(state);
  602. }
  603. return RegexMatchBacktrack;
  604. }
  605. //---------------------------------------------------------------------------
  606. RegexMatchAction RegexPenaltyPattern::match(RegexState & state)
  607. {
  608. state.score -= penalty;
  609. RegexMatchAction ret = matchNext(state);
  610. state.score += penalty;
  611. return ret;
  612. }
  613. RegexMatchAction RegexPenaltyPattern::beginMatch(RegexState & state)
  614. {
  615. state.score -= penalty;
  616. return pushMatched(state);
  617. }
  618. void RegexPenaltyPattern::killStage(ActiveStage & stage, RegexState & state)
  619. {
  620. state.score += penalty;
  621. }
  622. void RegexPenaltyPattern::serializePattern(MemoryBuffer & out)
  623. {
  624. RegexPattern::serializePattern(out);
  625. out.append(penalty);
  626. }
  627. void RegexPenaltyPattern::deserializePattern(MemoryBuffer & in)
  628. {
  629. RegexPattern::deserializePattern(in);
  630. in.read(penalty);
  631. }
  632. void RegexPenaltyPattern::toXMLattr(StringBuffer & out, RegexXmlState & state)
  633. {
  634. RegexPattern::toXMLattr(out, state);
  635. out.append(" penalty=\"").append(penalty).append("\"");
  636. }
  637. //---------------------------------------------------------------------------
  638. void RegexAsciiBasePattern::serializePattern(MemoryBuffer & out)
  639. {
  640. RegexPattern::serializePattern(out);
  641. out.append(text);
  642. }
  643. void RegexAsciiBasePattern::deserializePattern(MemoryBuffer & in)
  644. {
  645. RegexPattern::deserializePattern(in);
  646. in.read(text);
  647. len = (size32_t)strlen(text);
  648. }
  649. void RegexAsciiBasePattern::toXMLattr(StringBuffer & out, RegexXmlState & state)
  650. {
  651. RegexPattern::toXMLattr(out, state);
  652. out.append(" text=\"");
  653. encodeXML(text, out, ENCODE_WHITESPACE);
  654. out.append("\"");
  655. }
  656. inline bool RegexAsciiPattern::doMatch(RegexState & state)
  657. {
  658. const byte * cur = state.cur;
  659. const byte * end = state.end;
  660. if (cur + len <= end)
  661. {
  662. if (memcmp(cur, text.get(), len) == 0)
  663. {
  664. state.cur = cur + len;
  665. return true;
  666. }
  667. }
  668. return false;
  669. }
  670. RegexMatchAction RegexAsciiPattern::match(RegexState & state)
  671. {
  672. if (doMatch(state))
  673. return matchNext(state);
  674. return RegexMatchBacktrack;
  675. }
  676. RegexMatchAction RegexAsciiPattern::beginMatch(RegexState & state)
  677. {
  678. if (doMatch(state))
  679. return pushMatched(state);
  680. return RegexMatchBacktrack;
  681. }
  682. inline bool RegexAsciiIPattern::doMatch(RegexState & state)
  683. {
  684. const byte * cur = state.cur;
  685. const byte * end = state.end;
  686. if (cur + len <= end)
  687. {
  688. if (memicmp(cur, text.get(), len) == 0)
  689. {
  690. state.cur = cur + len;
  691. return true;
  692. }
  693. }
  694. return false;
  695. }
  696. RegexMatchAction RegexAsciiIPattern::match(RegexState & state)
  697. {
  698. if (doMatch(state))
  699. return matchNext(state);
  700. return RegexMatchBacktrack;
  701. }
  702. RegexMatchAction RegexAsciiIPattern::beginMatch(RegexState & state)
  703. {
  704. if (doMatch(state))
  705. return pushMatched(state);
  706. return RegexMatchBacktrack;
  707. }
  708. //---------------------------------------------------------------------------
  709. RegexMatchAction RegexRecursivePattern::match(RegexState & state)
  710. {
  711. if ((positionStack.ordinality() == 0) || (positionStack.tos() != state.cur))
  712. {
  713. positionStack.append(state.cur);
  714. state.stack.append(*this);
  715. RegexMatchAction ret = matchNext(state);
  716. state.stack.pop();
  717. positionStack.pop();
  718. return ret;
  719. }
  720. else
  721. throw MakeStringException(0, "Grammar is left recursive - cannot yet process!");
  722. return RegexMatchBacktrack;
  723. }
  724. RegexMatchAction RegexRecursivePattern::beginMatch(RegexState & state)
  725. {
  726. //MORE!
  727. return pushMatched(state);
  728. }
  729. RegexMatchAction RegexEndRecursivePattern::match(RegexState & state)
  730. {
  731. RegexRecursivePattern & top = (RegexRecursivePattern &)state.stack.popGet();
  732. const void * saved = top.positionStack.popGet();
  733. RegexMatchAction ret = matchNext(state);
  734. top.positionStack.append(saved);
  735. state.stack.append(top);
  736. return ret;
  737. }
  738. RegexMatchAction RegexEndRecursivePattern::beginMatch(RegexState & state)
  739. {
  740. //MORE!
  741. return pushMatched(state);
  742. }
  743. void encodeUnicode(StringBuffer & out, size32_t len, const UChar * text)
  744. {
  745. #ifdef _USE_ICU
  746. UnicodeString unicode(text, len);
  747. unsigned len8 = unicode.extract(0, unicode.length(), 0, 0, "UTF-8");
  748. char * text8 = (char *)malloc(len8);
  749. unicode.extract(0, unicode.length(), text8, len8, "UTF-8");
  750. encodeXML(text8, out, ENCODE_WHITESPACE, len8, true);
  751. free(text8);
  752. #else
  753. rtlThrowNoUnicode();
  754. #endif
  755. }
  756. //---------------------------------------------------------------------------
  757. #ifdef _USE_ICU
  758. void RegexUnicodePattern::serializePattern(MemoryBuffer & out)
  759. {
  760. RegexPattern::serializePattern(out);
  761. ::serialize(out, text);
  762. }
  763. void RegexUnicodePattern::deserializePattern(MemoryBuffer & in)
  764. {
  765. RegexPattern::deserializePattern(in);
  766. ::deserialize(in, text);
  767. }
  768. void RegexUnicodePattern::toXMLattr(StringBuffer & out, RegexXmlState & state)
  769. {
  770. RegexPattern::toXMLattr(out, state);
  771. out.appendf(" text=\"");
  772. encodeUnicode(out, (size32_t)(text.length()/2), (const UChar *)text.get());
  773. out.append("\"");
  774. }
  775. inline bool RegexUnicodePattern::doMatch(RegexState & state)
  776. {
  777. const byte * cur = state.cur;
  778. const byte * end = state.end;
  779. size32_t len = (size32_t)text.length();
  780. if (cur + len <= end)
  781. {
  782. if (memcmp(cur, text.get(), len) == 0)
  783. {
  784. state.cur = cur + len;
  785. return true;
  786. }
  787. }
  788. return false;
  789. }
  790. RegexMatchAction RegexUnicodePattern::match(RegexState & state)
  791. {
  792. if (doMatch(state))
  793. return matchNext(state);
  794. return RegexMatchBacktrack;
  795. }
  796. RegexMatchAction RegexUnicodePattern::beginMatch(RegexState & state)
  797. {
  798. if (doMatch(state))
  799. return pushMatched(state);
  800. return RegexMatchBacktrack;
  801. }
  802. RegexUnicodeIPattern::RegexUnicodeIPattern(unsigned _len, const UChar * _text)
  803. {
  804. UChar * curLower = (UChar *)lower.allocate(_len*2);
  805. UChar * curUpper = (UChar *)upper.allocate(_len*2);
  806. for (unsigned i = 0; i< _len; i++)
  807. {
  808. curLower[i] = (UChar)u_tolower(_text[i]);
  809. curUpper[i] = (UChar)u_toupper(_text[i]);
  810. }
  811. }
  812. void RegexUnicodeIPattern::serializePattern(MemoryBuffer & out)
  813. {
  814. RegexPattern::serializePattern(out);
  815. ::serialize(out, lower);
  816. ::serialize(out, upper);
  817. }
  818. void RegexUnicodeIPattern::deserializePattern(MemoryBuffer & in)
  819. {
  820. RegexPattern::deserializePattern(in);
  821. ::deserialize(in, lower);
  822. ::deserialize(in, upper);
  823. }
  824. void RegexUnicodeIPattern::toXMLattr(StringBuffer & out, RegexXmlState & state)
  825. {
  826. RegexPattern::toXMLattr(out, state);
  827. out.appendf(" text=\"");
  828. encodeUnicode(out, (size32_t)(lower.length()/2), (const UChar *)lower.get());
  829. out.append("\"");
  830. }
  831. inline bool RegexUnicodeIPattern::doMatch(RegexState & state)
  832. {
  833. const byte * start = state.cur;
  834. const byte * end = state.end;
  835. size32_t size = (size32_t)lower.length();
  836. if (start + size <= end)
  837. {
  838. unsigned i;
  839. unsigned len = size/sizeof(UChar);
  840. const UChar * cur = (const UChar *)state.cur;
  841. const UChar * curLower = (const UChar *)lower.get();
  842. const UChar * curUpper = (const UChar *)upper.get();
  843. for (i = 0; i < len; i++)
  844. {
  845. UChar next = cur[i];
  846. if ((next != curLower[i]) && (next != curUpper[i]))
  847. return false;
  848. }
  849. state.cur += size;
  850. return true;
  851. }
  852. return false;
  853. }
  854. RegexMatchAction RegexUnicodeIPattern::match(RegexState & state)
  855. {
  856. if (doMatch(state))
  857. return matchNext(state);
  858. return RegexMatchBacktrack;
  859. }
  860. RegexMatchAction RegexUnicodeIPattern::beginMatch(RegexState & state)
  861. {
  862. if (doMatch(state))
  863. return pushMatched(state);
  864. return RegexMatchBacktrack;
  865. }
  866. RegexUnicodeFIPattern::RegexUnicodeFIPattern(unsigned _len, const UChar * _text)
  867. {
  868. MemoryBuffer foldedBuffer;
  869. UChar temp[2];
  870. UErrorCode error = U_ZERO_ERROR;
  871. for (unsigned i = 0; i< _len; i++)
  872. {
  873. unsigned foldLen = u_strFoldCase(temp, _elements_in(temp), _text+i, 1, U_FOLD_CASE_DEFAULT, &error);
  874. foldedBuffer.append(foldLen, &temp);
  875. }
  876. folded.set(foldedBuffer.length(), foldedBuffer.toByteArray());
  877. }
  878. bool RegexUnicodeFIPattern::doMatch(RegexState & state)
  879. {
  880. const UChar * cur = (const UChar *)state.cur;
  881. const UChar * end = (const UChar *)state.end;
  882. UChar temp[2];
  883. UErrorCode error = U_ZERO_ERROR;
  884. const UChar * curMatchText = (const UChar *)folded.get();
  885. const UChar * endMatchText = (const UChar *)((const byte *)folded.get() + folded.length());
  886. while (cur != end)
  887. {
  888. unsigned foldLen = u_strFoldCase(temp, _elements_in(temp), cur, 1, U_FOLD_CASE_DEFAULT, &error);
  889. if (temp[0] != *curMatchText)
  890. return false;
  891. assertex(foldLen == 1 || foldLen == 2);
  892. if (foldLen == 2)
  893. {
  894. if (curMatchText + foldLen > endMatchText)
  895. return false;
  896. if (temp[1] != curMatchText[1])
  897. return false;
  898. }
  899. cur += 1;
  900. curMatchText += foldLen;
  901. if (curMatchText == endMatchText)
  902. {
  903. state.cur = (const byte *)cur;
  904. return true;
  905. }
  906. }
  907. return false;
  908. }
  909. RegexMatchAction RegexUnicodeFIPattern::match(RegexState & state)
  910. {
  911. if (doMatch(state))
  912. return matchNext(state);
  913. return RegexMatchBacktrack;
  914. }
  915. RegexMatchAction RegexUnicodeFIPattern::beginMatch(RegexState & state)
  916. {
  917. if (doMatch(state))
  918. return pushMatched(state);
  919. return RegexMatchBacktrack;
  920. }
  921. //---------------------------------------------------------------------------
  922. void RegexUtf8Pattern::serializePattern(MemoryBuffer & out)
  923. {
  924. RegexPattern::serializePattern(out);
  925. ::serialize(out, text);
  926. }
  927. void RegexUtf8Pattern::deserializePattern(MemoryBuffer & in)
  928. {
  929. RegexPattern::deserializePattern(in);
  930. ::deserialize(in, text);
  931. }
  932. void RegexUtf8Pattern::toXMLattr(StringBuffer & out, RegexXmlState & state)
  933. {
  934. RegexPattern::toXMLattr(out, state);
  935. out.appendf(" text=\"");
  936. encodeXML((const char *)text.get(), out, ENCODE_WHITESPACE, (size32_t)text.length(), true);
  937. out.append("\"");
  938. }
  939. inline bool RegexUtf8Pattern::doMatch(RegexState & state)
  940. {
  941. const byte * cur = state.cur;
  942. const byte * end = state.end;
  943. size32_t size = (size32_t)text.length();
  944. if (cur + size <= end)
  945. {
  946. if (memcmp(cur, text.get(), size) == 0)
  947. {
  948. state.cur = cur + size;
  949. return true;
  950. }
  951. }
  952. return false;
  953. }
  954. RegexMatchAction RegexUtf8Pattern::match(RegexState & state)
  955. {
  956. if (doMatch(state))
  957. return matchNext(state);
  958. return RegexMatchBacktrack;
  959. }
  960. RegexMatchAction RegexUtf8Pattern::beginMatch(RegexState & state)
  961. {
  962. if (doMatch(state))
  963. return pushMatched(state);
  964. return RegexMatchBacktrack;
  965. }
  966. //---------------------------------------------------------------------------
  967. RegexUtf8IPattern::RegexUtf8IPattern(unsigned _len, const char * _text)
  968. {
  969. //Store unicode lowercase and uppercase versions, and compare the incoming utf a character at a time.
  970. UChar * curLower = (UChar *)lower.allocate(_len*sizeof(UChar));
  971. UChar * curUpper = (UChar *)upper.allocate(_len*sizeof(UChar));
  972. const byte * cur = (const byte *)_text;
  973. const byte * end = cur + rtlUtf8Size(_len, cur);
  974. for (unsigned i = 0; i< _len; i++)
  975. {
  976. UChar next = readUtf8Character((size32_t)(end-cur), cur);
  977. curLower[i] = (UChar)u_tolower(next);
  978. curUpper[i] = (UChar)u_toupper(next);
  979. }
  980. }
  981. void RegexUtf8IPattern::serializePattern(MemoryBuffer & out)
  982. {
  983. RegexPattern::serializePattern(out);
  984. ::serialize(out, lower);
  985. ::serialize(out, upper);
  986. }
  987. void RegexUtf8IPattern::deserializePattern(MemoryBuffer & in)
  988. {
  989. RegexPattern::deserializePattern(in);
  990. ::deserialize(in, lower);
  991. ::deserialize(in, upper);
  992. }
  993. void RegexUtf8IPattern::toXMLattr(StringBuffer & out, RegexXmlState & state)
  994. {
  995. RegexPattern::toXMLattr(out, state);
  996. out.appendf(" text=\"");
  997. encodeUnicode(out, (size32_t)(lower.length()/2), (const UChar *)lower.get());
  998. out.append("\"");
  999. }
  1000. inline bool RegexUtf8IPattern::doMatch(RegexState & state)
  1001. {
  1002. const byte * end = state.end;
  1003. size32_t size = (size32_t)lower.length();
  1004. size32_t len = size/(size32_t)sizeof(UChar);
  1005. const byte * cur = state.cur;
  1006. const UChar * curLower = (const UChar *)lower.get();
  1007. const UChar * curUpper = (const UChar *)upper.get();
  1008. for (unsigned i = 0; i < len; i++)
  1009. {
  1010. if (cur == end)
  1011. return false;
  1012. UChar next = readUtf8Character((size32_t)(end-cur), cur);
  1013. if ((next != curLower[i]) && (next != curUpper[i]))
  1014. return false;
  1015. }
  1016. state.cur = cur;
  1017. return true;
  1018. }
  1019. RegexMatchAction RegexUtf8IPattern::match(RegexState & state)
  1020. {
  1021. if (doMatch(state))
  1022. return matchNext(state);
  1023. return RegexMatchBacktrack;
  1024. }
  1025. RegexMatchAction RegexUtf8IPattern::beginMatch(RegexState & state)
  1026. {
  1027. if (doMatch(state))
  1028. return pushMatched(state);
  1029. return RegexMatchBacktrack;
  1030. }
  1031. #endif
  1032. //---------------------------------------------------------------------------
  1033. RegexAsciiSetBasePattern::RegexAsciiSetBasePattern()
  1034. {
  1035. inverted = false;
  1036. for (unsigned i = 0; i < 256; i++)
  1037. include[i] = inverted;
  1038. }
  1039. void RegexAsciiSetBasePattern::setInvert(bool newValue)
  1040. {
  1041. //can call before or after processing the list.
  1042. if (inverted != newValue)
  1043. {
  1044. for (unsigned i = 0; i < 256; i++)
  1045. include[i] ^= true;
  1046. inverted = newValue;
  1047. }
  1048. }
  1049. void RegexAsciiSetBasePattern::serializePattern(MemoryBuffer & out)
  1050. {
  1051. RegexPattern::serializePattern(out);
  1052. serializeBoolArray(out, 256, include);
  1053. out.append(inverted);
  1054. }
  1055. void RegexAsciiSetBasePattern::deserializePattern(MemoryBuffer & in)
  1056. {
  1057. RegexPattern::deserializePattern(in);
  1058. deserializeBoolArray(256, include, in);
  1059. in.read(inverted);
  1060. }
  1061. inline bool RegexAsciiSetBasePattern::doMatch(RegexState & state)
  1062. {
  1063. const byte * cur = state.cur;
  1064. const byte * end = state.end;
  1065. if (cur < end)
  1066. {
  1067. if (include[*cur])
  1068. {
  1069. state.cur = cur + 1;
  1070. return true;
  1071. }
  1072. }
  1073. return false;
  1074. }
  1075. RegexMatchAction RegexAsciiSetBasePattern::match(RegexState & state)
  1076. {
  1077. if (doMatch(state))
  1078. return matchNext(state);
  1079. return RegexMatchBacktrack;
  1080. }
  1081. RegexMatchAction RegexAsciiSetBasePattern::beginMatch(RegexState & state)
  1082. {
  1083. if (doMatch(state))
  1084. return pushMatched(state);
  1085. return RegexMatchBacktrack;
  1086. }
  1087. void RegexAsciiSetBasePattern::addSpecialRange(unsigned value)
  1088. {
  1089. for (unsigned i = 0; i < 256; i++)
  1090. {
  1091. if (isAsciiMatch(value, i))
  1092. include[i] = !inverted;
  1093. }
  1094. }
  1095. void RegexAsciiSetBasePattern::toXMLattr(StringBuffer & out, RegexXmlState & state)
  1096. {
  1097. RegexSetBasePattern::toXMLattr(out, state);
  1098. out.append(" set=\"");
  1099. if (inverted)
  1100. out.append("^");
  1101. for (unsigned i = 0; i < 256; i++)
  1102. {
  1103. if (include[i] != inverted)
  1104. {
  1105. char c = i;
  1106. encodeXML(&c, out, ENCODE_WHITESPACE, 1);
  1107. }
  1108. }
  1109. out.append("\"");
  1110. }
  1111. void RegexAsciiSetPattern::addRange(unsigned low, unsigned high)
  1112. {
  1113. if (low & RegexSpecialMask)
  1114. {
  1115. assertex(low == high);
  1116. addSpecialRange(low);
  1117. }
  1118. else
  1119. {
  1120. assertex(low < 256 && high < 256);
  1121. for (unsigned i = low; i <= high; i++)
  1122. include[i] = !inverted;
  1123. }
  1124. }
  1125. void RegexAsciiISetPattern::addRange(unsigned low, unsigned high)
  1126. {
  1127. if (low & RegexSpecialMask)
  1128. {
  1129. assertex(low == high);
  1130. addSpecialRange(low);
  1131. }
  1132. else
  1133. {
  1134. assertex(low < 256 && high < 256);
  1135. for (unsigned i = low; i <= high; i++)
  1136. {
  1137. include[tolower(i)] = !inverted;
  1138. include[toupper(i)] = !inverted;
  1139. }
  1140. }
  1141. }
  1142. //---------------------------------------------------------------------------
  1143. #ifdef _USE_ICU
  1144. RegexUnicodeSetPattern::RegexUnicodeSetPattern(bool _caseSensitive)
  1145. {
  1146. inverted = false;
  1147. caseSensitive = _caseSensitive;
  1148. }
  1149. void RegexUnicodeSetPattern::addRange(unsigned low, unsigned high)
  1150. {
  1151. if (!caseSensitive)
  1152. {
  1153. //This isn't really good enough - probably need to use string for the high and low values to allow collation
  1154. //comparison, and to allow SS to match correctly.
  1155. if (low & RegexSpecialMask)
  1156. {
  1157. if ((low == RCClower) || (low == RCCupper))
  1158. low = RCCalpha;
  1159. }
  1160. else
  1161. {
  1162. low = u_foldCase(low, U_FOLD_CASE_DEFAULT);
  1163. high = u_foldCase(high, U_FOLD_CASE_DEFAULT);
  1164. }
  1165. }
  1166. from.append(low);
  1167. to.append(high);
  1168. }
  1169. void RegexUnicodeSetPattern::setInvert(bool value)
  1170. {
  1171. inverted = value;
  1172. }
  1173. bool RegexUnicodeSetPattern::doMatch(RegexState & state)
  1174. {
  1175. const byte * cur = (const byte *)state.cur;
  1176. const byte * end = (const byte *)state.end;
  1177. if (cur < end)
  1178. {
  1179. UChar next;
  1180. if (state.inputFormat == NlpUnicode)
  1181. {
  1182. next = *(const UChar *)cur;
  1183. cur += sizeof(UChar);
  1184. }
  1185. else
  1186. {
  1187. next = readUtf8Character((size32_t)(end-cur), cur);
  1188. }
  1189. if (!caseSensitive)
  1190. next = (UChar)u_foldCase(next, U_FOLD_CASE_DEFAULT); // MORE: Doesn't cope with full folding of SS
  1191. ForEachItemIn(idx, from)
  1192. {
  1193. unsigned low = from.item(idx);
  1194. if (low & RegexSpecialMask)
  1195. {
  1196. if (isUnicodeMatch(low, next))
  1197. {
  1198. if (inverted)
  1199. return false;
  1200. state.cur = cur;
  1201. return true;
  1202. }
  1203. }
  1204. else if ((next >= low) && (next <= to.item(idx)))
  1205. {
  1206. if (inverted)
  1207. return false;
  1208. state.cur = cur;
  1209. return true;
  1210. }
  1211. }
  1212. if (inverted)
  1213. {
  1214. state.cur = cur;
  1215. return true;
  1216. }
  1217. }
  1218. return false;
  1219. }
  1220. RegexMatchAction RegexUnicodeSetPattern::match(RegexState & state)
  1221. {
  1222. if (doMatch(state))
  1223. return matchNext(state);
  1224. return RegexMatchBacktrack;
  1225. }
  1226. RegexMatchAction RegexUnicodeSetPattern::beginMatch(RegexState & state)
  1227. {
  1228. if (doMatch(state))
  1229. return pushMatched(state);
  1230. return RegexMatchBacktrack;
  1231. }
  1232. void RegexUnicodeSetPattern::serializePattern(MemoryBuffer & out)
  1233. {
  1234. RegexPattern::serializePattern(out);
  1235. out.append(caseSensitive);
  1236. out.append(inverted);
  1237. out.append(from.ordinality());
  1238. ForEachItemIn(idx, from)
  1239. {
  1240. out.append(from.item(idx));
  1241. out.append(to.item(idx));
  1242. }
  1243. }
  1244. void RegexUnicodeSetPattern::deserializePattern(MemoryBuffer & in)
  1245. {
  1246. unsigned count;
  1247. RegexPattern::deserializePattern(in);
  1248. in.read(caseSensitive);
  1249. in.read(inverted);
  1250. in.read(count);
  1251. for (unsigned idx=0; idx < count; idx++)
  1252. {
  1253. unsigned value;
  1254. in.read(value);
  1255. from.append(value);
  1256. in.read(value);
  1257. to.append(value);
  1258. }
  1259. }
  1260. void RegexUnicodeSetPattern::toXMLattr(StringBuffer & out, RegexXmlState & state)
  1261. {
  1262. RegexSetBasePattern::toXMLattr(out, state);
  1263. if (state.detail >= 100)
  1264. {
  1265. if (inverted)
  1266. out.append(" inverted");
  1267. out.append(" set=\"");
  1268. ForEachItemIn(idx, from)
  1269. {
  1270. if (idx)
  1271. out.append(",");
  1272. out.append(from.item(idx));
  1273. if (from.item(idx) != to.item(idx))
  1274. out.append("..").append(to.item(idx));
  1275. }
  1276. out.append("\"");
  1277. }
  1278. }
  1279. #endif
  1280. //---------------------------------------------------------------------------
  1281. RegexMatchAction RegexDonePattern::match(RegexState & state)
  1282. {
  1283. state.top.end = state.cur;
  1284. if (state.matchAction->onMatch(state))
  1285. return RegexMatchDone;
  1286. return RegexMatchBacktrack;
  1287. }
  1288. RegexMatchAction RegexDonePattern::beginMatch(RegexState & state)
  1289. {
  1290. state.top.end = state.cur;
  1291. if (state.matchAction->onMatch(state))
  1292. return RegexMatchDone;
  1293. return RegexMatchBacktrack;
  1294. }
  1295. //---------------------------------------------------------------------------
  1296. RegexMatchAction RegexEndNestedPattern::match(RegexState & state)
  1297. {
  1298. RegexPattern & top = state.stack.popGet();
  1299. RegexMatchAction ret = top.MATCH(state);
  1300. state.stack.append(top);
  1301. return ret;
  1302. }
  1303. RegexMatchAction RegexEndNestedPattern::beginMatch(RegexState & state)
  1304. {
  1305. ActiveStage & stage = pushStage(state);
  1306. stage.setState(RSfinished);
  1307. stage.extra.nextPattern = &state.namedStack.popGet();
  1308. return stage.extra.nextPattern->beginMatch(state);
  1309. }
  1310. void RegexEndNestedPattern::killStage(ActiveStage & stage, RegexState & state)
  1311. {
  1312. state.namedStack.append(*stage.extra.nextPattern);
  1313. }
  1314. //---------------------------------------------------------------------------
  1315. void RegexAssertionPattern::dispose()
  1316. {
  1317. RegexPattern::dispose();
  1318. disposeLink(pattern);
  1319. }
  1320. bool RegexAssertionPattern::gather(RegexSerializeState & state)
  1321. {
  1322. bool ok = RegexPattern::gather(state);
  1323. if (ok)
  1324. pattern->gather(state);
  1325. return ok;
  1326. }
  1327. void RegexAssertionPattern::serializeLinks(MemoryBuffer & out, RegexSerializeState & state)
  1328. {
  1329. RegexPattern::serializeLinks(out, state);
  1330. serializeLink(out, pattern, state);
  1331. }
  1332. void RegexAssertionPattern::serializePattern(MemoryBuffer & out)
  1333. {
  1334. RegexPattern::serializePattern(out);
  1335. out.append(invert);
  1336. }
  1337. void RegexAssertionPattern::toXMLattr(StringBuffer & out, RegexXmlState & state)
  1338. {
  1339. RegexPattern::toXMLattr(out, state);
  1340. out.appendf(" sub=\"%d\" invert=\"%d\"", (unsigned)state.patterns.find(*pattern), invert);
  1341. state.addLink(*pattern);
  1342. }
  1343. void RegexAssertionPattern::deserializeLinks(MemoryBuffer & in, RegexSerializeState & state)
  1344. {
  1345. RegexPattern::deserializeLinks(in, state);
  1346. pattern.setown(deserializeLink(in, state));
  1347. }
  1348. void RegexAssertionPattern::deserializePattern(MemoryBuffer & in)
  1349. {
  1350. RegexPattern::deserializePattern(in);
  1351. in.read(invert);
  1352. }
  1353. bool RegexAssertNextPattern::nextMatches(RegexState & state)
  1354. {
  1355. //Sub function to reduce stack usage
  1356. MatchedAssertionAction matchedFlag;
  1357. RegexState localState(state, &matchedFlag, (size32_t)(state.end - state.cur), state.cur);
  1358. localState.processPattern(pattern);
  1359. return matchedFlag.isMatched();
  1360. }
  1361. RegexMatchAction RegexAssertNextPattern::match(RegexState & state)
  1362. {
  1363. if (nextMatches(state) != invert)
  1364. return matchNext(state);
  1365. return RegexMatchBacktrack;
  1366. }
  1367. RegexMatchAction RegexAssertNextPattern::beginMatch(RegexState & state)
  1368. {
  1369. if (nextMatches(state) != invert)
  1370. return pushMatched(state);
  1371. return RegexMatchBacktrack;
  1372. }
  1373. bool RegexAssertPrevPattern::prevMatches(RegexState & state)
  1374. {
  1375. //Sub function to reduce stack usage
  1376. ExactMatchedAssertionAction matchedFlag;
  1377. //This will work efficiently on fixed size patterns, but its painful for variable length
  1378. //It would work much better if we reversed the pattern, reversed the string, and then matched.
  1379. //I'll implement it when someone complains!
  1380. unsigned sizeBefore = (size32_t)(state.cur - state.start);
  1381. if (sizeBefore >= minSize)
  1382. {
  1383. unsigned maxToCheck = sizeBefore < maxSize ? sizeBefore : maxSize;
  1384. for (unsigned sizeToCheck = minSize; sizeToCheck <= maxToCheck; sizeToCheck += state.charSize)
  1385. {
  1386. //MORE: A test of finish will incorrectly succeed
  1387. RegexState localState(state, &matchedFlag, sizeBefore, state.start);
  1388. localState.cur = state.cur - sizeToCheck;
  1389. localState.processPattern(pattern);
  1390. if (matchedFlag.isMatched())
  1391. return true;
  1392. }
  1393. }
  1394. return false;
  1395. }
  1396. RegexMatchAction RegexAssertPrevPattern::match(RegexState & state)
  1397. {
  1398. if (prevMatches(state) != invert)
  1399. return matchNext(state);
  1400. return RegexMatchBacktrack;
  1401. }
  1402. RegexMatchAction RegexAssertPrevPattern::beginMatch(RegexState & state)
  1403. {
  1404. if (prevMatches(state) != invert)
  1405. return pushMatched(state);
  1406. return RegexMatchBacktrack;
  1407. }
  1408. void RegexAssertPrevPattern::serializePattern(MemoryBuffer & out)
  1409. {
  1410. RegexAssertionPattern::serializePattern(out);
  1411. out.append(minSize);
  1412. out.append(maxSize);
  1413. }
  1414. void RegexAssertPrevPattern::deserializePattern(MemoryBuffer & in)
  1415. {
  1416. RegexAssertionPattern::deserializePattern(in);
  1417. in.read(minSize);
  1418. in.read(maxSize);
  1419. }
  1420. void RegexAssertPrevPattern::toXMLattr(StringBuffer & out, RegexXmlState & state)
  1421. {
  1422. RegexAssertionPattern::toXMLattr(out, state);
  1423. out.appendf(" min=\"%d\" max=\"%d\"", minSize, maxSize);
  1424. }
  1425. //---------------------------------------------------------------------------
  1426. RegexMatchAction RegexBeginCheckPattern::match(RegexState & state)
  1427. {
  1428. MatchState matched;
  1429. MatchSaveState saved;
  1430. state.pushMatch(matched, saved);
  1431. RegexMatchAction ret = matchNext(state);
  1432. state.popMatch(saved);
  1433. return ret;
  1434. }
  1435. RegexMatchAction RegexBeginCheckPattern::beginMatch(RegexState & state)
  1436. {
  1437. RegexMatchStateSave * matched = state.cache.createStateSave(NULL, 0);
  1438. pushStageBeginMatch(state, matched);
  1439. return RegexMatchContinue;
  1440. }
  1441. void RegexBeginCheckPattern::killStage(ActiveStage & stage, RegexState & state)
  1442. {
  1443. cleanupBeginMatch(stage, state);
  1444. }
  1445. void RegexCheckPattern::serializePattern(MemoryBuffer & out)
  1446. {
  1447. RegexPattern::serializePattern(out);
  1448. out.append(invert);
  1449. out.append(stripSeparator);
  1450. }
  1451. void RegexCheckPattern::deserializePattern(MemoryBuffer & in)
  1452. {
  1453. RegexPattern::deserializePattern(in);
  1454. in.read(invert);
  1455. in.read(stripSeparator);
  1456. }
  1457. inline const byte * RegexCheckPattern::getEndMatch(RegexState & state)
  1458. {
  1459. const byte * end = state.cur;
  1460. if (stripSeparator)
  1461. {
  1462. MatchState * child = findTrailingSeparator(*state.curMatch);
  1463. if (child && child->end == end)
  1464. end = child->start;
  1465. }
  1466. return end;
  1467. }
  1468. RegexMatchAction RegexCheckInPattern::match(RegexState & state)
  1469. {
  1470. const byte * start = state.curMatch->start;
  1471. const byte * end = getEndMatch(state);
  1472. MatchedCheckAction matchedFlag;
  1473. RegexState localState(state, &matchedFlag, (size32_t)(end - start), start);
  1474. pattern->MATCH(localState);
  1475. if (matchedFlag.isMatched() ^ invert)
  1476. return markFinishContinueMatch(state);
  1477. return RegexMatchBacktrack;
  1478. }
  1479. RegexMatchAction RegexCheckInPattern::beginMatch(RegexState & state)
  1480. {
  1481. const byte * start = state.curMatch->start;
  1482. const byte * end = getEndMatch(state);
  1483. MatchedCheckAction matchedFlag;
  1484. RegexState localState(state, &matchedFlag, (size32_t)(end - start), start);
  1485. localState.helper = state.helper;
  1486. localState.processPattern(pattern);
  1487. if (matchedFlag.isMatched() ^ invert)
  1488. return pushStageEndMatch(state);
  1489. return RegexMatchBacktrack;
  1490. }
  1491. void RegexCheckInPattern::killStage(ActiveStage & stage, RegexState & state)
  1492. {
  1493. cleanupEndMatch(stage, state);
  1494. }
  1495. void RegexCheckInPattern::dispose()
  1496. {
  1497. RegexCheckPattern::dispose();
  1498. disposeLink(pattern);
  1499. }
  1500. bool RegexCheckInPattern::gather(RegexSerializeState & state)
  1501. {
  1502. bool ok = RegexCheckPattern::gather(state);
  1503. if (ok)
  1504. pattern->gather(state);
  1505. return ok;
  1506. }
  1507. void RegexCheckInPattern::serializeLinks(MemoryBuffer & out, RegexSerializeState & state)
  1508. {
  1509. RegexCheckPattern::serializeLinks(out, state);
  1510. serializeLink(out, pattern, state);
  1511. }
  1512. void RegexCheckInPattern::deserializeLinks(MemoryBuffer & in, RegexSerializeState & state)
  1513. {
  1514. RegexCheckPattern::deserializeLinks(in, state);
  1515. pattern.setown(deserializeLink(in, state));
  1516. }
  1517. void RegexCheckInPattern::toXMLattr(StringBuffer & out, RegexXmlState & state)
  1518. {
  1519. RegexCheckPattern::toXMLattr(out, state);
  1520. out.appendf(" sub=\"%d\"", (unsigned)state.patterns.find(*pattern));
  1521. state.addLink(*pattern);
  1522. }
  1523. inline bool RegexCheckLengthPattern::doMatch(RegexState & state)
  1524. {
  1525. const byte * start = state.curMatch->start;
  1526. const byte * end = getEndMatch(state);
  1527. unsigned matchSize = (size32_t)(end - start);
  1528. unsigned matchLength;
  1529. if (state.inputFormat == NlpUtf8)
  1530. matchLength = rtlUtf8Length(matchSize, start);
  1531. else
  1532. matchLength = matchSize;
  1533. bool inRange = ((matchLength >= minExpectedBytes) && (matchLength <= maxExpectedBytes));
  1534. return (inRange != invert);
  1535. }
  1536. RegexMatchAction RegexCheckLengthPattern::match(RegexState & state)
  1537. {
  1538. if (doMatch(state))
  1539. return markFinishContinueMatch(state);
  1540. return RegexMatchBacktrack;
  1541. }
  1542. RegexMatchAction RegexCheckLengthPattern::beginMatch(RegexState & state)
  1543. {
  1544. if (doMatch(state))
  1545. return pushStageEndMatch(state);
  1546. return RegexMatchBacktrack;
  1547. }
  1548. void RegexCheckLengthPattern::killStage(ActiveStage & stage, RegexState & state)
  1549. {
  1550. cleanupEndMatch(stage, state);
  1551. }
  1552. void RegexCheckLengthPattern::serializePattern(MemoryBuffer & out)
  1553. {
  1554. RegexCheckPattern::serializePattern(out);
  1555. out.append(minExpectedBytes);
  1556. out.append(maxExpectedBytes);
  1557. }
  1558. void RegexCheckLengthPattern::deserializePattern(MemoryBuffer & in)
  1559. {
  1560. RegexCheckPattern::deserializePattern(in);
  1561. in.read(minExpectedBytes);
  1562. in.read(maxExpectedBytes);
  1563. }
  1564. void RegexCheckLengthPattern::toXMLattr(StringBuffer & out, RegexXmlState & state)
  1565. {
  1566. RegexCheckPattern::toXMLattr(out, state);
  1567. out.appendf(" expected=\"%d..%d\"", minExpectedBytes, maxExpectedBytes);
  1568. }
  1569. //---------------------------------------------------------------------------
  1570. void RegexValidatePattern::serializePattern(MemoryBuffer & out)
  1571. {
  1572. RegexCheckPattern::serializePattern(out);
  1573. out.append(validatorIndex);
  1574. }
  1575. void RegexValidatePattern::deserializePattern(MemoryBuffer & in)
  1576. {
  1577. RegexCheckPattern::deserializePattern(in);
  1578. in.read(validatorIndex);
  1579. }
  1580. void RegexValidatePattern::deserializeLinks(MemoryBuffer & in, RegexSerializeState & state)
  1581. {
  1582. RegexCheckPattern::deserializeLinks(in, state);
  1583. }
  1584. void RegexValidatePattern::toXMLattr(StringBuffer & out, RegexXmlState & state)
  1585. {
  1586. RegexCheckPattern::toXMLattr(out, state);
  1587. out.appendf(" validator=\"%d\"", validatorIndex);
  1588. }
  1589. RegexMatchAction RegexValidatePattern::match(RegexState & state)
  1590. {
  1591. if (doMatch(state))
  1592. return markFinishContinueMatch(state);
  1593. return RegexMatchBacktrack;
  1594. }
  1595. RegexMatchAction RegexValidatePattern::beginMatch(RegexState & state)
  1596. {
  1597. if (doMatch(state))
  1598. return pushStageEndMatch(state);
  1599. return RegexMatchBacktrack;
  1600. }
  1601. void RegexValidatePattern::killStage(ActiveStage & stage, RegexState & state)
  1602. {
  1603. cleanupEndMatch(stage, state);
  1604. }
  1605. bool RegexValidateAscAsAscPattern::doMatch(RegexState & state)
  1606. {
  1607. IStringValidator * validator = (IStringValidator *)state.helper->queryValidator(validatorIndex);
  1608. const byte * start = state.curMatch->start;
  1609. const byte * end = getEndMatch(state);
  1610. unsigned matchSize = (size32_t)(end - start);
  1611. return validator->isValid(matchSize, (const char *)start);
  1612. }
  1613. bool RegexValidateUniAsAscPattern::doMatch(RegexState & state)
  1614. {
  1615. IStringValidator * validator = (IStringValidator *)state.helper->queryValidator(validatorIndex);
  1616. const byte * start = state.curMatch->start;
  1617. const byte * end = getEndMatch(state);
  1618. size32_t matchSize = (size32_t)(end - start);
  1619. unsigned asciiLen;
  1620. char * asciiText;
  1621. rtlUnicodeToStrX(asciiLen, asciiText, matchSize/2, (const UChar *)start);
  1622. bool ok = validator->isValid(asciiLen, asciiText);
  1623. rtlFree(asciiText);
  1624. return ok;
  1625. }
  1626. bool RegexValidateUtf8AsAscPattern::doMatch(RegexState & state)
  1627. {
  1628. IStringValidator * validator = (IStringValidator *)state.helper->queryValidator(validatorIndex);
  1629. const byte * start = state.curMatch->start;
  1630. const byte * end = getEndMatch(state);
  1631. unsigned matchSize = (size32_t)(end - start);
  1632. unsigned asciiLen;
  1633. char * asciiText;
  1634. rtlUnicodeToStrX(asciiLen, asciiText, rtlUtf8Length(matchSize, start), (const UChar *)start);
  1635. bool ok = validator->isValid(asciiLen, asciiText);
  1636. rtlFree(asciiText);
  1637. return ok;
  1638. }
  1639. bool RegexValidateAscAsUniPattern::doMatch(RegexState & state)
  1640. {
  1641. IUnicodeValidator * validator = (IUnicodeValidator *)state.helper->queryValidator(validatorIndex);
  1642. const byte * start = state.curMatch->start;
  1643. const byte * end = getEndMatch(state);
  1644. unsigned matchSize = (size32_t)(end - start);
  1645. unsigned unicodeLen;
  1646. UChar * unicodeText;
  1647. rtlStrToUnicodeX(unicodeLen, unicodeText, matchSize, (const char *)start);
  1648. bool ok = validator->isValid(unicodeLen, unicodeText);
  1649. rtlFree(unicodeText);
  1650. return ok;
  1651. }
  1652. bool RegexValidateUniAsUniPattern::doMatch(RegexState & state)
  1653. {
  1654. IUnicodeValidator * validator = (IUnicodeValidator *)state.helper->queryValidator(validatorIndex);
  1655. const byte * start = state.curMatch->start;
  1656. const byte * end = getEndMatch(state);
  1657. unsigned matchSize = (size32_t)(end - start);
  1658. return (validator->isValid(matchSize/2, (const UChar *)start));
  1659. }
  1660. bool RegexValidateUtf8AsUniPattern::doMatch(RegexState & state)
  1661. {
  1662. IUnicodeValidator * validator = (IUnicodeValidator *)state.helper->queryValidator(validatorIndex);
  1663. const byte * start = state.curMatch->start;
  1664. const byte * end = getEndMatch(state);
  1665. unsigned matchSize = (size32_t)(end - start);
  1666. unsigned unicodeLen;
  1667. UChar * unicodeText;
  1668. rtlStrToUnicodeX(unicodeLen, unicodeText, rtlUtf8Length(matchSize, start), (const char *)start);
  1669. bool ok = validator->isValid(unicodeLen, unicodeText);
  1670. rtlFree(unicodeText);
  1671. return ok;
  1672. }
  1673. //---------------------------------------------------------------------------
  1674. void RegexNamedPattern::getTraceText(StringBuffer & s)
  1675. {
  1676. s.append(getPatternText(getKind()));
  1677. if (def->queryName())
  1678. s.append(" ").append(def->queryName());
  1679. }
  1680. RegexMatchAction RegexNamedPattern::match(RegexState & state)
  1681. {
  1682. #ifdef __64BIT__
  1683. // 64-bit systems have more space for stack
  1684. RegexMatchState matched(def);
  1685. return def->match(state, &end, matched);
  1686. #else
  1687. //Allocate on the heap to make a stack fault less likely
  1688. RegexMatchState * matched = state.cache.createState(def);
  1689. RegexMatchAction ret = def->match(state, &end, *matched);
  1690. state.cache.destroyState(matched);
  1691. return ret;
  1692. #endif
  1693. }
  1694. void RegexNamedPattern::dispose()
  1695. {
  1696. RegexPattern::dispose();
  1697. disposeName(def);
  1698. }
  1699. bool RegexNamedPattern::gather(RegexSerializeState & state)
  1700. {
  1701. bool ok = RegexPattern::gather(state);
  1702. if (ok)
  1703. def->gather(state);
  1704. return ok;
  1705. }
  1706. void RegexNamedPattern::serializeLinks(MemoryBuffer & out, RegexSerializeState & state)
  1707. {
  1708. RegexPattern::serializeLinks(out, state);
  1709. serializeName(out, def, state);
  1710. }
  1711. void RegexNamedPattern::deserializeLinks(MemoryBuffer & in, RegexSerializeState & state)
  1712. {
  1713. RegexPattern::deserializeLinks(in, state);
  1714. def.setown(deserializeName(in, state));
  1715. }
  1716. void RegexNamedPattern::toXMLattr(StringBuffer & out, RegexXmlState & state)
  1717. {
  1718. RegexPattern::toXMLattr(out, state);
  1719. if (def->queryName())
  1720. out.appendf(" name=\"%s\"", def->queryName()->queryStr());
  1721. out.appendf(" body=\"%d\"", (unsigned)state.named.find(*def));
  1722. }
  1723. RegexMatchAction RegexNamedPattern::RegexEndNamedPattern::match(RegexState & state)
  1724. {
  1725. MatchSaveState saved;
  1726. state.markFinish(saved);
  1727. RegexMatchAction ret = named->matchNext(state);
  1728. state.unmarkFinish(saved);
  1729. return ret;
  1730. }
  1731. RegexMatchAction RegexNamedPattern::RegexEndNamedPattern::beginMatch(RegexState & state)
  1732. {
  1733. return pushStageEndMatch(state);
  1734. }
  1735. RegexMatchAction RegexNamedPattern::RegexEndNamedPattern::nextAction(ActiveStage & stage, RegexState & state)
  1736. {
  1737. switch (stage.getState())
  1738. {
  1739. case RSnextfollow:
  1740. return named->nextChild(stage, state);
  1741. default:
  1742. return RegexPattern::nextAction(stage, state);
  1743. }
  1744. }
  1745. void RegexNamedPattern::RegexEndNamedPattern::killStage(ActiveStage & stage, RegexState & state)
  1746. {
  1747. cleanupEndMatch(stage, state);
  1748. }
  1749. RegexMatchAction RegexNamedPattern::beginMatch(RegexState & state)
  1750. {
  1751. RegexMatchStateSave * matched = state.cache.createStateSave(def);
  1752. ActiveStage & stage = pushStageBeginMatch(state, matched);
  1753. stage.setState(RSfinished); // so children don't get processed.
  1754. state.namedStack.append(end);
  1755. return def->beginMatch(state);
  1756. }
  1757. void RegexNamedPattern::killStage(ActiveStage & stage, RegexState & state)
  1758. {
  1759. cleanupBeginMatch(stage, state);
  1760. state.namedStack.pop();
  1761. }
  1762. //---------------------------------------------------------------------------
  1763. RegexMatchAction RegexBeginSeparatorPattern::match(RegexState & state)
  1764. {
  1765. MatchState matched(separatorTagAtom, 0);
  1766. MatchSaveState saved;
  1767. state.pushMatch(matched, saved);
  1768. RegexMatchAction ret = matchNext(state);
  1769. state.popMatch(saved);
  1770. return ret;
  1771. }
  1772. RegexMatchAction RegexBeginSeparatorPattern::beginMatch(RegexState & state)
  1773. {
  1774. RegexMatchStateSave * matched = state.cache.createStateSave(separatorTagAtom, 0);
  1775. pushStageBeginMatch(state, matched);
  1776. return RegexMatchContinue;
  1777. }
  1778. void RegexBeginSeparatorPattern::killStage(ActiveStage & stage, RegexState & state)
  1779. {
  1780. cleanupBeginMatch(stage, state);
  1781. }
  1782. RegexMatchAction RegexEndSeparatorPattern::match(RegexState & state)
  1783. {
  1784. return mapAction(markFinishContinueMatch(state));
  1785. }
  1786. RegexMatchAction RegexEndSeparatorPattern::beginMatch(RegexState & state)
  1787. {
  1788. return mapAction(pushStageEndMatch(state));
  1789. }
  1790. RegexMatchAction RegexEndSeparatorPattern::nextAction(ActiveStage & stage, RegexState & state)
  1791. {
  1792. return mapAction(RegexPattern::nextAction(stage, state));
  1793. }
  1794. void RegexEndSeparatorPattern::killStage(ActiveStage & stage, RegexState & state)
  1795. {
  1796. cleanupEndMatch(stage, state);
  1797. }
  1798. //---------------------------------------------------------------------------
  1799. RegexMatchAction RegexRepeatPattern::match(RegexState & state)
  1800. {
  1801. return match(state, minLimit, maxLimit);
  1802. }
  1803. RegexMatchAction RegexRepeatPattern::matchNextInstance(RegexState & state, unsigned curMin, unsigned curMax)
  1804. {
  1805. Owned<RegexRepeatInstance> instance = new RegexRepeatInstance(this, curMin ? curMin-1 : 0, curMax-1);
  1806. return def->match(state, instance);
  1807. }
  1808. RegexMatchAction RegexRepeatPattern::match(RegexState & state, unsigned curMin, unsigned curMax)
  1809. {
  1810. if (curMax == 0)
  1811. return matchNext(state);
  1812. checkStackOverflow((size32_t)(state.cur-state.start));
  1813. Owned<RegexRepeatInstance> instance = new RegexRepeatInstance(this, curMin ? curMin-1 : 0, curMax-1);
  1814. if (curMin != 0)
  1815. return def->match(state, instance);
  1816. const byte * saved = state.cur;
  1817. if (greedy)
  1818. {
  1819. RegexMatchAction ret = def->match(state, instance);
  1820. if (ret != RegexMatchBacktrack)
  1821. return ret;
  1822. state.cur = saved;
  1823. return matchNext(state);
  1824. }
  1825. else
  1826. {
  1827. RegexMatchAction ret = matchNext(state);
  1828. if (ret != RegexMatchBacktrack)
  1829. return ret;
  1830. state.cur = saved;
  1831. return def->match(state, instance);
  1832. }
  1833. }
  1834. void RegexRepeatPattern::dispose()
  1835. {
  1836. RegexPattern::dispose();
  1837. disposeName(def);
  1838. }
  1839. bool RegexRepeatPattern::gather(RegexSerializeState & state)
  1840. {
  1841. bool ok = RegexPattern::gather(state);
  1842. if (ok)
  1843. def->gather(state);
  1844. return ok;
  1845. }
  1846. void RegexRepeatPattern::serializePattern(MemoryBuffer & out)
  1847. {
  1848. RegexPattern::serializePattern(out);
  1849. out.append(minLimit);
  1850. out.append(maxLimit);
  1851. out.append(greedy);
  1852. }
  1853. void RegexRepeatPattern::serializeLinks(MemoryBuffer & out, RegexSerializeState & state)
  1854. {
  1855. RegexPattern::serializeLinks(out, state);
  1856. serializeName(out, def, state);
  1857. }
  1858. void RegexRepeatPattern::deserializePattern(MemoryBuffer & in)
  1859. {
  1860. RegexPattern::deserializePattern(in);
  1861. in.read(minLimit);
  1862. in.read(maxLimit);
  1863. in.read(greedy);
  1864. }
  1865. void RegexRepeatPattern::deserializeLinks(MemoryBuffer & in, RegexSerializeState & state)
  1866. {
  1867. RegexPattern::deserializeLinks(in, state);
  1868. def.setown(deserializeName(in, state));
  1869. }
  1870. void RegexRepeatPattern::toXMLattr(StringBuffer & out, RegexXmlState & state)
  1871. {
  1872. RegexPattern::toXMLattr(out, state);
  1873. out.appendf(" min=\"%d\" max=\"%d\" greedy=\"%d\" body=\"%d\"", minLimit, maxLimit, greedy, (unsigned)state.named.find(*def));
  1874. }
  1875. RegexMatchAction RegexRepeatInstance::match(RegexState & state)
  1876. {
  1877. return def->match(state, minLimit, maxLimit);
  1878. }
  1879. //--- stack friedly implementation
  1880. RegexMatchAction RegexRepeatPattern::beginMatch(RegexState & state)
  1881. {
  1882. return beginMatch(state, minLimit, maxLimit);
  1883. }
  1884. RegexMatchAction RegexRepeatPattern::beginMatchNextInstance(RegexState & state, unsigned curMin, unsigned curMax)
  1885. {
  1886. Owned<RegexRepeatInstance> instance = new RegexRepeatInstance(this, curMin ? curMin-1 : 0, curMax-1);
  1887. return def->match(state, instance);
  1888. }
  1889. RegexMatchAction RegexRepeatPattern::beginMatch(RegexState & state, unsigned curMin, unsigned curMax)
  1890. {
  1891. ActiveStage & stage = pushStage(state);
  1892. stage.nextFollow = 0;
  1893. if (curMax == 0)
  1894. {
  1895. stage.extra.repeatInstance = NULL;
  1896. stage.setState(RSfollowonly);
  1897. return RegexMatchContinue;
  1898. }
  1899. RegexRepeatInstance * instance = new RegexRepeatInstance(this, curMin ? curMin-1 : 0, curMax-1);
  1900. stage.extra.repeatInstance = instance;
  1901. if (curMin != 0)
  1902. stage.setState(RSrepeatonly);
  1903. else if (greedy)
  1904. stage.setState(RSrepeat);
  1905. else
  1906. stage.setState(RSnextfollow);
  1907. return RegexMatchContinue;
  1908. }
  1909. void RegexRepeatPattern::killStage(ActiveStage & stage, RegexState & state)
  1910. {
  1911. if (state.namedStack.ordinality() && stage.extra.repeatInstance)
  1912. {
  1913. if (&state.namedStack.tos() == stage.extra.repeatInstance)
  1914. state.namedStack.pop();
  1915. }
  1916. delete stage.extra.repeatInstance;
  1917. }
  1918. inline RegexMatchAction RegexRepeatPattern::nextRepeatMatch(ActiveStage & stage, RegexState & state)
  1919. {
  1920. state.namedStack.append(*stage.extra.repeatInstance);
  1921. return def->beginMatch(state);
  1922. }
  1923. RegexMatchAction RegexRepeatPattern::nextAction(ActiveStage & stage, RegexState & state)
  1924. {
  1925. switch (stage.getState())
  1926. {
  1927. case RSretry:
  1928. assertex(!greedy);
  1929. state.cur = stage.followPosition;
  1930. //fall through to repeat-only code.
  1931. case RSrepeatonly:
  1932. stage.setState(RSfinished);
  1933. return nextRepeatMatch(stage, state);
  1934. case RSrepeat:
  1935. stage.setState(RSdonerepeat); // ensure namedStack is popped before processing children
  1936. stage.nextFollow = 0;
  1937. return nextRepeatMatch(stage, state);
  1938. case RSdonerepeat:
  1939. state.namedStack.pop();
  1940. stage.setState(RSfollowonly);
  1941. stage.nextFollow = 0;
  1942. return RegexPattern::nextAction(stage, state);
  1943. default:
  1944. return RegexPattern::nextAction(stage, state);
  1945. }
  1946. }
  1947. RegexMatchAction RegexRepeatInstance::beginMatch(RegexState & state)
  1948. {
  1949. return def->beginMatch(state, minLimit, maxLimit);
  1950. }
  1951. //---------------------------------------------------------------------------
  1952. static const int MultiAcceptMask = 0x80000000;
  1953. AsciiDfa::AsciiDfa()
  1954. {
  1955. numStates = 0;
  1956. numTransitions = 0;
  1957. numAccepts = 0;
  1958. states = NULL;
  1959. transitions = NULL;
  1960. accepts = NULL;
  1961. }
  1962. AsciiDfa::~AsciiDfa()
  1963. {
  1964. delete [] states;
  1965. free(transitions);
  1966. free(accepts);
  1967. }
  1968. void AsciiDfa::init(unsigned _numStates)
  1969. {
  1970. numStates = _numStates;
  1971. states = new AsciiDfaState[numStates];
  1972. memset(states, 0, sizeof(*states)*numStates);
  1973. }
  1974. unsigned AsciiDfa::getAccepts(const AsciiDfaState & state, unsigned idx) const
  1975. {
  1976. unsigned acceptId = state.acceptID;
  1977. if (acceptId == NotFound) return NotFound;
  1978. if (acceptId & MultiAcceptMask)
  1979. {
  1980. unsigned * values = accepts + (acceptId & ~MultiAcceptMask);
  1981. return values[idx];
  1982. }
  1983. else
  1984. return (idx == 0) ? acceptId : NotFound;
  1985. }
  1986. void AsciiDfa::serialize(MemoryBuffer & out)
  1987. {
  1988. out.append(numStates).append(numTransitions).append(numAccepts);
  1989. out.append(numStates * sizeof(*states), states);
  1990. out.append(numTransitions * sizeof(*transitions), transitions);
  1991. out.append(numAccepts * sizeof(*accepts), accepts);
  1992. }
  1993. void AsciiDfa::deserialize(MemoryBuffer & in)
  1994. {
  1995. in.read(numStates).read(numTransitions).read(numAccepts);
  1996. states = new AsciiDfaState[numStates];
  1997. transitions = (unsigned *)malloc(numTransitions * sizeof(*transitions));
  1998. accepts = (unsigned *)malloc(numAccepts * sizeof(*accepts));
  1999. in.read(numStates * sizeof(*states), states);
  2000. in.read(numTransitions * sizeof(*transitions), transitions);
  2001. in.read(numAccepts * sizeof(*accepts), accepts);
  2002. }
  2003. void AsciiDfa::setEmpty()
  2004. {
  2005. init(1);
  2006. states[0].delta = 0;
  2007. states[0].min = 255;
  2008. states[0].max = 0;
  2009. states[0].acceptID = NotFound;
  2010. }
  2011. void AsciiDfa::toXML(StringBuffer & out, unsigned detail)
  2012. {
  2013. out.appendf(" numStates=\"%d\" numTransitions=\"%d\"", numStates, numTransitions);
  2014. if (detail > 100)
  2015. {
  2016. out.newline();
  2017. for (unsigned i=0; i < numStates; i++)
  2018. {
  2019. unsigned min = states[i].min;
  2020. unsigned max = states[i].max;
  2021. out.append("\t\t").append(i).append(": ").append(min).append("..").append(max);
  2022. if (states[i].accepts())
  2023. {
  2024. out.append(" Accepts:");
  2025. for (unsigned k = 0;;k++)
  2026. {
  2027. unsigned id = getAccepts(states[i], k);
  2028. if (id == NotFound) break;
  2029. out.append(" ").append(id);
  2030. }
  2031. }
  2032. for (unsigned j = min; j<= max; j++)
  2033. {
  2034. if (((j-min) & 15)==0)
  2035. out.newline().append("\t\t\t");
  2036. out.append(j).append("->").append(transitions[states[i].delta + j]).append(' ');
  2037. }
  2038. out.newline();
  2039. }
  2040. }
  2041. }
  2042. AsciiDfaBuilder::AsciiDfaBuilder(AsciiDfa & _dfa) : dfa(_dfa)
  2043. {
  2044. maxTransitions = 0;
  2045. }
  2046. void AsciiDfaBuilder::addTransition(unsigned next, unsigned nextState)
  2047. {
  2048. if (dfa.states[curState].min > next)
  2049. dfa.states[curState].min = next;
  2050. if (dfa.states[curState].max < next)
  2051. dfa.states[curState].max = next;
  2052. dfa.transitions[firstTransition+next] = nextState;
  2053. }
  2054. void AsciiDfaBuilder::init(unsigned _numStates)
  2055. {
  2056. dfa.init(_numStates);
  2057. }
  2058. void AsciiDfaBuilder::init(unsigned _numStates, unsigned approxTransitions)
  2059. {
  2060. dfa.init(_numStates);
  2061. reallocTransitions(approxTransitions+256*2);
  2062. }
  2063. void AsciiDfaBuilder::reallocTransitions(unsigned required)
  2064. {
  2065. if (required > maxTransitions)
  2066. {
  2067. if (maxTransitions < 0x1000)
  2068. maxTransitions = 0x1000;
  2069. while (maxTransitions < required)
  2070. maxTransitions += 0x1000;
  2071. dfa.transitions = (unsigned *)realloc(dfa.transitions, maxTransitions * sizeof(*dfa.transitions));
  2072. }
  2073. }
  2074. void AsciiDfaBuilder::beginState(unsigned id)
  2075. {
  2076. curState = id;
  2077. dfa.states[curState].min = 255;
  2078. dfa.states[curState].max = 0;
  2079. dfa.states[curState].acceptID = NotFound;
  2080. firstTransition = dfa.numTransitions;
  2081. //ensure there are enough transitions...
  2082. if (firstTransition + 256 > maxTransitions)
  2083. {
  2084. reallocTransitions(firstTransition + 256 );
  2085. assertex(dfa.transitions);
  2086. }
  2087. for (unsigned i =0; i < 256; i++)
  2088. dfa.transitions[firstTransition+i] = NotFound;
  2089. }
  2090. void AsciiDfaBuilder::setStateAccept(unsigned id)
  2091. {
  2092. AsciiDfaState & cur = dfa.states[curState];
  2093. assertex((id & MultiAcceptMask) == 0);
  2094. if (cur.acceptID != NotFound)
  2095. {
  2096. if (!(cur.acceptID & MultiAcceptMask))
  2097. {
  2098. accepts.append(cur.acceptID);
  2099. cur.acceptID = (accepts.ordinality()-1)|MultiAcceptMask;
  2100. }
  2101. accepts.append(id);
  2102. }
  2103. else
  2104. cur.acceptID = id;
  2105. }
  2106. void AsciiDfaBuilder::endState()
  2107. {
  2108. AsciiDfaState & cur = dfa.states[curState];
  2109. if ((cur.acceptID != NotFound) && (cur.acceptID & MultiAcceptMask))
  2110. accepts.append(NotFound);
  2111. unsigned minEntry = cur.min;
  2112. unsigned maxEntry = cur.max;
  2113. if (minEntry <= maxEntry)
  2114. {
  2115. for (unsigned i = minEntry; i <= maxEntry; i++)
  2116. dfa.transitions[firstTransition+i-minEntry] = dfa.transitions[firstTransition+i];
  2117. cur.delta = firstTransition-minEntry;
  2118. dfa.numTransitions += (maxEntry - minEntry + 1);
  2119. }
  2120. else
  2121. cur.delta = 0; // no transitions from this state.
  2122. }
  2123. void AsciiDfaBuilder::finished()
  2124. {
  2125. unsigned numAccepts = accepts.ordinality();
  2126. dfa.numAccepts = numAccepts;
  2127. dfa.accepts = (unsigned *)malloc(sizeof(unsigned)*numAccepts);
  2128. memcpy_iflen(dfa.accepts, accepts.getArray(), numAccepts*sizeof(unsigned));
  2129. }
  2130. //---------------------------------------------------------------------------
  2131. RegexAsciiDfaPattern::RegexAsciiDfaPattern(bool _matchesToken)
  2132. {
  2133. matchesToken = _matchesToken;
  2134. }
  2135. RegexMatchAction RegexAsciiDfaPattern::match(RegexState & state)
  2136. {
  2137. const byte * cur = state.cur;
  2138. const byte * end = state.end;
  2139. unsigned activeState = 0;
  2140. const AsciiDfaState * states = dfa.queryStates();
  2141. unsigned * transitions = dfa.queryTransitions();
  2142. if (matchesToken)
  2143. {
  2144. //MORE: Store only one because we know we could never be backtracked - e.g., because this is always a token.
  2145. const byte * best = NULL;
  2146. for (;;)
  2147. {
  2148. if (states[activeState].accepts())
  2149. best = cur;
  2150. if (cur == end)
  2151. break;
  2152. byte next = *cur++;
  2153. if (next < states[activeState].min)
  2154. break;
  2155. if (next > states[activeState].max)
  2156. break;
  2157. activeState = transitions[states[activeState].delta + next];
  2158. if (activeState == NotFound)
  2159. break;
  2160. }
  2161. if (best)
  2162. {
  2163. state.cur = best;
  2164. RegexMatchAction ret = matchNext(state);
  2165. if (ret == RegexMatchBacktrackToken)
  2166. ret = RegexMatchBacktrack;
  2167. return ret;
  2168. }
  2169. return RegexMatchBacktrack;
  2170. }
  2171. else
  2172. {
  2173. ConstPointerArray & potentialMatches = state.cache.potentialMatches;
  2174. unsigned prevPotentialMatches = potentialMatches.ordinality();
  2175. for (;;)
  2176. {
  2177. if (states[activeState].accepts())
  2178. potentialMatches.append(cur);
  2179. if (cur == end)
  2180. break;
  2181. byte next = *cur++;
  2182. if (next < states[activeState].min)
  2183. break;
  2184. if (next > states[activeState].max)
  2185. break;
  2186. activeState = transitions[states[activeState].delta + next];
  2187. if (activeState == NotFound)
  2188. break;
  2189. }
  2190. while (potentialMatches.ordinality() > prevPotentialMatches)
  2191. {
  2192. state.cur = (const byte *)potentialMatches.popGet();
  2193. RegexMatchAction ret = matchNext(state);
  2194. if (ret != RegexMatchBacktrack)
  2195. {
  2196. potentialMatches.trunc(prevPotentialMatches);
  2197. return ret;
  2198. }
  2199. }
  2200. return RegexMatchBacktrack;
  2201. }
  2202. }
  2203. void RegexAsciiDfaPattern::serializePattern(MemoryBuffer & out)
  2204. {
  2205. RegexDfaPattern::serializePattern(out);
  2206. dfa.serialize(out);
  2207. out.append(matchesToken);
  2208. }
  2209. void RegexAsciiDfaPattern::deserializePattern(MemoryBuffer & in)
  2210. {
  2211. RegexDfaPattern::deserializePattern(in);
  2212. dfa.deserialize(in);
  2213. in.read(matchesToken);
  2214. }
  2215. void RegexAsciiDfaPattern::toXMLattr(StringBuffer & out, RegexXmlState & state)
  2216. {
  2217. RegexDfaPattern::toXMLattr(out, state);
  2218. dfa.toXML(out, state.detail);
  2219. if (matchesToken)
  2220. out.append(" token");
  2221. }
  2222. void RegexAsciiDfaPattern::killStage(ActiveStage & stage, RegexState & state)
  2223. {
  2224. ConstPointerArray & potentialMatches = state.cache.potentialMatches;
  2225. unsigned prevPotentialMatches = stage.extra.prevPotentialMatches;
  2226. potentialMatches.trunc(prevPotentialMatches);
  2227. }
  2228. RegexMatchAction RegexAsciiDfaPattern::beginMatch(RegexState & state)
  2229. {
  2230. const byte * cur = state.cur;
  2231. const byte * end = state.end;
  2232. unsigned activeState = 0;
  2233. const AsciiDfaState * states = dfa.queryStates();
  2234. unsigned * transitions = dfa.queryTransitions();
  2235. const byte * best = NULL;
  2236. ConstPointerArray & potentialMatches = state.cache.potentialMatches;
  2237. const unsigned prevPotentialMatches = potentialMatches.ordinality();
  2238. for (;;)
  2239. {
  2240. if (states[activeState].accepts())
  2241. {
  2242. if (!best || matchesToken)
  2243. best = cur;
  2244. else
  2245. {
  2246. if (prevPotentialMatches == potentialMatches.ordinality())
  2247. potentialMatches.append(best);
  2248. potentialMatches.append(cur);
  2249. }
  2250. }
  2251. if (cur == end)
  2252. break;
  2253. byte next = *cur++;
  2254. if (next < states[activeState].min)
  2255. break;
  2256. if (next > states[activeState].max)
  2257. break;
  2258. activeState = transitions[states[activeState].delta + next];
  2259. if (activeState == NotFound)
  2260. break;
  2261. }
  2262. if (!best)
  2263. return RegexMatchBacktrack;
  2264. ActiveStage & stage = pushStage(state);
  2265. stage.extra.prevPotentialMatches = prevPotentialMatches;
  2266. if (matchesToken)
  2267. stage.flags |= RFbeginToken;
  2268. //Only a single match, therefore no need to backtrack.
  2269. if (prevPotentialMatches == potentialMatches.ordinality())
  2270. {
  2271. stage.followPosition = best;
  2272. stage.setMatched();
  2273. }
  2274. else
  2275. {
  2276. stage.setState(RSretry);
  2277. }
  2278. return RegexMatchContinue;
  2279. }
  2280. RegexMatchAction RegexAsciiDfaPattern::nextAction(ActiveStage & stage, RegexState & state)
  2281. {
  2282. ConstPointerArray & potentialMatches = state.cache.potentialMatches;
  2283. unsigned prevPotentialMatches = stage.extra.prevPotentialMatches;
  2284. assertex(prevPotentialMatches <= potentialMatches.ordinality());
  2285. switch (stage.getState())
  2286. {
  2287. case RSretry:
  2288. {
  2289. if (prevPotentialMatches < potentialMatches.ordinality())
  2290. {
  2291. stage.followPosition = (const byte *)potentialMatches.popGet();
  2292. stage.setMatched();
  2293. return RegexMatchContinue;
  2294. }
  2295. return RegexPattern::nextAction(stage, state);
  2296. }
  2297. default:
  2298. return RegexPattern::nextAction(stage, state);
  2299. }
  2300. }
  2301. //---------------------------------------------------------------------------
  2302. RegexUnicodeDfaPattern::RegexUnicodeDfaPattern()
  2303. {
  2304. }
  2305. RegexUnicodeDfaPattern::~RegexUnicodeDfaPattern()
  2306. {
  2307. }
  2308. RegexMatchAction RegexUnicodeDfaPattern::match(RegexState & state)
  2309. {
  2310. /*
  2311. const byte * cur = state.cur;
  2312. const byte * end = state.end;
  2313. unsigned activeState = 0;
  2314. ConstPointerArray matches;
  2315. for (;;)
  2316. {
  2317. //MORE: It would be better to store only one if we knew we could never be backtracked - e.g., if this was always a token.
  2318. if (states[activeState].accepts)
  2319. matches.append(cur);
  2320. if (cur == end)
  2321. break;
  2322. byte next = *cur++;
  2323. if (next < states[activeState].min)
  2324. break;
  2325. if (next > states[activeState].max)
  2326. break;
  2327. activeState = transitions[states[activeState].delta + next];
  2328. }
  2329. while (matches.ordinality())
  2330. {
  2331. state.cur = (const byte *)matches.tos();
  2332. matches.pop();
  2333. RegexMatchAction ret = matchNext(state);
  2334. if (ret != RegexMatchBacktrack)
  2335. return ret;
  2336. }
  2337. */
  2338. return RegexMatchBacktrack;
  2339. }
  2340. RegexMatchAction RegexUnicodeDfaPattern::beginMatch(RegexState & state)
  2341. {
  2342. UNIMPLEMENTED;
  2343. return RegexMatchBacktrack;
  2344. }
  2345. void RegexUnicodeDfaPattern::serializePattern(MemoryBuffer & out)
  2346. {
  2347. }
  2348. void RegexUnicodeDfaPattern::deserializePattern(MemoryBuffer & in)
  2349. {
  2350. }
  2351. //---------------------------------------------------------------------------
  2352. RegexRepeatAnyPattern::RegexRepeatAnyPattern(unsigned _low, unsigned _high, bool _minimal)
  2353. {
  2354. low = _low;
  2355. high = _high;
  2356. minimal = _minimal;
  2357. }
  2358. RegexMatchAction RegexRepeatAnyPattern::match(RegexState & state)
  2359. {
  2360. unsigned max = (size32_t)(state.end - state.cur);
  2361. if (max > high)
  2362. max = high;
  2363. const byte * start = state.cur;
  2364. if (low > max)
  2365. return RegexMatchBacktrack;
  2366. if (minimal)
  2367. {
  2368. for (unsigned i = low; i <= max; i++)
  2369. {
  2370. state.cur = start+i;
  2371. RegexMatchAction ret = matchNext(state);
  2372. if (ret != RegexMatchBacktrack)
  2373. return ret;
  2374. }
  2375. return RegexMatchBacktrack;
  2376. }
  2377. else
  2378. {
  2379. unsigned i = max;
  2380. for (;;)
  2381. {
  2382. state.cur = start+i;
  2383. RegexMatchAction ret = matchNext(state);
  2384. if (ret != RegexMatchBacktrack)
  2385. return ret;
  2386. if (i == low)
  2387. return RegexMatchBacktrack;
  2388. i--;
  2389. }
  2390. }
  2391. }
  2392. RegexMatchAction RegexRepeatAnyPattern::beginMatch(RegexState & state)
  2393. {
  2394. unsigned max = (size32_t)(state.end - state.cur);
  2395. if (max > high)
  2396. max = high;
  2397. const byte * start = state.cur;
  2398. if (low > max)
  2399. return RegexMatchBacktrack;
  2400. ActiveStage & stage = pushStage(state).setMatched();
  2401. if (minimal)
  2402. {
  2403. stage.followPosition = start+low;
  2404. stage.extra.limit = start+max;
  2405. }
  2406. else
  2407. {
  2408. stage.followPosition = start+max;
  2409. stage.extra.limit = start+low;
  2410. }
  2411. return RegexMatchContinue;
  2412. }
  2413. RegexMatchAction RegexRepeatAnyPattern::nextAction(ActiveStage & stage, RegexState & state)
  2414. {
  2415. switch (stage.getState())
  2416. {
  2417. case RSretry:
  2418. if (minimal)
  2419. {
  2420. if (stage.followPosition < stage.extra.limit)
  2421. {
  2422. stage.followPosition++;
  2423. stage.setMatched();
  2424. return RegexMatchContinue;
  2425. }
  2426. }
  2427. else
  2428. {
  2429. if (stage.followPosition > stage.extra.limit)
  2430. {
  2431. stage.followPosition++;
  2432. stage.setMatched();
  2433. return RegexMatchContinue;
  2434. }
  2435. }
  2436. return RegexPattern::nextAction(stage, state);
  2437. default:
  2438. return RegexPattern::nextAction(stage, state);
  2439. }
  2440. }
  2441. void RegexRepeatAnyPattern::serializePattern(MemoryBuffer & out)
  2442. {
  2443. RegexPattern::serializePattern(out);
  2444. out.append(low).append(high).append(minimal);
  2445. }
  2446. void RegexRepeatAnyPattern::deserializePattern(MemoryBuffer & in)
  2447. {
  2448. RegexPattern::deserializePattern(in);
  2449. in.read(low).read(high).read(minimal);
  2450. }
  2451. void RegexRepeatAnyPattern::toXMLattr(StringBuffer & out, RegexXmlState & state)
  2452. {
  2453. RegexPattern::toXMLattr(out, state);
  2454. out.appendf(" low=\"%u\" high=\"%u\" greedy=\"%d\"", low, high, !minimal);
  2455. }
  2456. //---------------------------------------------------------------------------
  2457. RegexMatchAction RegexNamed::match(RegexState & state, RegexPattern * instance, MatchState & matched)
  2458. {
  2459. MatchSaveState saved;
  2460. checkStackOverflow((size32_t)(state.cur-state.start));
  2461. state.pushMatch(matched, saved);
  2462. RegexMatchAction ret = match(state, instance);
  2463. state.popMatch(saved);
  2464. return ret;
  2465. }
  2466. RegexMatchAction RegexNamed::match(RegexState & state, RegexPattern * instance)
  2467. {
  2468. state.stack.append(*instance);
  2469. RegexMatchAction ret = first->MATCH(state);
  2470. state.stack.pop();
  2471. return ret;
  2472. }
  2473. void RegexNamed::serializeLinks(MemoryBuffer & out, RegexSerializeState & state)
  2474. {
  2475. serializeLink(out, first, state);
  2476. }
  2477. void RegexNamed::dispose()
  2478. {
  2479. disposeLink(first);
  2480. }
  2481. void RegexNamed::gather(RegexSerializeState & state)
  2482. {
  2483. if (state.named.find(*this) == NotFound)
  2484. {
  2485. state.named.append(OLINK(*this));
  2486. first->gather(state);
  2487. }
  2488. }
  2489. void RegexNamed::deserializeLinks(MemoryBuffer & in, RegexSerializeState & state)
  2490. {
  2491. first.setown(deserializeLink(in, state));
  2492. }
  2493. void RegexNamed::serializePattern(MemoryBuffer & out)
  2494. {
  2495. if (name)
  2496. {
  2497. StringBuffer lowerName;
  2498. lowerName.append(str(name)).toLowerCase();
  2499. out.append(lowerName.str());
  2500. }
  2501. else
  2502. out.append(str(name));
  2503. out.append(id);
  2504. }
  2505. void RegexNamed::deserializePattern(MemoryBuffer & in)
  2506. {
  2507. ::deserialize(in, name);
  2508. in.read(id);
  2509. }
  2510. void graphToXML(StringBuffer & out, RegexXmlState & state, RegexPattern * first)
  2511. {
  2512. state.reset();
  2513. state.addLink(*first);
  2514. while (state.toVisit.ordinality())
  2515. {
  2516. RegexPattern & next = state.toVisit.item(0);
  2517. state.toVisit.remove(0, true);
  2518. state.visited.append(next);
  2519. next.toXML(out, state);
  2520. }
  2521. }
  2522. void RegexNamed::toXML(StringBuffer & out, RegexXmlState & state)
  2523. {
  2524. StringBuffer lowerName;
  2525. lowerName.append(str(name)).toLowerCase();
  2526. out.appendf("<named id=\"%d\" name=\"%s\" matchid=\"%d\" first=\"%d\">", (unsigned)state.named.find(*this), lowerName.str(), id, (unsigned)state.patterns.find(*first)).newline();
  2527. graphToXML(out, state, first);
  2528. //first->toXML(out, state);
  2529. out.append("</named>").newline();
  2530. }
  2531. RegexMatchAction RegexNamed::beginMatch(RegexState & state)
  2532. {
  2533. return first->beginMatch(state);
  2534. }
  2535. //-----------------------------------------------------------------------
  2536. void regexToXml(StringBuffer & out, RegexPattern * root, unsigned detail)
  2537. {
  2538. RegexXmlState state;
  2539. state.detail = detail;
  2540. root->gather(state);
  2541. out.appendf("<regex root=\"%d\">", (unsigned)state.patterns.find(*root)).newline();
  2542. graphToXML(out, state, root);
  2543. ForEachItemIn(in, state.named)
  2544. state.named.item(in).toXML(out, state);
  2545. out.append("</regex>").newline();
  2546. }
  2547. void serializeRegex(MemoryBuffer & out, RegexPattern * root)
  2548. {
  2549. RegexSerializeState state;
  2550. root->gather(state);
  2551. unsigned version = REGEX_VERSION;
  2552. out.append(version);
  2553. out.append((unsigned)state.patterns.ordinality());
  2554. out.append((unsigned)state.named.ordinality());
  2555. ForEachItemIn(ip, state.patterns)
  2556. state.patterns.item(ip).serializePattern(out);
  2557. ForEachItemIn(in, state.named)
  2558. state.named.item(in).serializePattern(out);
  2559. ForEachItemIn(ip2, state.patterns)
  2560. state.patterns.item(ip2).serializeLinks(out, state);
  2561. ForEachItemIn(in2, state.named)
  2562. state.named.item(in2).serializeLinks(out, state);
  2563. ForEachItemIn(i, state.patterns)
  2564. state.patterns.item(i).clearGathered();
  2565. }
  2566. RegexPattern * deserializeRegex(MemoryBuffer & in)
  2567. {
  2568. RegexSerializeState state;
  2569. unsigned idx;
  2570. unsigned version;
  2571. unsigned numPatterns, numNamed;
  2572. in.read(version);
  2573. if (version != REGEX_VERSION)
  2574. throwError(REGEXERR_IncompatibleVersion);
  2575. in.read(numPatterns);
  2576. in.read(numNamed);
  2577. for (idx=0; idx < numPatterns; idx++)
  2578. {
  2579. byte _kind;
  2580. ThorRegexKind kind;
  2581. in.read(_kind);
  2582. kind = (ThorRegexKind)_kind;
  2583. RegexPattern * next;
  2584. switch (kind)
  2585. {
  2586. case ThorRegexNull: next = new RegexNullPattern(); break;
  2587. case ThorRegexAnyChar: next = new RegexAnyCharPattern(); break;
  2588. case ThorRegexAsciiDFA: next = new RegexAsciiDfaPattern(); break;
  2589. // case ThorRegexIDFA: next = new RegexIDFAPattern(); break;
  2590. case ThorRegexAscii: next = new RegexAsciiPattern(); break;
  2591. case ThorRegexAsciiI: next = new RegexAsciiIPattern(); break;
  2592. case ThorRegexAsciiSet: next = new RegexAsciiSetPattern(); break;
  2593. case ThorRegexAsciiISet: next = new RegexAsciiISetPattern(); break;
  2594. #ifdef _USE_ICU
  2595. case ThorRegexUnicode: next = new RegexUnicodePattern(); break;
  2596. case ThorRegexUnicodeI: next = new RegexUnicodeIPattern(); break;
  2597. case ThorRegexUnicodeSet: next = new RegexUnicodeSetPattern(); break;
  2598. case ThorRegexUtf8: next = new RegexUtf8Pattern(); break;
  2599. case ThorRegexUtf8I: next = new RegexUtf8IPattern(); break;
  2600. // case ThorRegexUnicodeISet: next = new RegexUnicodeISetPattern(); break;
  2601. #else
  2602. case ThorRegexUnicode:
  2603. case ThorRegexUnicodeI:
  2604. case ThorRegexUnicodeSet:
  2605. case ThorRegexUtf8:
  2606. case ThorRegexUtf8I:
  2607. rtlThrowNoUnicode();
  2608. #endif
  2609. case ThorRegexStart: next = new RegexStartPattern(); break;
  2610. case ThorRegexFinish: next = new RegexFinishPattern(); break;
  2611. case ThorRegexBeginToken: next = new RegexBeginTokenPattern(); break;
  2612. case ThorRegexEndToken: next = new RegexEndTokenPattern(); break;
  2613. case ThorRegexBeginSeparator:next = new RegexBeginSeparatorPattern(); break;
  2614. case ThorRegexEndSeparator: next = new RegexEndSeparatorPattern(); break;
  2615. case ThorRegexRepeat: next = new RegexRepeatPattern(); break;
  2616. case ThorRegexCheck: next = new RegexCheckInPattern(); break;
  2617. case ThorRegexCheckLength: next = new RegexCheckLengthPattern(); break;
  2618. case ThorRegexBeginCheck: next = new RegexBeginCheckPattern(); break;
  2619. case ThorRegexAssertNext: next = new RegexAssertNextPattern(); break;
  2620. case ThorRegexAssertPrev: next = new RegexAssertPrevPattern(); break;
  2621. case ThorRegexNamed: next = new RegexNamedPattern(); break;
  2622. case ThorRegexEndNested: next = new RegexEndNestedPattern(); break;
  2623. case ThorRegexDone: next = new RegexDonePattern(); break;
  2624. case ThorRegexValidateAscAsAsc: next = new RegexValidateAscAsAscPattern(); break;
  2625. case ThorRegexValidateUniAsAsc: next = new RegexValidateUniAsAscPattern(); break;
  2626. case ThorRegexValidateUtf8AsAsc: next = new RegexValidateUtf8AsAscPattern(); break;
  2627. case ThorRegexValidateAscAsUni: next = new RegexValidateAscAsUniPattern(); break;
  2628. case ThorRegexValidateUniAsUni: next = new RegexValidateUniAsUniPattern(); break;
  2629. case ThorRegexValidateUtf8AsUni: next = new RegexValidateUtf8AsUniPattern(); break;
  2630. case ThorRegexRecursive: next = new RegexRecursivePattern(); break;
  2631. case ThorRegexEndRecursive: next = new RegexEndRecursivePattern(); break;
  2632. case ThorRegexPenalty: next = new RegexPenaltyPattern(); break;
  2633. case ThorRegexRepeatAny: next = new RegexRepeatAnyPattern(); break;
  2634. default:
  2635. UNIMPLEMENTED;
  2636. }
  2637. next->deserializePattern(in);
  2638. state.patterns.append(*next);
  2639. }
  2640. for (idx=0; idx < numNamed; idx++)
  2641. {
  2642. RegexNamed * next = new RegexNamed();
  2643. next->deserializePattern(in);
  2644. state.named.append(*next);
  2645. }
  2646. for (idx=0; idx < numPatterns; idx++)
  2647. state.patterns.item(idx).deserializeLinks(in, state);
  2648. for (idx=0; idx < numNamed; idx++)
  2649. state.named.item(idx).deserializeLinks(in, state);
  2650. return LINK(&state.patterns.item(0));
  2651. }
  2652. //---------------------------------------------------------------------------
  2653. RegexMatchState * RegexStateCache::createState(RegexNamed * def)
  2654. {
  2655. if (matchStates.ordinality())
  2656. {
  2657. RegexMatchState * ret = &matchStates.popGet();
  2658. ret->reset(def);
  2659. return ret;
  2660. }
  2661. return new RegexMatchState(def);
  2662. }
  2663. void RegexStateCache::destroyState(RegexMatchState * state)
  2664. {
  2665. matchStates.append(*state);
  2666. }
  2667. RegexMatchStateSave * RegexStateCache::createStateSave(RegexNamed * def)
  2668. {
  2669. if (matchStateSaves.ordinality())
  2670. {
  2671. RegexMatchStateSave * ret = &matchStateSaves.popGet();
  2672. ret->reset(def);
  2673. return ret;
  2674. }
  2675. return new RegexMatchStateSave(def);
  2676. }
  2677. RegexMatchStateSave * RegexStateCache::createStateSave(IAtom * _name, regexid_t _id)
  2678. {
  2679. if (matchStateSaves.ordinality())
  2680. {
  2681. RegexMatchStateSave * ret = &matchStateSaves.popGet();
  2682. ret->reset(_name, _id);
  2683. return ret;
  2684. }
  2685. return new RegexMatchStateSave(_name, _id);
  2686. }
  2687. void RegexStateCache::destroyStateSave(RegexMatchStateSave * state)
  2688. {
  2689. matchStateSaves.append(*state);
  2690. }
  2691. void RegexState::processPattern(RegexPattern * grammar)
  2692. {
  2693. if (implementation == NLPAregexStack)
  2694. {
  2695. grammar->MATCH(*this);
  2696. return;
  2697. }
  2698. RegexMatchAction action = grammar->beginMatch(*this);
  2699. switch (action)
  2700. {
  2701. case RegexMatchDone:
  2702. while (hasActiveStage())
  2703. popStage();
  2704. return;
  2705. case RegexMatchBacktrack:
  2706. case RegexMatchContinue:
  2707. break;
  2708. case RegexMatchBacktrackToken:
  2709. throwUnexpected();
  2710. break;
  2711. }
  2712. while (hasActiveStage())
  2713. {
  2714. #ifdef TRACE_REGEX
  2715. StringBuffer itemText;
  2716. itemText.appendf("[%p]", topStage().pattern);
  2717. #endif
  2718. action = topStage().nextAction(*this);
  2719. #ifdef TRACE_REGEX
  2720. DBGLOG("%*s%snextAction returned %s", patternDepth, "", itemText.str(), resultText[action]);
  2721. #endif
  2722. switch (action)
  2723. {
  2724. case RegexMatchDone:
  2725. {
  2726. while (hasActiveStage())
  2727. popStage();
  2728. return;
  2729. }
  2730. case RegexMatchContinue:
  2731. case RegexMatchBacktrack:
  2732. break;
  2733. case RegexMatchBacktrackToken:
  2734. {
  2735. bool finished = false;
  2736. do
  2737. {
  2738. finished = topStage().isBeginToken();
  2739. popStage();
  2740. } while (!finished);
  2741. break;
  2742. }
  2743. }
  2744. }
  2745. assertex(namedStack.ordinality() == 0);
  2746. assertex(curMatch == &top);
  2747. }
  2748. ActiveStage & RegexState::pushStage()
  2749. {
  2750. curActiveStage++;
  2751. if (!stages.isItem(curActiveStage))
  2752. {
  2753. ActiveStage temp;
  2754. //initialise not needed, but it stops a warning from the compiler
  2755. temp.pattern = NULL;
  2756. temp.state = RSinit;
  2757. stages.append(temp);
  2758. }
  2759. return stages.element(curActiveStage);
  2760. }
  2761. void RegexState::popStage()
  2762. {
  2763. #ifdef TRACE_REGEX
  2764. ActiveStage & tos = topStage();
  2765. StringBuffer t;
  2766. tos.pattern->getTraceText(t);
  2767. DBGLOG("%*s[%p]Pop %s", --patternDepth, "", tos.pattern, t.str());
  2768. #endif
  2769. stages.element(curActiveStage).cleanup(*this);
  2770. curActiveStage--;
  2771. }
  2772. /*
  2773. RegexMatchAction RegexDfa::beginMatch(RegexState & state)
  2774. {
  2775. //walk to find matches
  2776. if (only one then save it).
  2777. if (>1 create a list and add all items onto it)
  2778. if (matched)
  2779. {
  2780. if (single)
  2781. {
  2782. }
  2783. else
  2784. {
  2785. state.pushState(this).setStage(RSretry);
  2786. newState->extra.matches = matches;
  2787. }
  2788. state.namedAction.append(*end);
  2789. return firstElement->beginMatch(state);
  2790. }
  2791. RegexMatchAction RegexDfa::nextAction(ActiveStage & stage, RegexState & state)
  2792. {
  2793. if (stage.extra.matches->ordinality())
  2794. {
  2795. state.cur = stage.extra.matches.pop();
  2796. stage.setMatched();
  2797. return RegexMatchContinue;
  2798. }
  2799. return RegexMatchBacktrack;
  2800. }
  2801. */
  2802. //---------------------------------------------------------------------------
  2803. RegexMatchInfo * RegexMatches::createMatch()
  2804. {
  2805. RegexMatchInfo * row = new RegexMatchInfo(NULL);
  2806. results.append(*row);
  2807. return row;
  2808. }
  2809. void RegexMatches::reset()
  2810. {
  2811. results.kill();
  2812. }
  2813. //---------------------------------------------------------------------------
  2814. RegexAlgorithm::RegexAlgorithm(IOutputMetaData * _outRecordSize, byte _kind) : NlpAlgorithm(new CRegexMatchedResultInfo)
  2815. {
  2816. outRecordSize.set(_outRecordSize);
  2817. kind = _kind;
  2818. }
  2819. RegexAlgorithm::~RegexAlgorithm()
  2820. {
  2821. if (grammar)
  2822. grammar->dispose();
  2823. }
  2824. INlpParser * RegexAlgorithm::createParser(ICodeContext * ctx, unsigned activityId, INlpHelper * helper, IHThorParseArg * arg)
  2825. {
  2826. return new RegexParser(ctx, this, helper, activityId);
  2827. }
  2828. void RegexAlgorithm::init(IHThorParseArg & arg)
  2829. {
  2830. }
  2831. void RegexAlgorithm::serialize(MemoryBuffer & out)
  2832. {
  2833. out.append(kind);
  2834. NlpAlgorithm::serialize(out);
  2835. out.append(minPatternLength);
  2836. serializeRegex(out, grammar);
  2837. }
  2838. void RegexAlgorithm::deserialize(MemoryBuffer & in)
  2839. {
  2840. NlpAlgorithm::deserialize(in);
  2841. in.read(minPatternLength);
  2842. grammar.setown(deserializeRegex(in));
  2843. }
  2844. INlpParseAlgorithm * createRegexParser(MemoryBuffer & buffer, IOutputMetaData * outRecordSize, byte kind)
  2845. {
  2846. RegexAlgorithm * ret = new RegexAlgorithm(outRecordSize, kind);
  2847. ret->deserialize(buffer);
  2848. return ret;
  2849. }
  2850. void RegexAlgorithm::match(RegexState & state)
  2851. {
  2852. state.processPattern(grammar);
  2853. }
  2854. //---------------------------------------------------------------------------
  2855. RegexParser::RegexParser(ICodeContext * ctx, RegexAlgorithm * _algo, INlpHelper * _helper, unsigned _activityId) : matched(_algo->matchInfo)
  2856. {
  2857. algo = LINK(_algo);
  2858. IOutputMetaData * outputMeta = algo->outRecordSize;
  2859. outputAllocator.setown(ctx->getRowAllocator(outputMeta, _activityId));
  2860. helper = _helper;
  2861. results.first();
  2862. charWidth = algo->charWidth;
  2863. }
  2864. RegexParser::~RegexParser()
  2865. {
  2866. matched.kill();
  2867. algo->Release();
  2868. }
  2869. bool RegexParser::performMatch(IMatchedAction & action, const void * row, unsigned len, const void * data)
  2870. {
  2871. try
  2872. {
  2873. results.reset();
  2874. len *= charWidth;
  2875. size32_t maxSize = algo->maxLength*charWidth;
  2876. const byte * start = (const byte *)data;
  2877. RegexState state(cache, algo->kind, helper, this, algo->inputFormat, len, start);
  2878. state.row = row;
  2879. state.processor = &action;
  2880. state.best = NULL;
  2881. if (len >= algo->minPatternLength)
  2882. {
  2883. const byte * endData = start + len;
  2884. const byte * end = endData - algo->minPatternLength;
  2885. for (const byte * curScan = start; curScan <= end;)
  2886. {
  2887. state.cur = curScan;
  2888. state.top.start = curScan;
  2889. state.nextScanPosition = NULL;
  2890. state.score = 0;
  2891. if (!algo->singleChoicePerLine)
  2892. state.best = NULL;
  2893. if ((size32_t)(endData - curScan) > maxSize)
  2894. {
  2895. state.end = curScan + (maxSize + charWidth);
  2896. state.lengthIsLimited = true;
  2897. }
  2898. else
  2899. {
  2900. state.end = endData;
  2901. state.lengthIsLimited = false;
  2902. }
  2903. algo->match(state);
  2904. if (state.numMatched >= algo->keepLimit)
  2905. break;
  2906. if (state.numMatched > algo->atMostLimit)
  2907. {
  2908. results.reset();
  2909. return false;
  2910. }
  2911. if (algo->scanAction == INlpParseAlgorithm::NlpScanWhole)
  2912. break;
  2913. if (state.numMatched && (algo->scanAction == INlpParseAlgorithm::NlpScanNone))
  2914. break;
  2915. if (state.nextScanPosition && (algo->scanAction == INlpParseAlgorithm::NlpScanNext) && (curScan != state.nextScanPosition))
  2916. curScan = state.nextScanPosition;
  2917. else
  2918. curScan += charWidth;
  2919. }
  2920. }
  2921. if (state.numMatched == 0)
  2922. {
  2923. if (algo->notMatchedOnly || algo->notMatched)
  2924. {
  2925. //Create a row where nothing matches.
  2926. MatchState & top = state.top;
  2927. top.end = top.start;
  2928. matched.extractResults(top, state.start, algo->addedSeparators);
  2929. NlpMatchWalker walker(&top);
  2930. const void * matchRow = createMatchRow(state, walker);
  2931. if (matchRow)
  2932. results.appendOwnResult(matchRow);
  2933. }
  2934. }
  2935. else if (algo->notMatchedOnly)
  2936. {
  2937. results.reset();
  2938. }
  2939. //Now reduce the number of matches according to min/max/best
  2940. return results.first();
  2941. }
  2942. catch (IException *)
  2943. {
  2944. throw;
  2945. }
  2946. catch (...)
  2947. {
  2948. throwError1(REGEXERR_MatchFailure, 0);
  2949. return false;
  2950. }
  2951. }
  2952. const void * RegexParser::createMatchRow(RegexState & state, NlpMatchWalker & walker)
  2953. {
  2954. RtlDynamicRowBuilder row(outputAllocator);
  2955. unsigned rowSize = state.processor->onMatch(row, state.row, &matched, &walker);
  2956. if (rowSize)
  2957. return row.finalizeRowClear(rowSize);
  2958. return NULL;
  2959. }
  2960. bool RegexParser::onMatch(NlpState & _state)
  2961. {
  2962. if ((algo->scanAction == INlpParseAlgorithm::NlpScanWhole) && (_state.cur != _state.end))
  2963. return false;
  2964. RegexState & state = (RegexState &)_state;
  2965. if (state.lengthIsLimited && state.cur == state.end)
  2966. return false;
  2967. MatchState & top = state.top;
  2968. const byte * end = top.end;
  2969. unsigned length = (size32_t)(end - top.start);
  2970. RegexMatchInfo * result = NULL;
  2971. if (algo->chooseMin || algo->chooseMax || algo->chooseBest)
  2972. {
  2973. RegexMatchInfo * best = state.best;
  2974. if (best)
  2975. {
  2976. int comp = 0;
  2977. if (algo->chooseMin)
  2978. comp = (int)(length - best->length);
  2979. else if (algo->chooseMax)
  2980. comp = (int)(best->length - length);
  2981. if (algo->chooseBest && (comp == 0))
  2982. comp = (int)(best->score - state.score);
  2983. if (comp >= 0)
  2984. return false;
  2985. }
  2986. else
  2987. {
  2988. state.best = results.createMatch();
  2989. state.numMatched++;
  2990. }
  2991. result = state.best;
  2992. state.nextScanPosition = end;
  2993. }
  2994. else
  2995. {
  2996. if (!state.nextScanPosition || state.nextScanPosition > end)
  2997. state.nextScanPosition = end;
  2998. }
  2999. matched.extractResults(top, state.start, algo->addedSeparators);
  3000. NlpMatchWalker walker(&top);
  3001. const void * newRow = createMatchRow(state, walker);
  3002. if (!newRow)
  3003. return false;
  3004. if (!result)
  3005. {
  3006. result = results.appendOwnResult(newRow);
  3007. state.numMatched++;
  3008. }
  3009. else
  3010. {
  3011. result->setown(newRow);
  3012. }
  3013. result->score = state.score;
  3014. result->length = length;
  3015. if (algo->matchAction == INlpParseAlgorithm::NlpMatchAll)
  3016. {
  3017. if (state.numMatched >= algo->keepLimit)
  3018. return true;
  3019. if (state.numMatched > algo->atMostLimit)
  3020. return true;
  3021. return false;
  3022. }
  3023. return true;
  3024. }
  3025. /*
  3026. #if 0
  3027. RegexMatchAction RegexDFAPattern::match(MatchState & state)
  3028. {
  3029. const byte * savedCur = state.cur;
  3030. while (!fail)
  3031. {
  3032. if (consume & in a match state), add it to a list else fail
  3033. else
  3034. }
  3035. ForEachItemInrev(matches)
  3036. {
  3037. state.cur =matches.item(idx);
  3038. if (matchNext(state) == RegexMatchDone)
  3039. return RegexMatchDone;
  3040. }
  3041. state.cur = savedCur;
  3042. return RegexMatchBacktrack;
  3043. }
  3044. Converting a NFA:
  3045. States, transition(symbol)->*states
  3046. to a DFA
  3047. States, transition(symbol)->state
  3048. Create a set of set-of-nstates Dstates that can be reached from the initial.
  3049. ForEachUnmarkedDState(T)
  3050. ForEachSymbol(x),
  3051. U = possible states that could be reach after that
  3052. if U not in DStates, add it.
  3053. Save DTrans[T,a] = U
  3054. End
  3055. End
  3056. All states that can be reached at all onto a stack
  3057. regular expressions:
  3058. *+? have highest
  3059. then concat
  3060. then
  3061. r|(s|t) == (r|s)|t
  3062. (rs)t == r(st)
  3063. r(s|t) == rs|rt
  3064. er == r == re
  3065. r* = (r|e)*
  3066. r** = r*
  3067. Regular expression to NFA:
  3068. Fairly simple - gives a list of Node(All transitions and actions)
  3069. Running a NFA:
  3070. Simple algorithmn maintains a sets of possible states. So all calculated at the same time.
  3071. - Solves problem of repeatadly searching, but I don't think it could be used for matching text,
  3072. because no positioning information is maintained.
  3073. - Use an ordered list of next-states which
  3074. Lazy
  3075. }}}
  3076. #endif
  3077. */