hqltomita.cpp 55 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088
  1. /*##############################################################################
  2. Copyright (C) 2011 HPCC Systems.
  3. All rights reserved. This program is free software: you can redistribute it and/or modify
  4. it under the terms of the GNU Affero General Public License as
  5. published by the Free Software Foundation, either version 3 of the
  6. License, or (at your option) any later version.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. GNU Affero General Public License for more details.
  11. You should have received a copy of the GNU Affero General Public License
  12. along with this program. If not, see <http://www.gnu.org/licenses/>.
  13. ############################################################################## */
  14. #include <algorithm>
  15. #include "jliball.hpp"
  16. #include "eclrtl.hpp"
  17. #include "hqlexpr.hpp"
  18. #include "hqlthql.hpp"
  19. #include "hqlcerrors.hpp"
  20. #include "hqlcatom.hpp"
  21. #include "hqlutil.hpp"
  22. #include "hqltomita.ipp"
  23. #include "hqlhtcpp.ipp"
  24. #include "thortparse.ipp"
  25. #include "hqlregex.ipp"
  26. //#define DEFAULT_DFA_COMPLEXITY 0
  27. #define DEFAULT_UNICODE_DFA_COMPLEXITY 0
  28. #define DEFAULT_DFA_COMPLEXITY 2000
  29. //===========================================================================
  30. template <class T>
  31. void cloneLinkedArray(T & target, const T & source)
  32. {
  33. ForEachItemIn(i, source)
  34. target.append(OLINK(source.item(i)));
  35. }
  36. //---------------------------------------------------------------------------
  37. TomFeature::TomFeature(IHqlExpression * expr, TomFeatureKind _kind)
  38. {
  39. def.set(expr);
  40. kind = _kind;
  41. mask = 0;
  42. }
  43. StringBuffer & TomFeature::getDebugText(StringBuffer & out)
  44. {
  45. out.append(" FEATURE ");
  46. getName(out);
  47. switch (kind)
  48. {
  49. case TomMaskFeature:
  50. {
  51. if (values.ordinality())
  52. {
  53. out.append(" := ");
  54. getValue(out);
  55. }
  56. out.append(";");
  57. break;
  58. }
  59. case TomStrFeature:
  60. out.append(" := STRING;");
  61. break;
  62. case TomIntFeature:
  63. out.append(" := INTEGER;");
  64. break;
  65. }
  66. return out.appendf(" // mask(%x)", mask).newline();
  67. }
  68. StringBuffer & TomFeature::getName(StringBuffer & out)
  69. {
  70. if (def && def->queryName())
  71. return out.append(def->queryName());
  72. else
  73. return out.appendf("anon:%p", this);
  74. }
  75. StringBuffer & TomFeature::getValue(StringBuffer & out)
  76. {
  77. ForEachItemIn(idx, values)
  78. {
  79. if (idx) out.append("|");
  80. values.item(idx).getName(out);
  81. }
  82. return out;
  83. }
  84. StringBuffer & TomFeature::getGuardValue(StringBuffer & out)
  85. {
  86. if (def)
  87. return getName(out);
  88. return getValue(out);
  89. }
  90. //---------------------------------------------------------------------------
  91. StringBuffer & TomGuard::getDebugText(StringBuffer & out)
  92. {
  93. if (feature)
  94. feature->getName(out).append("=");
  95. return out;
  96. }
  97. StringBuffer & TomMaskGuard::getDebugText(StringBuffer & out)
  98. {
  99. TomGuard::getDebugText(out);
  100. value->getGuardValue(out);
  101. return out;
  102. }
  103. StringBuffer & TomStrGuard::getDebugText(StringBuffer & out)
  104. {
  105. return TomGuard::getDebugText(out).append('\'').append(value).append('\'');
  106. }
  107. StringBuffer & TomIntGuard::getDebugText(StringBuffer & out)
  108. {
  109. return TomGuard::getDebugText(out).append(value);
  110. }
  111. //---------------------------------------------------------------------------
  112. TomToken::TomToken(IHqlExpression * expr, const TomGuardArray & _guards)
  113. {
  114. pattern.set(expr);
  115. cloneLinkedArray(guards, _guards);
  116. id = NotFound;
  117. }
  118. StringBuffer & TomToken::getDebugText(StringBuffer & out)
  119. {
  120. out.append(" TOKEN<").append(id).append("> ");
  121. getName(out).append(" ");
  122. if (pattern)
  123. {
  124. IHqlExpression * expr = pattern;
  125. if (expr->getOperator() == no_pat_instance)
  126. expr = expr->queryChild(0);
  127. getExprECL(expr->queryBody(), out.append(" := "));
  128. }
  129. return out.append(";").newline();
  130. }
  131. StringBuffer & TomToken::getName(StringBuffer & out)
  132. {
  133. IHqlExpression * expr = pattern;
  134. if (!expr)
  135. return out.append("EOF");
  136. _ATOM name = expr->queryName();
  137. if (expr->getOperator() == no_pat_instance)
  138. {
  139. name = expr->queryChild(1)->queryName();
  140. expr = expr->queryChild(0);
  141. }
  142. if (name)
  143. out.append("tok").append(name);
  144. else if (id != NotFound)
  145. out.appendf("tok%d", id);
  146. else
  147. out.appendf("tok%p", this);
  148. return out;
  149. }
  150. bool TomTokenSet::add(TomToken & other)
  151. {
  152. if (tokens.find(other) == NotFound)
  153. {
  154. tokens.append(other);
  155. return true;
  156. }
  157. other.Release();
  158. return false;
  159. }
  160. bool TomTokenSet::add(TomTokenSet & other)
  161. {
  162. bool changed = false;
  163. ForEachItemIn(idx, other)
  164. {
  165. TomToken & cur = other.item(idx);
  166. if (tokens.find(cur) == NotFound)
  167. {
  168. tokens.append(OLINK(cur));
  169. changed = true;
  170. }
  171. }
  172. return changed;
  173. }
  174. //---------------------------------------------------------------------------
  175. StringBuffer & outputGuards(StringBuffer & out, const TomGuardArray & guards, const char * prefix, const char * suffix)
  176. {
  177. if (guards.ordinality())
  178. {
  179. out.append(prefix);
  180. ForEachItemIn(idx, guards)
  181. {
  182. if (idx) out.append(",");
  183. guards.item(idx).getDebugText(out);
  184. }
  185. out.append(suffix);
  186. }
  187. return out;
  188. }
  189. bool TomStep::calcCanBeNull(bool isRoot, bool & result)
  190. {
  191. return true;
  192. }
  193. bool TomStep::calcFirst(bool isRoot, TomTokenSet & tokens)
  194. {
  195. return true;
  196. }
  197. StringBuffer & TomStep::getDebugText(StringBuffer & out)
  198. {
  199. return out;
  200. }
  201. StringBuffer & TomConditionStep::getDebugText(StringBuffer & out)
  202. {
  203. out.append("(?");
  204. if (rule)
  205. rule->getName(out).append(":");
  206. getExprECL(condition, out);
  207. return out.append("?)");
  208. }
  209. TomGuardStep::TomGuardStep(const TomGuardArray & _guards)
  210. {
  211. cloneLinkedArray(guards, _guards);
  212. }
  213. StringBuffer & TomGuardStep::getDebugText(StringBuffer & out)
  214. {
  215. return outputGuards(out, guards, "[", "]");
  216. }
  217. TomNonTerminalStep::TomNonTerminalStep(TomRule * _rule, const TomGuardArray & _guards)
  218. {
  219. rule = _rule;
  220. cloneLinkedArray(guards, _guards);
  221. }
  222. bool TomNonTerminalStep::calcCanBeNull(bool isRoot, bool & result)
  223. {
  224. return rule->calcCanBeNull(isRoot, result);
  225. }
  226. bool TomNonTerminalStep::calcFirst(bool isRoot, TomTokenSet & tokens)
  227. {
  228. return rule->calcFirst(isRoot, tokens);
  229. }
  230. //-- Non Terminal
  231. bool TomNonTerminalStep::canBeNull()
  232. {
  233. return rule->canBeNull();
  234. }
  235. StringBuffer & TomNonTerminalStep::getDebugText(StringBuffer & out)
  236. {
  237. rule->getName(out);
  238. return outputGuards(out, guards, "{", "}");
  239. }
  240. unsigned TomNonTerminalStep::getId()
  241. {
  242. return rule->getId();
  243. }
  244. TomProduction * TomNonTerminalStep::queryExpandRules()
  245. {
  246. return rule->queryExpand();
  247. }
  248. //-- Penalty
  249. StringBuffer & TomPenaltyStep::getDebugText(StringBuffer & out)
  250. {
  251. return out.append("PENALTY(").append(penalty).append(")");
  252. }
  253. //-- Terminal
  254. bool TomTerminalStep::calcFirst(bool isRoot, TomTokenSet & tokens)
  255. {
  256. tokens.add(*LINK(token));
  257. return true;
  258. }
  259. StringBuffer & TomTerminalStep::getDebugText(StringBuffer & out)
  260. {
  261. return token->getName(out);
  262. }
  263. //-- Use --
  264. TomUseStep::TomUseStep(IHqlExpression * _expr, const TomGuardArray & _guards)
  265. {
  266. expr.set(_expr);
  267. cloneLinkedArray(guards, _guards);
  268. }
  269. TomStep * TomUseStep::expandRecursion(IResolveContext & ctx)
  270. {
  271. TomRule * replacement;
  272. assertex(expr->queryChild(0)->queryValue());
  273. replacement = ctx.queryDefine(expr->queryChild(0));
  274. replacement->addUse();
  275. return new TomNonTerminalStep(replacement, guards);
  276. }
  277. //---------------------------------------------------------------------------
  278. void TomProduction::buildProduction(LRTableBuilder & builder, IHqlExpression * test, GenerateTomitaCtx & ctx)
  279. {
  280. builder.addProduction(id, rule->getId(), rule->queryName(), steps.ordinality(), penalty, transformClonesFirstSymbol());
  281. if (test)
  282. {
  283. switch (test->getOperator())
  284. {
  285. case no_pat_first:
  286. builder.addValidator(id, LRVfirst, 0, 0, NULL);
  287. break;
  288. case no_pat_last:
  289. builder.addValidator(id, LRVlast, 0, 0, NULL);
  290. break;
  291. case no_pat_validate:
  292. {
  293. ValidateKind validatorKind = getValidateKind(ctx.regex.queryValidateExpr(test));
  294. byte op;
  295. switch (ctx.info.type)
  296. {
  297. case type_unicode:
  298. case type_utf8:
  299. op = (validatorKind != ValidateIsString) ? LRVvalidateuni : LRVvalidateasc;
  300. break;
  301. default:
  302. op = (validatorKind != ValidateIsUnicode) ? LRVvalidateasc : LRVvalidateuni;
  303. }
  304. builder.addValidator(id, op, ctx.regex.getValidatorIndex(test), 0, NULL);
  305. break;
  306. }
  307. case no_pat_checklength:
  308. {
  309. unsigned minLength = 0;
  310. unsigned maxLength = PATTERN_UNLIMITED_LENGTH;
  311. getCheckRange(test->queryChild(1), minLength, maxLength, ctx.info.charSize);
  312. builder.addValidator(id, LRVchecklength, minLength, maxLength, NULL);
  313. break;
  314. }
  315. case no_pat_checkin:
  316. {
  317. UNIMPLEMENTED;
  318. AsciiDfa * dfa = NULL;
  319. builder.addValidator(id, LRVcheckin, 0, 0, dfa);
  320. break;
  321. }
  322. case no_pat_x_before_y:
  323. case no_pat_before_y:
  324. {
  325. UNIMPLEMENTED;
  326. AsciiDfa * dfa = NULL;
  327. builder.addValidator(id, LRVbefore, 0, 0, dfa);
  328. break;
  329. }
  330. case no_pat_x_after_y:
  331. case no_pat_after_y:
  332. {
  333. UNIMPLEMENTED;
  334. unsigned minLength = 0;
  335. unsigned maxLength = 0;
  336. AsciiDfa * dfa = NULL;
  337. builder.addValidator(id, LRVafter, minLength, maxLength, dfa);
  338. break;
  339. }
  340. default:
  341. UNIMPLEMENTED;
  342. }
  343. }
  344. }
  345. void TomProduction::buildReduce(LRTableBuilder & builder)
  346. {
  347. rule->buildReduce(builder, this);
  348. }
  349. bool TomProduction::calcCanBeNull(bool isRoot, bool & result)
  350. {
  351. ForEachItemIn(idx, steps)
  352. {
  353. if (!steps.item(idx).calcCanBeNull(isRoot, result))
  354. return false;
  355. if (!result)
  356. return false;
  357. }
  358. result = true;
  359. return true;
  360. }
  361. bool TomProduction::canBeNull()
  362. {
  363. bool result = false;
  364. calcCanBeNull(true, result);
  365. return result;
  366. }
  367. bool TomProduction::calcFirst(bool isRoot, TomTokenSet & result)
  368. {
  369. bool known = true;
  370. ForEachItemIn(idx, steps)
  371. {
  372. TomStep & cur = steps.item(idx);
  373. if (!steps.item(idx).calcFirst(isRoot, result))
  374. known = false;
  375. if (!cur.canBeNull())
  376. break;
  377. }
  378. return known;
  379. }
  380. bool TomProduction::calcFollow()
  381. {
  382. bool changed = false;
  383. unsigned max = steps.ordinality();
  384. ForEachItemIn(idx, steps)
  385. {
  386. TomStep & cur = steps.item(idx);
  387. TomRule * curRule = cur.queryRule();
  388. if (curRule)
  389. {
  390. unsigned nextIdx = idx+1;
  391. for (;nextIdx < max; nextIdx++)
  392. {
  393. TomStep & next = steps.item(nextIdx);
  394. if (curRule->gatherFollow(next))
  395. changed = true;
  396. if (!next.canBeNull())
  397. break;
  398. }
  399. if (nextIdx == max)
  400. if (curRule->gatherFollow(*rule))
  401. changed = true;
  402. }
  403. }
  404. return changed;
  405. }
  406. bool TomProduction::transformClonesFirstSymbol()
  407. {
  408. if (transform)
  409. return false;
  410. if (!rule->queryRecord())
  411. return false;
  412. return queryStep(0) != NULL;
  413. }
  414. void TomProduction::expandRecursion(IResolveContext & ctx)
  415. {
  416. ForEachItemIn(idx, steps)
  417. {
  418. TomStep * replacement = steps.item(idx).expandRecursion(ctx);
  419. if (replacement)
  420. steps.replace(*replacement, idx);
  421. }
  422. }
  423. StringBuffer & TomProduction::getDebugText(StringBuffer & out)
  424. {
  425. out.appendf(" Production<%d>: [", id);
  426. ForEachItemIn(idx, localFeatures)
  427. {
  428. if (idx) out.append(",");
  429. localFeatures.item(idx).getName(out);
  430. }
  431. out.append("]");
  432. if (penalty)
  433. out.append("PENALTY(").append(penalty).append(")");
  434. if (transformClonesFirstSymbol())
  435. out.append(" CloningTransform");
  436. out.append(" := ");
  437. ForEachItemIn(i, steps)
  438. {
  439. if (i) out.append(" ");
  440. steps.item(i).getDebugText(out);
  441. }
  442. return out.newline();
  443. }
  444. bool TomProduction::isSimple(bool singleUse)
  445. {
  446. if (localFeatures.ordinality() || transform)
  447. return false;
  448. if (singleUse)
  449. return true;
  450. return steps.ordinality()<=1;
  451. }
  452. void TomProduction::optimizeRules()
  453. {
  454. ForEachItemInRev(idx, steps)
  455. {
  456. Linked<TomProduction> other = steps.item(idx).queryExpandRules();
  457. if (other)
  458. {
  459. steps.remove(idx);
  460. penalty += other->penalty;
  461. ForEachItemIn(i,other->steps)
  462. steps.add(OLINK(other->steps.item(i)), idx+i);
  463. //MORE: Should inc usage count on rules that are cloned when expanding simple.
  464. //MORE: Should probably do in two passes i) expand if used once. ii) expand simple
  465. }
  466. }
  467. }
  468. //---------------------------------------------------------------------------
  469. TomRule::TomRule(IHqlExpression * expr, _ATOM _name, const TomFeatureArray & _features, bool _implicit, bool _isMatched)
  470. {
  471. def.set(expr);
  472. cloneLinkedArray(features, _features);
  473. implicit = _implicit;
  474. id = 0;
  475. isNullState = ValueUnknown;
  476. firstState = ValueUnknown;
  477. curExpandState = (unsigned)-1;
  478. numUses = 0;
  479. IHqlExpression * body = expr;
  480. cachedName = _name;
  481. if (body)
  482. {
  483. //skip attribute used to mark an implicit rule
  484. if (body->isAttribute())
  485. body = body->queryChild(0);
  486. if (body->getOperator() == no_pat_featureparam)
  487. body = body->queryChild(0);
  488. if (body->getOperator() == no_define)
  489. defineName.set(body->queryChild(1));
  490. }
  491. isMatched = _isMatched;
  492. }
  493. void TomRule::buildProductions(LRTableBuilder & builder, GenerateTomitaCtx & ctx)
  494. {
  495. ForEachItemIn(idx, productions)
  496. productions.item(idx).buildProduction(builder, test, ctx);
  497. }
  498. void TomRule::buildReduce(LRTableBuilder & builder, TomProduction * production)
  499. {
  500. ForEachItemIn(idx, follow)
  501. builder.addReduce(follow.item(idx).getId(), production->getId());
  502. }
  503. bool TomRule::canBeNull()
  504. {
  505. bool ret = false;
  506. calcCanBeNull(true, ret);
  507. return ret;
  508. }
  509. bool TomRule::calcCanBeNull(bool isRoot, bool & result)
  510. {
  511. if (isNullState == ValueKnown)
  512. {
  513. result = isNull;
  514. return true;
  515. }
  516. if (isNullState == ValueRecursive)
  517. return false;
  518. isNullState = ValueRecursive;
  519. isNull = false;
  520. bool known = true;
  521. ForEachItemIn(idx, productions)
  522. {
  523. if (productions.item(idx).calcCanBeNull(false, isNull))
  524. {
  525. // Short circuit....
  526. if (isNull)
  527. {
  528. result = isNull;
  529. isNullState = ValueKnown;
  530. return true;
  531. }
  532. }
  533. else
  534. known = false;
  535. }
  536. if (isNull)
  537. result = true;
  538. if (!known && !isRoot)
  539. {
  540. isNullState = ValueUnknown;
  541. return false;
  542. }
  543. isNullState = ValueKnown;
  544. return true;
  545. }
  546. bool TomRule::calcFirst(bool isRoot, TomTokenSet & result)
  547. {
  548. if (firstState == ValueKnown)
  549. {
  550. result.add(first);
  551. return true;
  552. }
  553. if (firstState == ValueRecursive)
  554. return false;
  555. firstState = ValueRecursive;
  556. first.kill();
  557. bool known = true;
  558. ForEachItemIn(idx, productions)
  559. {
  560. if (!productions.item(idx).calcFirst(false, first))
  561. known = false;
  562. }
  563. result.add(first);
  564. if (!known && !isRoot)
  565. {
  566. firstState = ValueUnknown;
  567. return false;
  568. }
  569. firstState = ValueKnown;
  570. return true;
  571. }
  572. bool TomRule::calcFollow()
  573. {
  574. bool changed = false;
  575. ForEachItemIn(idx, productions)
  576. if (productions.item(idx).calcFollow())
  577. changed = true;
  578. return changed;
  579. }
  580. void TomRule::expandRecursion(IResolveContext & ctx)
  581. {
  582. ForEachItemIn(idx, productions)
  583. productions.item(idx).expandRecursion(ctx);
  584. }
  585. bool TomRule::gatherFollow(TomRule & next)
  586. {
  587. unsigned num = follow.ordinality();
  588. follow.add(next.follow);
  589. return (num != follow.ordinality());
  590. }
  591. bool TomRule::gatherFollow(TomStep & next)
  592. {
  593. unsigned num = follow.ordinality();
  594. next.calcFirst(true, follow);
  595. return (num != follow.ordinality());
  596. }
  597. StringBuffer & TomRule::getDebugText(StringBuffer & out)
  598. {
  599. out.append(" Rule<").append(id).append("> ");
  600. getName(out);
  601. if (features.ordinality())
  602. {
  603. out.append("{");
  604. ForEachItemIn(idx, features)
  605. {
  606. if (idx) out.append(",");
  607. features.item(idx).getName(out);
  608. }
  609. out.append("}");
  610. }
  611. if (test)
  612. {
  613. out.append(" Test: (");
  614. getExprECL(test, out);
  615. out.append(")");
  616. }
  617. out.newline();
  618. out.appendf(" CanBeNull(%d) First[", canBeNull());
  619. if (firstState == ValueUnknown)
  620. out.append("?");
  621. ForEachItemIn(i1, first)
  622. {
  623. if (i1) out.append(" ");
  624. out.append(first.item(i1).getId());
  625. }
  626. out.append("] Follow[");
  627. ForEachItemIn(i2, follow)
  628. {
  629. if (i2) out.append(" ");
  630. out.append(follow.item(i2).getId());
  631. }
  632. out.append("]").newline();
  633. ForEachItemIn(idx, productions)
  634. productions.item(idx).getDebugText(out);
  635. return out;
  636. }
  637. void TomRule::getFeatures(TomFeatureArray & target)
  638. {
  639. cloneLinkedArray(target, features);
  640. }
  641. StringBuffer & TomRule::getName(StringBuffer & out)
  642. {
  643. IHqlExpression * expr = def;
  644. if (expr && expr->isAttribute())
  645. expr = expr->queryChild(0);
  646. if (cachedName)
  647. out.append(cachedName);
  648. else if (id)
  649. out.appendf("rule%d", id);
  650. else
  651. out.appendf("rule%p", this);
  652. return out;
  653. }
  654. bool TomRule::matchesDefine(IHqlExpression * name)
  655. {
  656. return (name == defineName);
  657. }
  658. void TomRule::optimizeRules()
  659. {
  660. ForEachItemIn(idx, productions)
  661. productions.item(idx).optimizeRules();
  662. }
  663. TomProduction * TomRule::queryExpand()
  664. {
  665. if (productions.ordinality() != 1)
  666. return NULL;
  667. if (cachedName && isMatched)
  668. return NULL;
  669. if (test)
  670. return NULL;
  671. TomProduction * first = &productions.item(0);
  672. if (first->isSimple(numUses == 1))
  673. return first;
  674. return NULL;
  675. }
  676. IHqlExpression * TomRule::queryRecord()
  677. {
  678. if (def)
  679. {
  680. if (def->isAttribute() && def->queryName() == tomitaAtom)
  681. return def->queryChild(0)->queryRecord();
  682. return def->queryRecord();
  683. }
  684. return NULL;
  685. }
  686. void TomRule::setProductionIds(HqlExprArray & productionMappings, unsigned & id)
  687. {
  688. ForEachItemIn(idx, productions)
  689. {
  690. TomProduction & cur = productions.item(idx);
  691. cur.setId(id++);
  692. LinkedHqlExpr mapTo = cur.queryTransform();
  693. if (!mapTo && def)
  694. {
  695. IHqlExpression * record = queryRecord();
  696. if (record)
  697. {
  698. if (!cur.queryStep(0))
  699. {
  700. //create a default transform to clear the record.
  701. mapTo.setown(createNullTransform(record));
  702. }
  703. else
  704. {
  705. //Check there is only a single input symbol which has an associated record
  706. TomRule * defaultRule = NULL;
  707. for (unsigned i=0;;i++)
  708. {
  709. TomStep * curStep = cur.queryStep(i);
  710. if (!curStep)
  711. break;
  712. TomRule * curRule = curStep->queryRule();
  713. if (curRule && curRule->queryRecord())
  714. {
  715. if (defaultRule)
  716. throwError(HQLERR_ExpectedTransformManyInputs);
  717. defaultRule = curRule;
  718. }
  719. }
  720. if (!defaultRule || !recordTypesMatch(defaultRule->queryRecord(), record))
  721. throwError(HQLERR_ExpectedTransformManyInputs);
  722. //If so, then don't generate a transform since the input record can be cloned.
  723. mapTo.set(record);
  724. }
  725. }
  726. }
  727. if (mapTo)
  728. productionMappings.append(*createValue(no_mapto, makeVoidType(), createConstant((__int64)cur.getId()), LINK(mapTo)));
  729. }
  730. }
  731. //---------------------------------------------------------------------------
  732. TomitaContext::TomitaContext(IHqlExpression * _expr, IWorkUnit * _wu, const HqlCppOptions & _options, ITimeReporter * _timeReporter)
  733. : NlpParseContext(_expr, _wu, _options, _timeReporter), parser(NULL), translatorOptions(_options)
  734. {
  735. numTerminals = 0;
  736. numSymbols = 0;
  737. numProductions = 0;
  738. done = NULL;
  739. gotos = NULL;
  740. maximizeLexer = _options.maximizeLexer;
  741. grammarAmbiguous = false;
  742. }
  743. TomitaContext::~TomitaContext()
  744. {
  745. }
  746. void TomitaContext::addLexerTerminator(IHqlExpression * expr)
  747. {
  748. //NB: expr is know to be valid...
  749. switch (expr->getOperator())
  750. {
  751. case no_list:
  752. {
  753. ForEachChild(idx, expr)
  754. addLexerTerminator(expr->queryChild(idx));
  755. break;
  756. }
  757. case no_constant:
  758. {
  759. ITypeInfo * type = expr->queryType();
  760. const void * value = expr->queryValue()->queryValue();
  761. switch (type->getTypeCode())
  762. {
  763. case type_data:
  764. case type_string:
  765. parser.endTokenChars.append(*(const byte *)value);
  766. break;
  767. case type_utf8:
  768. assertex(rtlUtf8Size(1, value) == 1);
  769. parser.endTokenChars.append(*(const byte *)value);
  770. break;
  771. case type_unicode:
  772. UNIMPLEMENTED;
  773. }
  774. break;
  775. }
  776. }
  777. }
  778. unsigned TomitaContext::addMatchReference(IHqlExpression * expr)
  779. {
  780. if (expr && expr->queryType()->getTypeCode() == type_pattern)
  781. {
  782. StringBuffer s;
  783. s.append(expr->queryName());
  784. if (!s.length())
  785. getExprECL(expr, s);
  786. throwError1(HQLWRN_TomitaMatchPattern, s.str());
  787. }
  788. return NlpParseContext::addMatchReference(expr);
  789. }
  790. void TomitaContext::associateIds()
  791. {
  792. unsigned id=0;
  793. ForEachItemIn(idx1, tokens)
  794. tokens.item(idx1).setId(id++);
  795. numTerminals = id;
  796. ForEachItemIn(idx2, rules)
  797. rules.item(idx2).setId(id++);
  798. numSymbols = id;
  799. }
  800. void TomitaContext::calculateFeatureMasks()
  801. {
  802. }
  803. TomFeature * TomitaContext::createFeature(IHqlExpression * expr)
  804. {
  805. TomFeature * feature = queryFeature(expr);
  806. if (feature)
  807. return feature;
  808. assertex(expr->getOperator() == no_pat_featuredef);
  809. IHqlExpression * def = expr->queryChild(0);
  810. TomFeatureKind kind = TomMaskFeature;
  811. if (def->getOperator() == no_null)
  812. {
  813. switch (def->queryType()->getTypeCode())
  814. {
  815. case type_string:
  816. kind = TomStrFeature;
  817. break;
  818. case type_int:
  819. kind = TomIntFeature;
  820. break;
  821. }
  822. }
  823. feature = new TomFeature(expr, kind);
  824. if (def->getOperator() != no_null)
  825. {
  826. HqlExprArray args;
  827. def->unwindList(args, no_pat_or);
  828. ForEachItemIn(idx, args)
  829. feature->addValue(createFeature(&args.item(idx)));
  830. }
  831. features.append(*feature);
  832. return feature;
  833. }
  834. TomFeature * TomitaContext::createFeatureValue(IHqlExpression * expr)
  835. {
  836. TomFeature * feature = queryFeature(expr);
  837. if (feature)
  838. return feature;
  839. feature = new TomFeature(expr, TomMaskFeature);
  840. HqlExprArray args;
  841. expr->unwindList(args, no_pat_or);
  842. ForEachItemIn(idx, args)
  843. feature->addValue(createFeature(&args.item(idx)));
  844. features.append(*feature);
  845. return feature;
  846. }
  847. void TomitaContext::createFeatures(TomFeatureArray & target, IHqlExpression * expr)
  848. {
  849. if (!expr)
  850. return;
  851. HqlExprArray args;
  852. expr->unwindList(args, no_comma);
  853. ForEachItemIn(idx, args)
  854. target.append(OLINK(*createFeature(&args.item(idx))));
  855. }
  856. TomGuard * TomitaContext::createGuard(IHqlExpression * featureExpr, IHqlExpression * valueExpr)
  857. {
  858. TomFeature * feature = NULL;
  859. if (featureExpr)
  860. feature = createFeature(featureExpr);
  861. switch (valueExpr->queryType()->getTypeCode())
  862. {
  863. case type_feature:
  864. {
  865. TomFeature * value = createFeatureValue(valueExpr);
  866. return new TomMaskGuard(feature, value);
  867. }
  868. case type_string:
  869. {
  870. StringBuffer text;
  871. valueExpr->queryValue()->getStringValue(text);
  872. //MORE: case sensitivity/unicode
  873. _ATOM value = createAtom(text);
  874. return new TomStrGuard(feature, value);
  875. }
  876. case type_int:
  877. {
  878. unsigned value = (unsigned)getIntValue(valueExpr);
  879. return new TomIntGuard(feature, value);
  880. }
  881. default:
  882. UNIMPLEMENTED;
  883. }
  884. }
  885. void TomitaContext::createGuards(TomGuardArray & guards, IHqlExpression * expr)
  886. {
  887. if (!expr)
  888. return;
  889. HqlExprArray args;
  890. expr->unwindList(args, no_comma);
  891. ForEachItemIn(idx, args)
  892. {
  893. IHqlExpression & cur = args.item(idx);
  894. assertex(cur.getOperator() == no_eq);
  895. TomGuard * guard = createGuard(cur.queryChild(0), cur.queryChild(1));
  896. guards.append(*guard);
  897. }
  898. }
  899. void TomitaContext::createImplicitProduction(TomRule * self, TomRule * rule, IHqlExpression * expr, bool expandTokens)
  900. {
  901. switch (expr->getOperator())
  902. {
  903. case no_pat_repeat:
  904. if (!expandTokens)
  905. {
  906. IHqlExpression * action = expr->queryChild(0);
  907. unsigned low = getRepeatMin(expr);
  908. unsigned high = getRepeatMax(expr);
  909. TomProduction * first = new TomProduction(rule);
  910. TomProduction * last = NULL;
  911. if ((low == 0) && (high == 1))
  912. {
  913. last = new TomProduction(rule);
  914. createSteps(self, last, action, expandTokens);
  915. }
  916. else
  917. {
  918. TomRule * actionRule = createImplicitRule(self, action, expandTokens);
  919. TomGuardArray guards;
  920. for (unsigned i = 0; i < low; i++)
  921. first->addStepOwn(new TomNonTerminalStep(actionRule, guards));
  922. if (high != low)
  923. {
  924. if (high == (unsigned)-1)
  925. {
  926. last = new TomProduction(rule);
  927. last->addStepOwn(new TomNonTerminalStep(rule, guards));
  928. last->addStepOwn(new TomNonTerminalStep(actionRule, guards));
  929. }
  930. else
  931. {
  932. if (high - low > 5)
  933. throwError(HQLERR_ArbitaryRepeatUnimplemented);
  934. for (unsigned steps = low+1; steps <= high; steps++)
  935. {
  936. TomProduction * next = new TomProduction(rule);
  937. for (unsigned i = 0; i < steps; i++)
  938. next->addStepOwn(new TomNonTerminalStep(actionRule, guards));
  939. rule->addProductionOwn(next);
  940. }
  941. }
  942. }
  943. }
  944. rule->addProductionOwn(first);
  945. if (last) rule->addProductionOwn(last);
  946. }
  947. else
  948. createProduction(self, rule, expr, expandTokens);
  949. break;
  950. default:
  951. createProduction(self, rule, expr, expandTokens);
  952. break;
  953. }
  954. }
  955. TomRule * TomitaContext::createImplicitRule(TomRule * self, IHqlExpression * expr, bool expandTokens)
  956. {
  957. OwnedHqlExpr wrapped = createAttribute(tomitaAtom, LINK(expr));
  958. TomRule * rule = queryRule(wrapped, NULL);
  959. if (!rule)
  960. {
  961. TomFeatureArray features;
  962. activeRules.tos().getFeatures(features);
  963. rule = new TomRule(wrapped, NULL, features, true, false);
  964. rules.append(*rule);
  965. activeRules.append(*rule);
  966. LinkedHqlExpr body = expr;
  967. switch (expr->getOperator())
  968. {
  969. case no_pat_first:
  970. case no_pat_last:
  971. case no_pat_validate:
  972. case no_pat_checklength:
  973. case no_pat_checkin:
  974. case no_pat_x_before_y:
  975. case no_pat_before_y:
  976. case no_pat_x_after_y:
  977. case no_pat_after_y:
  978. {
  979. rule->setTest(expr);
  980. body.set(body->queryChild(0));
  981. if (!body)
  982. body.setown(createValue(no_null, makePatternType()));
  983. break;
  984. }
  985. }
  986. HqlExprArray args;
  987. body->unwindList(args, no_pat_or);
  988. ForEachItemIn(idx, args)
  989. createImplicitProduction(self, rule, &args.item(idx), expandTokens);
  990. activeRules.pop();
  991. }
  992. return rule;
  993. }
  994. void TomitaContext::createProduction(TomRule * self, TomRule * rule, IHqlExpression * expr, bool expandTokens)
  995. {
  996. TomProduction * production = new TomProduction(rule);
  997. createSteps(self, production, expr, expandTokens);
  998. rule->addProductionOwn(production);
  999. }
  1000. void TomitaContext::createProductions(TomRule * rule, IHqlExpression * expr, bool expandTokens)
  1001. {
  1002. HqlExprArray args;
  1003. expr->unwindList(args, no_pat_or);
  1004. ForEachItemIn(idx, args)
  1005. createProduction(rule, rule, &args.item(idx), expandTokens);
  1006. }
  1007. TomRule * TomitaContext::createRule(IHqlExpression * expr, _ATOM name, bool expandTokens)
  1008. {
  1009. TomRule * rule = queryRule(expr, name);
  1010. if (!rule)
  1011. {
  1012. TomFeatureArray features;
  1013. IHqlExpression * def = expr;
  1014. if (def->getOperator() == no_pat_featureparam)
  1015. {
  1016. createFeatures(features, def->queryChild(1));
  1017. def = def->queryChild(0);
  1018. }
  1019. if (def->getOperator() == no_define)
  1020. def = def->queryChild(0);
  1021. rule = new TomRule(expr, name, features, false, isMatched(expr, name));
  1022. rules.append(*rule);
  1023. activeRules.append(*rule);
  1024. createProductions(rule, def, expandTokens);
  1025. activeRules.pop();
  1026. }
  1027. rule->addUse();
  1028. return rule;
  1029. }
  1030. void TomitaContext::createStepAsImplicitRule(TomRule * self, TomProduction * production, IHqlExpression * expr, bool expandTokens)
  1031. {
  1032. TomRule * rule = createImplicitRule(self, expr, expandTokens);
  1033. TomGuardArray guards;
  1034. production->addStepOwn(new TomNonTerminalStep(rule, guards));
  1035. }
  1036. void TomitaContext::createSteps(TomRule * self, TomProduction * production, IHqlExpression * expr, bool expandTokens)
  1037. {
  1038. switch (expr->getOperator())
  1039. {
  1040. case no_pat_instance:
  1041. {
  1042. IHqlExpression * def = expr->queryChild(0);
  1043. bool canExpand;
  1044. if (maximizeLexer)
  1045. canExpand = canCreateLexerToken(def, false);
  1046. else
  1047. canExpand = isToken(def, expandTokens);
  1048. if (!canExpand)
  1049. {
  1050. if (expr->hasProperty(_function_Atom))
  1051. createSteps(self, production, def, expandTokens);
  1052. else
  1053. {
  1054. TomGuardArray guards;
  1055. if (def->getOperator() == no_pat_featureactual)
  1056. {
  1057. HqlExprArray actuals;
  1058. def->queryChild(1)->unwindList(actuals, no_comma);
  1059. ForEachItemIn(idx, actuals)
  1060. guards.append(*createGuard(NULL, &actuals.item(idx)));
  1061. def = def->queryChild(0);
  1062. }
  1063. TomRule * rule = createRule(def, expr->queryChild(1)->queryName(), expandTokens);
  1064. production->addStepOwn(new TomNonTerminalStep(rule, guards));
  1065. //MORE: Need to add guards to localFeatures
  1066. }
  1067. return;
  1068. }
  1069. break;
  1070. }
  1071. case no_pat_or:
  1072. //MORE: This changes when we want to extract guard information...
  1073. if (!expandTokens)
  1074. {
  1075. if (expr->numChildren() == 1)
  1076. createSteps(self, production, expr->queryChild(0), expandTokens);
  1077. else
  1078. createStepAsImplicitRule(self, production, expr, expandTokens);
  1079. return;
  1080. }
  1081. break;
  1082. case no_pat_guard:
  1083. {
  1084. TomGuardArray guards;
  1085. createGuards(guards, expr->queryChild(0));
  1086. production->addStepOwn(new TomGuardStep(guards));
  1087. return;
  1088. }
  1089. case no_penalty:
  1090. production->addPenalty((int)getIntValue(expr->queryChild(0)));
  1091. return;
  1092. case no_null:
  1093. //production->addStepOwn(new TomNullStep);
  1094. return;
  1095. case no_pat_case:
  1096. case no_pat_nocase:
  1097. expandTokens = true;
  1098. break;
  1099. case no_pat_first:
  1100. case no_pat_last:
  1101. createStepAsImplicitRule(self, production, expr, expandTokens);
  1102. return;
  1103. }
  1104. if (expandTokens)
  1105. {
  1106. TomToken * token = createToken(expr);
  1107. production->addStepOwn(new TomTerminalStep(token));
  1108. return;
  1109. }
  1110. switch (expr->getOperator())
  1111. {
  1112. case no_pat_imptoken:
  1113. case no_pat_token:
  1114. {
  1115. createSteps(self, production, expr->queryChild(0), true);
  1116. break;
  1117. }
  1118. case no_pat_follow:
  1119. {
  1120. ForEachChild(i, expr)
  1121. createSteps(self, production, expr->queryChild(i), expandTokens);
  1122. break;
  1123. }
  1124. case no_pat_repeat:
  1125. {
  1126. createStepAsImplicitRule(self, production, expr, expandTokens);
  1127. break;
  1128. }
  1129. case no_pat_use:
  1130. {
  1131. //MORE: Should guards be allowed?
  1132. TomGuardArray guards;
  1133. production->addStepOwn(new TomUseStep(expr, guards));
  1134. }
  1135. break;
  1136. case no_self:
  1137. {
  1138. //MORE: Should guards be allowed?
  1139. TomGuardArray guards;
  1140. production->addStepOwn(new TomNonTerminalStep(self, guards));
  1141. break;
  1142. }
  1143. case no_pat_instance:
  1144. createSteps(self, production, expr->queryChild(0), expandTokens);
  1145. break;
  1146. case no_pat_validate:
  1147. case no_pat_checklength:
  1148. case no_pat_checkin:
  1149. case no_pat_x_before_y:
  1150. case no_pat_before_y:
  1151. case no_pat_x_after_y:
  1152. case no_pat_after_y:
  1153. createStepAsImplicitRule(self, production, expr, expandTokens);
  1154. break;
  1155. case no_implicitcast:
  1156. //can sometimes occur when parameters are bound
  1157. createSteps(self, production, expr->queryChild(0), expandTokens);
  1158. break;
  1159. case no_pat_production:
  1160. createSteps(self, production, expr->queryChild(0), expandTokens);
  1161. assertex(!production->transform);
  1162. production->transform.set(expr->queryChild(1));
  1163. break;
  1164. case no_penalty:
  1165. //already handled
  1166. break;
  1167. default:
  1168. UNIMPLEMENTED;
  1169. }
  1170. }
  1171. TomToken * TomitaContext::createToken(IHqlExpression * expr)
  1172. {
  1173. ForEachItemIn(idx, tokens)
  1174. {
  1175. TomToken & cur = tokens.item(idx);
  1176. if (cur.matches(expr))
  1177. return &cur;
  1178. }
  1179. TomGuardArray guards;
  1180. if (expr->getOperator() == no_pat_follow)
  1181. {
  1182. ForEachChild(idx, expr)
  1183. {
  1184. IHqlExpression * cur = expr->queryChild(idx);
  1185. if (cur->getOperator() == no_pat_guard)
  1186. createGuards(guards, cur->queryChild(0));
  1187. }
  1188. }
  1189. TomToken * token = new TomToken(expr, guards);
  1190. tokens.append(*token);
  1191. return token;
  1192. }
  1193. void TomitaContext::compileSearchPattern()
  1194. {
  1195. if (searchType() == type_unicode)
  1196. throwError(HQLERR_TomitaNoUnicode);
  1197. checkValidMatches();
  1198. IHqlExpression * grammar = expr->queryChild(2);
  1199. IHqlExpression * rootSymbol = grammar->queryChild(0);
  1200. assertex(grammar->queryType()->getTypeCode() != type_pattern);
  1201. grammarRule.set(createRule(rootSymbol, grammar->queryChild(1)->queryName(), false));
  1202. //Create an augmented grammar G' with rule S' := S;
  1203. TomFeatureArray noFeatures;
  1204. TomGuardArray noGuards;
  1205. rootRule.setown(new TomRule(NULL, NULL, noFeatures, true, false));
  1206. rootProduction.setown(new TomProduction(rootRule));
  1207. rootProduction->addStepOwn(new TomNonTerminalStep(grammarRule, noGuards));
  1208. rootProduction->setRoot();
  1209. rootRule->addProductionOwn(LINK(rootProduction));
  1210. rules.append(*LINK(rootRule));
  1211. eofToken.setown(new TomToken(NULL, noGuards));
  1212. tokens.append(*LINK(eofToken));
  1213. OwnedHqlExpr nullExpr = createValue(no_null);
  1214. //MORE, create rules for used list...
  1215. expandRecursion();
  1216. optimizeRules();
  1217. associateIds();
  1218. calculateFeatureMasks();
  1219. generateLexer();
  1220. generateParser();
  1221. setParserOptions(parser);
  1222. compileMatched(parser);
  1223. }
  1224. void TomitaContext::generateLexer()
  1225. {
  1226. //nested scope
  1227. try
  1228. {
  1229. RegexContext regex(expr, wu(), translatorOptions, timeReporter, NLPAregexStack);
  1230. regex.beginLexer();
  1231. ForEachItemIn(idx, tokens)
  1232. {
  1233. TomToken & cur = tokens.item(idx);
  1234. if (&cur != eofToken)
  1235. regex.addLexerToken(cur.id, cur.pattern);
  1236. }
  1237. AsciiDfaBuilder builder(parser.tokenDfa);
  1238. regex.generateLexer(&builder);
  1239. }
  1240. catch (IException * e)
  1241. {
  1242. StringBuffer s;
  1243. e->errorMessage(s);
  1244. e->Release();
  1245. throwError1(HQLERR_TomitaPatternTooComplex, s.str());
  1246. }
  1247. IHqlExpression * separator = expr->queryProperty(separatorAtom);
  1248. if (separator)
  1249. {
  1250. RegexContext regex2(expr, wu(), translatorOptions, timeReporter, NLPAregexStack);
  1251. regex2.beginLexer();
  1252. regex2.addLexerToken(1, separator->queryChild(0));
  1253. AsciiDfaBuilder builder(parser.skipDfa);
  1254. regex2.generateLexer(&builder);
  1255. }
  1256. else
  1257. parser.skipDfa.setEmpty();
  1258. IHqlExpression * terminator = expr->queryProperty(terminatorAtom);
  1259. if (terminator)
  1260. addLexerTerminator(terminator->queryChild(0));
  1261. parser.eofId = eofToken->getId();
  1262. }
  1263. //-- Parser generation -------------------------------------------------
  1264. //---------------------------------------------------------------------------
  1265. inline int compareLRItem(HqlLRItem * left, HqlLRItem * right)
  1266. {
  1267. if (left->production->seq < right->production->seq)
  1268. return -1;
  1269. if (left->production->seq > right->production->seq)
  1270. return +1;
  1271. if (left->index < right->index)
  1272. return -1;
  1273. if (left->index > right->index)
  1274. return +1;
  1275. return 0;
  1276. }
  1277. int compareLRItem(CInterface * * left, CInterface * * right)
  1278. {
  1279. return compareLRItem((HqlLRItem *)*left, (HqlLRItem *)*right);
  1280. }
  1281. void HqlLRItemSet::add(HqlLRItem * value)
  1282. {
  1283. bool isNew;
  1284. CInterface * castValue = value;
  1285. values.bAdd(castValue, compareLRItem, isNew);
  1286. if (!isNew)
  1287. value->Release();
  1288. }
  1289. int HqlLRItemSet::compare(const HqlLRItemSet & other) const
  1290. {
  1291. unsigned numThis = values.ordinality();
  1292. unsigned numOther = other.values.ordinality();
  1293. unsigned numCommon = std::min(numThis, numOther);
  1294. for (unsigned i = 0; i < numCommon; i++)
  1295. {
  1296. HqlLRItem & left = values.item(i);
  1297. HqlLRItem & right = other.values.item(i);
  1298. int c = compareLRItem(&left, &right);
  1299. if (c)
  1300. return c;
  1301. }
  1302. if (numThis > numOther)
  1303. return +1;
  1304. else if (numThis < numOther)
  1305. return -1;
  1306. else
  1307. return 0;
  1308. }
  1309. bool HqlLRItemSet::equals(const HqlLRItemSet & other) const
  1310. {
  1311. unsigned numThis = values.ordinality();
  1312. unsigned numOther = other.values.ordinality();
  1313. if (numThis != numOther)
  1314. return false;
  1315. for (unsigned i = 0; i < numThis; i++)
  1316. {
  1317. if (compareLRItem(&values.item(i), &other.values.item(i)) == 0)
  1318. return false;
  1319. }
  1320. return true;
  1321. }
  1322. StringBuffer & HqlLRGoto::getDebugText(StringBuffer & out)
  1323. {
  1324. return out.append(symbol).append("->").append(state->id);
  1325. }
  1326. void HqlLRState::addItem(TomProduction * production, unsigned idx)
  1327. {
  1328. HqlLRItem * next = new HqlLRItem(production, idx);
  1329. items.add(next);
  1330. }
  1331. void HqlLRState::addGoto(unsigned id, HqlLRState * state)
  1332. {
  1333. gotos.append(*new HqlLRGoto(id, state));
  1334. }
  1335. void HqlLRState::buildNullReductions(LRTableBuilder & builder, TomRule * rule)
  1336. {
  1337. ForEachItemIn(i2, rule->productions)
  1338. {
  1339. TomProduction & curProd = rule->productions.item(i2);
  1340. if (curProd.canBeNull())
  1341. curProd.buildReduce(builder);
  1342. TomStep * first = curProd.queryStep(0);
  1343. if (first)
  1344. {
  1345. TomRule * stepRule = first->queryRule();
  1346. if (stepRule)
  1347. buildNullReductions(builder, stepRule);
  1348. }
  1349. }
  1350. }
  1351. void HqlLRState::buildStateItem(LRTableBuilder & builder, TomProduction * production, unsigned index)
  1352. {
  1353. TomStep * step = production->queryStep(index);
  1354. if (step)
  1355. {
  1356. if (step->isTerminal())
  1357. {
  1358. TomTokenSet tokens;
  1359. step->calcFirst(true, tokens);
  1360. ForEachItemIn(idx, tokens)
  1361. {
  1362. TomToken & cur = tokens.item(idx);
  1363. HqlLRState * gotoState = queryGoto(cur.getId());
  1364. assertex(gotoState);
  1365. builder.addShift(cur.getId(), gotoState->id);
  1366. }
  1367. }
  1368. else
  1369. {
  1370. //Need to be careful. If the a.Xb is in kernal, and X-> is a production then need to include reduce X
  1371. TomRule * rule = step->queryRule();
  1372. if (rule)
  1373. {
  1374. if (!rule->alreadyExpanded(id))
  1375. {
  1376. ForEachItemIn(i2, rule->productions)
  1377. buildStateItem(builder, &rule->productions.item(i2), 0);
  1378. }
  1379. }
  1380. }
  1381. }
  1382. else
  1383. production->buildReduce(builder);
  1384. }
  1385. void HqlLRState::buildState(LRTableBuilder & builder, TomToken * eofToken, unsigned numTerminals)
  1386. {
  1387. builder.beginState(id);
  1388. ForEachItemIn(idx, items)
  1389. {
  1390. HqlLRItem & cur = items.item(idx);
  1391. TomProduction * production = cur.production;
  1392. buildStateItem(builder, production, cur.index);
  1393. if (production->isRootReduction() && (cur.index == 1))
  1394. builder.addAccept(eofToken->getId());
  1395. }
  1396. ForEachItemIn(idx2, gotos)
  1397. {
  1398. HqlLRGoto & cur = gotos.item(idx2);
  1399. if (cur.symbol >= numTerminals)
  1400. builder.addGoto(cur.symbol, cur.state->id);
  1401. }
  1402. builder.endState();
  1403. }
  1404. StringBuffer & HqlLRState::getDebugText(StringBuffer & out)
  1405. {
  1406. out.appendf("\t[%d] I={", id);
  1407. ForEachItemIn(idx, items)
  1408. {
  1409. if (idx) out.append(",");
  1410. items.item(idx).getDebugText(out);
  1411. }
  1412. out.append("} [");
  1413. ForEachItemIn(idx2, gotos)
  1414. {
  1415. if (idx2) out.append(",");
  1416. gotos.item(idx2).getDebugText(out);
  1417. }
  1418. return out.append("]").newline();
  1419. }
  1420. HqlLRState * HqlLRState::queryGoto(token_id id)
  1421. {
  1422. ForEachItemIn(idx, gotos)
  1423. {
  1424. HqlLRGoto & cur = gotos.item(idx);
  1425. if (cur.symbol == id)
  1426. return cur.state;
  1427. }
  1428. return NULL;
  1429. }
  1430. static inline int compareState(HqlLRState * left, HqlLRState * right)
  1431. {
  1432. return left->items.compare(right->items);
  1433. }
  1434. static int compareState(CInterface * * left, CInterface * * right)
  1435. {
  1436. return compareState((HqlLRState *)*left, (HqlLRState *)*right);
  1437. }
  1438. void TomitaContext::addGotos(TomProduction * production, unsigned idx)
  1439. {
  1440. //This could be done via virtuals if the context was passed around.
  1441. //but the code is a bit messy.
  1442. unsigned numSteps = production->steps.ordinality();
  1443. if (numSteps == idx)
  1444. return;
  1445. TomStep & step = production->steps.item(idx);
  1446. unsigned id = step.getId();
  1447. if (!gotos[id]) gotos[id] = new HqlLRState;
  1448. gotos[id]->addItem(production, idx+1);
  1449. TomRule * rule = step.queryRule();
  1450. if (rule && !done[id])
  1451. {
  1452. done[id] = true;
  1453. ForEachItemIn(idx, rule->productions)
  1454. {
  1455. TomProduction & cur = rule->productions.item(idx);
  1456. addGotos(&cur, 0);
  1457. }
  1458. }
  1459. }
  1460. HqlLRState * TomitaContext::addState(HqlLRState * state)
  1461. {
  1462. bool isNew = false;
  1463. CInterface * castState = state;
  1464. unsigned matchIndex = orderedStates.bAdd(castState, compareState, isNew);
  1465. if (isNew)
  1466. states.append(*LINK(state));
  1467. else
  1468. state->Release();
  1469. return &orderedStates.item(matchIndex);
  1470. }
  1471. void TomitaContext::calculateStates()
  1472. {
  1473. gotos = new HqlLRState * [numSymbols];
  1474. memset(gotos, 0, sizeof(HqlLRState *) * numSymbols);
  1475. done = new bool[numSymbols];
  1476. try
  1477. {
  1478. HqlLRState * root = new HqlLRState;
  1479. root->addItem(rootProduction, 0);
  1480. rootState.set(addState(root));
  1481. for (unsigned idxState = 0; idxState < states.ordinality(); idxState++)
  1482. {
  1483. memset(done, 0, sizeof(bool) * numSymbols);
  1484. HqlLRState & curState = states.item(idxState);
  1485. ForEachItemIn(idx, curState.items)
  1486. {
  1487. HqlLRItem & curItem = curState.items.item(idx);
  1488. addGotos(curItem.production, curItem.index);
  1489. }
  1490. for (unsigned i= 0; i < numSymbols; i++)
  1491. {
  1492. if (gotos[i])
  1493. {
  1494. HqlLRState * state = addState(gotos[i]);
  1495. curState.addGoto(i, state);
  1496. gotos[i] = NULL;
  1497. }
  1498. }
  1499. }
  1500. }
  1501. catch (...)
  1502. {
  1503. for (unsigned i= 0; i < numSymbols; i++)
  1504. delete gotos[i];
  1505. delete [] gotos;
  1506. delete [] done;
  1507. throw;
  1508. }
  1509. delete [] gotos;
  1510. delete [] done;
  1511. ForEachItemIn(i1, states)
  1512. states.item(i1).id = i1;
  1513. unsigned id = 0;
  1514. ForEachItemIn(i2, rules)
  1515. rules.item(i2).setProductionIds(productions, id);
  1516. numProductions = id;
  1517. }
  1518. void TomitaContext::calculateFollowing()
  1519. {
  1520. //first(x) and canBeNull(x) are calculated on request
  1521. //Need to be careful about recursion: keep repeating until no more items added.
  1522. grammarRule->addFollowOwn(*LINK(eofToken));
  1523. loop
  1524. {
  1525. bool changed = false;
  1526. ForEachItemIn(idx, rules)
  1527. {
  1528. if (rules.item(idx).calcFollow())
  1529. changed = true;
  1530. }
  1531. if (!changed)
  1532. break;
  1533. }
  1534. }
  1535. void TomitaContext::expandRecursion()
  1536. {
  1537. ForEachChild(i1, expr)
  1538. {
  1539. IHqlExpression * cur = expr->queryChild(i1);
  1540. if (cur->getOperator() == no_pat_use)
  1541. createRule(cur->queryChild(0), NULL, false);
  1542. }
  1543. ForEachItemIn(idx, rules)
  1544. rules.item(idx).expandRecursion(*this);
  1545. }
  1546. void TomitaContext::optimizeRules()
  1547. {
  1548. ForEachItemIn(idx, rules)
  1549. {
  1550. TomRule & cur = rules.item(idx);
  1551. if (&cur != rootRule)
  1552. cur.optimizeRules();
  1553. }
  1554. }
  1555. void TomitaContext::generateParser()
  1556. {
  1557. //Following the method described in the dragon book. pp....
  1558. calculateStates();
  1559. calculateFollowing();
  1560. generateTables();
  1561. ForEachItemIn(idx, rules)
  1562. {
  1563. TomRule & cur = rules.item(idx);
  1564. idAllocator.setID(cur.def, cur.queryName(), cur.id);
  1565. }
  1566. }
  1567. void TomitaContext::generateTables()
  1568. {
  1569. GenerateTomitaCtx ctx(*this, info, idAllocator);
  1570. LRTableBuilder builder(parser.table);
  1571. //MORE: Walk and call addGoto etc.
  1572. builder.init(states.ordinality(), numTerminals, numSymbols, numProductions);
  1573. ForEachItemIn(idx, states)
  1574. states.item(idx).buildState(builder, eofToken, numTerminals);
  1575. ForEachItemIn(i1, rules)
  1576. rules.item(i1).buildProductions(builder, ctx);
  1577. builder.finished(rootState->id);
  1578. grammarAmbiguous = builder.isAmbiguous();
  1579. }
  1580. void TomitaContext::getDebugText(StringBuffer & s, unsigned detail)
  1581. {
  1582. NlpParseContext::getDebugText(s, detail);
  1583. s.append("\nFeatures:\n");
  1584. ForEachItemIn(i1, features)
  1585. features.item(i1).getDebugText(s);
  1586. s.append("Tokens:\n");
  1587. ForEachItemIn(i2, tokens)
  1588. tokens.item(i2).getDebugText(s);
  1589. s.append("Rules:\n");
  1590. ForEachItemIn(i3, rules)
  1591. rules.item(i3).getDebugText(s);
  1592. s.append("Lexer:\n");
  1593. s.append("\tEndOfToken: [");
  1594. ForEachItemIn(i, parser.endTokenChars)
  1595. {
  1596. char c = parser.endTokenChars.item(i);
  1597. encodeXML(&c, s, ENCODE_WHITESPACE, 1);
  1598. }
  1599. s.append("]").newline();
  1600. s.append("\tToken DFA");
  1601. parser.tokenDfa.toXML(s, detail);
  1602. s.append("\tSkip DFA");
  1603. parser.skipDfa.toXML(s, detail);
  1604. s.append("\nStates:\n");
  1605. ForEachItemIn(i4, states)
  1606. states.item(i4).getDebugText(s);
  1607. s.append("Parser:\n");
  1608. parser.table.trace(s);
  1609. }
  1610. bool TomitaContext::canCreateLexerToken(IHqlExpression * expr, bool insideOr)
  1611. {
  1612. switch (expr->getOperator())
  1613. {
  1614. case no_null:
  1615. case no_pat_const:
  1616. case no_pat_anychar:
  1617. case no_pat_set:
  1618. return true;
  1619. case no_pat_repeat:
  1620. return isStandardRepeat(expr);
  1621. case no_pat_instance:
  1622. return canCreateLexerToken(expr->queryChild(0), insideOr);
  1623. //depends if used for match...
  1624. return false;
  1625. case no_pat_or:
  1626. {
  1627. ForEachChild(idx, expr)
  1628. {
  1629. if (!canCreateLexerToken(expr->queryChild(idx), true))
  1630. return false;
  1631. }
  1632. return true;
  1633. }
  1634. case no_pat_follow:
  1635. {
  1636. ForEachChild(idx, expr)
  1637. {
  1638. if (!canCreateLexerToken(expr->queryChild(idx), insideOr))
  1639. return false;
  1640. }
  1641. return true;
  1642. }
  1643. default:
  1644. return false;
  1645. }
  1646. }
  1647. bool TomitaContext::isToken(IHqlExpression * expr, bool expandTokens)
  1648. {
  1649. if (!expandTokens)
  1650. {
  1651. /*
  1652. switch (expr->getOperator())
  1653. {
  1654. case no_pat_token:
  1655. case no_pat_imptoken:
  1656. return isToken(expr->queryChild(0), true);
  1657. }
  1658. */
  1659. return false;
  1660. }
  1661. switch (expr->getOperator())
  1662. {
  1663. case no_pat_instance:
  1664. case no_pat_or:
  1665. case no_pat_guard:
  1666. case no_penalty:
  1667. case no_null:
  1668. case no_pat_first:
  1669. case no_pat_last:
  1670. return false;
  1671. default:
  1672. return true;
  1673. }
  1674. }
  1675. TomRule * TomitaContext::queryDefine(IHqlExpression * defineName)
  1676. {
  1677. ForEachItemIn(idx, rules)
  1678. {
  1679. TomRule & cur = rules.item(idx);
  1680. if (cur.matchesDefine(defineName))
  1681. return &cur;
  1682. }
  1683. StringBuffer s;
  1684. defineName->toString(s);
  1685. throwError1(HQLERR_DefineUseStrNotFound, s.str());
  1686. return NULL;
  1687. }
  1688. TomFeature * TomitaContext::queryFeature(IHqlExpression * expr)
  1689. {
  1690. //MORE: May need a hash table!
  1691. ForEachItemIn(idx, features)
  1692. {
  1693. TomFeature & cur = features.item(idx);
  1694. if (cur.matches(expr))
  1695. return &cur;
  1696. }
  1697. return NULL;
  1698. }
  1699. TomRule * TomitaContext::queryRule(IHqlExpression * expr, _ATOM name)
  1700. {
  1701. //MORE: May need a hash table!
  1702. ForEachItemIn(idx, rules)
  1703. {
  1704. TomRule & cur = rules.item(idx);
  1705. if (cur.matches(expr, name))
  1706. return &cur;
  1707. }
  1708. return NULL;
  1709. }
  1710. //---------------------------------------------------------------------------
  1711. NlpParseContext * createTomitaContext(IHqlExpression * expr, IWorkUnit * wu, const HqlCppOptions & options, ITimeReporter * timeReporter)
  1712. {
  1713. return new TomitaContext(expr, wu, options, timeReporter);
  1714. }
  1715. /*
  1716. ToDo:
  1717. * Features
  1718. o Need to process implicit guards and other conditions.
  1719. o Design how feature actuals are represented and bound.
  1720. o May need guards based on other guards - need to review grammars to see, so can guard a production and also match with others.
  1721. It can be worked around with an intermediate production.
  1722. * Give an error if unicode is used with ,parse
  1723. * Give an error if a token cannot create a DFA
  1724. * Tokens: [+see notes below]
  1725. o Allow before to be included in a token, and use it as the default mechanism for generating tokens.
  1726. o UNKNOWN - pattern matching at different priorities.
  1727. * Validation
  1728. o BEFORE, AFTER [+inverse]
  1729. o pattern IN pattern [+inverse]
  1730. o Non-DFA patterns?
  1731. o Inverse of check length
  1732. * Revisit node packing, and test with the "boy saw the girl" example
  1733. * Reduce number of empty productions e.g., ConnectWord := rule42. Use a flag to indicate top
  1734. * Productions: Allow the order of the symbols to be changed in the graph that is generated.
  1735. * Optimizations:
  1736. o Make position a rolling window so no need to copy
  1737. o best could be processed when packed nodes being built
  1738. * LEFT,RIGHT,NONE to specify precedence of operators + remove s/r conflicts
  1739. Some notes on the algorithms:
  1740. o S->S' in augmented grammar G'
  1741. closure(I) :=
  1742. 1. All items in I.
  1743. 2. If A->a.Bb in closure(I) and B->c is a production then add B->.c
  1744. closure(I)
  1745. {
  1746. J := I
  1747. added = [];
  1748. loop
  1749. for each item A->a.Bb in J
  1750. if !added[B]
  1751. added[B] = true;
  1752. foreach production B->c
  1753. if (B->.c) not in J
  1754. add (B->.c) to J.
  1755. until no mode items added;
  1756. }
  1757. Kernal: S'->.S and all items without dots on lhs.
  1758. Non: items with dots on lhs
  1759. No need to store non-kernal items since adding them can never add any kernals + can always regenerate.
  1760. goto(I, X) := closure of all items [A->aX.b] such that [A->a.Xb] is in I.
  1761. items(G')
  1762. C := closure({S'->.S})
  1763. loop
  1764. for each item I in C
  1765. for each grammar symbol X
  1766. q := goto(I, X);
  1767. if (q is non empty) and q not in C, add q to C.
  1768. until no more items added to C.
  1769. Generating the parsing tables:
  1770. 1. Calculate C := { I... } the items of G'
  1771. 2. If [A->x.ay] is in Ii and goto(Ii,a) = Ij action[i, a] = shift j;
  1772. 3. If [A->x.] is in Ii then action[i, a] := reduce by production for all a in FOLLOW(A) [A may not be S']
  1773. 4. If [S'->S.] in Ii then action[i, $] = accept;
  1774. 5. goto transitions: if gotot(Ii, A) = Ij goto[i, A} := j
  1775. 6. The initial entry is one containing S'->S
  1776. FIRST(x) := set of terminals that begin the strings derived from x
  1777. if (x=>e) then e is also in FIRST(x)
  1778. first(x)
  1779. {
  1780. if (x is a terminal) result = {x}
  1781. if (x->e) is a production, add e to first(x)
  1782. if (x non terminal) and x->y1y2y3, then first(x) = first(y1) + if(first(y1) includes e, first(y2).....)
  1783. - only add e if ALL y1y2y3 include e.
  1784. }
  1785. follow(X) := set of terminals x that can occur to the right of non terminal X in some sentinel form.
  1786. {
  1787. follow(S) = {$}
  1788. repeat
  1789. if a production A->aBb then all in FIRST(b) (except e) is added to follow B.
  1790. if a production A->aB or A->aBb where FIRST(b) includes e. then add everything in follow(A) to follow(B)
  1791. until nothing added to any follow set.
  1792. }
  1793. Tokens:
  1794. * Each token should have an optional set of characters that can or can not follow it.
  1795. * When a token is potentially accepted it should check the set
  1796. i) If specified, and matches then insert a token.
  1797. ii) at end if not-specified then insert a token.
  1798. * repeats aren't specified.
  1799. Reducing the tables:
  1800. i) if rows of the action table are identical, then they can be commoned up
  1801. ii) errors can be converted to reductions without problem.
  1802. iii) can have a list of (symbol, action) pairs instead of an array.
  1803. Ambiguous grammars:
  1804. i) on shift/reduce. Compare precedence of operator being shifted with right most operator in the reduction (or user defined)
  1805. ii) associativity. If the same precedence then if left associative reduce else shift [ should check associativity is the same]
  1806. iii) special rules. Possibly add a commit syntax to imply if this matches don't try and reduce on others?
  1807. */