thortparse.cpp 39 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371
  1. /*##############################################################################
  2. Copyright (C) 2011 HPCC Systems.
  3. All rights reserved. This program is free software: you can redistribute it and/or modify
  4. it under the terms of the GNU Affero General Public License as
  5. published by the Free Software Foundation, either version 3 of the
  6. License, or (at your option) any later version.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. GNU Affero General Public License for more details.
  11. You should have received a copy of the GNU Affero General Public License
  12. along with this program. If not, see <http://www.gnu.org/licenses/>.
  13. ############################################################################## */
  14. #include "jliball.hpp"
  15. #include "eclrtl.hpp"
  16. #include "rtlds_imp.hpp"
  17. #include "thortparse.ipp"
  18. //#define TRACING
  19. #ifndef TRACING
  20. #undef LOG
  21. #define LOG (void)
  22. #endif
  23. //---------------------------------------------------------------------------
  24. void doUnwindRelease(GrammarSymbol * symbol, CIArrayOf<GrammarSymbol> & pending)
  25. {
  26. if (!symbol->isPacked())
  27. {
  28. unsigned level = pending.ordinality();
  29. unsigned num = symbol->numChildren();
  30. for (unsigned i = 0; i < num; i++)
  31. pending.append(*LINK(symbol->queryChild(i)));
  32. if (!symbol->Release())
  33. pending.trunc(level);
  34. }
  35. else
  36. symbol->Release();
  37. }
  38. bool GrammarSymbol::canMerge(const GrammarSymbol * other)
  39. {
  40. //MORE: Shouldn't this be enabled at some point?
  41. return false;
  42. byte * leftRow = queryResultRow();
  43. byte * rightRow = other->queryResultRow();
  44. if (leftRow || rightRow)
  45. {
  46. assertex(leftRow && rightRow); // if in same state they really should have consistency here
  47. assertex(queryResultSize() == other->queryResultSize());
  48. if (memcmp(leftRow, rightRow, queryResultSize()) != 0)
  49. return false;
  50. }
  51. assertex(features.info == other->features.info);
  52. unsigned len = features.values.length();
  53. return memcmp(features.values.get(), other->features.values.get(), len) == 0;
  54. }
  55. GrammarSymbol * GrammarSymbol::createMerged(GrammarSymbol * other)
  56. {
  57. Owned<GrammarSymbol> merged = new PackedSymbol(this);
  58. return merged->createMerged(other);
  59. }
  60. //---------------------------------------------------------------------------
  61. Terminal::Terminal(symbol_id _id, const FeatureInfo * featureInfo, unsigned _len, const byte * _start) : GrammarSymbol(_id)
  62. {
  63. if (featureInfo && featureInfo->getNum() != 0)
  64. {
  65. features.info = featureInfo;
  66. unsigned size = featureInfo->getSize();
  67. void * resultFeatures = features.values.allocate(size);
  68. memcpy(resultFeatures, featureInfo->defaults, size);
  69. }
  70. len = _len;
  71. start = _start;
  72. }
  73. //---------------------------------------------------------------------------
  74. NonTerminal::NonTerminal(symbol_id _id, _ATOM _name, FeatureValue & _features, unsigned numSymbols, GrammarSymbol * * symbols, const byte * _reducePtr, size32_t _resultSize, byte * _resultRow) : GrammarSymbol(_id), resultSize(_resultSize), resultRow(_resultRow)
  75. {
  76. unsigned nullCount = 0;
  77. unsigned nonNullIndex;
  78. reduced.ensure(numSymbols);
  79. for (unsigned i= 0; i < numSymbols; i++)
  80. {
  81. GrammarSymbol * cur = symbols[i];
  82. if (cur->isNull())
  83. nullCount++;
  84. else
  85. nonNullIndex = i;
  86. reduced.append(*LINK(cur));
  87. addPenalty(cur->getPenalty());
  88. }
  89. cachedIsNull = false;
  90. if (nullCount == numSymbols)
  91. cachedIsNull = true;
  92. else if (nullCount != 0)
  93. {
  94. if (nonNullIndex+1 != numSymbols)
  95. {
  96. const byte * pos = symbols[nonNullIndex]->queryEndPtr();
  97. for (unsigned i1 = nonNullIndex+1; i1 < numSymbols; i1++)
  98. symbols[i1]->resetPosition(pos);
  99. }
  100. for (unsigned i2 = nonNullIndex; i2-- > 0;)
  101. {
  102. if (symbols[i2]->isNull())
  103. symbols[i2]->resetPosition(symbols[i2+1]->queryStartPtr());
  104. }
  105. }
  106. name = _name;
  107. unsigned len = _features.values.length();
  108. features.info = _features.info;
  109. if (len)
  110. features.values.setOwn(len, _features.values.detach());
  111. if (numSymbols == 0)
  112. reducePtr = _reducePtr;
  113. else
  114. reducePtr = NULL;
  115. }
  116. NonTerminal::~NonTerminal()
  117. {
  118. if (resultRow)
  119. rtlReleaseRow(resultRow);
  120. //Try and kill tail recursion when freeing a very large match structure
  121. while (reduced.ordinality())
  122. doUnwindRelease(&reduced.popGet(), reduced);
  123. }
  124. void NonTerminal::resetPosition(const byte * pos)
  125. {
  126. assertex(cachedIsNull);
  127. reducePtr = pos;
  128. ForEachItemInRev(idx, reduced)
  129. reduced.item(idx).resetPosition(pos);
  130. }
  131. const byte * NonTerminal::queryStartPtr() const
  132. {
  133. if (reduced.ordinality())
  134. return reduced.item(0).queryStartPtr();
  135. return reducePtr;
  136. }
  137. const byte * NonTerminal::queryEndPtr() const
  138. {
  139. if (reduced.ordinality())
  140. return reduced.tos().queryEndPtr();
  141. return reducePtr;
  142. }
  143. GrammarSymbol * NonTerminal::queryChild(unsigned i)
  144. {
  145. if (reduced.isItem(i))
  146. return &reduced.item(i);
  147. return NULL;
  148. }
  149. //---------------------------------------------------------------------------
  150. PackedSymbol::PackedSymbol(GrammarSymbol * symbol) : GrammarSymbol(symbol->id)
  151. {
  152. //copy the features...
  153. assertex(!symbol->isNull());
  154. equivalents.append(*LINK(symbol));
  155. penalty = symbol->getPenalty();
  156. }
  157. void PackedSymbol::resetPosition(const byte * pos)
  158. {
  159. throwUnexpected();
  160. }
  161. GrammarSymbol * PackedSymbol::createMerged(GrammarSymbol * other)
  162. {
  163. assertex(!other->isNull());
  164. assertex(other->queryStartPtr() == queryStartPtr());
  165. assertex(other->queryEndPtr() == queryEndPtr());
  166. if (other->getPenalty() < penalty)
  167. penalty = other->getPenalty();
  168. if (other->isPacked())
  169. {
  170. //Can happen when two activity states are merged.
  171. for (unsigned i = 0; ; i++)
  172. {
  173. GrammarSymbol * child = queryPacked(i);
  174. if (!child)
  175. break;
  176. equivalents.append(*LINK(child));
  177. }
  178. }
  179. else
  180. equivalents.append(*LINK(other));
  181. return LINK(this);
  182. }
  183. GrammarSymbol * PackedSymbol::queryPacked(unsigned i)
  184. {
  185. if (equivalents.isItem(i))
  186. return &equivalents.item(i);
  187. return NULL;
  188. }
  189. const byte * PackedSymbol::queryStartPtr() const
  190. {
  191. return equivalents.item(0).queryStartPtr();
  192. }
  193. const byte * PackedSymbol::queryEndPtr() const
  194. {
  195. return equivalents.item(0).queryEndPtr();
  196. }
  197. //---------------------------------------------------------------------------
  198. TomitaMatchWalker::TomitaMatchWalker(const PackedSymbolChoice & _choice, GrammarSymbol * _symbol) : choice(_choice)
  199. {
  200. symbol = _symbol;
  201. while (symbol->isPacked())
  202. {
  203. symbol = symbol->queryPacked(choice.getInstance(symbol));
  204. }
  205. }
  206. _ATOM TomitaMatchWalker::queryName()
  207. {
  208. return symbol->queryName();
  209. }
  210. unsigned TomitaMatchWalker::queryID()
  211. {
  212. return symbol->getId();
  213. }
  214. size32_t TomitaMatchWalker::queryMatchSize()
  215. {
  216. return symbol->getLength();
  217. }
  218. const void * TomitaMatchWalker::queryMatchStart()
  219. {
  220. return symbol->queryStartPtr();
  221. }
  222. unsigned TomitaMatchWalker::numChildren()
  223. {
  224. return symbol->numChildren();
  225. }
  226. IMatchWalker * TomitaMatchWalker::getChild(unsigned idx)
  227. {
  228. GrammarSymbol * child = symbol->queryChild(idx);
  229. if (child)
  230. return new TomitaMatchWalker(choice, child);
  231. return NULL;
  232. }
  233. //---------------------------------------------------------------------------
  234. void PackedSymbolChoice::expandFirst(GrammarSymbol * symbol)
  235. {
  236. if (symbol->isPacked())
  237. {
  238. symbols.append(*LINK(symbol));
  239. branches.append(0);
  240. expandFirst(symbol->queryPacked(0));
  241. }
  242. else
  243. {
  244. unsigned max = symbol->numChildren();
  245. for (unsigned i = 0; i < max; i++)
  246. expandFirst(symbol->queryChild(i));
  247. }
  248. }
  249. void PackedSymbolChoice::first(GrammarSymbol * symbol)
  250. {
  251. branches.kill();
  252. symbols.kill();
  253. expandFirst(symbol);
  254. }
  255. void PackedSymbolChoice::expandBest(GrammarSymbol * symbol)
  256. {
  257. if (symbol->isPacked())
  258. {
  259. symbols.append(*LINK(symbol));
  260. unsigned bestIndex = 0;
  261. int bestPenalty = symbol->queryPacked(bestIndex)->getPenalty();
  262. for (unsigned j=1;;j++)
  263. {
  264. GrammarSymbol * cur = symbol->queryPacked(j);
  265. if (!cur) break;
  266. if (bestPenalty > cur->getPenalty())
  267. {
  268. bestIndex = j;
  269. bestPenalty = cur->getPenalty();
  270. }
  271. }
  272. branches.append(bestIndex);
  273. expandBest(symbol->queryPacked(bestIndex));
  274. }
  275. else
  276. {
  277. unsigned max = symbol->numChildren();
  278. for (unsigned i = 0; i < max; i++)
  279. expandBest(symbol->queryChild(i));
  280. }
  281. }
  282. void PackedSymbolChoice::selectBest(GrammarSymbol * symbol)
  283. {
  284. branches.kill();
  285. symbols.kill();
  286. expandBest(symbol);
  287. }
  288. bool PackedSymbolChoice::expandNext(unsigned & level, GrammarSymbol * symbol)
  289. {
  290. unsigned thisLevel = level;
  291. if (symbol->isPacked())
  292. {
  293. assertex(&symbols.item(thisLevel) == symbol);
  294. unsigned curBranch = branches.item(thisLevel);
  295. bool isLastLevel = (thisLevel == symbols.ordinality()-1);
  296. level++;
  297. if (!isLastLevel && expandNext(level, symbol->queryPacked(curBranch)))
  298. return true;
  299. curBranch++;
  300. if (symbol->queryPacked(curBranch))
  301. {
  302. branches.replace(curBranch, thisLevel);
  303. expandFirst(symbol->queryPacked(curBranch));
  304. return true;
  305. }
  306. branches.pop();
  307. symbols.pop();
  308. return false;
  309. }
  310. else
  311. {
  312. unsigned max = symbol->numChildren();
  313. for (unsigned i = 0; i < max; i++)
  314. if (!expandNext(level, symbol->queryChild(i)))
  315. return false;
  316. return true;
  317. }
  318. }
  319. bool PackedSymbolChoice::next(GrammarSymbol * symbol)
  320. {
  321. if (symbols.ordinality() == 0)
  322. return false;
  323. unsigned level = 0;
  324. return expandNext(level, symbol);
  325. }
  326. unsigned PackedSymbolChoice::getInstance(GrammarSymbol * symbol) const
  327. {
  328. unsigned match = symbols.find(*symbol);
  329. assertex(match != NotFound);
  330. return branches.item(match);
  331. }
  332. //---------------------------------------------------------------------------
  333. StackElement::StackElement(GrammarSymbol * _shifted, state_id _state, StackElement * _prev, LRParser * _pool)
  334. {
  335. shifted.set(_shifted);
  336. state = _state;
  337. prev.set(_prev);
  338. pool = _pool;
  339. }
  340. void StackElement::addSibling(StackElement * sib)
  341. {
  342. sib->sibling.setown(sibling.getClear());
  343. sibling.set(sib);
  344. }
  345. void StackElement::addToPool()
  346. {
  347. StackElementArray pending;
  348. gatherPoolPending(pending);
  349. doAddToPool();
  350. while (pending.ordinality())
  351. {
  352. Owned<StackElement> cur = &pending.popGet();
  353. cur->gatherPoolPending(pending);
  354. cur->doAddToPool();
  355. }
  356. }
  357. void StackElement::doAddToPool()
  358. {
  359. shifted.clear();
  360. prev.clear();
  361. sibling.clear();
  362. state = 0;
  363. pool->addFreeElement(this);
  364. pool = NULL;
  365. }
  366. void StackElement::Release()
  367. {
  368. if (pool && !IsShared())
  369. addToPool();
  370. CInterface::Release();
  371. }
  372. void StackElement::getDebugText(StringBuffer & s)
  373. {
  374. if (prev)
  375. prev->getDebugText(s);
  376. s.appendf("%p", this);
  377. s.append("[");
  378. if(shifted)
  379. {
  380. s.append(shifted->id).append(" ");
  381. s.appendf("{%p}", shifted->queryStartPtr());
  382. }
  383. s.append(state).append("] ");
  384. }
  385. bool StackElement::potentialPackNode(StackElement & other) const
  386. {
  387. GrammarSymbol * curSymbol = shifted;
  388. GrammarSymbol * nextSymbol = other.shifted;
  389. if (curSymbol && !curSymbol->isNull() && curSymbol->id == nextSymbol->id)
  390. {
  391. if (other.prev->state == prev->state)
  392. {
  393. if (curSymbol->queryStartPtr() == nextSymbol->queryStartPtr())
  394. {
  395. assertex(curSymbol->queryEndPtr() == nextSymbol->queryEndPtr());
  396. assertex(state == other.state);
  397. return true;
  398. }
  399. }
  400. }
  401. return false;
  402. }
  403. void StackElement::gatherPoolPending(CIArrayOf<StackElement> & pending)
  404. {
  405. if (prev && !prev->IsShared())
  406. pending.append(*prev.getClear());
  407. if (sibling && !sibling->IsShared())
  408. pending.append(*sibling.getClear());
  409. }
  410. //---------------------------------------------------------------------------
  411. LRActiveState::LRActiveState(unsigned _maxStates)
  412. {
  413. maxStates = _maxStates;
  414. cacheBytes = maxStates * sizeof(StackElement *);
  415. cache = new StackElement * [maxStates];
  416. beenReduced = new bool [maxStates];
  417. memset(beenReduced, 0, maxStates);
  418. memset(cache, 0, cacheBytes);
  419. }
  420. LRActiveState::~LRActiveState()
  421. {
  422. delete [] cache;
  423. delete [] beenReduced;
  424. }
  425. void LRActiveState::addElementOwn(StackElement * next, bool keepBest)
  426. {
  427. unsigned stateId = next->state;
  428. if (cache[stateId])
  429. {
  430. if (!mergePackedNode(stateId, next, keepBest))
  431. cache[stateId]->addSibling(next);
  432. next->Release();
  433. }
  434. else
  435. {
  436. cache[stateId] = next;
  437. elements.append(*next);
  438. }
  439. }
  440. void LRActiveState::clearReduced()
  441. {
  442. ForEachItemIn(idx, elements)
  443. beenReduced[elements.item(idx).state] = false;
  444. }
  445. bool LRActiveState::mergePackedNode(unsigned stateId, StackElement * next, bool keepBest)
  446. {
  447. StackElement * cur = cache[stateId];
  448. assertex(cur->state == next->state);
  449. assertex(!next->sibling);
  450. // If two symbols have the same id/attributes, and lie between the same states, then it is a locally
  451. // ambiguous tree, so common the symbol values up.
  452. GrammarSymbol * nextSymbol = next->shifted;
  453. if (!nextSymbol)
  454. return false; // MORE: Should this common???
  455. if (nextSymbol->isNull()) // Hard to know the position of these things...
  456. return false;
  457. StackElement * prev = NULL;
  458. do
  459. {
  460. GrammarSymbol * curSymbol = cur->shifted;
  461. if (curSymbol && !curSymbol->isNull() && (curSymbol->id == nextSymbol->id))
  462. {
  463. if (next->prev->state == cur->prev->state)
  464. {
  465. if (curSymbol->queryStartPtr() == nextSymbol->queryStartPtr())
  466. {
  467. assertex(curSymbol->queryEndPtr() == nextSymbol->queryEndPtr());
  468. assertex(cur->state == next->state);
  469. if (keepBest)
  470. {
  471. int curPenalty = curSymbol->getPenalty();
  472. int nextPenalty = nextSymbol->getPenalty();
  473. if (curPenalty != nextPenalty)
  474. {
  475. //Existing element is better=>throw away the new one.
  476. if (nextPenalty > curPenalty)
  477. return true;
  478. //The new element is better, throw away all the mismatches, by removing this element, and going around again
  479. if (prev)
  480. {
  481. cur = prev;
  482. prev->sibling.set(prev->sibling->sibling);
  483. }
  484. else
  485. {
  486. StackElement * sibling = cur->sibling.getClear();
  487. unsigned replacePos = elements.find(*cur);
  488. if (sibling)
  489. {
  490. //More than one element => replace the head with the (linked) sibling
  491. elements.replace(*sibling, replacePos);
  492. cache[stateId] = sibling;
  493. cur = sibling;
  494. continue;
  495. }
  496. else
  497. {
  498. elements.replace(*LINK(next), replacePos);
  499. cache[stateId] = next;
  500. return true;
  501. }
  502. }
  503. }
  504. }
  505. if (curSymbol->canMerge(nextSymbol))
  506. {
  507. cur->shifted.setown(curSymbol->createMerged(nextSymbol));
  508. LOG(MCdebugProgress, unknownJob, "Nodes Merged: %p = %p, %p", cur->shifted.get(), curSymbol, nextSymbol);
  509. return true;
  510. }
  511. }
  512. }
  513. }
  514. prev = cur;
  515. cur = cur->sibling;
  516. } while (cur);
  517. return false;
  518. }
  519. void LRActiveState::reinit()
  520. {
  521. memset(cache, 0, cacheBytes);
  522. elements.kill();
  523. }
  524. void LRActiveState::mergeIn(LRActiveState & other, bool keepBest)
  525. {
  526. ForEachItemIn(idx, other.elements)
  527. {
  528. Linked<StackElement> cur = &other.elements.item(idx);
  529. while (cur)
  530. {
  531. Linked<StackElement> next = cur->sibling;
  532. cur->sibling.clear();
  533. addElementOwn(cur.getClear(), keepBest);
  534. cur.setown(next.getClear());
  535. }
  536. }
  537. }
  538. //---------------------------------------------------------------------------
  539. LRParser::LRParser(const LRTable & _table, const TomitaParserCallback & _rowState) : table(_table), rowState(_rowState)
  540. {
  541. reduced = new LRActiveState(table.numStates);
  542. reducedOverflow[0] = new LRActiveState(table.numStates);
  543. reducedOverflow[1] = new LRActiveState(table.numStates);
  544. _clear(positions);
  545. init();
  546. }
  547. LRParser::~LRParser()
  548. {
  549. //Safely kill the free list.
  550. StackElement * head = nextFreeElement.getClear();
  551. while (head)
  552. {
  553. StackElement * next = head->sibling.getClear();
  554. head->Release();
  555. head = next;
  556. }
  557. for (position_t pos = 0; pos < MAX_POSITIONS; pos++)
  558. delete positions[pos];
  559. delete reduced;
  560. delete reducedOverflow[0];
  561. delete reducedOverflow[1];
  562. }
  563. void LRParser::addFreeElement(StackElement * element)
  564. {
  565. element->sibling.setown(nextFreeElement.getClear());
  566. nextFreeElement.set(element);
  567. }
  568. void LRParser::beginParse(bool _chooseBest)
  569. {
  570. chooseBest = _chooseBest;
  571. selectEndPosition(0);
  572. addStartState();
  573. accepted.kill();
  574. }
  575. void LRParser::endParse()
  576. {
  577. }
  578. void LRParser::clear()
  579. {
  580. activeInput = NULL;
  581. activeOutput = NULL;
  582. firstPosition = NotFound;
  583. endPosition = NotFound;
  584. }
  585. void LRParser::cleanupPosition(position_t pos)
  586. {
  587. unsigned index = pos % MAX_POSITIONS;
  588. positions[index]->reinit();
  589. }
  590. void LRParser::init()
  591. {
  592. chooseBest = false;
  593. reduced->reinit();
  594. reducedOverflow[0]->reinit();
  595. reducedOverflow[1]->reinit();
  596. clear();
  597. }
  598. void LRParser::addStartState()
  599. {
  600. activeOutput->addElementOwn(createState(NULL, (state_id)table.rootState, NULL), false);
  601. }
  602. StackElement * LRParser::createState(StackElement * prev, state_id nextState, GrammarSymbol * shifted)
  603. {
  604. StackElement * newState;
  605. if (nextFreeElement)
  606. {
  607. newState = nextFreeElement.getClear();
  608. nextFreeElement.setown(newState->sibling.getClear());
  609. newState->shifted.set(shifted);
  610. newState->prev.set(prev);
  611. newState->state = nextState;
  612. newState->pool = this;
  613. }
  614. else
  615. {
  616. newState = new StackElement(shifted, nextState, prev, this);
  617. }
  618. #ifdef TRACING
  619. LOG(MCdebugProgress, unknownJob, "%p: Push state %d symbol %d[%p] previous: %d[%p]", newState, nextState, shifted ? shifted->id : -1, shifted, prev ? prev->state : -1, prev);
  620. StringBuffer s;
  621. newState->getDebugText(s);
  622. s.newline();
  623. LOG(MCdebugProgress, unknownJob, s.str());
  624. #endif
  625. return newState;
  626. }
  627. void LRParser::doReductions(LRActiveState & active, GrammarSymbol * next)
  628. {
  629. //At the moment reduce can not remove items.
  630. //Done as a double loop to reduce calls to active.ordinality()
  631. unsigned max = active.elements.ordinality();
  632. for (unsigned i=0; i < max; )
  633. {
  634. for (; i < max; i++)
  635. {
  636. StackElement & cur = active.elements.item(i);
  637. active.markReduced(cur.state);
  638. reduce(cur, next);
  639. }
  640. max = active.elements.ordinality();
  641. }
  642. active.clearReduced();
  643. }
  644. void LRParser::doReductions(GrammarSymbol * next, bool singleToken)
  645. {
  646. //The reduction need to go to a separate list - since may be different for each token
  647. //but want to optimize case where only a single token.
  648. LRActiveState * output = activeInput;
  649. if (!singleToken)
  650. {
  651. output = reduced;
  652. reduced->reinit();
  653. }
  654. activeOutput = output;
  655. curOverflow = reducedOverflow[0];
  656. doReductions(*activeInput, next);
  657. if (!singleToken)
  658. doReductions(*reduced, next);
  659. //You get weird situations where reductions add to states thathave already been reduced,
  660. //so any clashes like that need to be sent to other lists, processed and merged in.
  661. unsigned overflowIndex = 0;
  662. while (curOverflow->elements.ordinality())
  663. {
  664. LRActiveState * oldOverflow = curOverflow;
  665. overflowIndex = 1-overflowIndex;
  666. activeOutput = oldOverflow;
  667. curOverflow = reducedOverflow[overflowIndex];
  668. doReductions(*oldOverflow, next);
  669. output->mergeIn(*oldOverflow, chooseBest);
  670. oldOverflow->reinit();
  671. }
  672. curOverflow = NULL;
  673. activeOutput = NULL;
  674. }
  675. void LRParser::process(GrammarSymbol * next, bool singleToken)
  676. {
  677. #ifdef TRACING
  678. LOG(MCdebugProgress, unknownJob, "Process token '%.*s' %d at position %d", next->queryEndPtr()-next->queryStartPtr(), next->queryStartPtr(), next->id, next->queryStartPtr()-rowState.inputText);
  679. #endif
  680. doReductions(next, singleToken);
  681. selectEndPosition((size32_t)(next->queryEndPtr()-rowState.inputText));
  682. doShifts(next, singleToken);
  683. }
  684. //Build up an array of the symbols to expand - may involve uncommoning nodes
  685. void LRParser::expandReduction(StackElement & element, LRProduction * production, unsigned numSymbols)
  686. {
  687. if (numSymbols == 0)
  688. {
  689. const byte * reducePtr = firstPosition+rowState.inputText;
  690. Owned<GrammarSymbol> reduced = production->reduce(reducedArgs, reducePtr, rowState);
  691. if (reduced)
  692. {
  693. state_id nextState = table.states[element.state]->getGoto(reduced->id);
  694. #ifdef TRACING
  695. StringBuffer s;
  696. for (unsigned i = 0; i < production->getNumSymbols(); i++)
  697. s.appendf("%p ", reducedArgs[i]);
  698. LOG(MCdebugProgress, unknownJob, "Reduce by production %d new element %p[%s]", reduced->id, reduced.get(), s.str());
  699. #endif
  700. //MORE: Some kind of recursion checking needed?
  701. StackElement * cached = activeOutput->cache[nextState];
  702. if (activeOutput->okToAddReduction(nextState))
  703. activeOutput->addElementOwn(createState(&element, nextState, reduced), chooseBest);
  704. else
  705. curOverflow->addElementOwn(createState(&element, nextState, reduced), chooseBest);
  706. }
  707. return;
  708. }
  709. StackElement * curElement = &element;
  710. do
  711. {
  712. reducedArgs[numSymbols-1] = curElement->shifted;
  713. expandReduction(*curElement->prev, production, numSymbols-1);
  714. curElement = curElement->sibling;
  715. } while (curElement);
  716. }
  717. LRActiveState * LRParser::getPosition(position_t pos)
  718. {
  719. unsigned index = pos % MAX_POSITIONS;
  720. return positions[index];
  721. }
  722. void LRParser::setPositionOwn(position_t pos, LRActiveState * value)
  723. {
  724. unsigned index = pos % MAX_POSITIONS;
  725. positions[index] = value;
  726. }
  727. void LRParser::reduce(StackElement & element, GrammarSymbol * next)
  728. {
  729. LRState & curState = *table.states[element.state];
  730. symbol_id nextId = next->id;
  731. unsigned numReductions = curState.numReductions(nextId);
  732. for (unsigned i = 0; i < numReductions; i++)
  733. {
  734. unsigned productionIndex = curState.queryReduction(nextId, i);
  735. LRProduction * production = &table.productions[productionIndex];
  736. expandReduction(element, production, production->getNumSymbols());
  737. }
  738. }
  739. void LRParser::doShifts(LRActiveState * active, GrammarSymbol * next)
  740. {
  741. symbol_id nextid = next->id;
  742. ForEachItemIn(idx, active->elements)
  743. {
  744. StackElement & cur = active->elements.item(idx);
  745. LRState & curState = *table.states[cur.state];
  746. state_id nextState = curState.getShift(nextid);
  747. if (nextState != NO_STATE)
  748. {
  749. LOG(MCdebugProgress, unknownJob, "Shift to state %d", nextState);
  750. activeOutput->addElementOwn(createState(&cur, nextState, next), chooseBest);
  751. }
  752. if (curState.canAccept(nextid))
  753. {
  754. for (StackElement * accept = &cur; accept; accept = accept->sibling)
  755. {
  756. GrammarSymbol * sym = accept->shifted;
  757. accepted.append(*LINK(sym));
  758. LOG(MCdebugProgress, unknownJob, "Accepted %p[%p]", accept, sym);
  759. }
  760. }
  761. }
  762. }
  763. void LRParser::doShifts(GrammarSymbol * next, bool singleToken)
  764. {
  765. doShifts(activeInput, next);
  766. if (!singleToken)
  767. doShifts(reduced, next);
  768. }
  769. position_t LRParser::getFirstPosition()
  770. {
  771. return firstPosition;
  772. }
  773. void LRParser::removePosition(position_t pos)
  774. {
  775. position_t cur = pos+1;
  776. cleanupPosition(pos);
  777. while (cur < endPosition)
  778. {
  779. LRActiveState * active = getPosition(cur);
  780. if (active && active->elements.ordinality())
  781. break;
  782. cur++;
  783. }
  784. if (pos == firstPosition)
  785. {
  786. firstPosition = cur;
  787. if (firstPosition == endPosition)
  788. {
  789. firstPosition = NotFound;
  790. endPosition = NotFound;
  791. }
  792. }
  793. else
  794. {
  795. assertex(!"This should work, but why is it happening???");
  796. }
  797. }
  798. void LRParser::selectEndPosition(unsigned offset)
  799. {
  800. if (firstPosition == endPosition)
  801. {
  802. firstPosition = offset;
  803. endPosition = offset;
  804. }
  805. assertex(offset >= firstPosition && offset < firstPosition+MAX_POSITIONS);
  806. if (offset >= endPosition)
  807. endPosition = offset+1;
  808. if (!getPosition(offset))
  809. setPositionOwn(offset, new LRActiveState(table.numStates));
  810. activeOutput = getPosition(offset);
  811. }
  812. void LRParser::selectStartPosition(unsigned offset)
  813. {
  814. assertex(offset >= firstPosition && offset < endPosition);
  815. activeInput = getPosition(offset);
  816. }
  817. //---------------------------------------------------------------------------
  818. #if 0
  819. void LRMultiParser::parse(IMultiTokenLexer & lexer)
  820. {
  821. selectEndPosition(0);
  822. addStartState();
  823. unsigned eofId = lexer.getEofId();
  824. GrammarSymbolArray tokens;
  825. bool done = false;
  826. while (!done)
  827. {
  828. position_t nextPosition = getFirstPosition();
  829. if (nextPosition == NotFound)
  830. break;
  831. selectStartPosition(nextPosition);
  832. tokens.kill();
  833. if (lexer.next(nextPosition, tokens)) // doesn't return whitespace.
  834. {
  835. ForEachItemIn(idx, tokens)
  836. {
  837. GrammarSymbol & curToken = tokens.item(idx);
  838. process(&curToken, tokens.ordinality() == 1);
  839. }
  840. if (tokens.ordinality() == 1 && tokens.item(0).id == eofId)
  841. done = true;
  842. }
  843. removePosition(nextPosition);
  844. }
  845. }
  846. #endif
  847. //---------------------------------------------------------------------------
  848. TomitaResultIterator::TomitaResultIterator(const TomitaStateInformation & _rowState, TomitaAlgorithm * _def) : rowState(_rowState), results(_def->matchInfo)
  849. {
  850. def = LINK(_def);
  851. matchFirst = (def->matchAction == INlpParseAlgorithm::NlpMatchFirst);
  852. singleMatchPerSymbol = def->chooseMin || def->chooseMax || def->chooseBest || matchFirst;
  853. }
  854. TomitaResultIterator::~TomitaResultIterator()
  855. {
  856. results.kill();
  857. def->Release();
  858. }
  859. int compareStartPtr(CInterface * * pLeft, CInterface * * pRight)
  860. {
  861. //try and make it different to improve sort performance, and so results are reproducible between windows and linux, but it can't be guaranteed.
  862. GrammarSymbol * left = static_cast<GrammarSymbol *>(*pLeft);
  863. GrammarSymbol * right = static_cast<GrammarSymbol *>(*pRight);
  864. int delta = (int)(left->queryStartPtr() - right->queryStartPtr()); // ok unless records can be 2Gb.
  865. if (delta != 0)
  866. return delta;
  867. delta = left->getPenalty() - right->getPenalty();
  868. if (delta)
  869. return delta;
  870. delta = left->getLength() - right->getLength();
  871. if (delta)
  872. return delta;
  873. return 0;
  874. }
  875. bool TomitaResultIterator::isBetter(const GrammarSymbol * left, const GrammarSymbol * right)
  876. {
  877. if (def->chooseMin || def->chooseMax)
  878. {
  879. unsigned leftLength = left->getLength();
  880. unsigned rightLength = right->getLength();
  881. if (leftLength != rightLength)
  882. {
  883. if (def->chooseMin)
  884. return leftLength < rightLength;
  885. else if (def->chooseMax)
  886. return leftLength > rightLength;
  887. }
  888. }
  889. if (def->chooseBest)
  890. {
  891. int leftPenalty = left->getPenalty();
  892. int rightPenalty = right->getPenalty();
  893. return (leftPenalty < rightPenalty);
  894. }
  895. return false;
  896. }
  897. void TomitaResultIterator::reset(const GrammarSymbolArray & _values)
  898. {
  899. values.kill();
  900. curIndex = NotFound;
  901. unsigned numValues = _values.ordinality();
  902. if (numValues == 0)
  903. {
  904. if (def->notMatchedOnly || def->notMatched)
  905. {
  906. //Add a dummy matched entry...
  907. values.append(*new Terminal(def->eofId, NULL, 0, NULL));
  908. }
  909. return;
  910. }
  911. if (def->notMatchedOnly)
  912. return;
  913. values.ensure(numValues);
  914. for (unsigned i = 0; i < numValues; i++)
  915. values.append(OLINK(_values.item(i)));
  916. if (def->scanAction != INlpParseAlgorithm::NlpScanWhole)
  917. values.sort(compareStartPtr);
  918. if (def->chooseMin || def->chooseMax || def->chooseBest)
  919. {
  920. GrammarSymbol * best = &values.item(0);
  921. for (unsigned j=1; j < numValues;)
  922. {
  923. GrammarSymbol & cur = values.item(j);
  924. if (cur.queryStartPtr() != best->queryStartPtr())
  925. {
  926. //Scan none: remove all entries that are at a different offset
  927. if (def->scanAction == INlpParseAlgorithm::NlpScanNone)
  928. {
  929. values.popn(numValues-j);
  930. break;
  931. }
  932. if (!def->singleChoicePerLine)
  933. {
  934. if (def->scanAction == INlpParseAlgorithm::NlpScanNext)
  935. {
  936. if (cur.queryStartPtr() < best->queryEndPtr())
  937. {
  938. numValues--;
  939. values.remove(j);
  940. continue;
  941. }
  942. }
  943. best = &cur;
  944. }
  945. }
  946. else
  947. {
  948. if (matchFirst)
  949. {
  950. numValues--;
  951. values.remove(j);
  952. continue;
  953. }
  954. }
  955. if (best != &cur)
  956. {
  957. if (isBetter(&cur, best))
  958. {
  959. values.zap(*best);
  960. best = &cur;
  961. }
  962. else
  963. values.remove(j);
  964. numValues--;
  965. //NB: j is now pointing at the next entry
  966. }
  967. else
  968. j++;
  969. }
  970. }
  971. else
  972. {
  973. GrammarSymbol * last = &values.item(0);
  974. for (unsigned j=1; j < numValues;)
  975. {
  976. GrammarSymbol & cur = values.item(j);
  977. if (cur.queryStartPtr() != last->queryStartPtr())
  978. {
  979. if (def->scanAction == INlpParseAlgorithm::NlpScanNone)
  980. {
  981. //Scan none: remove all entries that are at a different offset
  982. values.popn(numValues-j);
  983. break;
  984. }
  985. else if ((def->scanAction == INlpParseAlgorithm::NlpScanNext) &&
  986. (cur.queryStartPtr() < last->queryEndPtr()))
  987. {
  988. numValues--;
  989. values.remove(j);
  990. }
  991. else
  992. {
  993. last = &cur;
  994. j++;
  995. }
  996. }
  997. else
  998. {
  999. if (matchFirst)
  1000. {
  1001. numValues--;
  1002. values.remove(j);
  1003. }
  1004. else
  1005. {
  1006. if (def->scanAction == INlpParseAlgorithm::NlpScanNext)
  1007. {
  1008. if (cur.queryEndPtr() < last->queryEndPtr())
  1009. last = &cur;
  1010. }
  1011. j++;
  1012. }
  1013. }
  1014. }
  1015. }
  1016. }
  1017. bool TomitaResultIterator::first()
  1018. {
  1019. curIndex = 0;
  1020. if (!isValid())
  1021. return false;
  1022. firstChoice();
  1023. return true;
  1024. }
  1025. void TomitaResultIterator::firstChoice()
  1026. {
  1027. if (def->chooseBest)
  1028. choice.selectBest(&values.item(curIndex));
  1029. else
  1030. choice.first(&values.item(curIndex));
  1031. }
  1032. bool TomitaResultIterator::next()
  1033. {
  1034. if (!isValid())
  1035. return false;
  1036. if (!singleMatchPerSymbol && choice.next(&values.item(curIndex)))
  1037. return true;
  1038. curIndex++;
  1039. if (!isValid())
  1040. return false;
  1041. firstChoice();
  1042. return true;
  1043. }
  1044. void TomitaResultIterator::invalidate()
  1045. {
  1046. curIndex = (unsigned)-1;
  1047. values.kill();
  1048. }
  1049. bool TomitaResultIterator::isValid()
  1050. {
  1051. return values.isItem(curIndex);
  1052. }
  1053. const void * TomitaResultIterator::getRow()
  1054. {
  1055. Owned<IMatchWalker> walker = getWalker();
  1056. results.extractResults(&values.item(curIndex), choice, rowState.inputText);
  1057. // results.extractResults(walker, rowState.inputText);
  1058. RtlDynamicRowBuilder rowBuilder(outputAllocator);
  1059. unsigned newSize = rowState.action->onMatch(rowBuilder, rowState.row, &results, walker);
  1060. assertex(newSize);
  1061. return rowBuilder.finalizeRowClear(newSize);
  1062. }
  1063. //---------------------------------------------------------------------------
  1064. TomitaParser::TomitaParser(ICodeContext * ctx, TomitaAlgorithm * _def, unsigned _activityId, INlpHelper * _helper, IHThorParseArg * arg)
  1065. : parser(_def->table, rowState), lexer(_def->tokenDfa, _def->skipDfa, _def->endTokenChars, _def->eofId), iter(rowState, _def)
  1066. {
  1067. assertex(ctx);
  1068. def = _def;
  1069. helper = _helper;
  1070. helperArg = arg;
  1071. eofId = def->eofId;
  1072. outputAllocator.setown(ctx->getRowAllocator(arg->queryOutputMeta(), _activityId));
  1073. iter.setAllocator(outputAllocator);
  1074. //Now ensure that allocators are created for each of the production call backs.
  1075. for (unsigned i=0; i < def->numProductions(); i++)
  1076. {
  1077. IOutputMetaData * rs = arg->queryProductionMeta(i);
  1078. if (rs && !rowState.queryAllocator(rs))
  1079. {
  1080. Owned<IEngineRowAllocator> allocator = ctx->getRowAllocator(rs, _activityId);
  1081. if (allocator)
  1082. rowState.addAllocator(*allocator);
  1083. }
  1084. }
  1085. }
  1086. bool TomitaParser::performMatch(IMatchedAction & action, const void * row, unsigned len, const void * data)
  1087. {
  1088. rowState.row = row;
  1089. rowState.lengthInputText = len;
  1090. rowState.inputText = (const byte *)data;
  1091. rowState.action = &action;
  1092. rowState.inputFormat = def->inputFormat;
  1093. rowState.helper = helper;
  1094. rowState.helperArg = helperArg;
  1095. lexer.setDocument(len, data);
  1096. parser.beginParse(def->chooseBest);
  1097. bool scanWhole = (def->scanAction == INlpParseAlgorithm::NlpScanWhole);
  1098. loop
  1099. {
  1100. position_t nextPosition = parser.getFirstPosition();
  1101. if (nextPosition == NotFound)
  1102. break;
  1103. tokens.kill();
  1104. unsigned numTokens = lexer.next(nextPosition, tokens);
  1105. parser.selectStartPosition(nextPosition);
  1106. if (numTokens)
  1107. {
  1108. bool insertEnd = !scanWhole;
  1109. bool isOnlyToken = false;
  1110. if (numTokens == 1)
  1111. {
  1112. if (tokens.item(0).getId() == eofId)
  1113. {
  1114. insertEnd = false;
  1115. isOnlyToken = true;
  1116. }
  1117. else
  1118. isOnlyToken = scanWhole;
  1119. }
  1120. for (unsigned idx=0; idx < numTokens;idx++)
  1121. {
  1122. GrammarSymbol & curToken = tokens.item(idx);
  1123. parser.process(&curToken, isOnlyToken);
  1124. if (!scanWhole)
  1125. // a bit nasty - end position is still set up after the process() call.
  1126. parser.addStartState();
  1127. }
  1128. if (insertEnd)
  1129. {
  1130. Owned<Terminal> eofToken = new Terminal(eofId, NULL, 0, (const byte *)data + nextPosition);
  1131. parser.process(eofToken, false);
  1132. }
  1133. }
  1134. else if (!scanWhole && (nextPosition != len))
  1135. {
  1136. Owned<Terminal> eofToken = new Terminal(eofId, NULL, 0, (const byte *)data + nextPosition);
  1137. parser.process(eofToken, true);
  1138. parser.selectEndPosition(nextPosition+1);
  1139. parser.addStartState();
  1140. }
  1141. parser.removePosition(nextPosition);
  1142. if (nextPosition == len)
  1143. break;
  1144. }
  1145. parser.endParse();
  1146. iter.reset(parser.queryAccepted());
  1147. //MORE: Post filter lost of options.. including whole etc...
  1148. return parser.numAccepted() != 0;
  1149. }
  1150. INlpResultIterator * TomitaParser::queryResultIter()
  1151. {
  1152. return &iter;
  1153. }
  1154. void TomitaParser::reset()
  1155. {
  1156. iter.invalidate();
  1157. }
  1158. #if 0
  1159. To be done to finish multi parser
  1160. a) Save reductions on separate stack if >1 token at same position
  1161. b) Use two states for the single token case, and swap between to avoid copy.
  1162. Implementing a multiple lex parser:
  1163. 1) Parser maintain an ordered list of current start positions.
  1164. active[position-firstPosition] - use pointers into an array of states; States are cached on a list.
  1165. 2) lex from the first position in the list
  1166. 3) pass end position to parser - selects where new nodes get placed.
  1167. 4) Reductions have to be placed on to a separate list for each token, since dependant.
  1168. 4) Parser has associative array on position[x].
  1169. Items are processed from position x to position y
  1170. Notes:
  1171. Caching:
  1172. Can only prune a branch when shifting, which loses all elements simultaneously, so a singlely linked list suffices,
  1173. and solves link counting problems that a doubly linked list would have,
  1174. Local ambiguity packing:
  1175. When shifting a value, if it gets commoned up, and one of the common branches shifts the same symbol, and the preceeding state for that branch is the same, then common up that node.
  1176. ToDo:
  1177. * Do some more planning re:
  1178. o Augmented grammars
  1179. o Generating the lexer. Especially what we do about unknown words/multiple possible matches. [Other implictations if tokens do not necessarily lie on the same boundaries].
  1180. o Representing penalties and probabilities.
  1181. o Translating the regex syntax into parser input.
  1182. o Conditional reductions - where how/do they occur? What arguments do they need?
  1183. o Returning multiple rows from a single match?
  1184. * Parameterised patterns - how do they related to augmented grammars[do not], and what is needed to implement them?
  1185. * Design in detail the table generator
  1186. o LR or LALR?
  1187. o Pathological grammars e.g., S := | S | ... -> reread and understand doc. Can we cope?
  1188. * Use cases:
  1189. o MAX and BEST()
  1190. * Misc
  1191. Error if ": define()" is applied to a pattern
  1192. MAX,MIN in regex implementation
  1193. Stack problems with regex
  1194. #endif