thortparse.cpp 39 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371
  1. /*##############################################################################
  2. HPCC SYSTEMS software Copyright (C) 2012 HPCC Systems®.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. ############################################################################## */
  13. #include "jliball.hpp"
  14. #include "eclrtl.hpp"
  15. #include "rtlds_imp.hpp"
  16. #include "thortparse.ipp"
  17. //#define TRACING
  18. //---------------------------------------------------------------------------
  19. void doUnwindRelease(GrammarSymbol * symbol, CIArrayOf<GrammarSymbol> & pending)
  20. {
  21. if (!symbol->isPacked())
  22. {
  23. unsigned level = pending.ordinality();
  24. unsigned num = symbol->numChildren();
  25. for (unsigned i = 0; i < num; i++)
  26. pending.append(*LINK(symbol->queryChild(i)));
  27. if (!symbol->Release())
  28. pending.trunc(level);
  29. }
  30. else
  31. symbol->Release();
  32. }
  33. bool GrammarSymbol::canMerge(const GrammarSymbol * other)
  34. {
  35. //MORE: Shouldn't this be enabled at some point?
  36. return false;
  37. byte * leftRow = queryResultRow();
  38. byte * rightRow = other->queryResultRow();
  39. if (leftRow || rightRow)
  40. {
  41. assertex(leftRow && rightRow); // if in same state they really should have consistency here
  42. assertex(queryResultSize() == other->queryResultSize());
  43. if (memcmp(leftRow, rightRow, queryResultSize()) != 0)
  44. return false;
  45. }
  46. assertex(features.info == other->features.info);
  47. size_t len = features.values.length();
  48. return memcmp(features.values.get(), other->features.values.get(), len) == 0;
  49. }
  50. GrammarSymbol * GrammarSymbol::createMerged(GrammarSymbol * other)
  51. {
  52. Owned<GrammarSymbol> merged = new PackedSymbol(this);
  53. return merged->createMerged(other);
  54. }
  55. //---------------------------------------------------------------------------
  56. Terminal::Terminal(symbol_id _id, const FeatureInfo * featureInfo, unsigned _len, const byte * _start) : GrammarSymbol(_id)
  57. {
  58. if (featureInfo && featureInfo->getNum() != 0)
  59. {
  60. features.info = featureInfo;
  61. unsigned size = featureInfo->getSize();
  62. void * resultFeatures = features.values.allocate(size);
  63. memcpy(resultFeatures, featureInfo->defaults, size);
  64. }
  65. len = _len;
  66. start = _start;
  67. }
  68. //---------------------------------------------------------------------------
  69. NonTerminal::NonTerminal(symbol_id _id, IAtom * _name, FeatureValue & _features, unsigned numSymbols, GrammarSymbol * * symbols, const byte * _reducePtr, size32_t _resultSize, byte * _resultRow) : GrammarSymbol(_id), resultSize(_resultSize), resultRow(_resultRow)
  70. {
  71. unsigned nullCount = 0;
  72. unsigned nonNullIndex = 0;
  73. reduced.ensure(numSymbols);
  74. for (unsigned i= 0; i < numSymbols; i++)
  75. {
  76. GrammarSymbol * cur = symbols[i];
  77. if (cur->isNull())
  78. nullCount++;
  79. else
  80. nonNullIndex = i;
  81. reduced.append(*LINK(cur));
  82. addPenalty(cur->getPenalty());
  83. }
  84. cachedIsNull = false;
  85. if (nullCount == numSymbols)
  86. cachedIsNull = true;
  87. else if (nullCount != 0)
  88. {
  89. if (nonNullIndex+1 != numSymbols)
  90. {
  91. const byte * pos = symbols[nonNullIndex]->queryEndPtr();
  92. for (unsigned i1 = nonNullIndex+1; i1 < numSymbols; i1++)
  93. symbols[i1]->resetPosition(pos);
  94. }
  95. for (unsigned i2 = nonNullIndex; i2-- > 0;)
  96. {
  97. if (symbols[i2]->isNull())
  98. symbols[i2]->resetPosition(symbols[i2+1]->queryStartPtr());
  99. }
  100. }
  101. name = _name;
  102. size_t len = _features.values.length();
  103. features.info = _features.info;
  104. if (len)
  105. features.values.setOwn(len, _features.values.detach());
  106. if (numSymbols == 0)
  107. reducePtr = _reducePtr;
  108. else
  109. reducePtr = NULL;
  110. }
  111. NonTerminal::~NonTerminal()
  112. {
  113. if (resultRow)
  114. rtlReleaseRow(resultRow);
  115. //Try and kill tail recursion when freeing a very large match structure
  116. while (reduced.ordinality())
  117. doUnwindRelease(&reduced.popGet(), reduced);
  118. }
  119. void NonTerminal::resetPosition(const byte * pos)
  120. {
  121. assertex(cachedIsNull);
  122. reducePtr = pos;
  123. ForEachItemInRev(idx, reduced)
  124. reduced.item(idx).resetPosition(pos);
  125. }
  126. const byte * NonTerminal::queryStartPtr() const
  127. {
  128. if (reduced.ordinality())
  129. return reduced.item(0).queryStartPtr();
  130. return reducePtr;
  131. }
  132. const byte * NonTerminal::queryEndPtr() const
  133. {
  134. if (reduced.ordinality())
  135. return reduced.tos().queryEndPtr();
  136. return reducePtr;
  137. }
  138. GrammarSymbol * NonTerminal::queryChild(unsigned i)
  139. {
  140. if (reduced.isItem(i))
  141. return &reduced.item(i);
  142. return NULL;
  143. }
  144. //---------------------------------------------------------------------------
  145. PackedSymbol::PackedSymbol(GrammarSymbol * symbol) : GrammarSymbol(symbol->id)
  146. {
  147. //copy the features...
  148. assertex(!symbol->isNull());
  149. equivalents.append(*LINK(symbol));
  150. penalty = symbol->getPenalty();
  151. }
  152. void PackedSymbol::resetPosition(const byte * pos)
  153. {
  154. throwUnexpected();
  155. }
  156. GrammarSymbol * PackedSymbol::createMerged(GrammarSymbol * other)
  157. {
  158. assertex(!other->isNull());
  159. assertex(other->queryStartPtr() == queryStartPtr());
  160. assertex(other->queryEndPtr() == queryEndPtr());
  161. if (other->getPenalty() < penalty)
  162. penalty = other->getPenalty();
  163. if (other->isPacked())
  164. {
  165. //Can happen when two activity states are merged.
  166. for (unsigned i = 0; ; i++)
  167. {
  168. GrammarSymbol * child = queryPacked(i);
  169. if (!child)
  170. break;
  171. equivalents.append(*LINK(child));
  172. }
  173. }
  174. else
  175. equivalents.append(*LINK(other));
  176. return LINK(this);
  177. }
  178. GrammarSymbol * PackedSymbol::queryPacked(unsigned i)
  179. {
  180. if (equivalents.isItem(i))
  181. return &equivalents.item(i);
  182. return NULL;
  183. }
  184. const byte * PackedSymbol::queryStartPtr() const
  185. {
  186. return equivalents.item(0).queryStartPtr();
  187. }
  188. const byte * PackedSymbol::queryEndPtr() const
  189. {
  190. return equivalents.item(0).queryEndPtr();
  191. }
  192. //---------------------------------------------------------------------------
  193. TomitaMatchWalker::TomitaMatchWalker(const PackedSymbolChoice & _choice, GrammarSymbol * _symbol) : choice(_choice)
  194. {
  195. symbol = _symbol;
  196. while (symbol->isPacked())
  197. {
  198. symbol = symbol->queryPacked(choice.getInstance(symbol));
  199. }
  200. }
  201. IAtom * TomitaMatchWalker::queryName()
  202. {
  203. return symbol->queryName();
  204. }
  205. unsigned TomitaMatchWalker::queryID()
  206. {
  207. return symbol->getId();
  208. }
  209. size32_t TomitaMatchWalker::queryMatchSize()
  210. {
  211. return symbol->getLength();
  212. }
  213. const void * TomitaMatchWalker::queryMatchStart()
  214. {
  215. return symbol->queryStartPtr();
  216. }
  217. unsigned TomitaMatchWalker::numChildren()
  218. {
  219. return symbol->numChildren();
  220. }
  221. IMatchWalker * TomitaMatchWalker::getChild(unsigned idx)
  222. {
  223. GrammarSymbol * child = symbol->queryChild(idx);
  224. if (child)
  225. return new TomitaMatchWalker(choice, child);
  226. return NULL;
  227. }
  228. //---------------------------------------------------------------------------
  229. void PackedSymbolChoice::expandFirst(GrammarSymbol * symbol)
  230. {
  231. if (symbol->isPacked())
  232. {
  233. symbols.append(*LINK(symbol));
  234. branches.append(0);
  235. expandFirst(symbol->queryPacked(0));
  236. }
  237. else
  238. {
  239. unsigned max = symbol->numChildren();
  240. for (unsigned i = 0; i < max; i++)
  241. expandFirst(symbol->queryChild(i));
  242. }
  243. }
  244. void PackedSymbolChoice::first(GrammarSymbol * symbol)
  245. {
  246. branches.kill();
  247. symbols.kill();
  248. expandFirst(symbol);
  249. }
  250. void PackedSymbolChoice::expandBest(GrammarSymbol * symbol)
  251. {
  252. if (symbol->isPacked())
  253. {
  254. symbols.append(*LINK(symbol));
  255. unsigned bestIndex = 0;
  256. int bestPenalty = symbol->queryPacked(bestIndex)->getPenalty();
  257. for (unsigned j=1;;j++)
  258. {
  259. GrammarSymbol * cur = symbol->queryPacked(j);
  260. if (!cur) break;
  261. if (bestPenalty > cur->getPenalty())
  262. {
  263. bestIndex = j;
  264. bestPenalty = cur->getPenalty();
  265. }
  266. }
  267. branches.append(bestIndex);
  268. expandBest(symbol->queryPacked(bestIndex));
  269. }
  270. else
  271. {
  272. unsigned max = symbol->numChildren();
  273. for (unsigned i = 0; i < max; i++)
  274. expandBest(symbol->queryChild(i));
  275. }
  276. }
  277. void PackedSymbolChoice::selectBest(GrammarSymbol * symbol)
  278. {
  279. branches.kill();
  280. symbols.kill();
  281. expandBest(symbol);
  282. }
  283. bool PackedSymbolChoice::expandNext(unsigned & level, GrammarSymbol * symbol)
  284. {
  285. unsigned thisLevel = level;
  286. if (symbol->isPacked())
  287. {
  288. assertex(&symbols.item(thisLevel) == symbol);
  289. unsigned curBranch = branches.item(thisLevel);
  290. bool isLastLevel = (thisLevel == symbols.ordinality()-1);
  291. level++;
  292. if (!isLastLevel && expandNext(level, symbol->queryPacked(curBranch)))
  293. return true;
  294. curBranch++;
  295. if (symbol->queryPacked(curBranch))
  296. {
  297. branches.replace(curBranch, thisLevel);
  298. expandFirst(symbol->queryPacked(curBranch));
  299. return true;
  300. }
  301. branches.pop();
  302. symbols.pop();
  303. return false;
  304. }
  305. else
  306. {
  307. unsigned max = symbol->numChildren();
  308. for (unsigned i = 0; i < max; i++)
  309. if (!expandNext(level, symbol->queryChild(i)))
  310. return false;
  311. return true;
  312. }
  313. }
  314. bool PackedSymbolChoice::next(GrammarSymbol * symbol)
  315. {
  316. if (symbols.ordinality() == 0)
  317. return false;
  318. unsigned level = 0;
  319. return expandNext(level, symbol);
  320. }
  321. unsigned PackedSymbolChoice::getInstance(GrammarSymbol * symbol) const
  322. {
  323. unsigned match = symbols.find(*symbol);
  324. assertex(match != NotFound);
  325. return branches.item(match);
  326. }
  327. //---------------------------------------------------------------------------
  328. StackElement::StackElement(GrammarSymbol * _shifted, state_id _state, StackElement * _prev, LRParser * _pool)
  329. {
  330. shifted.set(_shifted);
  331. state = _state;
  332. prev.set(_prev);
  333. pool = _pool;
  334. }
  335. void StackElement::addSibling(StackElement * sib)
  336. {
  337. sib->sibling.setown(sibling.getClear());
  338. sibling.set(sib);
  339. }
  340. void StackElement::addToPool()
  341. {
  342. StackElementArray pending;
  343. gatherPoolPending(pending);
  344. doAddToPool();
  345. while (pending.ordinality())
  346. {
  347. Owned<StackElement> cur = &pending.popGet();
  348. cur->gatherPoolPending(pending);
  349. cur->doAddToPool();
  350. }
  351. }
  352. void StackElement::doAddToPool()
  353. {
  354. shifted.clear();
  355. prev.clear();
  356. sibling.clear();
  357. state = 0;
  358. pool->addFreeElement(this);
  359. pool = NULL;
  360. }
  361. void StackElement::Release()
  362. {
  363. if (pool && !IsShared())
  364. addToPool();
  365. CInterface::Release();
  366. }
  367. void StackElement::getDebugText(StringBuffer & s)
  368. {
  369. if (prev)
  370. prev->getDebugText(s);
  371. s.appendf("%p", this);
  372. s.append("[");
  373. if(shifted)
  374. {
  375. s.append(shifted->id).append(" ");
  376. s.appendf("{%p}", shifted->queryStartPtr());
  377. }
  378. s.append(state).append("] ");
  379. }
  380. bool StackElement::potentialPackNode(StackElement & other) const
  381. {
  382. GrammarSymbol * curSymbol = shifted;
  383. GrammarSymbol * nextSymbol = other.shifted;
  384. if (curSymbol && !curSymbol->isNull() && curSymbol->id == nextSymbol->id)
  385. {
  386. if (other.prev->state == prev->state)
  387. {
  388. if (curSymbol->queryStartPtr() == nextSymbol->queryStartPtr())
  389. {
  390. assertex(curSymbol->queryEndPtr() == nextSymbol->queryEndPtr());
  391. assertex(state == other.state);
  392. return true;
  393. }
  394. }
  395. }
  396. return false;
  397. }
  398. void StackElement::gatherPoolPending(CIArrayOf<StackElement> & pending)
  399. {
  400. if (prev && !prev->IsShared())
  401. pending.append(*prev.getClear());
  402. if (sibling && !sibling->IsShared())
  403. pending.append(*sibling.getClear());
  404. }
  405. //---------------------------------------------------------------------------
  406. LRActiveState::LRActiveState(unsigned _maxStates)
  407. {
  408. maxStates = _maxStates;
  409. cacheBytes = maxStates * sizeof(StackElement *);
  410. cache = new StackElement * [maxStates];
  411. beenReduced = new bool [maxStates];
  412. memset(beenReduced, 0, maxStates);
  413. memset(cache, 0, cacheBytes);
  414. }
  415. LRActiveState::~LRActiveState()
  416. {
  417. delete [] cache;
  418. delete [] beenReduced;
  419. }
  420. void LRActiveState::addElementOwn(StackElement * next, bool keepBest)
  421. {
  422. unsigned stateId = next->state;
  423. if (cache[stateId])
  424. {
  425. if (!mergePackedNode(stateId, next, keepBest))
  426. cache[stateId]->addSibling(next);
  427. next->Release();
  428. }
  429. else
  430. {
  431. cache[stateId] = next;
  432. elements.append(*next);
  433. }
  434. }
  435. void LRActiveState::clearReduced()
  436. {
  437. ForEachItemIn(idx, elements)
  438. beenReduced[elements.item(idx).state] = false;
  439. }
  440. bool LRActiveState::mergePackedNode(unsigned stateId, StackElement * next, bool keepBest)
  441. {
  442. StackElement * cur = cache[stateId];
  443. assertex(cur->state == next->state);
  444. assertex(!next->sibling);
  445. // If two symbols have the same id/attributes, and lie between the same states, then it is a locally
  446. // ambiguous tree, so common the symbol values up.
  447. GrammarSymbol * nextSymbol = next->shifted;
  448. if (!nextSymbol)
  449. return false; // MORE: Should this common???
  450. if (nextSymbol->isNull()) // Hard to know the position of these things...
  451. return false;
  452. StackElement * prev = NULL;
  453. do
  454. {
  455. GrammarSymbol * curSymbol = cur->shifted;
  456. if (curSymbol && !curSymbol->isNull() && (curSymbol->id == nextSymbol->id))
  457. {
  458. if (next->prev->state == cur->prev->state)
  459. {
  460. if (curSymbol->queryStartPtr() == nextSymbol->queryStartPtr())
  461. {
  462. assertex(curSymbol->queryEndPtr() == nextSymbol->queryEndPtr());
  463. assertex(cur->state == next->state);
  464. if (keepBest)
  465. {
  466. int curPenalty = curSymbol->getPenalty();
  467. int nextPenalty = nextSymbol->getPenalty();
  468. if (curPenalty != nextPenalty)
  469. {
  470. //Existing element is better=>throw away the new one.
  471. if (nextPenalty > curPenalty)
  472. return true;
  473. //The new element is better, throw away all the mismatches, by removing this element, and going around again
  474. if (prev)
  475. {
  476. cur = prev;
  477. prev->sibling.set(prev->sibling->sibling);
  478. }
  479. else
  480. {
  481. StackElement * sibling = cur->sibling.getClear();
  482. unsigned replacePos = elements.find(*cur);
  483. if (sibling)
  484. {
  485. //More than one element => replace the head with the (linked) sibling
  486. elements.replace(*sibling, replacePos);
  487. cache[stateId] = sibling;
  488. cur = sibling;
  489. continue;
  490. }
  491. else
  492. {
  493. elements.replace(*LINK(next), replacePos);
  494. cache[stateId] = next;
  495. return true;
  496. }
  497. }
  498. }
  499. }
  500. if (curSymbol->canMerge(nextSymbol))
  501. {
  502. cur->shifted.setown(curSymbol->createMerged(nextSymbol));
  503. #ifdef TRACING
  504. LOG(MCdebugProgress, unknownJob, "Nodes Merged: %p = %p, %p", cur->shifted.get(), curSymbol, nextSymbol);
  505. #endif
  506. return true;
  507. }
  508. }
  509. }
  510. }
  511. prev = cur;
  512. cur = cur->sibling;
  513. } while (cur);
  514. return false;
  515. }
  516. void LRActiveState::reinit()
  517. {
  518. memset(cache, 0, cacheBytes);
  519. elements.kill();
  520. }
  521. void LRActiveState::mergeIn(LRActiveState & other, bool keepBest)
  522. {
  523. ForEachItemIn(idx, other.elements)
  524. {
  525. Linked<StackElement> cur = &other.elements.item(idx);
  526. while (cur)
  527. {
  528. Linked<StackElement> next = cur->sibling;
  529. cur->sibling.clear();
  530. addElementOwn(cur.getClear(), keepBest);
  531. cur.setown(next.getClear());
  532. }
  533. }
  534. }
  535. //---------------------------------------------------------------------------
  536. LRParser::LRParser(const LRTable & _table, const TomitaParserCallback & _rowState) : table(_table), rowState(_rowState)
  537. {
  538. reduced = new LRActiveState(table.numStates);
  539. reducedOverflow[0] = new LRActiveState(table.numStates);
  540. reducedOverflow[1] = new LRActiveState(table.numStates);
  541. _clear(positions);
  542. init();
  543. }
  544. LRParser::~LRParser()
  545. {
  546. //Safely kill the free list.
  547. StackElement * head = nextFreeElement.getClear();
  548. while (head)
  549. {
  550. StackElement * next = head->sibling.getClear();
  551. head->Release();
  552. head = next;
  553. }
  554. for (position_t pos = 0; pos < MAX_POSITIONS; pos++)
  555. delete positions[pos];
  556. delete reduced;
  557. delete reducedOverflow[0];
  558. delete reducedOverflow[1];
  559. }
  560. void LRParser::addFreeElement(StackElement * element)
  561. {
  562. element->sibling.setown(nextFreeElement.getClear());
  563. nextFreeElement.set(element);
  564. }
  565. void LRParser::beginParse(bool _chooseBest)
  566. {
  567. chooseBest = _chooseBest;
  568. selectEndPosition(0);
  569. addStartState();
  570. accepted.kill();
  571. }
  572. void LRParser::endParse()
  573. {
  574. }
  575. void LRParser::clear()
  576. {
  577. activeInput = NULL;
  578. activeOutput = NULL;
  579. firstPosition = NotFound;
  580. endPosition = NotFound;
  581. }
  582. void LRParser::cleanupPosition(position_t pos)
  583. {
  584. unsigned index = pos % MAX_POSITIONS;
  585. positions[index]->reinit();
  586. }
  587. void LRParser::init()
  588. {
  589. chooseBest = false;
  590. reduced->reinit();
  591. reducedOverflow[0]->reinit();
  592. reducedOverflow[1]->reinit();
  593. clear();
  594. }
  595. void LRParser::addStartState()
  596. {
  597. activeOutput->addElementOwn(createState(NULL, (state_id)table.rootState, NULL), false);
  598. }
  599. StackElement * LRParser::createState(StackElement * prev, state_id nextState, GrammarSymbol * shifted)
  600. {
  601. StackElement * newState;
  602. if (nextFreeElement)
  603. {
  604. newState = nextFreeElement.getClear();
  605. nextFreeElement.setown(newState->sibling.getClear());
  606. newState->shifted.set(shifted);
  607. newState->prev.set(prev);
  608. newState->state = nextState;
  609. newState->pool = this;
  610. }
  611. else
  612. {
  613. newState = new StackElement(shifted, nextState, prev, this);
  614. }
  615. #ifdef TRACING
  616. LOG(MCdebugProgress, unknownJob, "%p: Push state %d symbol %d[%p] previous: %d[%p]", newState, nextState, shifted ? shifted->id : -1, shifted, prev ? prev->state : -1, prev);
  617. StringBuffer s;
  618. newState->getDebugText(s);
  619. s.newline();
  620. LOG(MCdebugProgress, unknownJob, s.str());
  621. #endif
  622. return newState;
  623. }
  624. void LRParser::doReductions(LRActiveState & active, GrammarSymbol * next)
  625. {
  626. //At the moment reduce can not remove items.
  627. //Done as a double loop to reduce calls to active.ordinality()
  628. unsigned max = active.elements.ordinality();
  629. for (unsigned i=0; i < max; )
  630. {
  631. for (; i < max; i++)
  632. {
  633. StackElement & cur = active.elements.item(i);
  634. active.markReduced(cur.state);
  635. reduce(cur, next);
  636. }
  637. max = active.elements.ordinality();
  638. }
  639. active.clearReduced();
  640. }
  641. void LRParser::doReductions(GrammarSymbol * next, bool singleToken)
  642. {
  643. //The reduction need to go to a separate list - since may be different for each token
  644. //but want to optimize case where only a single token.
  645. LRActiveState * output = activeInput;
  646. if (!singleToken)
  647. {
  648. output = reduced;
  649. reduced->reinit();
  650. }
  651. activeOutput = output;
  652. curOverflow = reducedOverflow[0];
  653. doReductions(*activeInput, next);
  654. if (!singleToken)
  655. doReductions(*reduced, next);
  656. //You get weird situations where reductions add to states thathave already been reduced,
  657. //so any clashes like that need to be sent to other lists, processed and merged in.
  658. unsigned overflowIndex = 0;
  659. while (curOverflow->elements.ordinality())
  660. {
  661. LRActiveState * oldOverflow = curOverflow;
  662. overflowIndex = 1-overflowIndex;
  663. activeOutput = oldOverflow;
  664. curOverflow = reducedOverflow[overflowIndex];
  665. doReductions(*oldOverflow, next);
  666. output->mergeIn(*oldOverflow, chooseBest);
  667. oldOverflow->reinit();
  668. }
  669. curOverflow = NULL;
  670. activeOutput = NULL;
  671. }
  672. void LRParser::process(GrammarSymbol * next, bool singleToken)
  673. {
  674. #ifdef TRACING
  675. LOG(MCdebugProgress, unknownJob, "Process token '%.*s' %d at position %d", next->queryEndPtr()-next->queryStartPtr(), next->queryStartPtr(), next->id, next->queryStartPtr()-rowState.inputText);
  676. #endif
  677. doReductions(next, singleToken);
  678. selectEndPosition((size32_t)(next->queryEndPtr()-rowState.inputText));
  679. doShifts(next, singleToken);
  680. }
  681. //Build up an array of the symbols to expand - may involve uncommoning nodes
  682. void LRParser::expandReduction(StackElement & element, LRProduction * production, unsigned numSymbols)
  683. {
  684. if (numSymbols == 0)
  685. {
  686. const byte * reducePtr = firstPosition+rowState.inputText;
  687. Owned<GrammarSymbol> reduced = production->reduce(reducedArgs, reducePtr, rowState);
  688. if (reduced)
  689. {
  690. state_id nextState = table.states[element.state]->getGoto(reduced->id);
  691. #ifdef TRACING
  692. StringBuffer s;
  693. for (unsigned i = 0; i < production->getNumSymbols(); i++)
  694. s.appendf("%p ", reducedArgs[i]);
  695. LOG(MCdebugProgress, unknownJob, "Reduce by production %d new element %p[%s]", reduced->id, reduced.get(), s.str());
  696. #endif
  697. //MORE: Some kind of recursion checking needed?
  698. StackElement * cached = activeOutput->cache[nextState];
  699. if (activeOutput->okToAddReduction(nextState))
  700. activeOutput->addElementOwn(createState(&element, nextState, reduced), chooseBest);
  701. else
  702. curOverflow->addElementOwn(createState(&element, nextState, reduced), chooseBest);
  703. }
  704. return;
  705. }
  706. StackElement * curElement = &element;
  707. do
  708. {
  709. reducedArgs[numSymbols-1] = curElement->shifted;
  710. expandReduction(*curElement->prev, production, numSymbols-1);
  711. curElement = curElement->sibling;
  712. } while (curElement);
  713. }
  714. LRActiveState * LRParser::getPosition(position_t pos)
  715. {
  716. unsigned index = pos % MAX_POSITIONS;
  717. return positions[index];
  718. }
  719. void LRParser::setPositionOwn(position_t pos, LRActiveState * value)
  720. {
  721. unsigned index = pos % MAX_POSITIONS;
  722. positions[index] = value;
  723. }
  724. void LRParser::reduce(StackElement & element, GrammarSymbol * next)
  725. {
  726. LRState & curState = *table.states[element.state];
  727. symbol_id nextId = next->id;
  728. unsigned numReductions = curState.numReductions(nextId);
  729. for (unsigned i = 0; i < numReductions; i++)
  730. {
  731. unsigned productionIndex = curState.queryReduction(nextId, i);
  732. LRProduction * production = &table.productions[productionIndex];
  733. expandReduction(element, production, production->getNumSymbols());
  734. }
  735. }
  736. void LRParser::doShifts(LRActiveState * active, GrammarSymbol * next)
  737. {
  738. symbol_id nextid = next->id;
  739. ForEachItemIn(idx, active->elements)
  740. {
  741. StackElement & cur = active->elements.item(idx);
  742. LRState & curState = *table.states[cur.state];
  743. state_id nextState = curState.getShift(nextid);
  744. if (nextState != NO_STATE)
  745. {
  746. #ifdef TRACING
  747. LOG(MCdebugProgress, unknownJob, "Shift to state %d", nextState);
  748. #endif
  749. activeOutput->addElementOwn(createState(&cur, nextState, next), chooseBest);
  750. }
  751. if (curState.canAccept(nextid))
  752. {
  753. for (StackElement * accept = &cur; accept; accept = accept->sibling)
  754. {
  755. GrammarSymbol * sym = accept->shifted;
  756. accepted.append(*LINK(sym));
  757. #ifdef TRACING
  758. LOG(MCdebugProgress, unknownJob, "Accepted %p[%p]", accept, sym);
  759. #endif
  760. }
  761. }
  762. }
  763. }
  764. void LRParser::doShifts(GrammarSymbol * next, bool singleToken)
  765. {
  766. doShifts(activeInput, next);
  767. if (!singleToken)
  768. doShifts(reduced, next);
  769. }
  770. position_t LRParser::getFirstPosition()
  771. {
  772. return firstPosition;
  773. }
  774. void LRParser::removePosition(position_t pos)
  775. {
  776. position_t cur = pos+1;
  777. cleanupPosition(pos);
  778. while (cur < endPosition)
  779. {
  780. LRActiveState * active = getPosition(cur);
  781. if (active && active->elements.ordinality())
  782. break;
  783. cur++;
  784. }
  785. if (pos == firstPosition)
  786. {
  787. firstPosition = cur;
  788. if (firstPosition == endPosition)
  789. {
  790. firstPosition = NotFound;
  791. endPosition = NotFound;
  792. }
  793. }
  794. else
  795. {
  796. assertex(!"This should work, but why is it happening???");
  797. }
  798. }
  799. void LRParser::selectEndPosition(unsigned offset)
  800. {
  801. if (firstPosition == endPosition)
  802. {
  803. firstPosition = offset;
  804. endPosition = offset;
  805. }
  806. assertex(offset >= firstPosition && offset < firstPosition+MAX_POSITIONS);
  807. if (offset >= endPosition)
  808. endPosition = offset+1;
  809. if (!getPosition(offset))
  810. setPositionOwn(offset, new LRActiveState(table.numStates));
  811. activeOutput = getPosition(offset);
  812. }
  813. void LRParser::selectStartPosition(unsigned offset)
  814. {
  815. assertex(offset >= firstPosition && offset < endPosition);
  816. activeInput = getPosition(offset);
  817. }
  818. //---------------------------------------------------------------------------
  819. #if 0
  820. void LRMultiParser::parse(IMultiTokenLexer & lexer)
  821. {
  822. selectEndPosition(0);
  823. addStartState();
  824. unsigned eofId = lexer.getEofId();
  825. GrammarSymbolArray tokens;
  826. bool done = false;
  827. while (!done)
  828. {
  829. position_t nextPosition = getFirstPosition();
  830. if (nextPosition == NotFound)
  831. break;
  832. selectStartPosition(nextPosition);
  833. tokens.kill();
  834. if (lexer.next(nextPosition, tokens)) // doesn't return whitespace.
  835. {
  836. ForEachItemIn(idx, tokens)
  837. {
  838. GrammarSymbol & curToken = tokens.item(idx);
  839. process(&curToken, tokens.ordinality() == 1);
  840. }
  841. if (tokens.ordinality() == 1 && tokens.item(0).id == eofId)
  842. done = true;
  843. }
  844. removePosition(nextPosition);
  845. }
  846. }
  847. #endif
  848. //---------------------------------------------------------------------------
  849. TomitaResultIterator::TomitaResultIterator(const TomitaStateInformation & _rowState, TomitaAlgorithm * _def) : rowState(_rowState), results(_def->matchInfo)
  850. {
  851. def = LINK(_def);
  852. matchFirst = (def->matchAction == INlpParseAlgorithm::NlpMatchFirst);
  853. singleMatchPerSymbol = def->chooseMin || def->chooseMax || def->chooseBest || matchFirst;
  854. }
  855. TomitaResultIterator::~TomitaResultIterator()
  856. {
  857. results.kill();
  858. def->Release();
  859. }
  860. int compareStartPtr(CInterface * const * pLeft, CInterface * const * pRight)
  861. {
  862. //try and make it different to improve sort performance, and so results are reproducible between windows and linux, but it can't be guaranteed.
  863. GrammarSymbol * left = static_cast<GrammarSymbol *>(*pLeft);
  864. GrammarSymbol * right = static_cast<GrammarSymbol *>(*pRight);
  865. int delta = (int)(left->queryStartPtr() - right->queryStartPtr()); // ok unless records can be 2Gb.
  866. if (delta != 0)
  867. return delta;
  868. delta = left->getPenalty() - right->getPenalty();
  869. if (delta)
  870. return delta;
  871. delta = left->getLength() - right->getLength();
  872. if (delta)
  873. return delta;
  874. return 0;
  875. }
  876. bool TomitaResultIterator::isBetter(const GrammarSymbol * left, const GrammarSymbol * right)
  877. {
  878. if (def->chooseMin || def->chooseMax)
  879. {
  880. unsigned leftLength = left->getLength();
  881. unsigned rightLength = right->getLength();
  882. if (leftLength != rightLength)
  883. {
  884. if (def->chooseMin)
  885. return leftLength < rightLength;
  886. else if (def->chooseMax)
  887. return leftLength > rightLength;
  888. }
  889. }
  890. if (def->chooseBest)
  891. {
  892. int leftPenalty = left->getPenalty();
  893. int rightPenalty = right->getPenalty();
  894. return (leftPenalty < rightPenalty);
  895. }
  896. return false;
  897. }
  898. void TomitaResultIterator::reset(const GrammarSymbolArray & _values)
  899. {
  900. values.kill();
  901. curIndex = NotFound;
  902. unsigned numValues = _values.ordinality();
  903. if (numValues == 0)
  904. {
  905. if (def->notMatchedOnly || def->notMatched)
  906. {
  907. //Add a dummy matched entry...
  908. values.append(*new Terminal(def->eofId, NULL, 0, NULL));
  909. }
  910. return;
  911. }
  912. if (def->notMatchedOnly)
  913. return;
  914. values.ensure(numValues);
  915. for (unsigned i = 0; i < numValues; i++)
  916. values.append(OLINK(_values.item(i)));
  917. if (def->scanAction != INlpParseAlgorithm::NlpScanWhole)
  918. values.sort(compareStartPtr);
  919. if (def->chooseMin || def->chooseMax || def->chooseBest)
  920. {
  921. GrammarSymbol * best = &values.item(0);
  922. for (unsigned j=1; j < numValues;)
  923. {
  924. GrammarSymbol & cur = values.item(j);
  925. if (cur.queryStartPtr() != best->queryStartPtr())
  926. {
  927. //Scan none: remove all entries that are at a different offset
  928. if (def->scanAction == INlpParseAlgorithm::NlpScanNone)
  929. {
  930. values.popn(numValues-j);
  931. break;
  932. }
  933. if (!def->singleChoicePerLine)
  934. {
  935. if (def->scanAction == INlpParseAlgorithm::NlpScanNext)
  936. {
  937. if (cur.queryStartPtr() < best->queryEndPtr())
  938. {
  939. numValues--;
  940. values.remove(j);
  941. continue;
  942. }
  943. }
  944. best = &cur;
  945. }
  946. }
  947. else
  948. {
  949. if (matchFirst)
  950. {
  951. numValues--;
  952. values.remove(j);
  953. continue;
  954. }
  955. }
  956. if (best != &cur)
  957. {
  958. if (isBetter(&cur, best))
  959. {
  960. values.zap(*best);
  961. best = &cur;
  962. }
  963. else
  964. values.remove(j);
  965. numValues--;
  966. //NB: j is now pointing at the next entry
  967. }
  968. else
  969. j++;
  970. }
  971. }
  972. else
  973. {
  974. GrammarSymbol * last = &values.item(0);
  975. for (unsigned j=1; j < numValues;)
  976. {
  977. GrammarSymbol & cur = values.item(j);
  978. if (cur.queryStartPtr() != last->queryStartPtr())
  979. {
  980. if (def->scanAction == INlpParseAlgorithm::NlpScanNone)
  981. {
  982. //Scan none: remove all entries that are at a different offset
  983. values.popn(numValues-j);
  984. break;
  985. }
  986. else if ((def->scanAction == INlpParseAlgorithm::NlpScanNext) &&
  987. (cur.queryStartPtr() < last->queryEndPtr()))
  988. {
  989. numValues--;
  990. values.remove(j);
  991. }
  992. else
  993. {
  994. last = &cur;
  995. j++;
  996. }
  997. }
  998. else
  999. {
  1000. if (matchFirst)
  1001. {
  1002. numValues--;
  1003. values.remove(j);
  1004. }
  1005. else
  1006. {
  1007. if (def->scanAction == INlpParseAlgorithm::NlpScanNext)
  1008. {
  1009. if (cur.queryEndPtr() < last->queryEndPtr())
  1010. last = &cur;
  1011. }
  1012. j++;
  1013. }
  1014. }
  1015. }
  1016. }
  1017. }
  1018. bool TomitaResultIterator::first()
  1019. {
  1020. curIndex = 0;
  1021. if (!isValid())
  1022. return false;
  1023. firstChoice();
  1024. return true;
  1025. }
  1026. void TomitaResultIterator::firstChoice()
  1027. {
  1028. if (def->chooseBest)
  1029. choice.selectBest(&values.item(curIndex));
  1030. else
  1031. choice.first(&values.item(curIndex));
  1032. }
  1033. bool TomitaResultIterator::next()
  1034. {
  1035. if (!isValid())
  1036. return false;
  1037. if (!singleMatchPerSymbol && choice.next(&values.item(curIndex)))
  1038. return true;
  1039. curIndex++;
  1040. if (!isValid())
  1041. return false;
  1042. firstChoice();
  1043. return true;
  1044. }
  1045. void TomitaResultIterator::invalidate()
  1046. {
  1047. curIndex = (unsigned)-1;
  1048. values.kill();
  1049. }
  1050. bool TomitaResultIterator::isValid()
  1051. {
  1052. return values.isItem(curIndex);
  1053. }
  1054. const void * TomitaResultIterator::getRow()
  1055. {
  1056. Owned<IMatchWalker> walker = getWalker();
  1057. results.extractResults(&values.item(curIndex), choice, rowState.inputText);
  1058. // results.extractResults(walker, rowState.inputText);
  1059. RtlDynamicRowBuilder rowBuilder(outputAllocator);
  1060. unsigned newSize = rowState.action->onMatch(rowBuilder, rowState.row, &results, walker);
  1061. assertex(newSize);
  1062. return rowBuilder.finalizeRowClear(newSize);
  1063. }
  1064. //---------------------------------------------------------------------------
  1065. TomitaParser::TomitaParser(ICodeContext * ctx, TomitaAlgorithm * _def, unsigned _activityId, INlpHelper * _helper, IHThorParseArg * arg)
  1066. : parser(_def->table, rowState), lexer(_def->tokenDfa, _def->skipDfa, _def->endTokenChars, _def->eofId), iter(rowState, _def)
  1067. {
  1068. assertex(ctx);
  1069. def = _def;
  1070. helper = _helper;
  1071. helperArg = arg;
  1072. eofId = def->eofId;
  1073. outputAllocator.setown(ctx->getRowAllocator(arg->queryOutputMeta(), _activityId));
  1074. iter.setAllocator(outputAllocator);
  1075. //Now ensure that allocators are created for each of the production call backs.
  1076. for (unsigned i=0; i < def->numProductions(); i++)
  1077. {
  1078. IOutputMetaData * rs = arg->queryProductionMeta(i);
  1079. if (rs && !rowState.queryAllocator(rs))
  1080. {
  1081. Owned<IEngineRowAllocator> allocator = ctx->getRowAllocator(rs, _activityId);
  1082. if (allocator)
  1083. rowState.addAllocator(*allocator);
  1084. }
  1085. }
  1086. }
  1087. bool TomitaParser::performMatch(IMatchedAction & action, const void * row, unsigned len, const void * data)
  1088. {
  1089. rowState.row = row;
  1090. rowState.lengthInputText = len;
  1091. rowState.inputText = (const byte *)data;
  1092. rowState.action = &action;
  1093. rowState.inputFormat = def->inputFormat;
  1094. rowState.helper = helper;
  1095. rowState.helperArg = helperArg;
  1096. lexer.setDocument(len, data);
  1097. parser.beginParse(def->chooseBest);
  1098. bool scanWhole = (def->scanAction == INlpParseAlgorithm::NlpScanWhole);
  1099. for (;;)
  1100. {
  1101. position_t nextPosition = parser.getFirstPosition();
  1102. if (nextPosition == NotFound)
  1103. break;
  1104. tokens.kill();
  1105. unsigned numTokens = lexer.next(nextPosition, tokens);
  1106. parser.selectStartPosition(nextPosition);
  1107. if (numTokens)
  1108. {
  1109. bool insertEnd = !scanWhole;
  1110. bool isOnlyToken = false;
  1111. if (numTokens == 1)
  1112. {
  1113. if (tokens.item(0).getId() == eofId)
  1114. {
  1115. insertEnd = false;
  1116. isOnlyToken = true;
  1117. }
  1118. else
  1119. isOnlyToken = scanWhole;
  1120. }
  1121. for (unsigned idx=0; idx < numTokens;idx++)
  1122. {
  1123. GrammarSymbol & curToken = tokens.item(idx);
  1124. parser.process(&curToken, isOnlyToken);
  1125. if (!scanWhole)
  1126. // a bit nasty - end position is still set up after the process() call.
  1127. parser.addStartState();
  1128. }
  1129. if (insertEnd)
  1130. {
  1131. Owned<Terminal> eofToken = new Terminal(eofId, NULL, 0, (const byte *)data + nextPosition);
  1132. parser.process(eofToken, false);
  1133. }
  1134. }
  1135. else if (!scanWhole && (nextPosition != len))
  1136. {
  1137. Owned<Terminal> eofToken = new Terminal(eofId, NULL, 0, (const byte *)data + nextPosition);
  1138. parser.process(eofToken, true);
  1139. parser.selectEndPosition(nextPosition+1);
  1140. parser.addStartState();
  1141. }
  1142. parser.removePosition(nextPosition);
  1143. if (nextPosition == len)
  1144. break;
  1145. }
  1146. parser.endParse();
  1147. iter.reset(parser.queryAccepted());
  1148. //MORE: Post filter lost of options.. including whole etc...
  1149. return parser.numAccepted() != 0;
  1150. }
  1151. INlpResultIterator * TomitaParser::queryResultIter()
  1152. {
  1153. return &iter;
  1154. }
  1155. void TomitaParser::reset()
  1156. {
  1157. iter.invalidate();
  1158. }
  1159. #if 0
  1160. To be done to finish multi parser
  1161. a) Save reductions on separate stack if >1 token at same position
  1162. b) Use two states for the single token case, and swap between to avoid copy.
  1163. Implementing a multiple lex parser:
  1164. 1) Parser maintain an ordered list of current start positions.
  1165. active[position-firstPosition] - use pointers into an array of states; States are cached on a list.
  1166. 2) lex from the first position in the list
  1167. 3) pass end position to parser - selects where new nodes get placed.
  1168. 4) Reductions have to be placed on to a separate list for each token, since dependant.
  1169. 4) Parser has associative array on position[x].
  1170. Items are processed from position x to position y
  1171. Notes:
  1172. Caching:
  1173. Can only prune a branch when shifting, which loses all elements simultaneously, so a singlely linked list suffices,
  1174. and solves link counting problems that a doubly linked list would have,
  1175. Local ambiguity packing:
  1176. When shifting a value, if it gets commoned up, and one of the common branches shifts the same symbol, and the preceeding state for that branch is the same, then common up that node.
  1177. ToDo:
  1178. * Do some more planning re:
  1179. o Augmented grammars
  1180. o Generating the lexer. Especially what we do about unknown words/multiple possible matches. [Other implictations if tokens do not necessarily lie on the same boundaries].
  1181. o Representing penalties and probabilities.
  1182. o Translating the regex syntax into parser input.
  1183. o Conditional reductions - where how/do they occur? What arguments do they need?
  1184. o Returning multiple rows from a single match?
  1185. * Parameterised patterns - how do they related to augmented grammars[do not], and what is needed to implement them?
  1186. * Design in detail the table generator
  1187. o LR or LALR?
  1188. o Pathological grammars e.g., S := | S | ... -> reread and understand doc. Can we cope?
  1189. * Use cases:
  1190. o MAX and BEST()
  1191. * Misc
  1192. Error if ": define()" is applied to a pattern
  1193. MAX,MIN in regex implementation
  1194. Stack problems with regex
  1195. #endif