123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371 |
- /*##############################################################################
- HPCC SYSTEMS software Copyright (C) 2012 HPCC Systems®.
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- http://www.apache.org/licenses/LICENSE-2.0
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- ############################################################################## */
- #include "jliball.hpp"
- #include "eclrtl.hpp"
- #include "rtlds_imp.hpp"
- #include "thortparse.ipp"
- //#define TRACING
- //---------------------------------------------------------------------------
- void doUnwindRelease(GrammarSymbol * symbol, CIArrayOf<GrammarSymbol> & pending)
- {
- if (!symbol->isPacked())
- {
- unsigned level = pending.ordinality();
- unsigned num = symbol->numChildren();
- for (unsigned i = 0; i < num; i++)
- pending.append(*LINK(symbol->queryChild(i)));
- if (!symbol->Release())
- pending.trunc(level);
- }
- else
- symbol->Release();
- }
- bool GrammarSymbol::canMerge(const GrammarSymbol * other)
- {
- //MORE: Shouldn't this be enabled at some point?
- return false;
- byte * leftRow = queryResultRow();
- byte * rightRow = other->queryResultRow();
- if (leftRow || rightRow)
- {
- assertex(leftRow && rightRow); // if in same state they really should have consistency here
- assertex(queryResultSize() == other->queryResultSize());
- if (memcmp(leftRow, rightRow, queryResultSize()) != 0)
- return false;
- }
- assertex(features.info == other->features.info);
- size_t len = features.values.length();
- return memcmp(features.values.get(), other->features.values.get(), len) == 0;
- }
- GrammarSymbol * GrammarSymbol::createMerged(GrammarSymbol * other)
- {
- Owned<GrammarSymbol> merged = new PackedSymbol(this);
- return merged->createMerged(other);
- }
- //---------------------------------------------------------------------------
- Terminal::Terminal(symbol_id _id, const FeatureInfo * featureInfo, unsigned _len, const byte * _start) : GrammarSymbol(_id)
- {
- if (featureInfo && featureInfo->getNum() != 0)
- {
- features.info = featureInfo;
- unsigned size = featureInfo->getSize();
- void * resultFeatures = features.values.allocate(size);
- memcpy(resultFeatures, featureInfo->defaults, size);
- }
- len = _len;
- start = _start;
- }
- //---------------------------------------------------------------------------
- NonTerminal::NonTerminal(symbol_id _id, IAtom * _name, FeatureValue & _features, unsigned numSymbols, GrammarSymbol * * symbols, const byte * _reducePtr, size32_t _resultSize, byte * _resultRow) : GrammarSymbol(_id), resultSize(_resultSize), resultRow(_resultRow)
- {
- unsigned nullCount = 0;
- unsigned nonNullIndex = 0;
- reduced.ensure(numSymbols);
- for (unsigned i= 0; i < numSymbols; i++)
- {
- GrammarSymbol * cur = symbols[i];
- if (cur->isNull())
- nullCount++;
- else
- nonNullIndex = i;
- reduced.append(*LINK(cur));
- addPenalty(cur->getPenalty());
- }
- cachedIsNull = false;
- if (nullCount == numSymbols)
- cachedIsNull = true;
- else if (nullCount != 0)
- {
- if (nonNullIndex+1 != numSymbols)
- {
- const byte * pos = symbols[nonNullIndex]->queryEndPtr();
- for (unsigned i1 = nonNullIndex+1; i1 < numSymbols; i1++)
- symbols[i1]->resetPosition(pos);
- }
- for (unsigned i2 = nonNullIndex; i2-- > 0;)
- {
- if (symbols[i2]->isNull())
- symbols[i2]->resetPosition(symbols[i2+1]->queryStartPtr());
- }
- }
- name = _name;
- size_t len = _features.values.length();
- features.info = _features.info;
- if (len)
- features.values.setOwn(len, _features.values.detach());
- if (numSymbols == 0)
- reducePtr = _reducePtr;
- else
- reducePtr = NULL;
- }
- NonTerminal::~NonTerminal()
- {
- if (resultRow)
- rtlReleaseRow(resultRow);
- //Try and kill tail recursion when freeing a very large match structure
- while (reduced.ordinality())
- doUnwindRelease(&reduced.popGet(), reduced);
- }
- void NonTerminal::resetPosition(const byte * pos)
- {
- assertex(cachedIsNull);
- reducePtr = pos;
- ForEachItemInRev(idx, reduced)
- reduced.item(idx).resetPosition(pos);
- }
- const byte * NonTerminal::queryStartPtr() const
- {
- if (reduced.ordinality())
- return reduced.item(0).queryStartPtr();
- return reducePtr;
- }
- const byte * NonTerminal::queryEndPtr() const
- {
- if (reduced.ordinality())
- return reduced.tos().queryEndPtr();
- return reducePtr;
- }
- GrammarSymbol * NonTerminal::queryChild(unsigned i)
- {
- if (reduced.isItem(i))
- return &reduced.item(i);
- return NULL;
- }
- //---------------------------------------------------------------------------
- PackedSymbol::PackedSymbol(GrammarSymbol * symbol) : GrammarSymbol(symbol->id)
- {
- //copy the features...
- assertex(!symbol->isNull());
- equivalents.append(*LINK(symbol));
- penalty = symbol->getPenalty();
- }
- void PackedSymbol::resetPosition(const byte * pos)
- {
- throwUnexpected();
- }
- GrammarSymbol * PackedSymbol::createMerged(GrammarSymbol * other)
- {
- assertex(!other->isNull());
- assertex(other->queryStartPtr() == queryStartPtr());
- assertex(other->queryEndPtr() == queryEndPtr());
- if (other->getPenalty() < penalty)
- penalty = other->getPenalty();
- if (other->isPacked())
- {
- //Can happen when two activity states are merged.
- for (unsigned i = 0; ; i++)
- {
- GrammarSymbol * child = queryPacked(i);
- if (!child)
- break;
- equivalents.append(*LINK(child));
- }
- }
- else
- equivalents.append(*LINK(other));
- return LINK(this);
- }
- GrammarSymbol * PackedSymbol::queryPacked(unsigned i)
- {
- if (equivalents.isItem(i))
- return &equivalents.item(i);
- return NULL;
- }
- const byte * PackedSymbol::queryStartPtr() const
- {
- return equivalents.item(0).queryStartPtr();
- }
- const byte * PackedSymbol::queryEndPtr() const
- {
- return equivalents.item(0).queryEndPtr();
- }
- //---------------------------------------------------------------------------
- TomitaMatchWalker::TomitaMatchWalker(const PackedSymbolChoice & _choice, GrammarSymbol * _symbol) : choice(_choice)
- {
- symbol = _symbol;
- while (symbol->isPacked())
- {
- symbol = symbol->queryPacked(choice.getInstance(symbol));
- }
- }
- IAtom * TomitaMatchWalker::queryName()
- {
- return symbol->queryName();
- }
- unsigned TomitaMatchWalker::queryID()
- {
- return symbol->getId();
- }
- size32_t TomitaMatchWalker::queryMatchSize()
- {
- return symbol->getLength();
- }
- const void * TomitaMatchWalker::queryMatchStart()
- {
- return symbol->queryStartPtr();
- }
- unsigned TomitaMatchWalker::numChildren()
- {
- return symbol->numChildren();
- }
- IMatchWalker * TomitaMatchWalker::getChild(unsigned idx)
- {
- GrammarSymbol * child = symbol->queryChild(idx);
- if (child)
- return new TomitaMatchWalker(choice, child);
- return NULL;
- }
- //---------------------------------------------------------------------------
- void PackedSymbolChoice::expandFirst(GrammarSymbol * symbol)
- {
- if (symbol->isPacked())
- {
- symbols.append(*LINK(symbol));
- branches.append(0);
- expandFirst(symbol->queryPacked(0));
- }
- else
- {
- unsigned max = symbol->numChildren();
- for (unsigned i = 0; i < max; i++)
- expandFirst(symbol->queryChild(i));
- }
- }
- void PackedSymbolChoice::first(GrammarSymbol * symbol)
- {
- branches.kill();
- symbols.kill();
- expandFirst(symbol);
- }
- void PackedSymbolChoice::expandBest(GrammarSymbol * symbol)
- {
- if (symbol->isPacked())
- {
- symbols.append(*LINK(symbol));
- unsigned bestIndex = 0;
- int bestPenalty = symbol->queryPacked(bestIndex)->getPenalty();
- for (unsigned j=1;;j++)
- {
- GrammarSymbol * cur = symbol->queryPacked(j);
- if (!cur) break;
- if (bestPenalty > cur->getPenalty())
- {
- bestIndex = j;
- bestPenalty = cur->getPenalty();
- }
- }
- branches.append(bestIndex);
- expandBest(symbol->queryPacked(bestIndex));
- }
- else
- {
- unsigned max = symbol->numChildren();
- for (unsigned i = 0; i < max; i++)
- expandBest(symbol->queryChild(i));
- }
- }
- void PackedSymbolChoice::selectBest(GrammarSymbol * symbol)
- {
- branches.kill();
- symbols.kill();
- expandBest(symbol);
- }
- bool PackedSymbolChoice::expandNext(unsigned & level, GrammarSymbol * symbol)
- {
- unsigned thisLevel = level;
- if (symbol->isPacked())
- {
- assertex(&symbols.item(thisLevel) == symbol);
- unsigned curBranch = branches.item(thisLevel);
- bool isLastLevel = (thisLevel == symbols.ordinality()-1);
- level++;
- if (!isLastLevel && expandNext(level, symbol->queryPacked(curBranch)))
- return true;
- curBranch++;
- if (symbol->queryPacked(curBranch))
- {
- branches.replace(curBranch, thisLevel);
- expandFirst(symbol->queryPacked(curBranch));
- return true;
- }
- branches.pop();
- symbols.pop();
- return false;
- }
- else
- {
- unsigned max = symbol->numChildren();
- for (unsigned i = 0; i < max; i++)
- if (!expandNext(level, symbol->queryChild(i)))
- return false;
- return true;
- }
- }
- bool PackedSymbolChoice::next(GrammarSymbol * symbol)
- {
- if (symbols.ordinality() == 0)
- return false;
- unsigned level = 0;
- return expandNext(level, symbol);
- }
- unsigned PackedSymbolChoice::getInstance(GrammarSymbol * symbol) const
- {
- unsigned match = symbols.find(*symbol);
- assertex(match != NotFound);
- return branches.item(match);
- }
- //---------------------------------------------------------------------------
- StackElement::StackElement(GrammarSymbol * _shifted, state_id _state, StackElement * _prev, LRParser * _pool)
- {
- shifted.set(_shifted);
- state = _state;
- prev.set(_prev);
- pool = _pool;
- }
- void StackElement::addSibling(StackElement * sib)
- {
- sib->sibling.setown(sibling.getClear());
- sibling.set(sib);
- }
- void StackElement::addToPool()
- {
- StackElementArray pending;
- gatherPoolPending(pending);
- doAddToPool();
- while (pending.ordinality())
- {
- Owned<StackElement> cur = &pending.popGet();
- cur->gatherPoolPending(pending);
- cur->doAddToPool();
- }
- }
- void StackElement::doAddToPool()
- {
- shifted.clear();
- prev.clear();
- sibling.clear();
- state = 0;
- pool->addFreeElement(this);
- pool = NULL;
- }
- void StackElement::Release()
- {
- if (pool && !IsShared())
- addToPool();
- CInterface::Release();
- }
- void StackElement::getDebugText(StringBuffer & s)
- {
- if (prev)
- prev->getDebugText(s);
- s.appendf("%p", this);
- s.append("[");
- if(shifted)
- {
- s.append(shifted->id).append(" ");
- s.appendf("{%p}", shifted->queryStartPtr());
- }
- s.append(state).append("] ");
- }
- bool StackElement::potentialPackNode(StackElement & other) const
- {
- GrammarSymbol * curSymbol = shifted;
- GrammarSymbol * nextSymbol = other.shifted;
- if (curSymbol && !curSymbol->isNull() && curSymbol->id == nextSymbol->id)
- {
- if (other.prev->state == prev->state)
- {
- if (curSymbol->queryStartPtr() == nextSymbol->queryStartPtr())
- {
- assertex(curSymbol->queryEndPtr() == nextSymbol->queryEndPtr());
- assertex(state == other.state);
- return true;
- }
- }
- }
- return false;
- }
- void StackElement::gatherPoolPending(CIArrayOf<StackElement> & pending)
- {
- if (prev && !prev->IsShared())
- pending.append(*prev.getClear());
- if (sibling && !sibling->IsShared())
- pending.append(*sibling.getClear());
- }
- //---------------------------------------------------------------------------
- LRActiveState::LRActiveState(unsigned _maxStates)
- {
- maxStates = _maxStates;
- cacheBytes = maxStates * sizeof(StackElement *);
- cache = new StackElement * [maxStates];
- beenReduced = new bool [maxStates];
- memset(beenReduced, 0, maxStates);
- memset(cache, 0, cacheBytes);
- }
- LRActiveState::~LRActiveState()
- {
- delete [] cache;
- delete [] beenReduced;
- }
- void LRActiveState::addElementOwn(StackElement * next, bool keepBest)
- {
- unsigned stateId = next->state;
- if (cache[stateId])
- {
- if (!mergePackedNode(stateId, next, keepBest))
- cache[stateId]->addSibling(next);
- next->Release();
- }
- else
- {
- cache[stateId] = next;
- elements.append(*next);
- }
- }
- void LRActiveState::clearReduced()
- {
- ForEachItemIn(idx, elements)
- beenReduced[elements.item(idx).state] = false;
- }
- bool LRActiveState::mergePackedNode(unsigned stateId, StackElement * next, bool keepBest)
- {
- StackElement * cur = cache[stateId];
- assertex(cur->state == next->state);
- assertex(!next->sibling);
- // If two symbols have the same id/attributes, and lie between the same states, then it is a locally
- // ambiguous tree, so common the symbol values up.
- GrammarSymbol * nextSymbol = next->shifted;
- if (!nextSymbol)
- return false; // MORE: Should this common???
- if (nextSymbol->isNull()) // Hard to know the position of these things...
- return false;
- StackElement * prev = NULL;
- do
- {
- GrammarSymbol * curSymbol = cur->shifted;
- if (curSymbol && !curSymbol->isNull() && (curSymbol->id == nextSymbol->id))
- {
- if (next->prev->state == cur->prev->state)
- {
- if (curSymbol->queryStartPtr() == nextSymbol->queryStartPtr())
- {
- assertex(curSymbol->queryEndPtr() == nextSymbol->queryEndPtr());
- assertex(cur->state == next->state);
- if (keepBest)
- {
- int curPenalty = curSymbol->getPenalty();
- int nextPenalty = nextSymbol->getPenalty();
- if (curPenalty != nextPenalty)
- {
- //Existing element is better=>throw away the new one.
- if (nextPenalty > curPenalty)
- return true;
- //The new element is better, throw away all the mismatches, by removing this element, and going around again
- if (prev)
- {
- cur = prev;
- prev->sibling.set(prev->sibling->sibling);
- }
- else
- {
- StackElement * sibling = cur->sibling.getClear();
- unsigned replacePos = elements.find(*cur);
- if (sibling)
- {
- //More than one element => replace the head with the (linked) sibling
- elements.replace(*sibling, replacePos);
- cache[stateId] = sibling;
- cur = sibling;
- continue;
- }
- else
- {
- elements.replace(*LINK(next), replacePos);
- cache[stateId] = next;
- return true;
- }
- }
- }
- }
- if (curSymbol->canMerge(nextSymbol))
- {
- cur->shifted.setown(curSymbol->createMerged(nextSymbol));
- #ifdef TRACING
- LOG(MCdebugProgress, unknownJob, "Nodes Merged: %p = %p, %p", cur->shifted.get(), curSymbol, nextSymbol);
- #endif
- return true;
- }
- }
- }
- }
- prev = cur;
- cur = cur->sibling;
- } while (cur);
- return false;
- }
-
- void LRActiveState::reinit()
- {
- memset(cache, 0, cacheBytes);
- elements.kill();
- }
- void LRActiveState::mergeIn(LRActiveState & other, bool keepBest)
- {
- ForEachItemIn(idx, other.elements)
- {
- Linked<StackElement> cur = &other.elements.item(idx);
- while (cur)
- {
- Linked<StackElement> next = cur->sibling;
- cur->sibling.clear();
- addElementOwn(cur.getClear(), keepBest);
- cur.setown(next.getClear());
- }
- }
- }
- //---------------------------------------------------------------------------
- LRParser::LRParser(const LRTable & _table, const TomitaParserCallback & _rowState) : table(_table), rowState(_rowState)
- {
- reduced = new LRActiveState(table.numStates);
- reducedOverflow[0] = new LRActiveState(table.numStates);
- reducedOverflow[1] = new LRActiveState(table.numStates);
- _clear(positions);
- init();
- }
- LRParser::~LRParser()
- {
- //Safely kill the free list.
- StackElement * head = nextFreeElement.getClear();
- while (head)
- {
- StackElement * next = head->sibling.getClear();
- head->Release();
- head = next;
- }
- for (position_t pos = 0; pos < MAX_POSITIONS; pos++)
- delete positions[pos];
- delete reduced;
- delete reducedOverflow[0];
- delete reducedOverflow[1];
- }
- void LRParser::addFreeElement(StackElement * element)
- {
- element->sibling.setown(nextFreeElement.getClear());
- nextFreeElement.set(element);
- }
- void LRParser::beginParse(bool _chooseBest)
- {
- chooseBest = _chooseBest;
- selectEndPosition(0);
- addStartState();
- accepted.kill();
- }
- void LRParser::endParse()
- {
- }
- void LRParser::clear()
- {
- activeInput = NULL;
- activeOutput = NULL;
- firstPosition = NotFound;
- endPosition = NotFound;
- }
- void LRParser::cleanupPosition(position_t pos)
- {
- unsigned index = pos % MAX_POSITIONS;
- positions[index]->reinit();
- }
- void LRParser::init()
- {
- chooseBest = false;
- reduced->reinit();
- reducedOverflow[0]->reinit();
- reducedOverflow[1]->reinit();
- clear();
- }
- void LRParser::addStartState()
- {
- activeOutput->addElementOwn(createState(NULL, (state_id)table.rootState, NULL), false);
- }
- StackElement * LRParser::createState(StackElement * prev, state_id nextState, GrammarSymbol * shifted)
- {
- StackElement * newState;
- if (nextFreeElement)
- {
- newState = nextFreeElement.getClear();
- nextFreeElement.setown(newState->sibling.getClear());
- newState->shifted.set(shifted);
- newState->prev.set(prev);
- newState->state = nextState;
- newState->pool = this;
- }
- else
- {
- newState = new StackElement(shifted, nextState, prev, this);
- }
- #ifdef TRACING
- LOG(MCdebugProgress, unknownJob, "%p: Push state %d symbol %d[%p] previous: %d[%p]", newState, nextState, shifted ? shifted->id : -1, shifted, prev ? prev->state : -1, prev);
- StringBuffer s;
- newState->getDebugText(s);
- s.newline();
- LOG(MCdebugProgress, unknownJob, s.str());
- #endif
- return newState;
- }
- void LRParser::doReductions(LRActiveState & active, GrammarSymbol * next)
- {
- //At the moment reduce can not remove items.
- //Done as a double loop to reduce calls to active.ordinality()
- unsigned max = active.elements.ordinality();
- for (unsigned i=0; i < max; )
- {
- for (; i < max; i++)
- {
- StackElement & cur = active.elements.item(i);
- active.markReduced(cur.state);
- reduce(cur, next);
- }
- max = active.elements.ordinality();
- }
- active.clearReduced();
- }
- void LRParser::doReductions(GrammarSymbol * next, bool singleToken)
- {
- //The reduction need to go to a separate list - since may be different for each token
- //but want to optimize case where only a single token.
- LRActiveState * output = activeInput;
- if (!singleToken)
- {
- output = reduced;
- reduced->reinit();
- }
- activeOutput = output;
- curOverflow = reducedOverflow[0];
- doReductions(*activeInput, next);
- if (!singleToken)
- doReductions(*reduced, next);
- //You get weird situations where reductions add to states thathave already been reduced,
- //so any clashes like that need to be sent to other lists, processed and merged in.
- unsigned overflowIndex = 0;
- while (curOverflow->elements.ordinality())
- {
- LRActiveState * oldOverflow = curOverflow;
- overflowIndex = 1-overflowIndex;
- activeOutput = oldOverflow;
- curOverflow = reducedOverflow[overflowIndex];
- doReductions(*oldOverflow, next);
- output->mergeIn(*oldOverflow, chooseBest);
- oldOverflow->reinit();
- }
- curOverflow = NULL;
- activeOutput = NULL;
- }
- void LRParser::process(GrammarSymbol * next, bool singleToken)
- {
- #ifdef TRACING
- LOG(MCdebugProgress, unknownJob, "Process token '%.*s' %d at position %d", next->queryEndPtr()-next->queryStartPtr(), next->queryStartPtr(), next->id, next->queryStartPtr()-rowState.inputText);
- #endif
- doReductions(next, singleToken);
- selectEndPosition((size32_t)(next->queryEndPtr()-rowState.inputText));
- doShifts(next, singleToken);
- }
- //Build up an array of the symbols to expand - may involve uncommoning nodes
- void LRParser::expandReduction(StackElement & element, LRProduction * production, unsigned numSymbols)
- {
- if (numSymbols == 0)
- {
- const byte * reducePtr = firstPosition+rowState.inputText;
- Owned<GrammarSymbol> reduced = production->reduce(reducedArgs, reducePtr, rowState);
- if (reduced)
- {
- state_id nextState = table.states[element.state]->getGoto(reduced->id);
- #ifdef TRACING
- StringBuffer s;
- for (unsigned i = 0; i < production->getNumSymbols(); i++)
- s.appendf("%p ", reducedArgs[i]);
- LOG(MCdebugProgress, unknownJob, "Reduce by production %d new element %p[%s]", reduced->id, reduced.get(), s.str());
- #endif
- //MORE: Some kind of recursion checking needed?
- StackElement * cached = activeOutput->cache[nextState];
- if (activeOutput->okToAddReduction(nextState))
- activeOutput->addElementOwn(createState(&element, nextState, reduced), chooseBest);
- else
- curOverflow->addElementOwn(createState(&element, nextState, reduced), chooseBest);
- }
- return;
- }
- StackElement * curElement = &element;
- do
- {
- reducedArgs[numSymbols-1] = curElement->shifted;
- expandReduction(*curElement->prev, production, numSymbols-1);
- curElement = curElement->sibling;
- } while (curElement);
- }
- LRActiveState * LRParser::getPosition(position_t pos)
- {
- unsigned index = pos % MAX_POSITIONS;
- return positions[index];
- }
- void LRParser::setPositionOwn(position_t pos, LRActiveState * value)
- {
- unsigned index = pos % MAX_POSITIONS;
- positions[index] = value;
- }
- void LRParser::reduce(StackElement & element, GrammarSymbol * next)
- {
- LRState & curState = *table.states[element.state];
- symbol_id nextId = next->id;
- unsigned numReductions = curState.numReductions(nextId);
- for (unsigned i = 0; i < numReductions; i++)
- {
- unsigned productionIndex = curState.queryReduction(nextId, i);
- LRProduction * production = &table.productions[productionIndex];
- expandReduction(element, production, production->getNumSymbols());
- }
- }
- void LRParser::doShifts(LRActiveState * active, GrammarSymbol * next)
- {
- symbol_id nextid = next->id;
- ForEachItemIn(idx, active->elements)
- {
- StackElement & cur = active->elements.item(idx);
- LRState & curState = *table.states[cur.state];
- state_id nextState = curState.getShift(nextid);
- if (nextState != NO_STATE)
- {
- #ifdef TRACING
- LOG(MCdebugProgress, unknownJob, "Shift to state %d", nextState);
- #endif
- activeOutput->addElementOwn(createState(&cur, nextState, next), chooseBest);
- }
- if (curState.canAccept(nextid))
- {
- for (StackElement * accept = &cur; accept; accept = accept->sibling)
- {
- GrammarSymbol * sym = accept->shifted;
- accepted.append(*LINK(sym));
- #ifdef TRACING
- LOG(MCdebugProgress, unknownJob, "Accepted %p[%p]", accept, sym);
- #endif
- }
- }
- }
- }
- void LRParser::doShifts(GrammarSymbol * next, bool singleToken)
- {
- doShifts(activeInput, next);
- if (!singleToken)
- doShifts(reduced, next);
- }
- position_t LRParser::getFirstPosition()
- {
- return firstPosition;
- }
- void LRParser::removePosition(position_t pos)
- {
- position_t cur = pos+1;
- cleanupPosition(pos);
- while (cur < endPosition)
- {
- LRActiveState * active = getPosition(cur);
- if (active && active->elements.ordinality())
- break;
- cur++;
- }
- if (pos == firstPosition)
- {
- firstPosition = cur;
- if (firstPosition == endPosition)
- {
- firstPosition = NotFound;
- endPosition = NotFound;
- }
- }
- else
- {
- assertex(!"This should work, but why is it happening???");
- }
- }
- void LRParser::selectEndPosition(unsigned offset)
- {
- if (firstPosition == endPosition)
- {
- firstPosition = offset;
- endPosition = offset;
- }
- assertex(offset >= firstPosition && offset < firstPosition+MAX_POSITIONS);
- if (offset >= endPosition)
- endPosition = offset+1;
- if (!getPosition(offset))
- setPositionOwn(offset, new LRActiveState(table.numStates));
- activeOutput = getPosition(offset);
- }
- void LRParser::selectStartPosition(unsigned offset)
- {
- assertex(offset >= firstPosition && offset < endPosition);
- activeInput = getPosition(offset);
- }
- //---------------------------------------------------------------------------
- #if 0
- void LRMultiParser::parse(IMultiTokenLexer & lexer)
- {
- selectEndPosition(0);
- addStartState();
- unsigned eofId = lexer.getEofId();
- GrammarSymbolArray tokens;
- bool done = false;
- while (!done)
- {
- position_t nextPosition = getFirstPosition();
- if (nextPosition == NotFound)
- break;
- selectStartPosition(nextPosition);
- tokens.kill();
- if (lexer.next(nextPosition, tokens)) // doesn't return whitespace.
- {
- ForEachItemIn(idx, tokens)
- {
- GrammarSymbol & curToken = tokens.item(idx);
-
- process(&curToken, tokens.ordinality() == 1);
- }
- if (tokens.ordinality() == 1 && tokens.item(0).id == eofId)
- done = true;
- }
- removePosition(nextPosition);
- }
- }
- #endif
- //---------------------------------------------------------------------------
- TomitaResultIterator::TomitaResultIterator(const TomitaStateInformation & _rowState, TomitaAlgorithm * _def) : rowState(_rowState), results(_def->matchInfo)
- {
- def = LINK(_def);
- matchFirst = (def->matchAction == INlpParseAlgorithm::NlpMatchFirst);
- singleMatchPerSymbol = def->chooseMin || def->chooseMax || def->chooseBest || matchFirst;
- }
- TomitaResultIterator::~TomitaResultIterator()
- {
- results.kill();
- def->Release();
- }
- int compareStartPtr(CInterface * const * pLeft, CInterface * const * pRight)
- {
- //try and make it different to improve sort performance, and so results are reproducible between windows and linux, but it can't be guaranteed.
- GrammarSymbol * left = static_cast<GrammarSymbol *>(*pLeft);
- GrammarSymbol * right = static_cast<GrammarSymbol *>(*pRight);
- int delta = (int)(left->queryStartPtr() - right->queryStartPtr()); // ok unless records can be 2Gb.
- if (delta != 0)
- return delta;
- delta = left->getPenalty() - right->getPenalty();
- if (delta)
- return delta;
- delta = left->getLength() - right->getLength();
- if (delta)
- return delta;
- return 0;
- }
- bool TomitaResultIterator::isBetter(const GrammarSymbol * left, const GrammarSymbol * right)
- {
- if (def->chooseMin || def->chooseMax)
- {
- unsigned leftLength = left->getLength();
- unsigned rightLength = right->getLength();
- if (leftLength != rightLength)
- {
- if (def->chooseMin)
- return leftLength < rightLength;
- else if (def->chooseMax)
- return leftLength > rightLength;
- }
- }
- if (def->chooseBest)
- {
- int leftPenalty = left->getPenalty();
- int rightPenalty = right->getPenalty();
- return (leftPenalty < rightPenalty);
- }
- return false;
- }
- void TomitaResultIterator::reset(const GrammarSymbolArray & _values)
- {
- values.kill();
- curIndex = NotFound;
- unsigned numValues = _values.ordinality();
- if (numValues == 0)
- {
- if (def->notMatchedOnly || def->notMatched)
- {
- //Add a dummy matched entry...
- values.append(*new Terminal(def->eofId, NULL, 0, NULL));
- }
- return;
- }
- if (def->notMatchedOnly)
- return;
- values.ensure(numValues);
- for (unsigned i = 0; i < numValues; i++)
- values.append(OLINK(_values.item(i)));
- if (def->scanAction != INlpParseAlgorithm::NlpScanWhole)
- values.sort(compareStartPtr);
- if (def->chooseMin || def->chooseMax || def->chooseBest)
- {
- GrammarSymbol * best = &values.item(0);
- for (unsigned j=1; j < numValues;)
- {
- GrammarSymbol & cur = values.item(j);
- if (cur.queryStartPtr() != best->queryStartPtr())
- {
- //Scan none: remove all entries that are at a different offset
- if (def->scanAction == INlpParseAlgorithm::NlpScanNone)
- {
- values.popn(numValues-j);
- break;
- }
- if (!def->singleChoicePerLine)
- {
- if (def->scanAction == INlpParseAlgorithm::NlpScanNext)
- {
- if (cur.queryStartPtr() < best->queryEndPtr())
- {
- numValues--;
- values.remove(j);
- continue;
- }
- }
- best = &cur;
- }
- }
- else
- {
- if (matchFirst)
- {
- numValues--;
- values.remove(j);
- continue;
- }
- }
- if (best != &cur)
- {
- if (isBetter(&cur, best))
- {
- values.zap(*best);
- best = &cur;
- }
- else
- values.remove(j);
- numValues--;
- //NB: j is now pointing at the next entry
- }
- else
- j++;
- }
- }
- else
- {
- GrammarSymbol * last = &values.item(0);
- for (unsigned j=1; j < numValues;)
- {
- GrammarSymbol & cur = values.item(j);
- if (cur.queryStartPtr() != last->queryStartPtr())
- {
- if (def->scanAction == INlpParseAlgorithm::NlpScanNone)
- {
- //Scan none: remove all entries that are at a different offset
- values.popn(numValues-j);
- break;
- }
- else if ((def->scanAction == INlpParseAlgorithm::NlpScanNext) &&
- (cur.queryStartPtr() < last->queryEndPtr()))
- {
- numValues--;
- values.remove(j);
- }
- else
- {
- last = &cur;
- j++;
- }
- }
- else
- {
- if (matchFirst)
- {
- numValues--;
- values.remove(j);
- }
- else
- {
- if (def->scanAction == INlpParseAlgorithm::NlpScanNext)
- {
- if (cur.queryEndPtr() < last->queryEndPtr())
- last = &cur;
- }
- j++;
- }
- }
- }
- }
- }
- bool TomitaResultIterator::first()
- {
- curIndex = 0;
- if (!isValid())
- return false;
- firstChoice();
- return true;
- }
- void TomitaResultIterator::firstChoice()
- {
- if (def->chooseBest)
- choice.selectBest(&values.item(curIndex));
- else
- choice.first(&values.item(curIndex));
- }
- bool TomitaResultIterator::next()
- {
- if (!isValid())
- return false;
- if (!singleMatchPerSymbol && choice.next(&values.item(curIndex)))
- return true;
- curIndex++;
- if (!isValid())
- return false;
-
- firstChoice();
- return true;
- }
- void TomitaResultIterator::invalidate()
- {
- curIndex = (unsigned)-1;
- values.kill();
- }
- bool TomitaResultIterator::isValid()
- {
- return values.isItem(curIndex);
- }
- const void * TomitaResultIterator::getRow()
- {
- Owned<IMatchWalker> walker = getWalker();
- results.extractResults(&values.item(curIndex), choice, rowState.inputText);
- // results.extractResults(walker, rowState.inputText);
- RtlDynamicRowBuilder rowBuilder(outputAllocator);
- unsigned newSize = rowState.action->onMatch(rowBuilder, rowState.row, &results, walker);
- assertex(newSize);
- return rowBuilder.finalizeRowClear(newSize);
- }
- //---------------------------------------------------------------------------
- TomitaParser::TomitaParser(ICodeContext * ctx, TomitaAlgorithm * _def, unsigned _activityId, INlpHelper * _helper, IHThorParseArg * arg)
- : parser(_def->table, rowState), lexer(_def->tokenDfa, _def->skipDfa, _def->endTokenChars, _def->eofId), iter(rowState, _def)
- {
- assertex(ctx);
- def = _def;
- helper = _helper;
- helperArg = arg;
- eofId = def->eofId;
- outputAllocator.setown(ctx->getRowAllocator(arg->queryOutputMeta(), _activityId));
- iter.setAllocator(outputAllocator);
- //Now ensure that allocators are created for each of the production call backs.
- for (unsigned i=0; i < def->numProductions(); i++)
- {
- IOutputMetaData * rs = arg->queryProductionMeta(i);
- if (rs && !rowState.queryAllocator(rs))
- {
- Owned<IEngineRowAllocator> allocator = ctx->getRowAllocator(rs, _activityId);
- if (allocator)
- rowState.addAllocator(*allocator);
- }
- }
- }
- bool TomitaParser::performMatch(IMatchedAction & action, const void * row, unsigned len, const void * data)
- {
- rowState.row = row;
- rowState.lengthInputText = len;
- rowState.inputText = (const byte *)data;
- rowState.action = &action;
- rowState.inputFormat = def->inputFormat;
- rowState.helper = helper;
- rowState.helperArg = helperArg;
- lexer.setDocument(len, data);
- parser.beginParse(def->chooseBest);
- bool scanWhole = (def->scanAction == INlpParseAlgorithm::NlpScanWhole);
- for (;;)
- {
- position_t nextPosition = parser.getFirstPosition();
- if (nextPosition == NotFound)
- break;
- tokens.kill();
- unsigned numTokens = lexer.next(nextPosition, tokens);
- parser.selectStartPosition(nextPosition);
- if (numTokens)
- {
- bool insertEnd = !scanWhole;
- bool isOnlyToken = false;
- if (numTokens == 1)
- {
- if (tokens.item(0).getId() == eofId)
- {
- insertEnd = false;
- isOnlyToken = true;
- }
- else
- isOnlyToken = scanWhole;
- }
- for (unsigned idx=0; idx < numTokens;idx++)
- {
- GrammarSymbol & curToken = tokens.item(idx);
-
- parser.process(&curToken, isOnlyToken);
- if (!scanWhole)
- // a bit nasty - end position is still set up after the process() call.
- parser.addStartState();
- }
- if (insertEnd)
- {
- Owned<Terminal> eofToken = new Terminal(eofId, NULL, 0, (const byte *)data + nextPosition);
- parser.process(eofToken, false);
- }
- }
- else if (!scanWhole && (nextPosition != len))
- {
- Owned<Terminal> eofToken = new Terminal(eofId, NULL, 0, (const byte *)data + nextPosition);
- parser.process(eofToken, true);
- parser.selectEndPosition(nextPosition+1);
- parser.addStartState();
- }
- parser.removePosition(nextPosition);
- if (nextPosition == len)
- break;
- }
- parser.endParse();
- iter.reset(parser.queryAccepted());
- //MORE: Post filter lost of options.. including whole etc...
- return parser.numAccepted() != 0;
- }
- INlpResultIterator * TomitaParser::queryResultIter()
- {
- return &iter;
- }
- void TomitaParser::reset()
- {
- iter.invalidate();
- }
- #if 0
- To be done to finish multi parser
- a) Save reductions on separate stack if >1 token at same position
- b) Use two states for the single token case, and swap between to avoid copy.
- Implementing a multiple lex parser:
- 1) Parser maintain an ordered list of current start positions.
- active[position-firstPosition] - use pointers into an array of states; States are cached on a list.
- 2) lex from the first position in the list
- 3) pass end position to parser - selects where new nodes get placed.
- 4) Reductions have to be placed on to a separate list for each token, since dependant.
- 4) Parser has associative array on position[x].
- Items are processed from position x to position y
- Notes:
- Caching:
- Can only prune a branch when shifting, which loses all elements simultaneously, so a singlely linked list suffices,
- and solves link counting problems that a doubly linked list would have,
- Local ambiguity packing:
- When shifting a value, if it gets commoned up, and one of the common branches shifts the same symbol, and the preceeding state for that branch is the same, then common up that node.
- ToDo:
- * Do some more planning re:
- o Augmented grammars
- o Generating the lexer. Especially what we do about unknown words/multiple possible matches. [Other implictations if tokens do not necessarily lie on the same boundaries].
- o Representing penalties and probabilities.
- o Translating the regex syntax into parser input.
- o Conditional reductions - where how/do they occur? What arguments do they need?
- o Returning multiple rows from a single match?
- * Parameterised patterns - how do they related to augmented grammars[do not], and what is needed to implement them?
- * Design in detail the table generator
- o LR or LALR?
- o Pathological grammars e.g., S := | S | ... -> reread and understand doc. Can we cope?
- * Use cases:
- o MAX and BEST()
- * Misc
- Error if ": define()" is applied to a pattern
- MAX,MIN in regex implementation
- Stack problems with regex
- #endif
|