thortparse.ipp 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371
  1. /*##############################################################################
  2. HPCC SYSTEMS software Copyright (C) 2012 HPCC Systems®.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. ############################################################################## */
  13. #ifndef __THORTPARSE_IPP_
  14. #define __THORTPARSE_IPP_
  15. #include "thorparse.ipp"
  16. #include "thortalgo.ipp"
  17. #define MAX_POSITIONS MAX_TOKEN_LENGTH
  18. typedef CIArrayOf<GrammarSymbol> GrammarSymbolArray;
  19. //---------------------------------------------------------------------------
  20. class MultiLexer
  21. {
  22. friend class TomitaContext;
  23. public:
  24. MultiLexer(const AsciiDfa & _tokens, const AsciiDfa & _skip, const UnsignedArray & _endTokenChars, unsigned _eofId);
  25. unsigned getEofId() { return eofId; }
  26. unsigned next(position_t pos, GrammarSymbolArray & symbols);
  27. void setDocument(size32_t len, const void * _start);
  28. protected:
  29. GrammarSymbol * createToken(symbol_id id, unsigned len, const byte * start);
  30. position_t skipWhitespace(position_t pos);
  31. protected:
  32. const AsciiDfa & skip;
  33. const AsciiDfa & tokens;
  34. bool isEndToken[256];
  35. unsigned eofId;
  36. struct
  37. {
  38. const byte * start;
  39. const byte * end;
  40. } state;
  41. };
  42. //---------------------------------------------------------------------------
  43. class GrammarSymbol : implements IMatchedElement, public CInterface
  44. {
  45. public:
  46. GrammarSymbol(symbol_id _id) { id = _id; penalty = 0; }
  47. IMPLEMENT_IINTERFACE
  48. bool canMerge(const GrammarSymbol * other);
  49. virtual GrammarSymbol * createMerged(GrammarSymbol * other);
  50. inline symbol_id getId() const { return id; }
  51. inline position_t getLength() const { return position_t(queryEndPtr() - queryStartPtr()); }
  52. inline int getPenalty() const { return penalty; }
  53. inline void addPenalty(int delta) { penalty += delta; }
  54. virtual bool isNull() const = 0;
  55. virtual bool isPacked() const { return false; }
  56. virtual unsigned numChildren() const { return 0; }
  57. virtual GrammarSymbol * queryChild(unsigned i) { return NULL; }
  58. virtual IAtom * queryName() const { return NULL; }
  59. virtual GrammarSymbol * queryPacked(unsigned i) { return NULL; }
  60. virtual size32_t queryResultSize() const { return 0; }
  61. virtual byte * queryResultRow() const { return NULL; }
  62. virtual void resetPosition(const byte * pos) = 0;
  63. virtual const byte * queryStartPtr() const = 0;
  64. virtual const byte * queryEndPtr() const = 0;
  65. virtual const byte * queryRow() const { return queryResultRow(); }
  66. public:
  67. symbol_id id;
  68. int penalty;
  69. FeatureValue features;
  70. };
  71. class Terminal : public GrammarSymbol
  72. {
  73. public:
  74. Terminal(symbol_id _id, const FeatureInfo * _feature, unsigned _len, const byte * _start);
  75. virtual GrammarSymbol * merge(GrammarSymbol * other) { UNIMPLEMENTED; }
  76. virtual bool isNull() const { return false; }
  77. virtual const byte * queryStartPtr() const { return start; }
  78. virtual const byte * queryEndPtr() const { return start+len; }
  79. virtual void resetPosition(const byte * pos) { UNIMPLEMENTED; }
  80. protected:
  81. const byte * start;
  82. unsigned len;
  83. };
  84. //This can be deleted once we switch to an exclusively link counted row thor
  85. class NonTerminal : public GrammarSymbol
  86. {
  87. public:
  88. NonTerminal(symbol_id id, IAtom * _name, FeatureValue & _features, unsigned numSymbols, GrammarSymbol * * symbols, const byte * _reducePtr, size32_t _resultSize, byte * _resultRow);
  89. ~NonTerminal();
  90. virtual bool isNull() const { return cachedIsNull; }
  91. virtual const byte * queryStartPtr() const;
  92. virtual const byte * queryEndPtr() const;
  93. virtual IAtom * queryName() const { return name; }
  94. virtual void resetPosition(const byte * pos);
  95. virtual unsigned numChildren() const { return reduced.ordinality(); }
  96. virtual GrammarSymbol * queryChild(unsigned i);
  97. virtual size32_t queryResultSize() const { return resultSize; }
  98. virtual byte * queryResultRow() const { return resultRow; }
  99. protected:
  100. const byte * reducePtr;
  101. byte * resultRow;
  102. IAtom * name;
  103. size32_t resultSize; // This really shouldn't be needed - it is needed to support old style record cloning, and doRowsMatch - which needs to be done with a helper.
  104. CIArrayOf<GrammarSymbol> reduced;
  105. bool cachedIsNull;
  106. };
  107. class PackedSymbol : public GrammarSymbol
  108. {
  109. public:
  110. PackedSymbol(GrammarSymbol * symbol);
  111. virtual GrammarSymbol * createMerged(GrammarSymbol * other);
  112. virtual const byte * queryStartPtr() const;
  113. virtual const byte * queryEndPtr() const;
  114. virtual void resetPosition(const byte * pos);
  115. virtual bool isNull() const { return false; }
  116. virtual bool isPacked() const { return true; }
  117. virtual unsigned numChildren() const { UNIMPLEMENTED; }
  118. virtual GrammarSymbol * queryChild(unsigned i) { UNIMPLEMENTED; }
  119. virtual IAtom * queryName() const { return equivalents.item(0).queryName(); }
  120. virtual GrammarSymbol * queryPacked(unsigned i);
  121. private:
  122. GrammarSymbolArray equivalents;
  123. };
  124. //---------------------------------------------------------------------------
  125. class PackedSymbolChoice;
  126. class TomitaMatchWalker : implements IMatchWalker, public CInterface
  127. {
  128. public:
  129. TomitaMatchWalker(const PackedSymbolChoice & _choice, GrammarSymbol * _symbol);
  130. IMPLEMENT_IINTERFACE
  131. virtual IAtom * queryName();
  132. virtual unsigned queryID();
  133. virtual size32_t queryMatchSize();
  134. virtual const void * queryMatchStart();
  135. virtual unsigned numChildren();
  136. virtual IMatchWalker * getChild(unsigned idx);
  137. protected:
  138. const PackedSymbolChoice & choice;
  139. GrammarSymbol * symbol;
  140. };
  141. class PackedSymbolChoice
  142. {
  143. public:
  144. void first(GrammarSymbol * symbol);
  145. bool next(GrammarSymbol * symbol);
  146. unsigned getInstance(GrammarSymbol * symbol) const;
  147. void selectBest(GrammarSymbol * symbol);
  148. protected:
  149. void expandBest(GrammarSymbol * symbol);
  150. void expandFirst(GrammarSymbol * symbol);
  151. bool expandNext(unsigned & level, GrammarSymbol * symbol);
  152. public:
  153. GrammarSymbolArray symbols;
  154. UnsignedArray branches;
  155. };
  156. //---------------------------------------------------------------------------
  157. class LRParser;
  158. class StackElement : public CInterface
  159. {
  160. public:
  161. StackElement(GrammarSymbol * _shifted, state_id _state, StackElement * _prev, LRParser * _pool);
  162. virtual void Release();
  163. void addToPool();
  164. void addSibling(StackElement * sib);
  165. void getDebugText(StringBuffer & s);
  166. bool potentialPackNode(StackElement & other) const;
  167. protected:
  168. void doAddToPool();
  169. void gatherPoolPending(CIArrayOf<StackElement> & pending);
  170. public:
  171. Linked<GrammarSymbol> shifted;
  172. state_id state;
  173. Linked<StackElement> prev;
  174. Linked<StackElement> sibling;
  175. LRParser * pool;
  176. };
  177. typedef CIArrayOf<StackElement> StackElementArray;
  178. //would it be better to not link count this??
  179. class LRActiveState
  180. {
  181. public:
  182. LRActiveState(unsigned maxStates);
  183. ~LRActiveState();
  184. void addElementOwn(StackElement * next, bool keepBest);
  185. void clearReduced();
  186. void markReduced(state_id id) { beenReduced[id] = true; }
  187. void mergeIn(LRActiveState & other, bool keepBest);
  188. bool mergePackedNode(unsigned stateId, StackElement * next, bool keepBest);
  189. bool okToAddReduction(state_id id) { return !beenReduced[id]; }
  190. void reinit();
  191. public:
  192. StackElement * * cache;// [MAX_STATES]; // elements in here are NOT linked.
  193. bool * beenReduced;
  194. StackElementArray elements;
  195. unsigned cacheBytes;
  196. unsigned maxStates;
  197. };
  198. class LRParser
  199. {
  200. public:
  201. LRParser(const LRTable & _table, const TomitaParserCallback & _rowState);
  202. ~LRParser();
  203. void init();
  204. void addStartState();
  205. void process(GrammarSymbol * next, bool singleToken);
  206. unsigned numAccepted() { return accepted.ordinality(); }
  207. const GrammarSymbolArray & queryAccepted() { return accepted; }
  208. // Code to support DAG lex graphs...
  209. position_t getFirstPosition();
  210. void removePosition(position_t pos);
  211. void beginParse(bool _chooseBest);
  212. void endParse();
  213. void selectEndPosition(unsigned offset);
  214. void selectStartPosition(unsigned offset);
  215. void addFreeElement(StackElement * element);
  216. protected:
  217. StackElement * createState(StackElement * prev, state_id nextState, GrammarSymbol * shifted);
  218. void cleanupPosition(position_t pos);
  219. void clear();
  220. void doReductions(LRActiveState & active, GrammarSymbol * next);
  221. void doReductions(GrammarSymbol * next, bool singleToken);
  222. void doShifts(LRActiveState * active, GrammarSymbol * next);
  223. void doShifts(GrammarSymbol * next, bool singleToken);
  224. void expandReduction(StackElement & element, LRProduction * production, unsigned numSymbols);
  225. LRActiveState * getPosition(position_t pos);
  226. void setPositionOwn(position_t pos, LRActiveState * value);
  227. void reduce(StackElement & element, GrammarSymbol * next);
  228. protected:
  229. LRActiveState * activeInput;
  230. LRActiveState * activeOutput;
  231. LRActiveState * reduced; // temporary array used while reducing
  232. LRActiveState * reducedOverflow[2]; // temporary array used while reducing
  233. LRActiveState * curOverflow; // temporary array used while reducing
  234. //These are common to a lexer position.
  235. const LRTable & table;
  236. StackElementArray shiftPending; // temporary array used while shifting.
  237. GrammarSymbol * reducedArgs[MAX_PRODUCTION_LENGTH]; // preallocated
  238. GrammarSymbolArray accepted;
  239. //Position management
  240. LRActiveState * positions[MAX_POSITIONS];
  241. position_t firstPosition;
  242. position_t endPosition;
  243. const TomitaParserCallback & rowState; // valid between beginParse() and endParse()
  244. Owned<StackElement> nextFreeElement;
  245. bool chooseBest;
  246. };
  247. class TomitaResultIterator : public CInterface, public INlpResultIterator
  248. {
  249. public:
  250. TomitaResultIterator(const TomitaStateInformation & _rowState, TomitaAlgorithm * _def);
  251. ~TomitaResultIterator();
  252. void reset(const GrammarSymbolArray & values);
  253. virtual bool first();
  254. virtual bool next();
  255. virtual bool isValid();
  256. virtual const void * getRow();
  257. void setAllocator(IEngineRowAllocator * _outputAllocator) { outputAllocator.set(_outputAllocator); }
  258. void invalidate();
  259. protected:
  260. TomitaMatchWalker * getWalker() { return new TomitaMatchWalker(choice, &values.item(curIndex)); }
  261. void firstChoice();
  262. bool isBetter(const GrammarSymbol * left, const GrammarSymbol * right);
  263. protected:
  264. const TomitaStateInformation & rowState; // currently a 1:1 correspondence between Parser and Iterator. Needs to be contained if that changes
  265. Owned<IEngineRowAllocator> outputAllocator;
  266. TomitaAlgorithm * def;
  267. PackedSymbolChoice choice;
  268. unsigned curIndex;
  269. GrammarSymbolArray values;
  270. CTomitaMatchedResults results;
  271. bool singleMatchPerSymbol;
  272. bool matchFirst;
  273. };
  274. class TomitaParser : public CInterface, public INlpParser
  275. {
  276. public:
  277. TomitaParser(ICodeContext * ctx, TomitaAlgorithm * _def, unsigned _activityId, INlpHelper * _helper, IHThorParseArg * arg);
  278. IMPLEMENT_IINTERFACE
  279. virtual bool performMatch(IMatchedAction & action, const void * in, unsigned len, const void * data);
  280. virtual INlpResultIterator * queryResultIter();
  281. virtual void reset();
  282. public:
  283. TomitaParserCallback rowState; // keep first since passed as reference to parser constructor
  284. Owned<IEngineRowAllocator> outputAllocator;
  285. TomitaAlgorithm * def;
  286. INlpHelper * helper;
  287. unsigned eofId;
  288. unsigned activityId;
  289. MultiLexer lexer;
  290. LRParser parser;
  291. TomitaResultIterator iter;
  292. StackElementArray accepted;
  293. GrammarSymbolArray tokens;
  294. IHThorParseArg * helperArg;
  295. };
  296. #endif