thorparse.ipp 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328
  1. /*##############################################################################
  2. HPCC SYSTEMS software Copyright (C) 2012 HPCC Systems®.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. ############################################################################## */
  13. #ifndef __THORPARSE_IPP_
  14. #define __THORPARSE_IPP_
  15. //MORE: Most of this should really be in a thorrparse.ipp instead. Move later.
  16. #include "thorrparse.hpp"
  17. #include "eclhelper.hpp"
  18. //MORE: How know if processing unicode.
  19. class NlpState;
  20. interface INlpMatchedAction
  21. {
  22. public:
  23. virtual bool onMatch(NlpState & matched) = 0;
  24. };
  25. class THORHELPER_API NlpState
  26. {
  27. public:
  28. NlpState(INlpMatchedAction * _action, NlpInputFormat _inputFormat, size32_t len, const void * text);
  29. void pushMatch(MatchState & match, MatchSaveState & save);
  30. void popMatch(const MatchSaveState & save);
  31. void markFinish(MatchSaveState & save);
  32. void unmarkFinish(const MatchSaveState & save);
  33. public:
  34. const byte * cur;
  35. const byte * start;
  36. const byte * end;
  37. MatchState top;
  38. MatchState * curMatch;
  39. MatchState * * next;
  40. INlpMatchedAction * matchAction;
  41. NlpInputFormat inputFormat;
  42. unsigned charSize;
  43. int score;
  44. };
  45. #define UNKNOWN_INSTANCE ((unsigned)-1)
  46. class NlpMatchSearchInstance
  47. {
  48. public:
  49. unsigned lastExactMatchDepth;
  50. unsigned nextIndex;
  51. };
  52. class THORHELPER_API NlpMatchPath : public CInterface
  53. {
  54. public:
  55. NlpMatchPath(MemoryBuffer & in);
  56. NlpMatchPath(const UnsignedArray & _ids, const UnsignedArray & _indices);
  57. ~NlpMatchPath();
  58. void serialize(MemoryBuffer & buffer) const;
  59. inline unsigned numItems() const { return ids.ordinality(); }
  60. inline unsigned getId(unsigned i) const { return ids.item(i); }
  61. inline unsigned getIndex(unsigned i) const { return indices.item(i); }
  62. inline bool matchAny(unsigned i) const { return indices.item(i) == UNKNOWN_INSTANCE; }
  63. inline unsigned nextExactMatchIndex(unsigned from) const
  64. {
  65. for (unsigned i=from; i < indices.ordinality(); i++)
  66. {
  67. unsigned cur = indices.item(i);
  68. if (cur != UNKNOWN_INSTANCE)
  69. return cur;
  70. }
  71. return 0;
  72. }
  73. protected:
  74. UnsignedArray ids;
  75. UnsignedArray indices;
  76. };
  77. class CMatchedElement : public IMatchedElement, public CInterface
  78. {
  79. public:
  80. CMatchedElement(MatchState * _cur) { cur = _cur; }
  81. IMPLEMENT_IINTERFACE
  82. virtual const byte * queryStartPtr() const { return cur->start; }
  83. virtual const byte * queryEndPtr() const { return cur->end; }
  84. virtual const byte * queryRow() const { return NULL; }
  85. protected:
  86. MatchState * cur;
  87. };
  88. class NoMatchElement : public IMatchedElement, public CInterface
  89. {
  90. public:
  91. IMPLEMENT_IINTERFACE
  92. virtual const byte * queryStartPtr() const { return ptr; }
  93. virtual const byte * queryEndPtr() const { return ptr; }
  94. virtual const byte * queryRow() const { return NULL; }
  95. public:
  96. const byte * ptr;
  97. };
  98. class MatchWalker2MatchedElement : public IMatchedElement, public CInterface
  99. {
  100. public:
  101. MatchWalker2MatchedElement(IMatchWalker * _cur) { cur.set(_cur); }
  102. IMPLEMENT_IINTERFACE
  103. virtual const byte * queryStartPtr() const { return (const byte *)cur->queryMatchStart(); }
  104. virtual const byte * queryEndPtr() const { return (const byte *)cur->queryMatchStart() + cur->queryMatchSize(); }
  105. virtual const byte * queryRow() const { return NULL; }
  106. protected:
  107. Owned<IMatchWalker> cur;
  108. };
  109. class THORHELPER_API CMatchedResultInfo : public CInterface
  110. {
  111. friend class CMatchedResults;
  112. public:
  113. CMatchedResultInfo();
  114. virtual void deserialize(MemoryBuffer & buffer);
  115. virtual void serialize(MemoryBuffer & buffer) const;
  116. virtual void addResult(const UnsignedArray & ids, const UnsignedArray & indices) = 0;
  117. void setFormat(NlpInputFormat value) { inputFormat = value; }
  118. protected:
  119. virtual NlpMatchPath * createMatchPath(MemoryBuffer & in) = 0;
  120. public:
  121. CIArrayOf<NlpMatchPath> matchResults;
  122. byte inputFormat;
  123. };
  124. class THORHELPER_API CMatchedResults : implements IMatchedResults, public CInterface
  125. {
  126. public:
  127. CMatchedResults(CMatchedResultInfo * _def);
  128. ~CMatchedResults();
  129. IMPLEMENT_IINTERFACE
  130. void kill();
  131. //IMatchedResults
  132. virtual bool getMatched(unsigned idx);
  133. virtual size32_t getMatchLength(unsigned idx);
  134. virtual size32_t getMatchPosition(unsigned idx);
  135. virtual void getMatchText(size32_t & outlen, char * & out, unsigned idx);
  136. virtual void getMatchUnicode(size32_t & outlen, UChar * & out, unsigned idx);
  137. virtual void getMatchUtf8(size32_t & outlen, char * & out, unsigned idx);
  138. virtual byte * queryMatchRow(unsigned idx);
  139. virtual byte * queryRootResult();
  140. protected:
  141. CMatchedResultInfo * def;
  142. IMatchedElement * * matched;
  143. const byte * in;
  144. NoMatchElement notMatched;
  145. const byte * rootResult;
  146. };
  147. class NlpMatchWalker : public IMatchWalker, public CInterface
  148. {
  149. public:
  150. NlpMatchWalker(MatchState * state) { curMatch = state; }
  151. IMPLEMENT_IINTERFACE
  152. virtual IAtom * queryName();
  153. virtual unsigned queryID() { return curMatch->id; }
  154. virtual size32_t queryMatchSize();
  155. virtual const void * queryMatchStart();
  156. virtual unsigned numChildren();
  157. virtual IMatchWalker * getChild(unsigned idx);
  158. protected:
  159. MatchState * curMatch;
  160. };
  161. class THORHELPER_API NlpAlgorithm : public CInterface, implements INlpParseAlgorithm
  162. {
  163. public:
  164. NlpAlgorithm(CMatchedResultInfo * _matched);
  165. ~NlpAlgorithm();
  166. IMPLEMENT_IINTERFACE
  167. virtual void serialize(MemoryBuffer & out);
  168. void deserialize(MemoryBuffer & in);
  169. virtual void setChoose(bool _chooseMin, bool _chooseMax, bool _chooseBest, bool _singleChoicePerLine);
  170. virtual void setJoin(bool _notMatched, bool _notMatchedOnly);
  171. virtual void setLimit(size32_t _maxLength);
  172. virtual void setOptions(MatchAction _matchAction, ScanAction _scanAction, NlpInputFormat _inputFormat, unsigned _keepLimit, unsigned _atMostLimit);
  173. public:
  174. MatchAction matchAction;
  175. ScanAction scanAction;
  176. NlpInputFormat inputFormat;
  177. bool addedSeparators;
  178. unsigned keepLimit;
  179. unsigned atMostLimit;
  180. byte charWidth;
  181. CMatchedResultInfo * matchInfo;
  182. size32_t maxLength;
  183. bool notMatched;
  184. bool notMatchedOnly;
  185. bool chooseMin;
  186. bool chooseMax;
  187. bool chooseBest;
  188. bool singleChoicePerLine;
  189. };
  190. //---------------------------------------------------------------------------
  191. class THORHELPER_API IDfaPattern : public IInterface
  192. {
  193. public:
  194. virtual void init(unsigned numStates) = 0;
  195. virtual void beginState(unsigned id) = 0;
  196. virtual void setStateAccept(unsigned id) = 0;
  197. virtual void endState() = 0;
  198. virtual void addTransition(unsigned next, unsigned nextState) = 0;
  199. virtual void finished() = 0;
  200. virtual void init(unsigned numStates, unsigned approxTransitions) = 0;
  201. };
  202. struct AsciiDfaState
  203. {
  204. inline bool accepts() const { return acceptID != NotFound; }
  205. unsigned delta;
  206. byte min;
  207. byte max;
  208. unsigned acceptID;
  209. };
  210. class THORHELPER_API AsciiDfa
  211. {
  212. friend class AsciiDfaBuilder;
  213. public:
  214. AsciiDfa();
  215. ~AsciiDfa();
  216. void init(unsigned _numStates);
  217. void deserialize(MemoryBuffer & in);
  218. void serialize(MemoryBuffer & out);
  219. unsigned getNumStates() const { return numStates; }
  220. unsigned getNumTransitions() const { return numTransitions; }
  221. unsigned getAccepts(const AsciiDfaState & state, unsigned idx) const;
  222. const AsciiDfaState * queryStates() const { return states; }
  223. unsigned * queryTransitions() const { return transitions; }
  224. void setEmpty();
  225. void toXML(StringBuffer & out, unsigned detail);
  226. protected:
  227. unsigned numAccepts;
  228. unsigned numStates;
  229. unsigned numTransitions;
  230. unsigned * accepts;
  231. AsciiDfaState * states;
  232. unsigned * transitions;
  233. };
  234. unsigned getMaximumMatchLength(AsciiDfa & dfa, unsigned len, const byte * start);
  235. class THORHELPER_API AsciiDfaBuilder : public CInterface, implements IDfaPattern
  236. {
  237. public:
  238. AsciiDfaBuilder(AsciiDfa & _dfa);
  239. IMPLEMENT_IINTERFACE
  240. virtual void addTransition(unsigned next, unsigned nextState);
  241. virtual void init(unsigned _numStates);
  242. virtual void init(unsigned numStates, unsigned approxTransitions);
  243. virtual void beginState(unsigned id);
  244. virtual void setStateAccept(unsigned id);
  245. virtual void endState();
  246. virtual void finished();
  247. private:
  248. void reallocTransitions(unsigned level);
  249. protected:
  250. AsciiDfa & dfa;
  251. unsigned curState;
  252. unsigned firstTransition;
  253. unsigned maxTransitions;
  254. UnsignedArray accepts;
  255. };
  256. void deserializeBoolArray(unsigned len, bool * values, MemoryBuffer & in);
  257. void serializeBoolArray(MemoryBuffer & out, unsigned len, const bool * values);
  258. INlpParseAlgorithm * createRegexParser(MemoryBuffer & buffer, IOutputMetaData * outRecordSize, byte kind);
  259. INlpParseAlgorithm * createTomitaParser(MemoryBuffer & buffer, IOutputMetaData * outRecordSize);
  260. #endif /* __THORPARSE_HPP_ */