thorparse.ipp 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329
  1. /*##############################################################################
  2. HPCC SYSTEMS software Copyright (C) 2012 HPCC Systems.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. ############################################################################## */
  13. #ifndef __THORPARSE_IPP_
  14. #define __THORPARSE_IPP_
  15. //MORE: Most of this should really be in a thorrparse.ipp instead. Move later.
  16. #include "unicode/utf.h"
  17. #include "thorrparse.hpp"
  18. #include "eclhelper.hpp"
  19. //MORE: How know if processing unicode.
  20. class NlpState;
  21. interface INlpMatchedAction
  22. {
  23. public:
  24. virtual bool onMatch(NlpState & matched) = 0;
  25. };
  26. class THORHELPER_API NlpState
  27. {
  28. public:
  29. NlpState(INlpMatchedAction * _action, NlpInputFormat _inputFormat, size32_t len, const void * text);
  30. void pushMatch(MatchState & match, MatchSaveState & save);
  31. void popMatch(const MatchSaveState & save);
  32. void markFinish(MatchSaveState & save);
  33. void unmarkFinish(const MatchSaveState & save);
  34. public:
  35. const byte * cur;
  36. const byte * start;
  37. const byte * end;
  38. MatchState top;
  39. MatchState * curMatch;
  40. MatchState * * next;
  41. INlpMatchedAction * matchAction;
  42. NlpInputFormat inputFormat;
  43. unsigned charSize;
  44. int score;
  45. };
  46. #define UNKNOWN_INSTANCE ((unsigned)-1)
  47. class NlpMatchSearchInstance
  48. {
  49. public:
  50. unsigned lastExactMatchDepth;
  51. unsigned nextIndex;
  52. };
  53. class THORHELPER_API NlpMatchPath : public CInterface
  54. {
  55. public:
  56. NlpMatchPath(MemoryBuffer & in);
  57. NlpMatchPath(const UnsignedArray & _ids, const UnsignedArray & _indices);
  58. ~NlpMatchPath();
  59. void serialize(MemoryBuffer & buffer) const;
  60. inline unsigned numItems() const { return ids.ordinality(); }
  61. inline unsigned getId(unsigned i) const { return ids.item(i); }
  62. inline unsigned getIndex(unsigned i) const { return indices.item(i); }
  63. inline bool matchAny(unsigned i) const { return indices.item(i) == UNKNOWN_INSTANCE; }
  64. inline unsigned nextExactMatchIndex(unsigned from) const
  65. {
  66. for (unsigned i=from; i < indices.ordinality(); i++)
  67. {
  68. unsigned cur = indices.item(i);
  69. if (cur != UNKNOWN_INSTANCE)
  70. return cur;
  71. }
  72. return 0;
  73. }
  74. protected:
  75. UnsignedArray ids;
  76. UnsignedArray indices;
  77. };
  78. class CMatchedElement : public CInterface, public IMatchedElement
  79. {
  80. public:
  81. CMatchedElement(MatchState * _cur) { cur = _cur; }
  82. IMPLEMENT_IINTERFACE
  83. virtual const byte * queryStartPtr() const { return cur->start; }
  84. virtual const byte * queryEndPtr() const { return cur->end; }
  85. virtual const byte * queryRow() const { return NULL; }
  86. protected:
  87. MatchState * cur;
  88. };
  89. class NoMatchElement : public CInterface, public IMatchedElement
  90. {
  91. public:
  92. IMPLEMENT_IINTERFACE
  93. virtual const byte * queryStartPtr() const { return ptr; }
  94. virtual const byte * queryEndPtr() const { return ptr; }
  95. virtual const byte * queryRow() const { return NULL; }
  96. public:
  97. const byte * ptr;
  98. };
  99. class MatchWalker2MatchedElement : public CInterface, public IMatchedElement
  100. {
  101. public:
  102. MatchWalker2MatchedElement(IMatchWalker * _cur) { cur.set(_cur); }
  103. IMPLEMENT_IINTERFACE
  104. virtual const byte * queryStartPtr() const { return (const byte *)cur->queryMatchStart(); }
  105. virtual const byte * queryEndPtr() const { return (const byte *)cur->queryMatchStart() + cur->queryMatchSize(); }
  106. virtual const byte * queryRow() const { return NULL; }
  107. protected:
  108. Owned<IMatchWalker> cur;
  109. };
  110. class THORHELPER_API CMatchedResultInfo : public CInterface
  111. {
  112. friend class CMatchedResults;
  113. public:
  114. CMatchedResultInfo();
  115. virtual void deserialize(MemoryBuffer & buffer);
  116. virtual void serialize(MemoryBuffer & buffer) const;
  117. virtual void addResult(const UnsignedArray & ids, const UnsignedArray & indices) = 0;
  118. void setFormat(NlpInputFormat value) { inputFormat = value; }
  119. protected:
  120. virtual NlpMatchPath * createMatchPath(MemoryBuffer & in) = 0;
  121. public:
  122. CIArrayOf<NlpMatchPath> matchResults;
  123. byte inputFormat;
  124. };
  125. class THORHELPER_API CMatchedResults : public CInterface, implements IMatchedResults
  126. {
  127. public:
  128. CMatchedResults(CMatchedResultInfo * _def);
  129. ~CMatchedResults();
  130. IMPLEMENT_IINTERFACE
  131. void kill();
  132. //IMatchedResults
  133. virtual bool getMatched(unsigned idx);
  134. virtual size32_t getMatchLength(unsigned idx);
  135. virtual size32_t getMatchPosition(unsigned idx);
  136. virtual void getMatchText(size32_t & outlen, char * & out, unsigned idx);
  137. virtual void getMatchUnicode(size32_t & outlen, UChar * & out, unsigned idx);
  138. virtual void getMatchUtf8(size32_t & outlen, char * & out, unsigned idx);
  139. virtual byte * queryMatchRow(unsigned idx);
  140. virtual byte * queryRootResult();
  141. protected:
  142. CMatchedResultInfo * def;
  143. IMatchedElement * * matched;
  144. const byte * in;
  145. NoMatchElement notMatched;
  146. const byte * rootResult;
  147. };
  148. class NlpMatchWalker : public CInterface, public IMatchWalker
  149. {
  150. public:
  151. NlpMatchWalker(MatchState * state) { curMatch = state; }
  152. IMPLEMENT_IINTERFACE
  153. virtual IAtom * queryName();
  154. virtual unsigned queryID() { return curMatch->id; }
  155. virtual size32_t queryMatchSize();
  156. virtual const void * queryMatchStart();
  157. virtual unsigned numChildren();
  158. virtual IMatchWalker * getChild(unsigned idx);
  159. protected:
  160. MatchState * curMatch;
  161. };
  162. class THORHELPER_API NlpAlgorithm : public CInterface, implements INlpParseAlgorithm
  163. {
  164. public:
  165. NlpAlgorithm(CMatchedResultInfo * _matched);
  166. ~NlpAlgorithm();
  167. IMPLEMENT_IINTERFACE
  168. virtual void serialize(MemoryBuffer & out);
  169. void deserialize(MemoryBuffer & in);
  170. virtual void setChoose(bool _chooseMin, bool _chooseMax, bool _chooseBest, bool _singleChoicePerLine);
  171. virtual void setJoin(bool _notMatched, bool _notMatchedOnly);
  172. virtual void setLimit(size32_t _maxLength);
  173. virtual void setOptions(MatchAction _matchAction, ScanAction _scanAction, NlpInputFormat _inputFormat, unsigned _keepLimit, unsigned _atMostLimit);
  174. public:
  175. MatchAction matchAction;
  176. ScanAction scanAction;
  177. NlpInputFormat inputFormat;
  178. bool addedSeparators;
  179. unsigned keepLimit;
  180. unsigned atMostLimit;
  181. byte charWidth;
  182. CMatchedResultInfo * matchInfo;
  183. size32_t maxLength;
  184. bool notMatched;
  185. bool notMatchedOnly;
  186. bool chooseMin;
  187. bool chooseMax;
  188. bool chooseBest;
  189. bool singleChoicePerLine;
  190. };
  191. //---------------------------------------------------------------------------
  192. class THORHELPER_API IDfaPattern : public IInterface
  193. {
  194. public:
  195. virtual void init(unsigned numStates) = 0;
  196. virtual void beginState(unsigned id) = 0;
  197. virtual void setStateAccept(unsigned id) = 0;
  198. virtual void endState() = 0;
  199. virtual void addTransition(unsigned next, unsigned nextState) = 0;
  200. virtual void finished() = 0;
  201. virtual void init(unsigned numStates, unsigned approxTransitions) = 0;
  202. };
  203. struct AsciiDfaState
  204. {
  205. inline bool accepts() const { return acceptID != NotFound; }
  206. unsigned delta;
  207. byte min;
  208. byte max;
  209. unsigned acceptID;
  210. };
  211. class THORHELPER_API AsciiDfa
  212. {
  213. friend class AsciiDfaBuilder;
  214. public:
  215. AsciiDfa();
  216. ~AsciiDfa();
  217. void init(unsigned _numStates);
  218. void deserialize(MemoryBuffer & in);
  219. void serialize(MemoryBuffer & out);
  220. unsigned getNumStates() const { return numStates; }
  221. unsigned getNumTransitions() const { return numTransitions; }
  222. unsigned getAccepts(const AsciiDfaState & state, unsigned idx) const;
  223. const AsciiDfaState * queryStates() const { return states; }
  224. unsigned * queryTransitions() const { return transitions; }
  225. void setEmpty();
  226. void toXML(StringBuffer & out, unsigned detail);
  227. protected:
  228. unsigned numAccepts;
  229. unsigned numStates;
  230. unsigned numTransitions;
  231. unsigned * accepts;
  232. AsciiDfaState * states;
  233. unsigned * transitions;
  234. };
  235. unsigned getMaximumMatchLength(AsciiDfa & dfa, unsigned len, const byte * start);
  236. class THORHELPER_API AsciiDfaBuilder : public CInterface, implements IDfaPattern
  237. {
  238. public:
  239. AsciiDfaBuilder(AsciiDfa & _dfa);
  240. IMPLEMENT_IINTERFACE
  241. virtual void addTransition(unsigned next, unsigned nextState);
  242. virtual void init(unsigned _numStates);
  243. virtual void init(unsigned numStates, unsigned approxTransitions);
  244. virtual void beginState(unsigned id);
  245. virtual void setStateAccept(unsigned id);
  246. virtual void endState();
  247. virtual void finished();
  248. private:
  249. void reallocTransitions(unsigned level);
  250. protected:
  251. AsciiDfa & dfa;
  252. unsigned curState;
  253. unsigned firstTransition;
  254. unsigned maxTransitions;
  255. UnsignedArray accepts;
  256. };
  257. void deserializeBoolArray(unsigned len, bool * values, MemoryBuffer & in);
  258. void serializeBoolArray(MemoryBuffer & out, unsigned len, const bool * values);
  259. INlpParseAlgorithm * createRegexParser(MemoryBuffer & buffer, IOutputMetaData * outRecordSize, byte kind);
  260. INlpParseAlgorithm * createTomitaParser(MemoryBuffer & buffer, IOutputMetaData * outRecordSize);
  261. #endif /* __THORPARSE_HPP_ */