hqlregex.ipp 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359
  1. /*##############################################################################
  2. HPCC SYSTEMS software Copyright (C) 2012 HPCC Systems®.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. ############################################################################## */
  13. #ifndef __HQLREGEX_IPP_
  14. #define __HQLREGEX_IPP_
  15. #include "hqlnlp.ipp"
  16. //---------------------------------------------------------------------------
  17. #define NO_DFA_SCORE ((unsigned)-1)
  18. class HqlRegexExpr;
  19. typedef CIArrayOf<HqlRegexExpr> HqlRegexExprArray;
  20. class RegexContext;
  21. struct GenerateRegexCtx;
  22. class HqlNamedRegex : public CInterface
  23. {
  24. friend class HqlRegexExpr;
  25. public:
  26. HqlNamedRegex(IHqlExpression * _expr, IAtom * _name, IHqlExpression * _searchExpr, node_operator _kind, bool _caseSensitive, bool _isMatched);
  27. ~HqlNamedRegex();
  28. void addBeginEnd(const ParseInformation & options);
  29. void analyseNullLeadTrail();
  30. void calcDfaScore();
  31. void calculateNext();
  32. bool canBeNull();
  33. void cleanup();
  34. void createDFAs(RegexContext & ctx);
  35. void expandNamedSymbols();
  36. void expandRecursion(RegexContext & ctx, HqlNamedRegex * self);
  37. void generateDFAs();
  38. void generateRegex(GenerateRegexCtx & info);
  39. unsigned getDfaScore();
  40. RegexPattern * queryRootPattern();
  41. void insertSeparators(IHqlExpression * separator, RegexContext * ctx);
  42. void markDFAs(unsigned complexity);
  43. bool matches(IHqlExpression * _def, IAtom * _name, bool _caseSensitive) { return searchExpr == _def && name == _name && caseSensitive == _caseSensitive; }
  44. bool matches(IHqlExpression * _def, IAtom * _name, node_operator _op, bool _caseSensitive) { return searchExpr == _def && name == _name && kind == _op && caseSensitive == _caseSensitive; }
  45. bool matchesDefine(IHqlExpression * name, bool _caseSensitive);
  46. void mergeCreateSets();
  47. bool queryExpandInline();
  48. bool queryExpandedRecursion() { return doneExpandRecursion; }
  49. void setRegexOwn(HqlRegexExpr * _def);
  50. void addUse() { numUses++; }
  51. void addRecursiveUse() { numUses++; isRecursive = true; isMatched = true; }
  52. bool isUsed() { return (numUses != 0); }
  53. void removeUse() { numUses--; }
  54. unsigned getMinLength() const { return limit.minLength; }
  55. unsigned getMaxLength() const { return limit.maxLength; }
  56. const LengthLimit & queryLimit() { return limit; }
  57. protected:
  58. unsigned numUses;
  59. IAtom * name;
  60. LengthLimit limit;
  61. node_operator kind;
  62. OwnedHqlExpr searchExpr;
  63. OwnedHqlExpr expr;
  64. Owned<HqlRegexExpr> def;
  65. Owned<RegexNamed> created;
  66. OwnedHqlExpr defineName;
  67. bool isMatched:1;
  68. bool isRecursive:1;
  69. bool doneExpandNamed:1;
  70. bool doneCalcDfaScore:1;
  71. bool doneCreateDFA:1;
  72. bool doneExpandRecursion:1;
  73. bool doneMarkDfas:1;
  74. bool noGenerate:1;
  75. bool caseSensitive:1;
  76. };
  77. class HqlRegexHashTable : public SuperHashTable
  78. {
  79. public:
  80. using SuperHashTable::add;
  81. using SuperHashTable::find;
  82. ~HqlRegexHashTable() { _releaseAll(); }
  83. private:
  84. virtual void onAdd(void *et);
  85. virtual void onRemove(void *et);
  86. virtual unsigned getHashFromElement(const void *et) const;
  87. virtual unsigned getHashFromFindParam(const void *fp) const;
  88. virtual const void * getFindParam(const void *et) const;
  89. virtual bool matchesFindParam(const void *et, const void *key, unsigned fphash) const;
  90. };
  91. class HqlRegexUniqueArray
  92. {
  93. public:
  94. void append(HqlRegexExpr & expr);
  95. unsigned ordinality() const { return array.ordinality(); }
  96. HqlRegexExpr & item(unsigned i) const { return array.item(i); }
  97. void kill() { array.kill(); hash.kill(); }
  98. protected:
  99. HqlRegexExprArray array;
  100. HqlRegexHashTable hash;
  101. };
  102. //Can't use the IHqlExpressions because instances can't be commoned up....
  103. class HqlDfaInfo;
  104. class SymbolArray;
  105. class HqlRegexExpr : public CInterface
  106. {
  107. friend class HqlNamedRegex;
  108. friend class HqlComplexRegexExpr;
  109. public:
  110. HqlRegexExpr(const ParseInformation & _options, IHqlExpression * _expr, bool _caseSensitive);
  111. HqlRegexExpr(const ParseInformation & _options, node_operator _op, IHqlExpression * _expr, bool _caseSensitive);
  112. ~HqlRegexExpr();
  113. //Functions like IHqlExpression....
  114. node_operator getOperator() const { return op; }
  115. HqlRegexExpr * queryChild(unsigned idx) const { return args.isItem(idx) ? &args.item(idx) : NULL; }
  116. unsigned numChildren() const { return args.ordinality(); }
  117. void addOperand(HqlRegexExpr * _arg) { args.append(*_arg); }
  118. void replaceOperand(HqlRegexExpr * _arg, unsigned idx) { args.replace(*_arg, idx); }
  119. virtual void setNamed(HqlNamedRegex * _named) { UNIMPLEMENTED; }
  120. virtual void setSubPattern(HqlNamedRegex * _sub) { UNIMPLEMENTED; }
  121. bool canBeNull() { return limit.canBeNull(); }
  122. const LengthLimit & queryLimit() { return limit; }
  123. unsigned getAcceptId();
  124. void setAcceptId(unsigned id) { dfaScore = id; } // change later...
  125. unsigned getRepeatMin();
  126. unsigned getRepeatMax();
  127. bool isStandardRepeat();
  128. unique_id_t queryUid() { return uid; }
  129. virtual HqlNamedRegex * queryNamed() { return NULL; }
  130. virtual HqlNamedRegex * querySubPattern() { return NULL; }
  131. virtual void analyseNullLeadTrail() = 0;
  132. virtual void gatherLeading(HqlRegexUniqueArray & target) = 0;
  133. virtual void gatherTrailing(HqlRegexUniqueArray & target) = 0;
  134. virtual unsigned numTrailing() const = 0;
  135. virtual HqlRegexExpr & queryTrailing(unsigned idx) = 0;
  136. //General processing functions
  137. void calcDfaScore();
  138. void calculateNext();
  139. bool canConsume(unsigned next);
  140. virtual void cleanup();
  141. virtual HqlRegexExpr * clone();
  142. void connectRegex();
  143. HqlRegexExpr * createDFAs(RegexContext & ctx);
  144. HqlRegexExpr * expandAsDFA();
  145. HqlRegexExpr * expandAsRepeatedDFA(unsigned count);
  146. HqlRegexExpr * expandNamedSymbols();
  147. void generateDFA(IDfaPattern * pattern);
  148. void generateDFAs();
  149. void generateRegex(GenerateRegexCtx & info);
  150. HqlRegexExpr * insertSeparators(IHqlExpression * separator, RegexContext * ctx);
  151. void markDFAs(unsigned complexity);
  152. HqlRegexExpr * mergeCreateSets();
  153. HqlRegexExpr * expandRecursion(RegexContext & ctx, HqlNamedRegex * self);
  154. void gatherConsumeSymbols(SymbolArray & next);
  155. unsigned getHash()
  156. {
  157. HqlRegexExpr * key = this;
  158. return hashc((const byte *)&key, sizeof(key), 0);
  159. }
  160. protected:
  161. bool isWorthConvertingToDFA();
  162. bool isSimpleStringList();
  163. private:
  164. void init();
  165. protected:
  166. unsigned uid;
  167. OwnedHqlExpr expr;
  168. node_operator op; // always create an expression and remove this?
  169. HqlRegexExprArray args; // move to complex!
  170. Owned<RegexPattern> created;
  171. const ParseInformation & options;
  172. HqlDfaInfo * dfa; // pointer because forward declared
  173. // create a derived HqlDfaRegexExpr and move this there..
  174. HqlRegexUniqueArray following;
  175. LengthLimit limit; // move to complex!
  176. unsigned dfaScore; // move to complex!
  177. bool createDFA:1;
  178. bool cleaned:1;
  179. bool connected:1;
  180. bool analysed:1;
  181. bool added:1;
  182. bool caseSensitive:1;
  183. };
  184. //MORE: Should move some of the logic to complex + remove the
  185. class HqlSimpleRegexExpr : public HqlRegexExpr
  186. {
  187. friend class HqlNamedRegex;
  188. public:
  189. HqlSimpleRegexExpr(const ParseInformation & _options, IHqlExpression * _expr, bool _caseSensitive) : HqlRegexExpr(_options, _expr, _caseSensitive) {}
  190. HqlSimpleRegexExpr(const ParseInformation & _options, node_operator _op, IHqlExpression * _expr, bool _caseSensitive) : HqlRegexExpr(_options, _op, _expr, _caseSensitive) {}
  191. virtual void analyseNullLeadTrail();
  192. virtual void gatherLeading(HqlRegexUniqueArray & target);
  193. virtual void gatherTrailing(HqlRegexUniqueArray & target);
  194. virtual unsigned numTrailing() const;
  195. virtual HqlRegexExpr & queryTrailing(unsigned idx);
  196. };
  197. class HqlComplexRegexExpr : public HqlRegexExpr
  198. {
  199. friend class HqlNamedRegex;
  200. public:
  201. HqlComplexRegexExpr(const ParseInformation & _options, IHqlExpression * _expr, bool _caseSensitive) : HqlRegexExpr(_options, _expr, _caseSensitive) {}
  202. HqlComplexRegexExpr(const ParseInformation & _options, node_operator _op, IHqlExpression * _expr, bool _caseSensitive) : HqlRegexExpr(_options, _op, _expr, _caseSensitive) {}
  203. virtual void analyseNullLeadTrail();
  204. virtual void gatherLeading(HqlRegexUniqueArray & target);
  205. virtual void gatherTrailing(HqlRegexUniqueArray & target);
  206. virtual unsigned numTrailing() const { return trailing.ordinality(); }
  207. virtual HqlRegexExpr & queryTrailing(unsigned idx) { return trailing.item(idx); }
  208. virtual HqlNamedRegex * queryNamed() { return named; }
  209. virtual HqlNamedRegex * querySubPattern() { return subPattern; }
  210. virtual void setNamed(HqlNamedRegex * _named) { named.set(_named); }
  211. virtual void setSubPattern(HqlNamedRegex * _sub) { subPattern.set(_sub); }
  212. virtual void cleanup();
  213. private:
  214. Owned<HqlNamedRegex> named;
  215. Owned<HqlNamedRegex> subPattern;
  216. HqlRegexUniqueArray leading;
  217. HqlRegexUniqueArray trailing;
  218. };
  219. class HqlRegexExprSet
  220. {
  221. public:
  222. void add(HqlRegexExpr * value);
  223. int compare(const HqlRegexExprSet & _other) const;
  224. bool equals(const HqlRegexExprSet & _other) const;
  225. inline unsigned ordinality() const { return values.ordinality(); }
  226. inline HqlRegexExpr & item(unsigned i) { return values.item(i); }
  227. private:
  228. CICopyArrayOf<HqlRegexExpr> values;
  229. };
  230. class HqlDfaState;
  231. class HqlDfaTransition : public CInterface
  232. {
  233. public:
  234. HqlDfaTransition(unsigned _code, HqlDfaState * _next) { code = _code; next = _next; }
  235. unsigned code;
  236. HqlDfaState * next;
  237. };
  238. class HqlDfaState : public CInterface
  239. {
  240. public:
  241. HqlDfaState(unsigned _id) { id = _id; }
  242. bool isAccepting();
  243. public:
  244. unsigned id;
  245. HqlRegexExprSet position;
  246. CIArrayOf<HqlDfaTransition> transitions;
  247. };
  248. class HqlDfaInfo : public CInterface
  249. {
  250. public:
  251. CIArrayOf<HqlDfaState> states;
  252. CIArrayOf<HqlDfaState> orderedStates;
  253. };
  254. class RegexContext : public NlpParseContext
  255. {
  256. friend class HqlNamedRegex;
  257. friend class HqlRegexExpr;
  258. public:
  259. RegexContext(IHqlExpression * _expr, IWorkUnit * wu, const HqlCppOptions & options, byte _algorithm);
  260. ~RegexContext();
  261. virtual void compileSearchPattern();
  262. virtual void getDebugText(StringBuffer & s, unsigned detail);
  263. virtual bool isGrammarAmbiguous() const { return false; }
  264. virtual INlpParseAlgorithm * queryParser() { return &parser; }
  265. HqlNamedRegex * queryDefine(IHqlExpression * defineName, bool caseSensitive);
  266. //When used to generate a lexer - should be split out of RegexContext when I have the time
  267. void addLexerToken(unsigned id, IHqlExpression * pattern);
  268. void beginLexer();
  269. void generateLexer(IDfaPattern * builder);
  270. protected:
  271. void analysePattern();
  272. void buildStructure();
  273. HqlRegexExpr * createFollow(HqlRegexExpr * start, IHqlExpression * body, node_operator endOp, IHqlExpression * endExprs, bool caseSensitive);
  274. HqlRegexExpr * createStructure(IHqlExpression * expr, bool caseSensitive);
  275. void expandRecursion();
  276. void generateRegex();
  277. void insertSeparators();
  278. void optimizePattern();
  279. void optimizeSpotDFA();
  280. HqlNamedRegex * queryNamed(IHqlExpression * defn, IAtom * name, node_operator op, bool caseSensitive);
  281. HqlNamedRegex * createNamed(IHqlExpression * expr, IAtom * name, node_operator op, bool caseSensitive);
  282. protected:
  283. CIArrayOf<HqlNamedRegex> named;
  284. CIArrayOf<HqlNamedRegex> deadNamed;
  285. Owned<HqlNamedRegex> root;
  286. RegexAlgorithm parser;
  287. bool createLexer;
  288. Owned<HqlNamedRegex> lexerNamed;
  289. Owned<HqlRegexExpr> lexerRoot;
  290. Owned<HqlRegexExpr> lexerOr;
  291. };
  292. struct GenerateRegexCtx
  293. {
  294. GenerateRegexCtx(const RegexContext & _ctx, const ParseInformation & _info, RegexIdAllocator & _idAllocator) : regex(_ctx), info(_info), idAllocator(_idAllocator) { namedKind = no_none; }
  295. const RegexContext & regex;
  296. const ParseInformation & info;
  297. RegexIdAllocator & idAllocator;
  298. node_operator namedKind;
  299. };
  300. #endif