hqlnlp.ipp 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160
  1. /*##############################################################################
  2. Copyright (C) 2011 HPCC Systems.
  3. All rights reserved. This program is free software: you can redistribute it and/or modify
  4. it under the terms of the GNU Affero General Public License as
  5. published by the Free Software Foundation, either version 3 of the
  6. License, or (at your option) any later version.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. GNU Affero General Public License for more details.
  11. You should have received a copy of the GNU Affero General Public License
  12. along with this program. If not, see <http://www.gnu.org/licenses/>.
  13. ############################################################################## */
  14. #ifndef __HQLNLP_IPP_
  15. #define __HQLNLP_IPP_
  16. #include "thorparse.ipp"
  17. #include "thorregex.hpp"
  18. #include "thorrparse.ipp"
  19. #include "hqlhtcpp.ipp"
  20. //---------------------------------------------------------------------------
  21. #define NO_DFA_SCORE ((unsigned)-1)
  22. typedef MapBetween<LinkedHqlExpr, IHqlExpression *, regexid_t, regexid_t> RegexIdMapping;
  23. class RegexIdAllocator
  24. {
  25. public:
  26. RegexIdAllocator() { nextId = 0; }
  27. regexid_t queryID(IHqlExpression * expr, _ATOM name);
  28. void setID(IHqlExpression * expr, _ATOM name, regexid_t id);
  29. protected:
  30. IHqlExpression * createKey(IHqlExpression * expr, _ATOM name);
  31. protected:
  32. unsigned nextId;
  33. RegexIdMapping map;
  34. };
  35. struct LengthLimit
  36. {
  37. LengthLimit() { minLength = 0; maxLength = PATTERN_UNLIMITED_LENGTH; containsAssertion = false; }
  38. bool canBeNull() { return (minLength == 0) && !containsAssertion; }
  39. unsigned minLength;
  40. unsigned maxLength;
  41. bool containsAssertion;
  42. };
  43. struct ParseInformation;
  44. class MatchReference : public CInterface
  45. {
  46. public:
  47. MatchReference(IHqlExpression * expr);
  48. void compileMatched(RegexIdAllocator & idAllocator, UnsignedArray & ids, UnsignedArray & indexValues);
  49. bool equals(const MatchReference & _other) const;
  50. StringBuffer & getDebugText(StringBuffer & out, RegexIdAllocator & idAllocator);
  51. void getPath(StringBuffer & path);
  52. protected:
  53. void expand(IHqlExpression * expr, bool isLast);
  54. public:
  55. HqlExprArray names;
  56. HqlExprArray indices;
  57. };
  58. struct ParseInformation
  59. {
  60. public:
  61. NlpInputFormat inputFormat() const
  62. {
  63. switch (type)
  64. {
  65. case type_string: return NlpAscii;
  66. case type_utf8: return NlpUtf8;
  67. case type_unicode: return NlpUnicode;
  68. }
  69. throwUnexpected();
  70. }
  71. public:
  72. OwnedHqlExpr separator;
  73. unsigned charSize;
  74. type_t type;
  75. bool caseSensitive;
  76. bool expandRepeatAnyAsDfa;
  77. unsigned dfaComplexity;
  78. bool addedSeparators;
  79. unsigned dfaRepeatMax;
  80. unsigned dfaRepeatMaxScore;
  81. unique_id_t uidBase;
  82. };
  83. class NlpParseContext : public CInterface
  84. {
  85. public:
  86. NlpParseContext(IHqlExpression * _expr, IWorkUnit * _wu, const HqlCppOptions & options, ITimeReporter * _timeReporter);
  87. void addAllMatched();
  88. virtual unsigned addMatchReference(IHqlExpression * expr);
  89. void buildProductions(HqlCppTranslator & translator, BuildCtx & classctx, BuildCtx & startctx);
  90. void buildValidators(HqlCppTranslator & translator, BuildCtx & classctx);
  91. void extractValidates(IHqlExpression * expr);
  92. bool isMatched(IHqlExpression * expr, _ATOM name);
  93. virtual void compileSearchPattern() = 0;
  94. virtual void getDebugText(StringBuffer & s, unsigned detail) = 0;
  95. virtual bool isGrammarAmbiguous() const = 0;
  96. virtual INlpParseAlgorithm * queryParser() = 0;
  97. bool isCaseSensitive() const { return info.caseSensitive; }
  98. type_t searchType() const { return info.type; }
  99. IWorkUnit * wu() { return workunit; }
  100. protected:
  101. void checkValidMatches();
  102. void compileMatched(NlpAlgorithm & parser);
  103. void extractMatchedSymbols(IHqlExpression * expr);
  104. bool isValidMatch(MatchReference & match, unsigned depth, IHqlExpression * pattern);
  105. unsigned getValidatorIndex(IHqlExpression * expr) const { return validators.find(*expr); }
  106. IHqlExpression * queryValidateExpr(IHqlExpression * expr) const;
  107. void setParserOptions(INlpParseAlgorithm & parser);
  108. private:
  109. void doExtractValidates(IHqlExpression * expr);
  110. protected:
  111. ParseInformation info;
  112. OwnedHqlExpr expr;
  113. CIArrayOf<MatchReference> matches;
  114. RegexIdAllocator idAllocator;
  115. HqlExprArray matchedSymbols;
  116. HqlExprArray productions;
  117. HqlExprArray validators;
  118. bool allMatched;
  119. IWorkUnit * workunit;
  120. Linked<ITimeReporter> timeReporter;
  121. };
  122. void getCheckRange(IHqlExpression * range, unsigned & minLength, unsigned & maxLength, unsigned charLength);
  123. enum ValidateKind { ValidateIsString, ValidateIsUnicode, ValidateIsEither };
  124. ValidateKind getValidateKind(IHqlExpression * expr);
  125. NlpParseContext * createRegexContext(IHqlExpression * expr, IWorkUnit * wu, const HqlCppOptions & options, ITimeReporter * timeReporter, byte algorithm);
  126. NlpParseContext * createTomitaContext(IHqlExpression * expr, IWorkUnit * wu, const HqlCppOptions & options, ITimeReporter * timeReporter);
  127. #endif