hqlnlp.ipp 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158
  1. /*##############################################################################
  2. HPCC SYSTEMS software Copyright (C) 2012 HPCC Systems®.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. ############################################################################## */
  13. #ifndef __HQLNLP_IPP_
  14. #define __HQLNLP_IPP_
  15. #include "thorparse.ipp"
  16. #include "thorregex.hpp"
  17. #include "thorralgo.ipp"
  18. #include "hqlhtcpp.ipp"
  19. //---------------------------------------------------------------------------
  20. #define NO_DFA_SCORE ((unsigned)-1)
  21. typedef MapBetween<LinkedHqlExpr, IHqlExpression *, regexid_t, regexid_t> RegexIdMapping;
  22. class RegexIdAllocator
  23. {
  24. public:
  25. RegexIdAllocator() { nextId = 0; }
  26. regexid_t queryID(IHqlExpression * expr, IAtom * name);
  27. void setID(IHqlExpression * expr, IAtom * name, regexid_t id);
  28. protected:
  29. IHqlExpression * createKey(IHqlExpression * expr, IAtom * name);
  30. protected:
  31. unsigned nextId;
  32. RegexIdMapping map;
  33. };
  34. struct LengthLimit
  35. {
  36. LengthLimit() { minLength = 0; maxLength = PATTERN_UNLIMITED_LENGTH; containsAssertion = false; }
  37. bool canBeNull() { return (minLength == 0) && !containsAssertion; }
  38. unsigned minLength;
  39. unsigned maxLength;
  40. bool containsAssertion;
  41. };
  42. struct ParseInformation;
  43. class MatchReference : public CInterface
  44. {
  45. public:
  46. MatchReference(IHqlExpression * expr);
  47. void compileMatched(RegexIdAllocator & idAllocator, UnsignedArray & ids, UnsignedArray & indexValues);
  48. bool equals(const MatchReference & _other) const;
  49. StringBuffer & getDebugText(StringBuffer & out, RegexIdAllocator & idAllocator);
  50. void getPath(StringBuffer & path);
  51. protected:
  52. void expand(IHqlExpression * expr, bool isLast);
  53. public:
  54. HqlExprArray names;
  55. HqlExprArray indices;
  56. };
  57. struct ParseInformation
  58. {
  59. public:
  60. NlpInputFormat inputFormat() const
  61. {
  62. switch (type)
  63. {
  64. case type_string: return NlpAscii;
  65. case type_utf8: return NlpUtf8;
  66. case type_unicode: return NlpUnicode;
  67. }
  68. throwUnexpected();
  69. }
  70. public:
  71. OwnedHqlExpr separator;
  72. unsigned charSize;
  73. type_t type;
  74. bool caseSensitive;
  75. bool expandRepeatAnyAsDfa;
  76. unsigned dfaComplexity;
  77. bool addedSeparators;
  78. unsigned dfaRepeatMax;
  79. unsigned dfaRepeatMaxScore;
  80. unique_id_t uidBase;
  81. };
  82. class NlpParseContext : public CInterface
  83. {
  84. public:
  85. NlpParseContext(IHqlExpression * _expr, IWorkUnit * _wu, const HqlCppOptions & options);
  86. void addAllMatched();
  87. virtual unsigned addMatchReference(IHqlExpression * expr);
  88. void buildProductions(HqlCppTranslator & translator, BuildCtx & classctx, BuildCtx & startctx);
  89. void buildValidators(HqlCppTranslator & translator, BuildCtx & classctx);
  90. void extractValidates(IHqlExpression * expr);
  91. bool isMatched(IHqlExpression * expr, IAtom * name);
  92. virtual void compileSearchPattern() = 0;
  93. virtual void getDebugText(StringBuffer & s, unsigned detail) = 0;
  94. virtual bool isGrammarAmbiguous() const = 0;
  95. virtual INlpParseAlgorithm * queryParser() = 0;
  96. bool isCaseSensitive() const { return info.caseSensitive; }
  97. type_t searchType() const { return info.type; }
  98. IWorkUnit * wu() { return workunit; }
  99. protected:
  100. void checkValidMatches();
  101. void compileMatched(NlpAlgorithm & parser);
  102. void extractMatchedSymbols(IHqlExpression * expr);
  103. bool isValidMatch(MatchReference & match, unsigned depth, IHqlExpression * pattern);
  104. unsigned getValidatorIndex(IHqlExpression * expr) const { return validators.find(*expr); }
  105. IHqlExpression * queryValidateExpr(IHqlExpression * expr) const;
  106. void setParserOptions(INlpParseAlgorithm & parser);
  107. private:
  108. void doExtractValidates(IHqlExpression * expr);
  109. protected:
  110. ParseInformation info;
  111. OwnedHqlExpr expr;
  112. CIArrayOf<MatchReference> matches;
  113. RegexIdAllocator idAllocator;
  114. HqlExprArray matchedSymbols;
  115. HqlExprArray productions;
  116. HqlExprArray validators;
  117. bool allMatched;
  118. IWorkUnit * workunit;
  119. };
  120. void getCheckRange(IHqlExpression * range, unsigned & minLength, unsigned & maxLength, unsigned charLength);
  121. enum ValidateKind { ValidateIsString, ValidateIsUnicode, ValidateIsEither };
  122. ValidateKind getValidateKind(IHqlExpression * expr);
  123. NlpParseContext * createRegexContext(IHqlExpression * expr, IWorkUnit * wu, const HqlCppOptions & options, byte algorithm);
  124. NlpParseContext * createTomitaContext(IHqlExpression * expr, IWorkUnit * wu, const HqlCppOptions & options);
  125. #endif