thorrparse.hpp 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165
  1. /*##############################################################################
  2. HPCC SYSTEMS software Copyright (C) 2012 HPCC Systems.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. ############################################################################## */
  13. #ifndef __THORRPARSE_HPP_
  14. #define __THORRPARSE_HPP_
  15. #include "thorparse.hpp"
  16. class RegexPattern;
  17. class RegexNamed;
  18. class RegexState;
  19. struct RegexSerializeState;
  20. struct RegexXmlState;
  21. typedef CIArrayOf<RegexPattern> RegexPatternArray;
  22. typedef CIArrayOf<RegexNamed> RegexNamedArray;
  23. typedef CopyCIArrayOf<RegexPattern> RegexPatternCopyArray;
  24. //Used for serialization:
  25. enum ThorRegexKind { ThorRegexNone,
  26. ThorRegexNull, ThorRegexAnyChar, ThorRegexAsciiDFA, ThorRegexUnicodeDFA,
  27. ThorRegexAscii, ThorRegexAsciiI, ThorRegexAsciiSet, ThorRegexAsciiISet,
  28. ThorRegexUnicode, ThorRegexUnicodeI, ThorRegexUnicodeSet, ThorRegexUnicodeISet,
  29. ThorRegexStart, ThorRegexFinish,
  30. ThorRegexBeginToken, ThorRegexEndToken,
  31. ThorRegexBeginSeparator, ThorRegexEndSeparator,
  32. ThorRegexRepeat, // used for repeats with counts.
  33. ThorRegexBeginCheck, ThorRegexAssertNext, ThorRegexAssertPrev,
  34. ThorRegexCheckLength,
  35. ThorRegexCheck,
  36. ThorRegexValidateAscAsAsc, ThorRegexValidateUniAsAsc, ThorRegexValidateAscAsUni, ThorRegexValidateUniAsUni,
  37. ThorRegexNamed,
  38. ThorRegexEndNamed,
  39. ThorRegexEndNested,
  40. ThorRegexDone,
  41. ThorRegexRecursive,
  42. ThorRegexEndRecursive,
  43. ThorRegexPenalty,
  44. ThorRegexRepeatAny,
  45. ThorRegexMax,
  46. ThorRegexValidateUtf8AsAsc, ThorRegexValidateUtf8AsUni,
  47. ThorRegexUtf8, ThorRegexUtf8I,
  48. //Temporary - not serialized...
  49. ThorRegexRepeatInstance
  50. };
  51. //RegexMatchContinue is processed the same as RegexMatchBacktrack, but returned in different contexts.
  52. enum RegexMatchAction { RegexMatchDone, RegexMatchBacktrack, RegexMatchBacktrackToken, RegexMatchContinue };
  53. class ActiveStage;
  54. class RegexMatchStateSave;
  55. class THORHELPER_API RegexPattern : public CInterface
  56. {
  57. public:
  58. RegexPattern();
  59. virtual ThorRegexKind getKind() = 0;
  60. virtual RegexMatchAction match(RegexState & state) = 0;
  61. virtual RegexMatchAction beginMatch(RegexState & state) = 0;
  62. virtual RegexMatchAction nextAction(ActiveStage & stage, RegexState & state);
  63. virtual void killStage(ActiveStage & stage, RegexState & state);
  64. //serialization code....
  65. virtual void dispose(); // needed to free the structure because it is a cyclic graph
  66. virtual bool gather(RegexSerializeState & state);
  67. virtual void serializePattern(MemoryBuffer & out);
  68. virtual void serializeLinks(MemoryBuffer & out, RegexSerializeState & state);
  69. virtual void deserializePattern(MemoryBuffer & in);
  70. virtual void deserializeLinks(MemoryBuffer & in, RegexSerializeState & state);
  71. virtual void toXML(StringBuffer & out, RegexXmlState & state);
  72. virtual void toXMLattr(StringBuffer & out, RegexXmlState & state);
  73. //construction
  74. virtual void addLink(RegexPattern * link) { next.append(OLINK(*link)); }
  75. virtual void setBody(RegexNamed * name) { UNIMPLEMENTED; }
  76. virtual void setSubPattern(RegexPattern * _pattern) { UNIMPLEMENTED; }
  77. public:
  78. RegexMatchAction traceMatch(RegexState & state);
  79. void gatherNext(RegexSerializeState & state);
  80. void clearGathered() { gathered = false; }
  81. virtual void getTraceText(StringBuffer & s);
  82. protected:
  83. inline RegexMatchAction pushMatched(RegexState & state);
  84. RegexMatchAction matchNext(RegexState & state);
  85. inline RegexMatchAction markFinishContinueMatch(RegexState & state);
  86. RegexMatchAction nextChild(ActiveStage & stage, RegexState & state);
  87. ActiveStage & pushStage(RegexState & state);
  88. ActiveStage & pushStageBeginMatch(RegexState & state, RegexMatchStateSave * matched);
  89. RegexMatchAction pushStageEndMatch(RegexState & state);
  90. void cleanupBeginMatch(ActiveStage & stage, RegexState & state);
  91. void cleanupEndMatch(ActiveStage & stage, RegexState & state);
  92. protected:
  93. RegexPatternArray next;
  94. bool gathered;
  95. };
  96. typedef Owned<RegexPattern> OwnedRegexPattern;
  97. class MatchState;
  98. class THORHELPER_API RegexNamed : public CInterface
  99. {
  100. public:
  101. RegexNamed() { name = NULL; id = 0; }
  102. RegexNamed(IAtom * _name, regexid_t _id) { name = _name; id = _id; }
  103. inline IAtom * queryName() { return name; }
  104. inline regexid_t queryID() { return id; }
  105. RegexMatchAction match(RegexState & state, RegexPattern * instance);
  106. RegexMatchAction match(RegexState & state, RegexPattern * instance, MatchState & match);
  107. RegexMatchAction beginMatch(RegexState & state);
  108. //serialization...
  109. void dispose(); // needed to free the structure because it is a cyclic graph
  110. void gather(RegexSerializeState & state);
  111. void serializePattern(MemoryBuffer & out);
  112. void serializeLinks(MemoryBuffer & out, RegexSerializeState & state);
  113. void deserializePattern(MemoryBuffer & in);
  114. void deserializeLinks(MemoryBuffer & in, RegexSerializeState & state);
  115. void toXML(StringBuffer & out, RegexXmlState & state);
  116. //construction
  117. void setFirst(RegexPattern * value) { first.set(value); }
  118. protected:
  119. OwnedRegexPattern first;
  120. IAtom * name;
  121. regexid_t id;
  122. };
  123. extern THORHELPER_API void serializeRegex(MemoryBuffer & out, RegexPattern * root);
  124. extern THORHELPER_API RegexPattern * deserializeRegex(MemoryBuffer & in);
  125. extern THORHELPER_API void regexToXml(StringBuffer & out, RegexPattern * root, unsigned detail);
  126. extern THORHELPER_API bool isAsciiMatch(unsigned code, unsigned next);
  127. extern THORHELPER_API bool isUnicodeMatch(unsigned code, unsigned next);
  128. #endif /* __THORRPARSE_HPP_ */