thorparse.hpp 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137
  1. /*##############################################################################
  2. HPCC SYSTEMS software Copyright (C) 2012 HPCC Systems®.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. ############################################################################## */
  13. #ifndef __THORPARSE_HPP_
  14. #define __THORPARSE_HPP_
  15. #ifdef THORHELPER_EXPORTS
  16. #define THORHELPER_API DECL_EXPORT
  17. #else
  18. #define THORHELPER_API DECL_IMPORT
  19. #endif
  20. typedef unsigned regexid_t;
  21. enum { NLPAregexStack, NLPAtomita, NLPAregexHeap };
  22. interface IMatchWalker : public IInterface
  23. {
  24. public:
  25. virtual IAtom * queryName() = 0;
  26. virtual unsigned queryID() = 0;
  27. virtual size32_t queryMatchSize() = 0;
  28. virtual const void * queryMatchStart() = 0;
  29. virtual unsigned numChildren() = 0;
  30. virtual IMatchWalker * getChild(unsigned idx) = 0;
  31. };
  32. interface IMatchedResults;
  33. class ARowBuilder;
  34. interface IMatchedAction
  35. {
  36. public:
  37. virtual size32_t onMatch(ARowBuilder & rowBuilder, const void * in, IMatchedResults * results, IMatchWalker * walker) = 0;
  38. };
  39. interface IMatchedElement : public IInterface
  40. {
  41. virtual const byte * queryStartPtr() const = 0;
  42. virtual const byte * queryEndPtr() const = 0;
  43. virtual const byte * queryRow() const = 0;
  44. };
  45. class RegexNamed;
  46. extern IAtom * separatorTagAtom;
  47. //MORE: Remove the vmt to make constructing more efficient... use id and name fields instead.
  48. class THORHELPER_API MatchState
  49. {
  50. public:
  51. MatchState() { next = NULL; firstChild = NULL; name = NULL; id = 0; } // other fields get filled in later.
  52. MatchState(IAtom * _name, regexid_t _id) { next = NULL; firstChild = NULL; name = _name; id = _id; } // other fields get filled in later.
  53. inline IAtom * queryName() { return name; }
  54. inline regexid_t queryID() { return id; }
  55. inline void reset(IAtom * _name, regexid_t _id) { next = NULL; firstChild = NULL; name = _name; id = _id; }
  56. public:
  57. const byte * start;
  58. const byte * end;
  59. MatchState * next;
  60. MatchState * firstChild;
  61. MatchState * parent;
  62. IAtom * name;
  63. regexid_t id;
  64. };
  65. class MatchSaveState
  66. {
  67. public:
  68. MatchState * savedMatch;
  69. MatchState * * savedNext;
  70. };
  71. interface INlpResultIterator
  72. {
  73. virtual bool first() = 0;
  74. virtual bool next() = 0;
  75. virtual bool isValid() = 0;
  76. virtual const void * getRow() = 0; // returns linked row.
  77. };
  78. interface INlpParser : public IInterface
  79. {
  80. public:
  81. // Currently has state, to remove it pass an iterator class to performMatch()
  82. virtual bool performMatch(IMatchedAction & action, const void * record, unsigned len, const void * data) = 0;
  83. virtual void reset() = 0;
  84. // only valid after performMatch has been called, and whilst the parameters passed to performMatch aren't freed.
  85. virtual INlpResultIterator * queryResultIter() = 0;
  86. };
  87. interface INlpHelper;
  88. interface IHThorParseArg;
  89. interface IResourceContext;
  90. interface ICodeContext;
  91. interface IOutputMetaData;
  92. enum NlpInputFormat { NlpAscii, NlpUnicode, NlpUtf8 };
  93. interface INlpParseAlgorithm : public IInterface
  94. {
  95. enum MatchAction { NlpMatchFirst, NlpMatchAll };
  96. enum ScanAction { NlpScanWhole, NlpScanNone, NlpScanNext, NlpScanAll };
  97. public:
  98. //MORE: This should be implemented so that we can have interchangable algorithms,
  99. //and so they can be implemented as add on bits of the system.
  100. virtual void setOptions(MatchAction _matchAction, ScanAction _scanAction, NlpInputFormat _inputFormat, unsigned _keepLimit, unsigned _atMostLimit) = 0;
  101. virtual void setChoose(bool _chooseMin, bool _chooseMax, bool _chooseBest, bool _chooseBestScan) = 0;
  102. virtual void setJoin(bool _notMatched, bool _notMatchedOnly) = 0;
  103. virtual void setLimit(size32_t _maxLength) = 0;
  104. virtual void serialize(MemoryBuffer & out) = 0;
  105. virtual void init(IHThorParseArg & arg) = 0;
  106. virtual INlpParser * createParser(ICodeContext * ctx, unsigned activityId, INlpHelper * helper, IHThorParseArg * arg) = 0;
  107. };
  108. extern THORHELPER_API INlpParseAlgorithm * createThorParser(MemoryBuffer & buffer, IOutputMetaData * outRecordSize);
  109. extern THORHELPER_API INlpParseAlgorithm * createThorParser(IResourceContext *ctx, IHThorParseArg & helper);
  110. extern THORHELPER_API void getDefaultParseTree(IMatchWalker * walker, unsigned & len, char * & text);
  111. extern THORHELPER_API void getXmlParseTree(IMatchWalker * walker, unsigned & len, char * & text);
  112. #endif /* __THORPARSE_HPP_ */