thortlex.cpp 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170
  1. /*##############################################################################
  2. HPCC SYSTEMS software Copyright (C) 2012 HPCC Systems®.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. ############################################################################## */
  13. #include "jliball.hpp"
  14. #include "eclrtl.hpp"
  15. #include "thortparse.ipp"
  16. //---------------------------------------------------------------------------
  17. MultiLexer::MultiLexer(const AsciiDfa & _tokens, const AsciiDfa & _skip, const UnsignedArray & _endTokenChars, unsigned _eofId) : tokens(_tokens), skip(_skip)
  18. {
  19. eofId = _eofId;
  20. _clear(isEndToken);
  21. ForEachItemIn(idx, _endTokenChars)
  22. {
  23. unsigned next = _endTokenChars.item(idx);
  24. if (next < 256)
  25. isEndToken[next] = true;
  26. }
  27. }
  28. GrammarSymbol * MultiLexer::createToken(symbol_id id, unsigned len, const byte * start)
  29. {
  30. const FeatureInfo * feature = NULL; // features[id];
  31. return new Terminal(id, feature, len, start);
  32. }
  33. position_t MultiLexer::skipWhitespace(position_t pos)
  34. {
  35. const AsciiDfaState * states = skip.queryStates();
  36. unsigned * transitions = skip.queryTransitions();
  37. unsigned activeState = 0;
  38. const byte * cur = state.start+pos;
  39. const byte * end = state.end;
  40. const byte * best = cur;
  41. for (;;)
  42. {
  43. const AsciiDfaState & curState = states[activeState];
  44. if (curState.accepts())
  45. best = cur;
  46. if (cur == end)
  47. break;
  48. byte next = *cur++;
  49. if ((next < curState.min) || (next > curState.max))
  50. break;
  51. activeState = transitions[curState.delta + next];
  52. if (activeState == NotFound)
  53. break;
  54. }
  55. return (size32_t)(best - state.start);
  56. }
  57. unsigned MultiLexer::next(position_t pos, GrammarSymbolArray & symbols)
  58. {
  59. const byte * start = state.start + skipWhitespace(pos);
  60. const byte * end = state.end;
  61. if (start == end)
  62. {
  63. symbols.append(*createToken(eofId, 0, start));
  64. return 1;
  65. }
  66. const byte * cur = start;
  67. unsigned activeState = 0;
  68. const AsciiDfaState * states = tokens.queryStates();
  69. unsigned * transitions = tokens.queryTransitions();
  70. const byte * best = NULL;
  71. const AsciiDfaState * bestState = NULL;
  72. for (;;)
  73. {
  74. const AsciiDfaState & curState = states[activeState];
  75. if (curState.accepts())
  76. {
  77. best = cur;
  78. bestState = &curState;
  79. }
  80. if (cur == end)
  81. break;
  82. byte next = *cur++;
  83. if ((activeState != 0) && isEndToken[next])
  84. {
  85. if (curState.accepts())
  86. {
  87. for (unsigned i=0;;i++)
  88. {
  89. unsigned id = tokens.getAccepts(curState, i);
  90. if (id == NotFound)
  91. break;
  92. symbols.append(*createToken(id, (size32_t)(cur-start), start));
  93. }
  94. best = NULL;
  95. }
  96. }
  97. if ((next < curState.min) || (next > curState.max))
  98. break;
  99. activeState = transitions[curState.delta + next];
  100. if (activeState == NotFound)
  101. break;
  102. }
  103. if (best)
  104. {
  105. for (unsigned i=0;;i++)
  106. {
  107. unsigned id = tokens.getAccepts(*bestState, i);
  108. if (id == NotFound)
  109. break;
  110. symbols.append(*createToken(id, (size32_t)(best-start), start));
  111. }
  112. }
  113. return symbols.ordinality();
  114. }
  115. void MultiLexer::setDocument(size32_t len, const void * _start)
  116. {
  117. state.start = (const byte *)_start;
  118. state.end = state.start + len;
  119. }
  120. #if 0
  121. ToDo:
  122. * Do some more planning re:
  123. o Augmented grammars
  124. o Generating the lexer. Especially what we do about unknown words/multiple possible matches. [Other implictations if tokens do not necessarily lie on the same boundaries].
  125. o Representing penalties and probabilities.
  126. o Translating the regex syntax into parser input.
  127. o Conditional reductions - where how/do they occur? What arguments do they need?
  128. o Returning multiple rows from a single match?
  129. * Parameterised patterns - how do they related to augmented grammars[do not], and what is needed to implement them?
  130. * Design in detail the table generator
  131. o LR or LALR?
  132. o Pathological grammars e.g., S := | S | ... -> reread and understand doc. Can we cope?
  133. * Use cases:
  134. o MAX and BEST()
  135. * Misc
  136. Error if ": define()" is applied to a pattern
  137. MAX,MIN in regex implementation
  138. Stack problems with regex
  139. #endif