123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170 |
- /*##############################################################################
- HPCC SYSTEMS software Copyright (C) 2012 HPCC Systems®.
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- http://www.apache.org/licenses/LICENSE-2.0
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- ############################################################################## */
- #include "jliball.hpp"
- #include "eclrtl.hpp"
- #include "thortparse.ipp"
- //---------------------------------------------------------------------------
- MultiLexer::MultiLexer(const AsciiDfa & _tokens, const AsciiDfa & _skip, const UnsignedArray & _endTokenChars, unsigned _eofId) : tokens(_tokens), skip(_skip)
- {
- eofId = _eofId;
- _clear(isEndToken);
- ForEachItemIn(idx, _endTokenChars)
- {
- unsigned next = _endTokenChars.item(idx);
- if (next < 256)
- isEndToken[next] = true;
- }
- }
- GrammarSymbol * MultiLexer::createToken(symbol_id id, unsigned len, const byte * start)
- {
- const FeatureInfo * feature = NULL; // features[id];
- return new Terminal(id, feature, len, start);
- }
- position_t MultiLexer::skipWhitespace(position_t pos)
- {
- const AsciiDfaState * states = skip.queryStates();
- unsigned * transitions = skip.queryTransitions();
- unsigned activeState = 0;
- const byte * cur = state.start+pos;
- const byte * end = state.end;
- const byte * best = cur;
- for (;;)
- {
- const AsciiDfaState & curState = states[activeState];
- if (curState.accepts())
- best = cur;
- if (cur == end)
- break;
- byte next = *cur++;
- if ((next < curState.min) || (next > curState.max))
- break;
- activeState = transitions[curState.delta + next];
- if (activeState == NotFound)
- break;
- }
- return (size32_t)(best - state.start);
- }
- unsigned MultiLexer::next(position_t pos, GrammarSymbolArray & symbols)
- {
- const byte * start = state.start + skipWhitespace(pos);
- const byte * end = state.end;
- if (start == end)
- {
- symbols.append(*createToken(eofId, 0, start));
- return 1;
- }
- const byte * cur = start;
- unsigned activeState = 0;
- const AsciiDfaState * states = tokens.queryStates();
- unsigned * transitions = tokens.queryTransitions();
- const byte * best = NULL;
- const AsciiDfaState * bestState = NULL;
- for (;;)
- {
- const AsciiDfaState & curState = states[activeState];
- if (curState.accepts())
- {
- best = cur;
- bestState = &curState;
- }
- if (cur == end)
- break;
- byte next = *cur++;
- if ((activeState != 0) && isEndToken[next])
- {
- if (curState.accepts())
- {
- for (unsigned i=0;;i++)
- {
- unsigned id = tokens.getAccepts(curState, i);
- if (id == NotFound)
- break;
- symbols.append(*createToken(id, (size32_t)(cur-start), start));
- }
- best = NULL;
- }
- }
- if ((next < curState.min) || (next > curState.max))
- break;
- activeState = transitions[curState.delta + next];
- if (activeState == NotFound)
- break;
- }
- if (best)
- {
- for (unsigned i=0;;i++)
- {
- unsigned id = tokens.getAccepts(*bestState, i);
- if (id == NotFound)
- break;
- symbols.append(*createToken(id, (size32_t)(best-start), start));
- }
- }
- return symbols.ordinality();
- }
- void MultiLexer::setDocument(size32_t len, const void * _start)
- {
- state.start = (const byte *)_start;
- state.end = state.start + len;
- }
- #if 0
- ToDo:
- * Do some more planning re:
- o Augmented grammars
- o Generating the lexer. Especially what we do about unknown words/multiple possible matches. [Other implictations if tokens do not necessarily lie on the same boundaries].
- o Representing penalties and probabilities.
- o Translating the regex syntax into parser input.
- o Conditional reductions - where how/do they occur? What arguments do they need?
- o Returning multiple rows from a single match?
- * Parameterised patterns - how do they related to augmented grammars[do not], and what is needed to implement them?
- * Design in detail the table generator
- o LR or LALR?
- o Pathological grammars e.g., S := | S | ... -> reread and understand doc. Can we cope?
- * Use cases:
- o MAX and BEST()
- * Misc
- Error if ": define()" is applied to a pattern
- MAX,MIN in regex implementation
- Stack problems with regex
- #endif
|