123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653 |
- /*##############################################################################
- HPCC SYSTEMS software Copyright (C) 2012 HPCC Systems®.
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- http://www.apache.org/licenses/LICENSE-2.0
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- ############################################################################## */
- #include "jliball.hpp"
- #include "thorparse.ipp"
- #include "thorregex.hpp"
- #include "eclrtl.hpp"
- #include "eclhelper.hpp"
- IAtom * separatorTagAtom;
- MODULE_INIT(INIT_PRIORITY_STANDARD)
- {
- separatorTagAtom = createAtom("<separator>");
- return true;
- }
- //---------------------------------------------------------------------------
- void deserializeBoolArray(unsigned len, bool * values, MemoryBuffer & in)
- {
- for (unsigned i = 0; i < len; i+= 8)
- {
- unsigned char next;
- in.read(next);
- unsigned max = i+8 <= len ? 8 : len - i;
- for (unsigned j=0; j<max; j++)
- values[i+j] = (next & (1 << j)) != 0;
- }
- }
- void serializeBoolArray(MemoryBuffer & out, unsigned len, const bool * values)
- {
- for (unsigned i = 0; i < len; i+= 8)
- {
- unsigned char next = 0;
- unsigned max = i+8 <= len ? 8 : len - i;
- for (unsigned j=0; j<max; j++)
- if (values[i+j]) next |= (1 << j);
- out.append(next);
- }
- }
- //---------------------------------------------------------------------------
-
- NlpState::NlpState(INlpMatchedAction * _action, NlpInputFormat _inputFormat, size32_t len, const void * text)
- {
- matchAction = _action;
- inputFormat = _inputFormat;
- charSize = (inputFormat == NlpUnicode) ? sizeof(UChar) : sizeof(char);
- start = (const byte *)text;
- cur = start;
- end = start + len;
- curMatch = ⊤
- next = &top.firstChild;
- top.start = start;
- top.parent = NULL;
- }
- void NlpState::pushMatch(MatchState & match, MatchSaveState & save)
- {
- save.savedNext = next;
- save.savedMatch = curMatch;
- match.parent = curMatch;
- match.start = cur;
- *next = &match;
- next = &match.firstChild;
- curMatch = &match;
- };
- void NlpState::popMatch(const MatchSaveState & save)
- {
- next = save.savedNext;
- *next = NULL;
- curMatch = save.savedMatch;
- }
- void NlpState::markFinish(MatchSaveState & save)
- {
- save.savedMatch = curMatch;
- save.savedNext = next;
- curMatch->end = cur;
- next = &curMatch->next;
- curMatch = curMatch->parent;
- }
- void NlpState::unmarkFinish(const MatchSaveState & save)
- {
- next = save.savedNext;
- curMatch = save.savedMatch;
- }
- //---------------------------------------------------------------------------
-
- NlpMatchPath::NlpMatchPath(const UnsignedArray & _ids, const UnsignedArray & _indices)
- {
- assert(_ids.ordinality() == _indices.ordinality());
- ForEachItemIn(idx, _ids)
- {
- ids.append(_ids.item(idx));
- indices.append(_indices.item(idx));
- }
- }
- NlpMatchPath::NlpMatchPath(MemoryBuffer & in)
- {
- unsigned num;
- in.read(num);
- for (unsigned idx = 0; idx < num; idx++)
- {
- unsigned index, id;
- in.read(id);
- in.read(index);
- ids.append(id);
- indices.append(index);
- }
- }
- NlpMatchPath::~NlpMatchPath()
- {
- }
- void NlpMatchPath::serialize(MemoryBuffer & out) const
- {
- unsigned num = ids.ordinality();
- out.append(num);
- for (unsigned idx = 0; idx < num; idx++)
- {
- out.append(ids.item(idx));
- out.append(indices.item(idx));
- }
- }
- //---------------------------------------------------------------------------
- CMatchedResultInfo::CMatchedResultInfo()
- {
- inputFormat = NlpAscii;
- }
- void CMatchedResultInfo::deserialize(MemoryBuffer & in)
- {
- unsigned num;
- in.read(inputFormat);
- in.read(num);
- for (unsigned idx = 0; idx < num; idx++)
- {
- NlpMatchPath & cur = *createMatchPath(in);
- matchResults.append(cur);
- }
- }
- void CMatchedResultInfo::serialize(MemoryBuffer & out) const
- {
- unsigned num = matchResults.ordinality();
- out.append(inputFormat);
- out.append(num);
- for (unsigned idx = 0; idx < num; idx++)
- matchResults.item(idx).serialize(out);
- }
- //---------------------------------------------------------------------------
- CMatchedResults::CMatchedResults(CMatchedResultInfo * _def)
- {
- in = NULL;
- def = _def;
- unsigned num = def->matchResults.ordinality();
- matched = new IMatchedElement *[num];
- for (unsigned i=0; i<num; i++)
- matched[i] = NULL;
- }
- CMatchedResults::~CMatchedResults()
- {
- kill();
- }
- bool CMatchedResults::getMatched(unsigned idx)
- {
- return matched[idx] != ¬Matched;
- }
- size32_t CMatchedResults::getMatchLength(unsigned idx)
- {
- const IMatchedElement * cur = matched[idx];
- const byte * start = cur->queryStartPtr();
- size32_t size = (size32_t)(cur->queryEndPtr() - start);
- size32_t len;
- switch (def->inputFormat)
- {
- case NlpAscii:
- len = size;
- break;
- case NlpUtf8:
- len = rtlUtf8Length(size, start);
- break;
- case NlpUnicode:
- len = size / sizeof(UChar);
- break;
- default:
- throwUnexpected();
- }
- return len;
- }
- size32_t CMatchedResults::getMatchPosition(unsigned idx)
- {
- IMatchedElement * cur = matched[idx];
- if (cur == ¬Matched)
- return 0;
- size32_t pos = (size32_t)(cur->queryStartPtr() - in);
- switch (def->inputFormat)
- {
- case NlpUtf8:
- pos = rtlUtf8Length(pos, in);
- break;
- case NlpUnicode:
- pos = pos / sizeof(UChar);
- break;
- }
- return pos+1;
- }
- void CMatchedResults::getMatchText(size32_t & outlen, char * & out, unsigned idx)
- {
- const IMatchedElement * cur = matched[idx];
- const byte * start = cur->queryStartPtr();
- size32_t size = (size32_t)(cur->queryEndPtr() - start);
- switch (def->inputFormat)
- {
- case NlpAscii:
- rtlStrToStrX(outlen, out, size, start);
- break;
- case NlpUtf8:
- {
- //could use codepage2codepage if worried about efficiency...
- unsigned len = rtlUtf8Length(size, start);
- rtlUtf8ToStrX(outlen, out, len, (const char *)start);
- break;
- }
- case NlpUnicode:
- rtlUnicodeToStrX(outlen, out, size/sizeof(UChar), (const UChar *)start);
- break;
- }
- }
- void CMatchedResults::getMatchUnicode(size32_t & outlen, UChar * & out, unsigned idx)
- {
- const IMatchedElement * cur = matched[idx];
- const byte * start = cur->queryStartPtr();
- size32_t size = (size32_t)(cur->queryEndPtr() - start);
- switch (def->inputFormat)
- {
- case NlpAscii:
- rtlStrToUnicodeX(outlen, out, size, (const char *)start);
- break;
- case NlpUtf8:
- {
- //could use codepage2codepage if worried about efficiency...
- unsigned len = rtlUtf8Length(size, start);
- rtlUtf8ToUnicodeX(outlen, out, len, (const char *)start);
- break;
- }
- break;
- case NlpUnicode:
- rtlUnicodeToUnicodeX(outlen, out, size/sizeof(UChar), (const UChar*)start);
- break;
- }
- }
- void CMatchedResults::getMatchUtf8(size32_t & outlen, char * & out, unsigned idx)
- {
- const IMatchedElement * cur = matched[idx];
- const byte * start = cur->queryStartPtr();
- size32_t size = (size32_t)(cur->queryEndPtr() - start);
- switch (def->inputFormat)
- {
- case NlpAscii:
- rtlStrToUtf8X(outlen, out, size, (const char *)start);
- break;
- case NlpUtf8:
- {
- //could use codepage2codepage if worried about efficiency...
- unsigned len = rtlUtf8Length(size, start);
- rtlUtf8ToUtf8X(outlen, out, len, (const char *)start);
- break;
- }
- case NlpUnicode:
- rtlUnicodeToUtf8X(outlen, out, size/sizeof(UChar), (const UChar*)start);
- break;
- }
- }
- byte * CMatchedResults::queryMatchRow(unsigned idx)
- {
- const IMatchedElement * cur = matched[idx];
- return (byte *)cur->queryRow();
- }
- //MORE: Allow access to attributes at any location on the tree.
- byte * CMatchedResults::queryRootResult()
- {
- return (byte *)rootResult;
- }
- void CMatchedResults::kill()
- {
- if (matched)
- {
- unsigned num = def->matchResults.ordinality();
- for (unsigned i=0; i < num; i++)
- ::Release(matched[i]);
- delete [] matched;
- matched = NULL;
- }
- }
- //---------------------------------------------------------------------------
- IAtom * NlpMatchWalker::queryName()
- {
- return curMatch->queryName();
- }
- size32_t NlpMatchWalker::queryMatchSize()
- {
- return (size32_t)(curMatch->end - curMatch->start);
- }
- const void * NlpMatchWalker::queryMatchStart()
- {
- return curMatch->start;
- }
- unsigned NlpMatchWalker::numChildren()
- {
- unsigned count = 0;
- MatchState * cur = curMatch->firstChild;
- while (cur)
- {
- count++;
- cur = cur->next;
- }
- return count;
- }
- IMatchWalker * NlpMatchWalker::getChild(unsigned numToSkip)
- {
- MatchState * cur = curMatch->firstChild;
- while (cur && numToSkip)
- {
- numToSkip--;
- cur = cur->next;
- }
- if (cur)
- return new NlpMatchWalker(cur);
- return NULL;
- }
- //------------------------------------------------------
- static bool hasChildren(IMatchWalker * walker)
- {
- for (unsigned i=0;;i++)
- {
- Owned<IMatchWalker> child = walker->getChild(i);
- if (!child)
- return false;
- if (child->queryName() != separatorTagAtom)
- return true;
- }
- }
- static StringBuffer & getElementText(StringBuffer & s, IMatchWalker * walker)
- {
- unsigned len = walker->queryMatchSize();
- const char * text = (const char *)walker->queryMatchStart();
- return s.append(len, text);
- }
- static void expandElementText(StringBuffer & s, IMatchWalker * walker)
- {
- getElementText(s.append('"'), walker).append('"');
- }
- static void getDefaultParseTree(StringBuffer & s, IMatchWalker * cur)
- {
- IAtom * name = cur->queryName();
- if (name != separatorTagAtom)
- {
- if (name)
- {
- StringBuffer lowerName;
- lowerName.append(name).toLowerCase();
- s.append(lowerName);
- }
- if (hasChildren(cur))
- {
- s.append("[");
- for (unsigned i=0;;i++)
- {
- Owned<IMatchWalker> child = cur->getChild(i);
- if (!child)
- break;
- getDefaultParseTree(s, child);
- s.append(" ");
- }
- s.setLength(s.length()-1);
- s.append("]");
- }
- else
- expandElementText(s, cur);
- }
- }
- void getDefaultParseTree(IMatchWalker * walker, unsigned & len, char * & text)
- {
- StringBuffer s;
- getDefaultParseTree(s, walker);
- len = s.length();
- text = s.detach();
- }
- static void getXmlParseTree(StringBuffer & s, IMatchWalker * walker, unsigned indent)
- {
- IAtom * name = walker->queryName();
- if (name != separatorTagAtom)
- {
- unsigned max = walker->numChildren();
- if (!name)
- {
- if (hasChildren(walker))
- {
- for (unsigned i=0; i<max; i++)
- {
- Owned<IMatchWalker> child = walker->getChild(i);
- getXmlParseTree(s, child, indent);
- }
- }
- else
- getElementText(s, walker);
- }
- else
- {
- StringBuffer lowerName;
- lowerName.append(name).toLowerCase();
- s.pad(indent).append('<').append(lowerName).append('>');
- if (hasChildren(walker))
- {
- s.newline();
- for (unsigned i=0; i<max; i++)
- {
- Owned<IMatchWalker> child = walker->getChild(i);
- getXmlParseTree(s, child, indent+1);
- }
- s.pad(indent);
- }
- else
- getElementText(s, walker);
- s.append("</").append(lowerName).append('>').newline();
- }
- }
- }
- void getXmlParseTree(IMatchWalker * walker, unsigned & len, char * & text)
- {
- StringBuffer s;
- getXmlParseTree(s, walker, 0);
- len = s.length();
- text = s.detach();
- }
- //------------------------------------------------------
- unsigned getMaximumMatchLength(AsciiDfa & dfa, unsigned len, const byte * start)
- {
- const byte * cur = start;
- const byte * end = start+len;
- unsigned activeState = 0;
- const AsciiDfaState * states = dfa.queryStates();
- unsigned * transitions = dfa.queryTransitions();
- const byte * best = NULL;
- for (;;)
- {
- if (states[activeState].accepts())
- best = cur;
- if (cur == end)
- break;
- byte next = *cur++;
- if (next < states[activeState].min)
- break;
- if (next > states[activeState].max)
- break;
- activeState = transitions[states[activeState].delta + next];
- if (activeState == NotFound)
- break;
- }
- if (best)
- return (size32_t)(best-start);
- return NotFound;
- }
- //------------------------------------------------------
- NlpAlgorithm::NlpAlgorithm(CMatchedResultInfo * _matched)
- {
- matchInfo = _matched;
- addedSeparators = false;
- notMatched = false;
- notMatchedOnly = false;
- chooseMin = false;
- chooseMax = false;
- chooseBest = false;
- singleChoicePerLine = false;
- inputFormat = NlpAscii;
- keepLimit = UINT_MAX;
- atMostLimit = UINT_MAX;
- charWidth = sizeof(char);
- }
- NlpAlgorithm::~NlpAlgorithm()
- {
- ::Release(matchInfo);
- }
- void NlpAlgorithm::setOptions(MatchAction _matchAction, ScanAction _scanAction, NlpInputFormat _inputFormat, unsigned _keepLimit, unsigned _atMostLimit)
- {
- matchAction = _matchAction;
- scanAction = _scanAction;
- inputFormat = _inputFormat;
- keepLimit = _keepLimit ? _keepLimit : UINT_MAX;
- atMostLimit = _atMostLimit ? _atMostLimit : UINT_MAX;
- charWidth = (inputFormat == NlpUnicode) ? sizeof(UChar) : sizeof(char);
- }
- void NlpAlgorithm::setChoose(bool _chooseMin, bool _chooseMax, bool _chooseBest, bool _singleChoicePerLine)
- {
- chooseMin = _chooseMin;
- chooseMax = _chooseMax;
- chooseBest = _chooseBest;
- singleChoicePerLine = _singleChoicePerLine;
- }
- void NlpAlgorithm::setJoin(bool _notMatched, bool _notMatchedOnly)
- {
- notMatched = _notMatched;
- notMatchedOnly = _notMatchedOnly;
- }
- void NlpAlgorithm::setLimit(unsigned _maxLength)
- {
- maxLength = _maxLength;
- }
- void NlpAlgorithm::serialize(MemoryBuffer & out)
- {
- out.append((unsigned)matchAction);
- out.append((unsigned)scanAction);
- out.append((byte)inputFormat);
- out.append(keepLimit);
- out.append(atMostLimit);
- out.append(charWidth);
- out.append(addedSeparators);
- out.append(notMatched);
- out.append(notMatchedOnly);
- out.append(chooseMin);
- out.append(chooseMax);
- out.append(chooseBest);
- out.append(singleChoicePerLine);
- out.append(maxLength);
- matchInfo->serialize(out);
- }
- void NlpAlgorithm::deserialize(MemoryBuffer & in)
- {
- unsigned temp;
- byte tempByte;
- in.read(temp); matchAction = (MatchAction)temp;
- in.read(temp); scanAction = (ScanAction)temp;
- in.read(tempByte); inputFormat = (NlpInputFormat)tempByte;
- in.read(keepLimit);
- in.read(atMostLimit);
- in.read(charWidth);
- in.read(addedSeparators);
- in.read(notMatched);
- in.read(notMatchedOnly);
- in.read(chooseMin);
- in.read(chooseMax);
- in.read(chooseBest);
- in.read(singleChoicePerLine);
- in.read(maxLength);
- matchInfo->deserialize(in);
- }
- INlpParseAlgorithm * createThorParser(MemoryBuffer & buffer, IOutputMetaData * outRecordSize)
- {
- byte kind;
- buffer.read(kind);
- switch (kind)
- {
- case NLPAregexStack:
- case NLPAregexHeap:
- return createRegexParser(buffer, outRecordSize, kind);
- case NLPAtomita:
- return createTomitaParser(buffer, outRecordSize);
- default:
- UNIMPLEMENTED;
- }
- }
- INlpParseAlgorithm * createThorParser(IResourceContext *ctx, IHThorParseArg & helper)
- {
- unsigned len;
- const void * data;
- helper.queryCompiled(ctx, len, data);
- MemoryBuffer compressed, buffer;
- compressed.setBuffer(len, (void *)data, false);
- decompressToBuffer(buffer, compressed);
- INlpParseAlgorithm * algorithm = createThorParser(buffer, helper.queryOutputMeta());
- algorithm->init(helper);
- return algorithm;
- }
|