RONCC
/
Big-Data-HPC-Platform


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652
							/*##############################################################################

    Copyright (C) 2011 HPCC Systems.

    All rights reserved. This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU Affero General Public License as
    published by the Free Software Foundation, either version 3 of the
    License, or (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Affero General Public License for more details.

    You should have received a copy of the GNU Affero General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
############################################################################## */

#include "jliball.hpp"
#include "thorparse.ipp"
#include "thorregex.hpp"
#include "eclrtl.hpp"
#include "eclhelper.hpp"

_ATOM separatorTagAtom;
MODULE_INIT(INIT_PRIORITY_STANDARD)
{
    separatorTagAtom = createAtom("<separator>");
    return true;
}

//---------------------------------------------------------------------------
void deserializeBoolArray(unsigned len, bool * values, MemoryBuffer & in)
{
    for (unsigned i = 0; i < len; i+= 8)
    {
        unsigned char next;
        in.read(next);
        unsigned max = i+8 <= len ? 8 : len - i;
        for (unsigned j=0; j<max; j++)
            values[i+j] = (next & (1 << j)) != 0;
    }
}


void serializeBoolArray(MemoryBuffer & out, unsigned len, const bool * values)
{
    for (unsigned i = 0; i < len; i+= 8)
    {
        unsigned char next = 0;
        unsigned max = i+8 <= len ? 8 : len - i;
        for (unsigned j=0; j<max; j++)
            if (values[i+j]) next |= (1 << j);
        out.append(next);
    }
}

//---------------------------------------------------------------------------
    
NlpState::NlpState(INlpMatchedAction * _action, NlpInputFormat _inputFormat, size32_t len, const void * text)
{
    matchAction = _action;
    inputFormat = _inputFormat;
    charSize = (inputFormat == NlpUnicode) ? sizeof(UChar) : sizeof(char);
    start = (const byte *)text;
    cur = start;
    end = start + len;
    curMatch = &top;
    next = &top.firstChild;
    top.start = start;
    top.parent = NULL;
}

void NlpState::pushMatch(MatchState & match, MatchSaveState & save)
{
    save.savedNext = next;
    save.savedMatch = curMatch;
    match.parent = curMatch;
    match.start = cur;
    *next = &match;
    next = &match.firstChild;
    curMatch = &match;
};

void NlpState::popMatch(const MatchSaveState & save)
{
    next = save.savedNext;
    *next = NULL;
    curMatch = save.savedMatch;
}

void NlpState::markFinish(MatchSaveState & save)
{
    save.savedMatch = curMatch;
    save.savedNext = next;
    curMatch->end = cur;
    next = &curMatch->next;
    curMatch = curMatch->parent;
}

void NlpState::unmarkFinish(const MatchSaveState & save)
{
    next = save.savedNext;
    curMatch = save.savedMatch;
}

//---------------------------------------------------------------------------
    
NlpMatchPath::NlpMatchPath(const UnsignedArray & _ids, const UnsignedArray & _indices)
{
    assert(_ids.ordinality() == _indices.ordinality());
    ForEachItemIn(idx, _ids)
    {
        ids.append(_ids.item(idx));
        indices.append(_indices.item(idx));
    }
}


NlpMatchPath::NlpMatchPath(MemoryBuffer & in)
{
    unsigned num;
    in.read(num);
    for (unsigned idx = 0; idx < num; idx++)
    {
        unsigned index, id;
        in.read(id);
        in.read(index);
        ids.append(id);
        indices.append(index);
    }
}

NlpMatchPath::~NlpMatchPath()
{
}

void NlpMatchPath::serialize(MemoryBuffer & out) const
{
    unsigned num = ids.ordinality();
    out.append(num);
    for (unsigned idx = 0; idx < num; idx++)
    {
        out.append(ids.item(idx));
        out.append(indices.item(idx));
    }
}

//---------------------------------------------------------------------------

CMatchedResultInfo::CMatchedResultInfo()
{
    inputFormat = NlpAscii;
}

void CMatchedResultInfo::deserialize(MemoryBuffer & in)
{
    unsigned num;
    in.read(inputFormat);
    in.read(num);
    for (unsigned idx = 0; idx < num; idx++)
    {
        NlpMatchPath & cur = *createMatchPath(in);
        matchResults.append(cur);
    }
}

void CMatchedResultInfo::serialize(MemoryBuffer & out) const
{
    unsigned num = matchResults.ordinality();
    out.append(inputFormat);
    out.append(num);
    for (unsigned idx = 0; idx < num; idx++)
        matchResults.item(idx).serialize(out);
}

//---------------------------------------------------------------------------

CMatchedResults::CMatchedResults(CMatchedResultInfo * _def)
{
    in = NULL;
    def = _def;
    unsigned num = def->matchResults.ordinality();
    matched = new IMatchedElement *[num];
    for (unsigned i=0; i<num; i++)
        matched[i] = NULL;
}

CMatchedResults::~CMatchedResults()
{
    kill();
}

bool CMatchedResults::getMatched(unsigned idx)              
{ 
    return matched[idx] != &notMatched; 
}

size32_t CMatchedResults::getMatchLength(unsigned idx)          
{ 
    const IMatchedElement * cur = matched[idx];
    const byte * start = cur->queryStartPtr(); 
    size32_t size = (size32_t)(cur->queryEndPtr() - start); 
    size32_t len;

    switch (def->inputFormat)
    {
    case NlpAscii:
        len = size;
        break;
    case NlpUtf8:
        len = rtlUtf8Length(size, start);
        break;
    case NlpUnicode:
        len = size / sizeof(UChar);
        break;
    }
    return len;
}


size32_t CMatchedResults::getMatchPosition(unsigned idx)        
{
    IMatchedElement * cur = matched[idx];
    if (cur == &notMatched)
        return 0;
    size32_t pos = (size32_t)(cur->queryStartPtr() - in);
    switch (def->inputFormat)
    {
    case NlpUtf8:
        pos = rtlUtf8Length(pos, in);
        break;
    case NlpUnicode:
        pos = pos / sizeof(UChar);
        break;
    }
    return pos+1;
}

void CMatchedResults::getMatchText(size32_t & outlen, char * & out, unsigned idx)
{
    const IMatchedElement * cur = matched[idx];
    const byte * start = cur->queryStartPtr(); 
    size32_t size = (size32_t)(cur->queryEndPtr() - start); 

    switch (def->inputFormat)
    {
    case NlpAscii:
        rtlStrToStrX(outlen, out, size, start);
        break;
    case NlpUtf8:
        {
            //could use codepage2codepage if worried about efficiency...
            unsigned len = rtlUtf8Length(size, start);
            rtlUtf8ToStrX(outlen, out, len, (const char *)start);
            break;
        }
    case NlpUnicode:
        rtlUnicodeToStrX(outlen, out, size/sizeof(UChar), (const UChar *)start);
        break;
    }
}

void CMatchedResults::getMatchUnicode(size32_t & outlen, UChar * & out, unsigned idx)
{
    const IMatchedElement * cur = matched[idx];
    const byte * start = cur->queryStartPtr();
    size32_t size = (size32_t)(cur->queryEndPtr() - start);

    switch (def->inputFormat)
    {
    case NlpAscii:
        rtlStrToUnicodeX(outlen, out, size, (const char *)start);
        break;
    case NlpUtf8:
        {
            //could use codepage2codepage if worried about efficiency...
            unsigned len = rtlUtf8Length(size, start);
            rtlUtf8ToUnicodeX(outlen, out, len, (const char *)start);
            break;
        }
        break;
    case NlpUnicode:
        rtlUnicodeToUnicodeX(outlen, out, size/sizeof(UChar), (const UChar*)start);
        break;
    }
}

void CMatchedResults::getMatchUtf8(size32_t & outlen, char * & out, unsigned idx)
{
    const IMatchedElement * cur = matched[idx];
    const byte * start = cur->queryStartPtr();
    size32_t size = (size32_t)(cur->queryEndPtr() - start);

    switch (def->inputFormat)
    {
    case NlpAscii:
        rtlStrToUtf8X(outlen, out, size, (const char *)start);
        break;
    case NlpUtf8:
        {
            //could use codepage2codepage if worried about efficiency...
            unsigned len = rtlUtf8Length(size, start);
            rtlUtf8ToUtf8X(outlen, out, len, (const char *)start);
            break;
        }
    case NlpUnicode:
        rtlUnicodeToUtf8X(outlen, out, size/sizeof(UChar), (const UChar*)start);
        break;
    }
}

byte * CMatchedResults::queryMatchRow(unsigned idx)
{
    const IMatchedElement * cur = matched[idx];
    return (byte *)cur->queryRow();
}

//MORE: Allow access to attributes at any location on the tree.
byte * CMatchedResults::queryRootResult()
{
    return (byte *)rootResult;
}

void CMatchedResults::kill()
{
    if (matched)
    {
        unsigned num = def->matchResults.ordinality();
        for (unsigned i=0; i < num; i++)
            ::Release(matched[i]);
        delete [] matched;
        matched = NULL;
    }
}

//---------------------------------------------------------------------------


_ATOM NlpMatchWalker::queryName()     
{ 
    return curMatch->queryName(); 
}

size32_t NlpMatchWalker::queryMatchSize()
{
    return (size32_t)(curMatch->end - curMatch->start);
}

const void * NlpMatchWalker::queryMatchStart()
{
    return curMatch->start;
}

unsigned NlpMatchWalker::numChildren()
{
    unsigned count = 0;
    MatchState * cur = curMatch->firstChild;
    while (cur)
    {
        count++;
        cur = cur->next;
    }
    return count;
}

IMatchWalker * NlpMatchWalker::getChild(unsigned numToSkip)
{
    MatchState * cur = curMatch->firstChild;
    while (cur && numToSkip)
    {
        numToSkip--;
        cur = cur->next;
    }
    if (cur)
        return new NlpMatchWalker(cur);
    return NULL;
}

//------------------------------------------------------

static bool hasChildren(IMatchWalker * walker)
{
    for (unsigned i=0;;i++)
    {
        Owned<IMatchWalker> child = walker->getChild(i);
        if (!child)
            return false;
        if (child->queryName() != separatorTagAtom)
            return true;
    }
}

static StringBuffer & getElementText(StringBuffer & s, IMatchWalker * walker)
{
    unsigned len = walker->queryMatchSize();
    const char * text = (const char *)walker->queryMatchStart();
    return s.append(len, text);
}


static void expandElementText(StringBuffer & s, IMatchWalker * walker)
{
    getElementText(s.append('"'), walker).append('"');
}

static void getDefaultParseTree(StringBuffer & s, IMatchWalker * cur)
{
    _ATOM name = cur->queryName();
    if (name != separatorTagAtom)
    {
        if (name)
        {
            StringBuffer lowerName;
            lowerName.append(name).toLowerCase();
            s.append(lowerName);
        }
        if (hasChildren(cur))
        {
            s.append("[");
            for (unsigned i=0;;i++)
            {
                Owned<IMatchWalker> child = cur->getChild(i);
                if (!child)
                    break;

                getDefaultParseTree(s, child);
                s.append(" ");
            }
            s.setLength(s.length()-1);
            s.append("]");
        }
        else
            expandElementText(s, cur);
    }
}


void getDefaultParseTree(IMatchWalker * walker, unsigned & len, char * & text)
{
    StringBuffer s;
    getDefaultParseTree(s, walker);
    len = s.length();
    text = s.detach();
}


static void getXmlParseTree(StringBuffer & s, IMatchWalker * walker, unsigned indent)
{
    _ATOM name = walker->queryName();
    if (name != separatorTagAtom)
    {
        unsigned max = walker->numChildren();
        if (!name)
        {
            if (hasChildren(walker))
            {
                for (unsigned i=0; i<max; i++)
                {
                    Owned<IMatchWalker> child = walker->getChild(i);
                    getXmlParseTree(s, child, indent);
                }
            }
            else
                getElementText(s, walker);
        }
        else
        {
            StringBuffer lowerName;
            lowerName.append(name).toLowerCase();

            s.pad(indent).append('<').append(lowerName).append('>');
            if (hasChildren(walker))
            {
                s.newline();
                for (unsigned i=0; i<max; i++)
                {
                    Owned<IMatchWalker> child = walker->getChild(i);
                    getXmlParseTree(s, child, indent+1);
                }
                s.pad(indent);
            }
            else
                getElementText(s, walker);
            s.append("</").append(lowerName).append('>').newline();
        }
    }
}


void getXmlParseTree(IMatchWalker * walker, unsigned & len, char * & text)
{
    StringBuffer s;
    getXmlParseTree(s, walker, 0);
    len = s.length();
    text = s.detach();
}

//------------------------------------------------------

unsigned getMaximumMatchLength(AsciiDfa & dfa, unsigned len, const byte * start)
{
    const byte * cur = start;
    const byte * end = start+len;
    unsigned activeState = 0;
    const AsciiDfaState * states = dfa.queryStates();
    unsigned * transitions = dfa.queryTransitions();
    const byte * best = NULL;
    loop
    {
        if (states[activeState].accepts())
            best = cur;
        if (cur == end)
            break;
        byte next = *cur++;
        if (next < states[activeState].min)
            break;
        if (next > states[activeState].max)
            break;
        activeState = transitions[states[activeState].delta + next];
        if (activeState == NotFound)
            break;
    }

    if (best)
        return (size32_t)(best-start);
    return NotFound;
}

//------------------------------------------------------

NlpAlgorithm::NlpAlgorithm(CMatchedResultInfo * _matched)
{
    matchInfo = _matched;
    addedSeparators = false;
    notMatched = false;
    notMatchedOnly = false;
    chooseMin = false;
    chooseMax = false;
    chooseBest = false;
    singleChoicePerLine = false;
    inputFormat = NlpAscii;
    keepLimit = UINT_MAX;
    atMostLimit = UINT_MAX;
    charWidth = sizeof(char);
}

NlpAlgorithm::~NlpAlgorithm()
{
    ::Release(matchInfo);
}

void NlpAlgorithm::setOptions(MatchAction _matchAction, ScanAction _scanAction, NlpInputFormat _inputFormat, unsigned _keepLimit, unsigned _atMostLimit)
{
    matchAction = _matchAction;
    scanAction = _scanAction;
    inputFormat = _inputFormat;
    keepLimit = _keepLimit ? _keepLimit : UINT_MAX;
    atMostLimit = _atMostLimit ? _atMostLimit : UINT_MAX;
    charWidth = (inputFormat == NlpUnicode) ? sizeof(UChar) : sizeof(char);
}

void NlpAlgorithm::setChoose(bool _chooseMin, bool _chooseMax, bool _chooseBest, bool _singleChoicePerLine)
{
    chooseMin = _chooseMin;
    chooseMax = _chooseMax;
    chooseBest = _chooseBest;
    singleChoicePerLine = _singleChoicePerLine;
}

void NlpAlgorithm::setJoin(bool _notMatched, bool _notMatchedOnly)
{
    notMatched = _notMatched;
    notMatchedOnly = _notMatchedOnly;
}

void NlpAlgorithm::setLimit(unsigned _maxLength)
{
    maxLength = _maxLength;
}

void NlpAlgorithm::serialize(MemoryBuffer & out)
{
    out.append((unsigned)matchAction);
    out.append((unsigned)scanAction);
    out.append((byte)inputFormat);
    out.append(keepLimit);
    out.append(atMostLimit);
    out.append(charWidth);
    out.append(addedSeparators);
    out.append(notMatched);
    out.append(notMatchedOnly);
    out.append(chooseMin);
    out.append(chooseMax);
    out.append(chooseBest);
    out.append(singleChoicePerLine);
    out.append(maxLength);
    matchInfo->serialize(out);
}

void NlpAlgorithm::deserialize(MemoryBuffer & in)
{
    unsigned temp;
    byte tempByte;
    in.read(temp); matchAction = (MatchAction)temp;
    in.read(temp); scanAction = (ScanAction)temp;
    in.read(tempByte); inputFormat = (NlpInputFormat)tempByte;
    in.read(keepLimit);
    in.read(atMostLimit);
    in.read(charWidth);
    in.read(addedSeparators);
    in.read(notMatched);
    in.read(notMatchedOnly);
    in.read(chooseMin);
    in.read(chooseMax);
    in.read(chooseBest);
    in.read(singleChoicePerLine);
    in.read(maxLength);
    matchInfo->deserialize(in);
}

INlpParseAlgorithm * createThorParser(MemoryBuffer & buffer, IOutputMetaData * outRecordSize)
{
    byte kind;
    buffer.read(kind);

    switch (kind)
    {
    case NLPAregexStack:
    case NLPAregexHeap:
        return createRegexParser(buffer, outRecordSize, kind);
    case NLPAtomita:
        return createTomitaParser(buffer, outRecordSize);
    default:
        UNIMPLEMENTED;
    }
}

INlpParseAlgorithm * createThorParser(IResourceContext *ctx, IHThorParseArg & helper)
{
    unsigned len;
    const void * data;
    helper.queryCompiled(ctx, len, data);
    MemoryBuffer compressed, buffer;
    compressed.setBuffer(len, (void *)data, false);
    decompressToBuffer(buffer, compressed);

    INlpParseAlgorithm * algorithm = createThorParser(buffer, helper.queryOutputMeta());
    algorithm->init(helper);
    return algorithm;
}