فهرست منبع

Refactor regex parse to improve performance

- Use a persistent cache for each thread to prevent arrays and
  objects being reallocated on the heap
- Simplify the matching code to avoid copying blocks of indices
- Improve both versions of the regex code.
- Fix problems with possible problems matching complex paths.

Signed-off-by: Gavin Halliday <gavin.halliday@lexisnexis.com>
Gavin Halliday 13 سال پیش
والد
کامیت
55b79d1650

+ 0 - 86
common/thorhelper/thorparse.cpp

@@ -114,7 +114,6 @@ NlpMatchPath::NlpMatchPath(const UnsignedArray & _ids, const UnsignedArray & _in
         ids.append(_ids.item(idx));
         indices.append(_indices.item(idx));
     }
-    init();
 }
 
 
@@ -130,83 +129,12 @@ NlpMatchPath::NlpMatchPath(MemoryBuffer & in)
         ids.append(id);
         indices.append(index);
     }
-    init();
 }
 
 NlpMatchPath::~NlpMatchPath()
 {
-    delete [] searchIndices;
 }
 
-void NlpMatchPath::init()
-{
-    maxDepth = ids.ordinality();
-    searchIndices = new unsigned[maxDepth];
-}
-
-
-IMatchWalker * NlpMatchPath::findInChildren(IMatchWalker * top, regexid_t id)
-{
-    for (unsigned i = 0;; i++)
-    {
-        Owned<IMatchWalker> child = top->getChild(i);
-        if (!child)
-            return NULL;
-        IMatchWalker * ret = find(child, id);
-        if (ret)
-            return ret;
-    }
-    return NULL;
-}
-
-IMatchWalker * NlpMatchPath::find(IMatchWalker * top, regexid_t id)
-{
-    unsigned savedSearchDepth = maxSearchDepth;
-    if (top->queryID() == id)
-    {
-        unsigned thisLevelIndex = searchIndices[pathIndex];
-        if ((thisLevelIndex == UNKNOWN_INSTANCE) || (thisLevelIndex == 1))
-        {
-            pathIndex++;
-            if (pathIndex == ids.ordinality())
-            {
-                maxSearchDepth = pathIndex;
-                return LINK(top);
-            }
-
-            if (thisLevelIndex == 1)
-                maxSearchDepth = pathIndex;
-            return findInChildren(top, ids.item(pathIndex));
-        }
-        else
-            searchIndices[pathIndex]--;
-    }
-    else
-    {
-        Owned<IMatchWalker> ret = findInChildren(top, id);
-        //return if matched another level - may have failed to match, or matched completely
-        if (savedSearchDepth != maxSearchDepth)
-            return ret.getClear();
-    }
-    return NULL;
-}
-
-IMatchedElement * NlpMatchPath::getMatch(IMatchWalker * top)
-{
-    CriticalBlock procedure(cs);
-
-    //MORE: We could allocate searchIndices on the stack and pass as a parameter
-    memcpy(searchIndices, indices.getArray(), sizeof(*searchIndices)*maxDepth);
-    pathIndex = 0;
-    maxSearchDepth = 0;
-    Owned<IMatchWalker> state = find(top, ids.item(0));
-    if (!state)
-        return NULL;
-    return new MatchWalker2MatchedElement(state);
-}
-
-
-
 void NlpMatchPath::serialize(MemoryBuffer & out) const
 {
     unsigned num = ids.ordinality();
@@ -263,20 +191,6 @@ CMatchedResults::~CMatchedResults()
     kill();
 }
 
-//MORE: Implement one that works directly on grammar symbols
-void CMatchedResults::extractResults(IMatchWalker * top, const byte * _in, const byte * _rootResult)
-{
-    in = _in;
-    rootResult = _rootResult;
-    notMatched.ptr = in;
-    ForEachItemIn(idx, def->matchResults)
-    {
-        ::Release(matched[idx]);
-        matched[idx] = def->matchResults.item(idx).getMatch(top);
-        if (!matched[idx]) matched[idx] = LINK(&notMatched);
-    }
-}
-
 bool CMatchedResults::getMatched(unsigned idx)              
 { 
     return matched[idx] != &notMatched; 

+ 2 - 0
common/thorhelper/thorparse.hpp

@@ -70,6 +70,8 @@ public:
     inline _ATOM queryName()                              { return name; }
     inline regexid_t queryID()                            { return id; }
 
+    inline void reset(_ATOM _name, regexid_t _id) { next = NULL; firstChild = NULL; name = _name; id = _id; }
+
 public:
     const byte * start;
     const byte * end;

+ 22 - 12
common/thorhelper/thorparse.ipp

@@ -59,6 +59,13 @@ public:
 
 #define UNKNOWN_INSTANCE    ((unsigned)-1)
 
+class NlpMatchSearchInstance
+{
+public:
+    unsigned lastExactMatchDepth;
+    unsigned nextIndex;
+};
+
 class THORHELPER_API NlpMatchPath : public CInterface
 {
 public:
@@ -68,21 +75,25 @@ public:
 
     void serialize(MemoryBuffer & buffer) const;
 
-    IMatchedElement * getMatch(IMatchWalker * top);
+    inline unsigned numItems() const { return ids.ordinality(); }
+    inline unsigned getId(unsigned i) const { return ids.item(i); }
+    inline unsigned getIndex(unsigned i) const { return indices.item(i); }
+    inline bool matchAny(unsigned i) const { return indices.item(i) == UNKNOWN_INSTANCE; }
+
+    inline unsigned nextExactMatchIndex(unsigned from) const
+    {
+        for (unsigned i=from; i < indices.ordinality(); i++)
+        {
+            unsigned cur = indices.item(i);
+            if (cur != UNKNOWN_INSTANCE)
+                return cur;
+        }
+        return 0;
+    }
 
 protected:
-    void init();
-    IMatchWalker * find(IMatchWalker * top, regexid_t id);
-    IMatchWalker * findInChildren(IMatchWalker * top, regexid_t id);
-    
-protected:
-    unsigned maxDepth;
-    unsigned pathIndex;
-    unsigned maxSearchDepth;
-    unsigned * searchIndices;
     UnsignedArray ids;
     UnsignedArray indices;
-    CriticalSection cs;
 };
 
 
@@ -156,7 +167,6 @@ public:
     ~CMatchedResults();
     IMPLEMENT_IINTERFACE
     
-    void extractResults(IMatchWalker * top, const byte * _in, const byte * _rootResult = NULL);
     void kill();
 
     //IMatchedResults

+ 108 - 49
common/thorhelper/thorrparse.cpp

@@ -237,40 +237,47 @@ static void removeTrailingSeparator(MatchState & matched)
         matched.end = child->start;
 }
 
-MatchState * RegexMatchPath::find(MatchState * top, regexid_t id)
+MatchState * RegexMatchSearchInstance::find(MatchState * top, const NlpMatchPath & path, unsigned depth)
 {
-    unsigned savedSearchDepth = maxSearchDepth;
+    regexid_t id = path.getId(depth);
     do
     {
         if (top->queryID() == id)
         {
-            unsigned thisLevelIndex = searchIndices[pathIndex];
-            if ((thisLevelIndex == UNKNOWN_INSTANCE) || (thisLevelIndex == 1))
+            bool matchAny = path.matchAny(depth);
+            if (matchAny || (nextIndex == 1))
             {
-
-                pathIndex++;
-                if (pathIndex == ids.ordinality())
+                if (depth+1 == path.numItems())
                 {
-                    maxSearchDepth = pathIndex;
+                    lastExactMatchDepth = depth+1;
                     return top;
                 }
 
-                if (thisLevelIndex == 1)
-                    maxSearchDepth = pathIndex;
+                if (!matchAny)
+                {
+                    lastExactMatchDepth = depth+1;
+                    nextIndex = path.nextExactMatchIndex(depth+1);
+                }
+
+                MatchState * ret = NULL;
+                unsigned prevExactMatchDepth = lastExactMatchDepth;
                 if (top->firstChild)
-                    return find(top->firstChild, ids.item(pathIndex));
-                return NULL;
+                    ret = find(top->firstChild, path, depth+1);
+                //If must match a child, or one of children had a required match then we have a result
+                if (!matchAny || (prevExactMatchDepth != lastExactMatchDepth))
+                    return ret;
             }
             else
-                searchIndices[pathIndex]--;
+                nextIndex--;
         }
         else
         {
             if (top->firstChild)
             {
-                MatchState * ret = find(top->firstChild, id);
+                unsigned prevExactMatchDepth = lastExactMatchDepth;
+                MatchState * ret = find(top->firstChild, path, depth);
                 //return if matched another level - may have failed to match, or matched completely
-                if (savedSearchDepth != maxSearchDepth)
+                if (prevExactMatchDepth != lastExactMatchDepth)
                     return ret;
             }
         }
@@ -279,14 +286,12 @@ MatchState * RegexMatchPath::find(MatchState * top, regexid_t id)
     return NULL;
 }
 
-IMatchedElement * RegexMatchPath::getMatch(MatchState * top, bool removeTrailingSep)
+IMatchedElement * RegexMatchPath::getMatch(MatchState * top, bool removeTrailingSep) const
 {
-    CriticalBlock procedure(cs);
-    //MORE: We could allocate searchIndices on the stack and pass as a parameter
-    memcpy(searchIndices, indices.getArray(), sizeof(*searchIndices)*maxDepth);
-    pathIndex = 0;
-    maxSearchDepth = 0;
-    MatchState * state = find(top, ids.item(0));
+    RegexMatchSearchInstance search;
+    search.lastExactMatchDepth = 0;
+    search.nextIndex = nextExactMatchIndex(0);
+    MatchState * state = search.find(top, *this, 0);
     if (!state)
         return NULL;
     if (removeTrailingSep)
@@ -454,7 +459,8 @@ void RegexPattern::cleanupBeginMatch(ActiveStage & stage, RegexState & state)
     DBGLOG("%*s[%p]Pop Begin Match", patternDepth, "", stage.pattern);
 #endif
     state.popMatch(stage.extra.matched->save);
-    delete stage.extra.matched;
+    state.cache.destroyStateSave(stage.extra.matched);
+    stage.extra.matched = NULL;
 }
 
 RegexMatchAction RegexPattern::pushStageEndMatch(RegexState & state)
@@ -1669,7 +1675,7 @@ RegexMatchAction RegexBeginCheckPattern::match(RegexState & state)
 
 RegexMatchAction RegexBeginCheckPattern::beginMatch(RegexState & state)
 {
-    RegexMatchStateSave * matched = new RegexMatchStateSave(NULL, 0);
+    RegexMatchStateSave * matched = state.cache.createStateSave(NULL, 0);
     pushStageBeginMatch(state, matched);
     return RegexMatchContinue;
 }
@@ -1978,9 +1984,9 @@ RegexMatchAction RegexNamedPattern::match(RegexState & state)
     return def->match(state, &end, matched);
 #else
     //Allocate on the heap to make a stack fault less likely
-    RegexMatchState * matched = new RegexMatchState(def);
+    RegexMatchState * matched = state.cache.createState(def);
     RegexMatchAction ret = def->match(state, &end, *matched);
-    delete matched;
+    state.cache.destroyState(matched);
     return ret;
 #endif
 }
@@ -2057,7 +2063,7 @@ void RegexNamedPattern::RegexEndNamedPattern::killStage(ActiveStage & stage, Reg
 
 RegexMatchAction RegexNamedPattern::beginMatch(RegexState & state)
 {
-    RegexMatchStateSave * matched = new RegexMatchStateSave(def);
+    RegexMatchStateSave * matched = state.cache.createStateSave(def);
     ActiveStage & stage = pushStageBeginMatch(state, matched);
     stage.setState(RSfinished);                 // so children don't get processed.
     state.namedStack.append(end);
@@ -2089,7 +2095,7 @@ RegexMatchAction RegexBeginSeparatorPattern::match(RegexState & state)
 
 RegexMatchAction RegexBeginSeparatorPattern::beginMatch(RegexState & state)
 {
-    RegexMatchStateSave * matched = new RegexMatchStateSave(separatorTagAtom, 0);
+    RegexMatchStateSave * matched = state.cache.createStateSave(separatorTagAtom, 0);
     pushStageBeginMatch(state, matched);
     return RegexMatchContinue;
 }
@@ -2566,11 +2572,12 @@ RegexMatchAction RegexAsciiDfaPattern::match(RegexState & state)
     }
     else
     {
-        ConstPointerArray matches;
+        ConstPointerArray & potentialMatches = state.cache.potentialMatches;
+        unsigned prevPotentialMatches = potentialMatches.ordinality();
         loop
         {
             if (states[activeState].accepts())
-                matches.append(cur);
+                potentialMatches.append(cur);
             if (cur == end)
                 break;
             byte next = *cur++;
@@ -2583,13 +2590,15 @@ RegexMatchAction RegexAsciiDfaPattern::match(RegexState & state)
                 break;
         }
 
-        while (matches.ordinality())
+        while (potentialMatches.ordinality() > prevPotentialMatches)
         {
-            state.cur = (const byte *)matches.tos();
-            matches.pop();
+            state.cur = (const byte *)potentialMatches.pop();
             RegexMatchAction ret = matchNext(state);
             if (ret != RegexMatchBacktrack)
+            {
+                potentialMatches.trunc(prevPotentialMatches);
                 return ret;
+            }
         }
         return RegexMatchBacktrack;
     }
@@ -2618,10 +2627,11 @@ void RegexAsciiDfaPattern::toXMLattr(StringBuffer & out, RegexXmlState & state)
         out.append(" token");
 }
 
-
 void RegexAsciiDfaPattern::killStage(ActiveStage & stage, RegexState & state)
 {
-    delete stage.extra.matches;
+    ConstPointerArray & potentialMatches = state.cache.potentialMatches;
+    unsigned prevPotentialMatches = stage.extra.prevPotentialMatches;
+    potentialMatches.trunc(prevPotentialMatches);
 }
 
 RegexMatchAction RegexAsciiDfaPattern::beginMatch(RegexState & state)
@@ -2632,8 +2642,8 @@ RegexMatchAction RegexAsciiDfaPattern::beginMatch(RegexState & state)
     const AsciiDfaState * states = dfa.queryStates();
     unsigned * transitions = dfa.queryTransitions();
     const byte * best = NULL;
-    ConstPointerArray * matches = NULL;
-
+    ConstPointerArray & potentialMatches = state.cache.potentialMatches;
+    const unsigned prevPotentialMatches = potentialMatches.ordinality();
     loop
     {
         if (states[activeState].accepts())
@@ -2642,12 +2652,9 @@ RegexMatchAction RegexAsciiDfaPattern::beginMatch(RegexState & state)
                 best = cur;
             else
             {
-                if (!matches)
-                {
-                    matches = new ConstPointerArray;
-                    matches->append(best);
-                }
-                matches->append(cur);
+                if (prevPotentialMatches == potentialMatches.ordinality())
+                    potentialMatches.append(best);
+                potentialMatches.append(cur);
             }
         }
         if (cur == end)
@@ -2666,10 +2673,12 @@ RegexMatchAction RegexAsciiDfaPattern::beginMatch(RegexState & state)
         return RegexMatchBacktrack;
 
     ActiveStage & stage = pushStage(state);
-    stage.extra.matches = matches;
+    stage.extra.prevPotentialMatches = prevPotentialMatches;
     if (matchesToken)
         stage.flags |= RFbeginToken; 
-    if (!matches)
+
+    //Only a single match, therefore no need to backtrack.
+    if (prevPotentialMatches == potentialMatches.ordinality())
     {
         stage.followPosition = best;
         stage.setMatched();
@@ -2683,14 +2692,16 @@ RegexMatchAction RegexAsciiDfaPattern::beginMatch(RegexState & state)
 
 RegexMatchAction RegexAsciiDfaPattern::nextAction(ActiveStage & stage, RegexState & state)
 {
+    ConstPointerArray & potentialMatches = state.cache.potentialMatches;
+    unsigned prevPotentialMatches = stage.extra.prevPotentialMatches;
+    assertex(prevPotentialMatches <= potentialMatches.ordinality());
     switch (stage.getState())
     {
     case RSretry:
         {
-            ConstPointerArray * matches = stage.extra.matches;
-            if (matches && matches->ordinality())
+            if (prevPotentialMatches < potentialMatches.ordinality())
             {
-                stage.followPosition = (const byte *)matches->pop();
+                stage.followPosition = (const byte *)potentialMatches.pop();
                 stage.setMatched();
                 return RegexMatchContinue;
             }
@@ -3108,6 +3119,54 @@ RegexPattern * deserializeRegex(MemoryBuffer & in)
 
 //---------------------------------------------------------------------------
 
+RegexMatchState * RegexStateCache::createState(RegexNamed * def)
+{
+    if (matchStates.ordinality())
+    {
+        RegexMatchState * ret = &matchStates.popGet();
+        ret->reset(def);
+        return ret;
+    }
+
+    return new RegexMatchState(def);
+}
+
+void RegexStateCache::destroyState(RegexMatchState * state)
+{
+    matchStates.append(*state);
+}
+
+
+RegexMatchStateSave * RegexStateCache::createStateSave(RegexNamed * def)
+{
+    if (matchStateSaves.ordinality())
+    {
+        RegexMatchStateSave * ret = &matchStateSaves.popGet();
+        ret->reset(def);
+        return ret;
+    }
+
+    return new RegexMatchStateSave(def);
+}
+
+RegexMatchStateSave * RegexStateCache::createStateSave(_ATOM _name, regexid_t _id)
+{
+    if (matchStateSaves.ordinality())
+    {
+        RegexMatchStateSave * ret = &matchStateSaves.popGet();
+        ret->reset(_name, _id);
+        return ret;
+    }
+
+    return new RegexMatchStateSave(_name, _id);
+}
+
+void RegexStateCache::destroyStateSave(RegexMatchStateSave * state)
+{
+    matchStateSaves.append(*state);
+}
+
+
 void RegexState::processPattern(RegexPattern * grammar)
 {
     if (implementation == NLPAregexStack)
@@ -3331,7 +3390,7 @@ bool RegexParser::performMatch(IMatchedAction & action, const void * row, unsign
         const byte * endData = start + len;
         const byte * end = endData - algo->minPatternLength;
 
-        RegexState state(algo->kind, helper, this, algo->inputFormat, len, start);
+        RegexState state(cache, algo->kind, helper, this, algo->inputFormat, len, start);
         state.row = row;
         state.processor = &action;
         state.best = NULL;

+ 36 - 9
common/thorhelper/thorrparse.ipp

@@ -30,17 +30,20 @@
 //#define TRACE_REGEX
 #endif
 
+
+class RegexMatchSearchInstance : public NlpMatchSearchInstance
+{
+public:
+    MatchState * find(MatchState * top, const NlpMatchPath & path, unsigned depth);
+};
+
 class THORHELPER_API RegexMatchPath : public NlpMatchPath
 {
 public:
     RegexMatchPath(MemoryBuffer & in) : NlpMatchPath(in) {}
     RegexMatchPath(const UnsignedArray & _ids, const UnsignedArray & _indices) : NlpMatchPath(_ids, _indices) {}
 
-    IMatchedElement * getMatch(MatchState * top, bool removeTrailingSeparator);
-
-protected:
-    void init();
-    MatchState * find(MatchState * top, regexid_t id);
+    IMatchedElement * getMatch(MatchState * top, bool removeTrailingSeparator) const;
 };
 
 class THORHELPER_API CRegexMatchedResultInfo : public CMatchedResultInfo
@@ -112,7 +115,7 @@ public:
         MatchSaveState saved;
         RegexMatchStateSave * matched;
         RegexPattern * nextPattern;
-        ConstPointerArray * matches;
+        unsigned prevPotentialMatches;
         RegexRepeatInstance * repeatInstance;
         const byte * limit;
     } extra;
@@ -130,12 +133,16 @@ MAKECopyArrayOf(ActiveStage, ActiveStage &, ActiveStageArray);
 
 class RegexState;
 // Used to represent a single match in the regular expression tree.  Also 
-class THORHELPER_API RegexMatchState : public MatchState
+class THORHELPER_API RegexMatchState : public CInterface, public MatchState
 {
 public:
     RegexMatchState() : MatchState() { }
     RegexMatchState(_ATOM _name, regexid_t _id) : MatchState(_name, _id) { }
     RegexMatchState(RegexNamed * owner) : MatchState(owner->queryName(), owner->queryID()) {}
+    IMPLEMENT_IINTERFACE
+
+    using MatchState::reset;
+    void reset(RegexNamed * owner) { MatchState::reset(owner->queryName(), owner->queryID()); }
 };
 
 class THORHELPER_API RegexMatchStateSave : public RegexMatchState
@@ -150,11 +157,29 @@ public:
 };
 
 struct RegexMatchInfo;
+class RegexStateCache
+{
+public:
+    RegexMatchState * createState(RegexNamed * def);
+    RegexMatchStateSave * createStateSave(RegexNamed * def);
+    RegexMatchStateSave * createStateSave(_ATOM _name, regexid_t _id);
+    void destroyState(RegexMatchState * state);
+    void destroyStateSave(RegexMatchStateSave * state);
+
+    CIArrayOf<RegexMatchState> matchStates;
+    CIArrayOf<RegexMatchStateSave> matchStateSaves;
+    ConstPointerArray potentialMatches;
+};
+
 class RegexState : public NlpState
 {
 public:
-    RegexState(unsigned _implementation, INlpHelper * _helper, INlpMatchedAction * _action, NlpInputFormat _inputFormat, size32_t _len, const void * _text) : NlpState(_action, _inputFormat, _len, _text) { implementation = _implementation; numMatched = 0; curActiveStage = NotFound; helper = _helper; }
-    RegexState(const RegexState & _state, INlpMatchedAction * _action, size32_t _len, const void * _text) : NlpState(_action, _state.inputFormat, _len, _text) 
+    RegexState(RegexStateCache & _cache, unsigned _implementation, INlpHelper * _helper, INlpMatchedAction * _action, NlpInputFormat _inputFormat, size32_t _len, const void * _text)
+    : NlpState(_action, _inputFormat, _len, _text), cache(_cache)
+    { implementation = _implementation; numMatched = 0; curActiveStage = NotFound; helper = _helper; }
+
+    RegexState(const RegexState & _state, INlpMatchedAction * _action, size32_t _len, const void * _text)
+    : NlpState(_action, _state.inputFormat, _len, _text), cache(_state.cache)
     { 
         implementation = _state.implementation; numMatched = 0; curActiveStage = NotFound; helper = _state.helper;
     }
@@ -168,6 +193,7 @@ protected:
     inline ActiveStage & topStage()                         { return stages.item(curActiveStage); }
 
 public:
+    RegexStateCache & cache;
     RegexPatternCopyArray stack;
     IMatchedAction * processor;
     const byte * nextScanPosition;
@@ -1121,6 +1147,7 @@ public:
     RegexAlgorithm * algo;
     RegexMatches results;
     CRegexMatchedResults matched;
+    RegexStateCache cache;
     unsigned charWidth;
 };
 

+ 28 - 30
common/thorhelper/thortalgo.cpp

@@ -45,65 +45,63 @@ void TomitaStateInformation::set(const TomitaStateInformation & other)
 
 //---------------------------------------------------------------------------
 
-GrammarSymbol * TomitaMatchPath::findInChildren(GrammarSymbol * top, regexid_t id)
+GrammarSymbol * TomitaMatchSearchInstance::findInChildren(GrammarSymbol * top, const TomitaMatchPath & path, unsigned depth)
 {
+    unsigned prevExactMatchDepth = lastExactMatchDepth;
     for (unsigned i = 0;; i++)
     {
         GrammarSymbol * child = top->queryChild(i);
         if (!child)
             return NULL;
-        GrammarSymbol * ret = find(child, id);
-        if (ret)
+        GrammarSymbol * ret = find(child, path, depth);
+        if (prevExactMatchDepth != lastExactMatchDepth)
             return ret;
     }
     return NULL;
 }
 
-GrammarSymbol * TomitaMatchPath::find(GrammarSymbol * top, regexid_t id)
+GrammarSymbol * TomitaMatchSearchInstance::find(GrammarSymbol * top, const TomitaMatchPath & path, unsigned depth)
 {
-    unsigned savedSearchDepth = maxSearchDepth;
     if (top->isPacked())
         top = top->queryPacked(choices->getInstance(top));
 
+    regexid_t id = path.getId(depth);
     if (top->getId() == id)
     {
-        unsigned thisLevelIndex = searchIndices[pathIndex];
-        if ((thisLevelIndex == UNKNOWN_INSTANCE) || (thisLevelIndex == 1))
+        bool matchAny = path.matchAny(depth);
+        if (matchAny || (nextIndex == 1))
         {
-            pathIndex++;
-            if (pathIndex == ids.ordinality())
+            if (depth+1 == path.numItems())
             {
-                maxSearchDepth = pathIndex;
+                lastExactMatchDepth = depth+1;
                 return top;
             }
 
-            if (thisLevelIndex == 1)
-                maxSearchDepth = pathIndex;
-            return findInChildren(top, ids.item(pathIndex));
+            if (!matchAny)
+            {
+                lastExactMatchDepth = depth+1;
+                nextIndex = path.nextExactMatchIndex(depth+1);
+            }
+
+            return findInChildren(top, path, depth+1);
         }
         else
-            searchIndices[pathIndex]--;
+        {
+            nextIndex--;
+            return NULL;
+        }
     }
     else
-    {
-        GrammarSymbol * ret = findInChildren(top, id);
-        //return if matched another level - may have failed to match, or matched completely
-        if (savedSearchDepth != maxSearchDepth)
-            return ret;
-    }
-    return NULL;
+        return findInChildren(top, path, depth);
 }
 
-IMatchedElement * TomitaMatchPath::getMatch(GrammarSymbol * top, PackedSymbolChoice & choice)
+IMatchedElement * TomitaMatchPath::getMatch(GrammarSymbol * top, PackedSymbolChoice & choice) const
 {
-    CriticalBlock procedure(cs);
-
-    //MORE: We could allocate searchIndices on the stack and pass as a parameter
-    memcpy(searchIndices, indices.getArray(), sizeof(*searchIndices)*maxDepth);
-    pathIndex = 0;
-    maxSearchDepth = 0;
-    choices = &choice;
-    GrammarSymbol * state = find(top, ids.item(0));
+    TomitaMatchSearchInstance search;
+    search.lastExactMatchDepth = 0;
+    search.nextIndex = nextExactMatchIndex(0);
+    search.choices = &choice;
+    GrammarSymbol * state = search.find(top, *this, 0);
     if (!state)
         return NULL;
     return LINK(state);

+ 14 - 9
common/thorhelper/thortalgo.ipp

@@ -62,22 +62,27 @@ public:
     NlpInputFormat inputFormat;
 };
 
-class THORHELPER_API TomitaMatchPath : public NlpMatchPath
+class TomitaMatchPath;
+class TomitaMatchSearchInstance : public NlpMatchSearchInstance
 {
 public:
-    TomitaMatchPath(MemoryBuffer & in) : NlpMatchPath(in) { choices = NULL; }
-    TomitaMatchPath(const UnsignedArray & _ids, const UnsignedArray & _indices) : NlpMatchPath(_ids, _indices) { choices = NULL; }
+    TomitaMatchSearchInstance() { choices = NULL; }
 
-    IMatchedElement * getMatch(GrammarSymbol * top, PackedSymbolChoice & choice);
+    GrammarSymbol * find(GrammarSymbol * top, const TomitaMatchPath & path, unsigned depth);
+    GrammarSymbol * findInChildren(GrammarSymbol * top, const TomitaMatchPath & path, unsigned depth);
 
-protected:
-    GrammarSymbol * find(GrammarSymbol * top, regexid_t id);
-    GrammarSymbol * findInChildren(GrammarSymbol * top, regexid_t id);
-    
-protected:
     PackedSymbolChoice * choices;
 };
 
+class THORHELPER_API TomitaMatchPath : public NlpMatchPath
+{
+public:
+    TomitaMatchPath(MemoryBuffer & in) : NlpMatchPath(in) { }
+    TomitaMatchPath(const UnsignedArray & _ids, const UnsignedArray & _indices) : NlpMatchPath(_ids, _indices) { }
+
+    IMatchedElement * getMatch(GrammarSymbol * top, PackedSymbolChoice & choice) const;
+};
+
 
 class THORHELPER_API CTomitaMatchedResultInfo : public CMatchedResultInfo
 {

+ 58 - 0
ecl/regress/pat19.ecl

@@ -0,0 +1,58 @@
+/*##############################################################################
+
+    Copyright (C) 2011 HPCC Systems.
+
+    All rights reserved. This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License as
+    published by the Free Software Foundation, either version 3 of the
+    License, or (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+############################################################################## */
+
+token word := PATTERN('[a-z0-9]')+;
+token a := word;
+token b := word;
+rule c := a b;
+rule d := a b;
+rule e := c;
+rule f := e d;
+rule g := f f;
+
+//i.e.
+//g[f[e[c[a b]] d[a b]] f[e[c[a b]] d[a b]]]
+
+infile := dataset([
+        {'w1 w2 w3 w4 w5 w6 w7 w8'}
+        ], { string line });
+
+
+results :=
+    record
+        '\'' + MATCHTEXT(word) + ':w1\'';
+        '\'' + MATCHTEXT(word[2]) + ':w2\'';
+        '\'' + MATCHTEXT(word[8]) + ':w8\'';
+        '\'' + MATCHTEXT(word[9]) + ':\'';
+        '\'' + MATCHTEXT(a/word[1]) + ':w1\'';
+        '\'' + MATCHTEXT(a[3]/word[1]) + ':w5\'';
+        '\'' + MATCHTEXT(a[3]/word[2]) + ':\'';
+        '\'' + MATCHTEXT(a[2]/word) + ':w3\'';
+        '\'' + MATCHTEXT(e/a[2]/word) + ':w5\'';
+        '\'' + MATCHTEXT(g/f/e/c/a/word) + ':w1\'';
+        '\'' + MATCHTEXT(g/f[2]/e/c/a/word) + ':w5\'';
+        '\'' + MATCHTEXT(f[1]/e/c/b/word) + ':w2\'';
+        '\'' + MATCHTEXT(f[1]/b[2]/word) + ':w4\'';
+        '\'' + MATCHTEXT(f[1]/c/b[2]/word) + ':\'';
+        '\'' + MATCHTEXT(f[1]/c/b[3]/word) + ':\'';
+        '\'' + MATCHTEXT(g/f[1]/b[2]/word) + ':w4\'';
+        '\'' + MATCHTEXT(g/f[1]/c/b[3]/word) + ':\'';
+        '\'' + MATCHTEXT(a/word[3]) + ':w5\'';
+    end;
+
+output(PARSE(infile,line,g,results,whole,nocase,skip([' ',',',';','\t','.']*)));

+ 1 - 1
ecl/regress/pat5.ecl

@@ -39,6 +39,7 @@ infile := dataset([
 
 results :=
     record
+        '\'' + MATCHTEXT(noun/patWord[2]) + '\'';
         '\'' + MATCHTEXT(patWord[1]) + '\'';
         '\'' + MATCHTEXT(patWord[2]) + '\'';
         '\'' + MATCHTEXT(patWord[3]) + '\'';
@@ -46,7 +47,6 @@ results :=
         '\'' + MATCHTEXT(noun[2]) + '\'';
         '\'' + MATCHTEXT(noun[3]) + '\'';
         '\'' + MATCHTEXT(noun/patWord[1]) + '\'';
-        '\'' + MATCHTEXT(noun/patWord[2]) + '\'';
         '\'' + MATCHTEXT(noun/patWord[3]) + '\'';
         '\'' + MATCHTEXT(noun[1]/patWord[1]) + '\'';
         '\'' + MATCHTEXT(noun[1]/patWord[2]) + '\'';

+ 1 - 1
system/jlib/jarray.hpp

@@ -121,7 +121,7 @@ public:
     aindex_t bAdd(MEMBER & newItem, CompareFunc, bool & isNew);
     aindex_t bSearch(const MEMBER & key, CompareFunc) const;
     aindex_t find(PARAM) const;
-    MEMBER *getArray(aindex_t = 0);
+    MEMBER *getArray(aindex_t = 0) const;
     void sort(CompareFunc);
     void swap(aindex_t pos1, aindex_t pos2);
 };

+ 1 - 1
system/jlib/jarray.tpp

@@ -85,7 +85,7 @@ aindex_t BaseArrayOf<MEMBER, PARAM>::bSearch(const MEMBER & key, CompareFunc cf)
 }
 
 template <class MEMBER, class PARAM>
-MEMBER *BaseArrayOf<MEMBER, PARAM>::getArray(aindex_t pos)
+MEMBER *BaseArrayOf<MEMBER, PARAM>::getArray(aindex_t pos) const
 {
    MEMBER * head= (MEMBER *)SELF::_head;
    assertex(pos <= SELF::used);

تفاوت فایلی نمایش داده نمی شود زیرا این فایل بسیار بزرگ است
+ 3 - 0
testing/ecl/key/pat19.xml


تفاوت فایلی نمایش داده نمی شود زیرا این فایل بسیار بزرگ است
+ 3 - 0
testing/ecl/key/tpat19.xml


+ 58 - 0
testing/ecl/pat19.ecl

@@ -0,0 +1,58 @@
+/*##############################################################################
+
+    Copyright (C) 2011 HPCC Systems.
+
+    All rights reserved. This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License as
+    published by the Free Software Foundation, either version 3 of the
+    License, or (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+############################################################################## */
+
+token word := PATTERN('[a-z0-9]')+;
+token a := word;
+token b := word;
+rule c := a b;
+rule d := a b;
+rule e := c;
+rule f := e d;
+rule g := f f;
+
+//i.e.
+//g[f[e[c[a b]] d[a b]] f[e[c[a b]] d[a b]]]
+
+infile := dataset([
+        {'w1 w2 w3 w4 w5 w6 w7 w8'}
+        ], { string line });
+
+
+results :=
+    record
+        '\'' + MATCHTEXT(word) + ':w1\'';
+        '\'' + MATCHTEXT(word[2]) + ':w2\'';
+        '\'' + MATCHTEXT(word[8]) + ':w8\'';
+        '\'' + MATCHTEXT(word[9]) + ':\'';
+        '\'' + MATCHTEXT(a/word[1]) + ':w1\'';
+        '\'' + MATCHTEXT(a[3]/word[1]) + ':w5\'';
+        '\'' + MATCHTEXT(a[3]/word[2]) + ':\'';
+        '\'' + MATCHTEXT(a[2]/word) + ':w3\'';
+        '\'' + MATCHTEXT(e/a[2]/word) + ':w5\'';
+        '\'' + MATCHTEXT(g/f/e/c/a/word) + ':w1\'';
+        '\'' + MATCHTEXT(g/f[2]/e/c/a/word) + ':w5\'';
+        '\'' + MATCHTEXT(f[1]/e/c/b/word) + ':w2\'';
+        '\'' + MATCHTEXT(f[1]/b[2]/word) + ':w4\'';
+        '\'' + MATCHTEXT(f[1]/c/b[2]/word) + ':\'';
+        '\'' + MATCHTEXT(f[1]/c/b[3]/word) + ':\'';
+        '\'' + MATCHTEXT(g/f[1]/b[2]/word) + ':w4\'';
+        '\'' + MATCHTEXT(g/f[1]/c/b[3]/word) + ':\'';
+        '\'' + MATCHTEXT(a/word[3]) + ':w5\'';
+    end;
+
+output(PARSE(infile,line,g,results,whole,nocase,skip([' ',',',';','\t','.']*)));

+ 58 - 0
testing/ecl/tpat19.ecl

@@ -0,0 +1,58 @@
+/*##############################################################################
+
+    Copyright (C) 2011 HPCC Systems.
+
+    All rights reserved. This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License as
+    published by the Free Software Foundation, either version 3 of the
+    License, or (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+############################################################################## */
+
+token word := PATTERN('[a-z0-9]')+;
+token a := word;
+token b := word;
+rule c := a b;
+rule d := a b;
+rule e := c;
+rule f := e d;
+rule g := f f;
+
+//i.e.
+//g[f[e[c[a b]] d[a b]] f[e[c[a b]] d[a b]]]
+
+infile := dataset([
+        {'w1 w2 w3 w4 w5 w6 w7 w8'}
+        ], { string line });
+
+
+results :=
+    record
+        '\'' + MATCHTEXT(word) + ':w1\'';
+        '\'' + MATCHTEXT(word[2]) + ':w2\'';
+        '\'' + MATCHTEXT(word[8]) + ':w8\'';
+        '\'' + MATCHTEXT(word[9]) + ':\'';
+        '\'' + MATCHTEXT(a/word[1]) + ':w1\'';
+        '\'' + MATCHTEXT(a[3]/word[1]) + ':w5\'';
+        '\'' + MATCHTEXT(a[3]/word[2]) + ':\'';
+        '\'' + MATCHTEXT(a[2]/word) + ':w3\'';
+        '\'' + MATCHTEXT(e/a[2]/word) + ':w5\'';
+        '\'' + MATCHTEXT(g/f/e/c/a/word) + ':w1\'';
+        '\'' + MATCHTEXT(g/f[2]/e/c/a/word) + ':w5\'';
+        '\'' + MATCHTEXT(f[1]/e/c/b/word) + ':w2\'';
+        '\'' + MATCHTEXT(f[1]/b[2]/word) + ':w4\'';
+        '\'' + MATCHTEXT(f[1]/c/b[2]/word) + ':\'';
+        '\'' + MATCHTEXT(f[1]/c/b[3]/word) + ':\'';
+        '\'' + MATCHTEXT(g/f[1]/b[2]/word) + ':w4\'';
+        '\'' + MATCHTEXT(g/f[1]/c/b[3]/word) + ':\'';
+        '\'' + MATCHTEXT(a/word[3]) + ':w5\'';
+    end;
+
+output(PARSE(infile,line,g,results,whole,nocase,skip([' ',',',';','\t','.']*), parse));