瀏覽代碼

HPCC-9217 Remove legacy implementations of editdistance functions

Signed-off-by: Gavin Halliday <gavin.halliday@lexisnexis.com>
Gavin Halliday 12 年之前
父節點
當前提交
f87f471183
共有 2 個文件被更改,包括 7 次插入235 次删除
  1. 7 233
      plugins/stringlib/stringlib.cpp
  2. 0 2
      plugins/stringlib/stringlib.hpp

+ 7 - 233
plugins/stringlib/stringlib.cpp

@@ -69,8 +69,8 @@ static const char * EclDefinition =
 "  boolean StringWildExactMatch(const string src, const string _pattern, boolean _noCase) : c, pure,entrypoint='slStringWildExactMatch'; \n"
 "  boolean StringContains(const string src, const string _pattern, boolean _noCase) : c, pure,entrypoint='slStringContains'; \n"
 "  string StringExtractMultiple(const string src, unsigned8 mask) : c,pure,entrypoint='slStringExtractMultiple'; \n"
-"  unsigned integer4 EditDistance(const string l, const string r) : c, pure,entrypoint='slEditDistance'; \n"
-"  boolean EditDistanceWithinRadius(const string l, const string r, unsigned4 radius) : c,pure,entrypoint='slEditDistanceWithinRadius'; \n"
+"  unsigned integer4 EditDistance(const string l, const string r) : c, pure,entrypoint='slEditDistanceV2'; \n"
+"  boolean EditDistanceWithinRadius(const string l, const string r, unsigned4 radius) : c,pure,entrypoint='slEditDistanceWithinRadiusV2'; \n"
 "  unsigned integer4 EditDistanceV2(const string l, const string r) : c, pure,entrypoint='slEditDistanceV2'; \n"
 "  boolean EditDistanceWithinRadiusV2(const string l, const string r, unsigned4 radius) : c,pure,entrypoint='slEditDistanceWithinRadiusV2'; \n"
 "  string StringGetNthWord(const string src, unsigned4 n) : c, pure,entrypoint='slStringGetNthWord'; \n"
@@ -158,134 +158,10 @@ inline unsigned min3(unsigned a, unsigned b, unsigned c)
     return mi;
 }
 
-class CEditDistance
-{
-private:
-    unsigned char da[256][256];
-
-public:
-    unsigned editDistance(unsigned leftLen, const char * left, unsigned rightLen, const char * right)
-    {
-        unsigned i, j, cost;
-        char l_i, r_j;
-
-        clip(leftLen, left);
-        clip(rightLen, right);
-
-        if (leftLen > 255)
-        {
-            leftLen = 255;
-        }
-        if (rightLen > 255)
-        {
-            rightLen = 255;
-        }
-
-        if (leftLen == 0)
-        {
-            return rightLen;
-        }
-        if (rightLen == 0)
-        {
-            return leftLen;
-        }
-
-        for (i = 0; i <= leftLen; i++)
-        {
-            da[i][0] = i;
-        }
-
-        for (j = 0; j <= rightLen; j++)
-        {
-            da[0][j] = j;
-        }
-
-        for (i = 1; i <= leftLen; i++)
-        {
-            l_i = left[i - 1];
-
-            for (j = 1; j <= rightLen; j++)
-            {
-                    r_j = right[j - 1];
-                    cost = (l_i == r_j) ? 0 : 1;
-                    da[i][j] = min3(da[i-1][j]+1, da[i][j-1]+1, da[i-1][j-1] + cost);
-            }
-        }
-
-        return da[leftLen][rightLen];
-    }
-
-    unsigned editDistance(unsigned leftLen, const char * left, unsigned rightLen, const char * right, unsigned radius)
-    {
-        unsigned i, j, cost;
-        char l_i, r_j;
-
-        clip(leftLen, left);
-        clip(rightLen, right);
-
-        if (leftLen > 255)
-        {
-            leftLen = 255;
-        }
-        if (rightLen > 255)
-        {
-            rightLen = 255;
-        }
-
-        if (leftLen == 0)
-        {
-            return rightLen;
-        }
-        if (rightLen == 0)
-        {
-            return leftLen;
-        }
-
-        if (leftLen > rightLen)
-        {
-            const char *tstr = left;
-            left = right;
-            right = tstr;
-
-            unsigned tlen = leftLen;
-            leftLen = rightLen;
-            rightLen = tlen;
-        }
-
-        for (i = 0; i <= leftLen; i++)
-        {
-            da[i][0] = i;
-        }
-
-        for (j = 0; j <= rightLen; j++)
-        {
-            da[0][j] = j;
-        }
-
-        for (i = 1; i <= leftLen; i++)
-        {
-            l_i = left[i - 1];
-
-            for (j = 1; j <= rightLen; j++)
-            {
-                    r_j = right[j - 1];
-                    cost = (l_i == r_j) ? 0 : 1;
-                    da[i][j] = min3(da[i-1][j]+1, da[i][j-1]+1, da[i-1][j-1] + cost);
-            }
-
-            // bail out early if ed can't possibly be <= radius
-            if ((da[i][rightLen] - (leftLen - i)) > radius)
-                return da[i][rightLen];
-        }
-
-        return da[leftLen][rightLen];
-    }
-};
-
-//--- Optimized versions of the edit distance functions above.
+//--- Optimized versions of the edit distance functions
 inline unsigned mask(unsigned x) { return x & 1; }
 
-unsigned editDistanceV2(unsigned leftLen, const char * left, unsigned rightLen, const char * right)
+unsigned editDistance(unsigned leftLen, const char * left, unsigned rightLen, const char * right)
 {
     unsigned i, j;
 
@@ -340,89 +216,13 @@ unsigned editDistanceV2(unsigned leftLen, const char * left, unsigned rightLen,
     return da[mask(leftLen-1)][rightLen-1];
 }
 
-unsigned editDistanceV2(unsigned leftLen, const char * left, unsigned rightLen, const char * right, unsigned radius)
-{
-    unsigned i, j;
-
-    clip(leftLen, left);
-    clip(rightLen, right);
-
-    unsigned minED = (leftLen < rightLen)? rightLen - leftLen: leftLen - rightLen;
-    if (minED > radius)
-        return minED;
-
-    if (leftLen > 255)
-        leftLen = 255;
-
-    if (rightLen > 255)
-        rightLen = 255;
-
-    //Checking for leading common substrings actually slows the function down.
-    if (leftLen == 0)
-        return rightLen;
-
-    if (rightLen == 0)
-        return leftLen;
-
-    /*
-    This function applies two optimizations over the function above.
-    a) Adding a charcter (next row) can at most decrease the edit distance by 1, so short circuit when
-       we there is no possiblity of getting within the distance.
-    b) We only need to evaluate the martix da[i-radius..i+radius][j-radius..j+radius]
-       not taking into account values outside that range [can use max value to prevent access]
-    */
-
-    //Optimize the storage requirements by
-    //i) Only storing two stripes
-    //ii) Calculate, but don't store the row comparing against the null string
-    //NB: A byte array is ok because the +1 is added after the minimum, and that will always include 254 as an option.
-    unsigned char da[2][255];
-    char r_0 = right[0];
-    char l_0 = left[0];
-    bool matched_l0 = false;
-    for (j = 0; j < rightLen; j++)
-    {
-        if (right[j] == l_0) matched_l0 = true;
-        da[0][j] = (matched_l0) ? j : j+1;
-    }
-
-    bool matched_r0 = (l_0 == r_0);
-    for (i = 1; i < leftLen; i++)
-    {
-        char l_i = left[i];
-        if (l_i == r_0)
-            matched_r0 = true;
-
-        byte da_i_0 = matched_r0 ? i : i+1;
-        da[mask(i)][0] = da_i_0;
-        byte da_i_prevj = da_i_0;
-
-        unsigned first = (i > radius) ? i-radius : 1;
-        unsigned last = (i + radius > rightLen) ? rightLen : i + radius;
-        for (j = 1; j < rightLen; j++)
-        {
-            char r_j = right[j];
-            unsigned char next = (l_i == r_j) ? da[mask(i-1)][j-1] :
-                        min3(da[mask(i-1)][j], da_i_prevj, da[mask(i-1)][j-1]) + 1;
-            da[mask(i)][j] = next;
-            da_i_prevj = next;
-        }
-
-        // bail out early if ed can't possibly be <= radius
-        unsigned maxdelta = (leftLen - (i+1));
-        if (da_i_prevj > radius + maxdelta)         // if da_i_prvj - maxdelta > radius can't ever get low enough
-            return da_i_prevj;
-    }
-
-    return da[mask(leftLen-1)][rightLen-1];
-}
 
 //This could be further improved in the following ways:
 // * Only use 2*radius bytes of temporary storage - I doubt it is worth it.
 // * special case edit1 - you could use variables for the 6 interesting array elements, and get
 //   rid of the array completely.  You could also unwind the first (and last iterations).
 // * I suspect the early exit condition could be improved depending the lengths of the strings.
-extern STRINGLIB_API unsigned editDistanceV3(unsigned leftLen, const char * left, unsigned rightLen, const char * right, unsigned radius)
+extern STRINGLIB_API unsigned editDistanceWithinRadius(unsigned leftLen, const char * left, unsigned rightLen, const char * right, unsigned radius)
 {
     if (radius >= 255)
         return 255;
@@ -1127,41 +927,15 @@ STRINGLIB_API bool STRINGLIB_CALL slStringContains(unsigned srcLen, const char *
     return true;
 }
 
-STRINGLIB_API unsigned STRINGLIB_CALL slEditDistance(unsigned leftLen, const char * left, unsigned rightLen, const char * right)
-{
-    CEditDistance * ed = new CEditDistance();
-    unsigned rval = ed->editDistance(leftLen, left, rightLen, right);
-    delete ed;
-    return rval;
-}
-
-
-STRINGLIB_API bool STRINGLIB_CALL slEditDistanceWithinRadius(unsigned leftLen, const char * left, unsigned rightLen, const char * right, unsigned radius)
-{
-    unsigned minED = (leftLen < rightLen)? rightLen - leftLen: leftLen - rightLen;
-
-    if (minED > radius)
-    {
-        return false;
-    }
-    else
-    {
-        CEditDistance *ed = new CEditDistance();
-        unsigned rval = ed->editDistance(leftLen, left, rightLen, right, radius);
-        delete ed;
-        return (rval <= radius);
-    }
-}
-
 STRINGLIB_API unsigned STRINGLIB_CALL slEditDistanceV2(unsigned leftLen, const char * left, unsigned rightLen, const char * right)
 {
-    return nsStringlib::editDistanceV2(leftLen, left, rightLen, right);
+    return nsStringlib::editDistance(leftLen, left, rightLen, right);
 }
 
 
 STRINGLIB_API bool STRINGLIB_CALL slEditDistanceWithinRadiusV2(unsigned leftLen, const char * left, unsigned rightLen, const char * right, unsigned radius)
 {
-    return nsStringlib::editDistanceV3(leftLen, left, rightLen, right, radius) <= radius;
+    return nsStringlib::editDistanceWithinRadius(leftLen, left, rightLen, right, radius) <= radius;
 }
 
 inline bool isWordSeparator(char x)

+ 0 - 2
plugins/stringlib/stringlib.hpp

@@ -75,8 +75,6 @@ STRINGLIB_API bool STRINGLIB_CALL slStringWildMatch(unsigned srcLen, const char
 STRINGLIB_API bool STRINGLIB_CALL slStringWildExactMatch(unsigned srcLen, const char * src, unsigned patLen, const char * pat, bool noCase);
 STRINGLIB_API bool STRINGLIB_CALL slStringContains(unsigned srcLen, const char * src, unsigned sampleLen, const char * sample, bool noCase);
 STRINGLIB_API void STRINGLIB_CALL slStringExtractMultiple(unsigned & tgtLen, char * & tgt, unsigned srcLen, const char * src, unsigned __int64 mask);
-STRINGLIB_API unsigned STRINGLIB_CALL slEditDistance(unsigned leftLen, const char * left, unsigned rightLen, const char * right);
-STRINGLIB_API bool STRINGLIB_CALL slEditDistanceWithinRadius(unsigned leftLen, const char * left, unsigned rightLen, const char * right, unsigned radius);
 STRINGLIB_API unsigned STRINGLIB_CALL slEditDistanceV2(unsigned leftLen, const char * left, unsigned rightLen, const char * right);
 STRINGLIB_API bool STRINGLIB_CALL slEditDistanceWithinRadiusV2(unsigned leftLen, const char * left, unsigned rightLen, const char * right, unsigned radius);
 STRINGLIB_API void STRINGLIB_CALL slStringGetNthWord(unsigned & tgtLen, char * & tgt, unsigned srcLen, const char * src, unsigned n);