Explorar el Código

HPCC-21909 Allow the maximum search to be specified for EditDistance

Signed-off-by: Gavin Halliday <gavin.halliday@lexisnexis.com>
Gavin Halliday hace 6 años
padre
commit
a58a40a91b

+ 6 - 4
ecllibrary/std/Str.ecl

@@ -305,11 +305,13 @@ EXPORT STRING CombineWords(SET OF STRING words, STRING separator) := lib_stringl
  * 
  * 
  * @param _left         The first string to be compared.
  * @param _left         The first string to be compared.
  * @param _right        The second string to be compared.
  * @param _right        The second string to be compared.
- * @return              The minimum edit distance between the two strings.
+ * @param radius        The maximum edit distance that is acceptable, or 0 for no limit.  Defaults to 0.
+ * @return              The minimum edit distance between the two strings.  Edit distances above radius will
+                        return an arbitrary value larger than radius.
  */
  */
 
 
-EXPORT UNSIGNED4 EditDistance(STRING _left, STRING _right) :=
-    lib_stringlib.StringLib.EditDistanceV2(_left, _right);
+EXPORT UNSIGNED4 EditDistance(STRING _left, STRING _right, UNSIGNED4 radius = 0) :=
+    lib_stringlib.StringLib.EditDistanceV3(_left, _right, radius);
 
 
 /**
 /**
  * Returns true if the minimum edit distance between the two strings is with a specific range.
  * Returns true if the minimum edit distance between the two strings is with a specific range.
@@ -317,7 +319,7 @@ EXPORT UNSIGNED4 EditDistance(STRING _left, STRING _right) :=
  * 
  * 
  * @param _left         The first string to be compared.
  * @param _left         The first string to be compared.
  * @param _right        The second string to be compared.
  * @param _right        The second string to be compared.
- * @param radius        The maximum edit distance that is accepable.
+ * @param radius        The maximum edit distance that is acceptable.
  * @return              Whether or not the two strings are within the given specified edit distance.
  * @return              Whether or not the two strings are within the given specified edit distance.
  */
  */
 
 

+ 6 - 4
ecllibrary/std/Uni.ecl

@@ -339,11 +339,13 @@ EXPORT BOOLEAN Contains(unicode src, unicode _pattern, boolean _noCase) :=
  * @param _left         The first string to be compared.
  * @param _left         The first string to be compared.
  * @param _right        The second string to be compared.
  * @param _right        The second string to be compared.
  * @param localname     The locale to use for the comparison.  Defaults to ''.
  * @param localname     The locale to use for the comparison.  Defaults to ''.
- * @return              The minimum edit distance between the two strings.
+ * @param radius        The maximum edit distance that is acceptable, or 0 for no limit.  Defaults to 0.
+ * @return              The minimum edit distance between the two strings.  Edit distances above radius will
+                        return an arbitrary value larger than radius.
  */
  */
 
 
-EXPORT UNSIGNED4 EditDistance(unicode _left, unicode _right, varstring localename = '') :=
-    lib_unicodelib.UnicodeLib.UnicodeLocaleEditDistance(_left, _right, localename);
+EXPORT UNSIGNED4 EditDistance(unicode _left, unicode _right, varstring localename = '', UNSIGNED4 radius = 0) :=
+    lib_unicodelib.UnicodeLib.UnicodeLocaleEditDistanceV2(_left, _right, localename, radius);
 
 
 /**
 /**
  * Returns true if the minimum edit distance between the two strings is with a specific range.
  * Returns true if the minimum edit distance between the two strings is with a specific range.
@@ -351,7 +353,7 @@ EXPORT UNSIGNED4 EditDistance(unicode _left, unicode _right, varstring localenam
  * 
  * 
  * @param _left         The first string to be compared.
  * @param _left         The first string to be compared.
  * @param _right        The second string to be compared.
  * @param _right        The second string to be compared.
- * @param radius        The maximum edit distance that is accepable.
+ * @param radius        The maximum edit distance that is acceptable.
  * @param localname     The locale to use for the comparison.  Defaults to ''.
  * @param localname     The locale to use for the comparison.  Defaults to ''.
  * @return              Whether or not the two strings are within the given specified edit distance.
  * @return              Whether or not the two strings are within the given specified edit distance.
  */
  */

+ 69 - 30
ecllibrary/teststd/str/TestEditDistance.ecl

@@ -13,35 +13,74 @@ EXPORT TestEditDistance := MODULE
   STRING manyDigits := digits+digits+digits+digits+digits+digits+digits+digits+digits+digits+digits+digits+digits+
   STRING manyDigits := digits+digits+digits+digits+digits+digits+digits+digits+digits+digits+digits+digits+digits+
                        digits+digits+digits+digits+digits+digits+digits+digits+digits+digits+digits+digits;
                        digits+digits+digits+digits+digits+digits+digits+digits+digits+digits+digits+digits;
 
 
-  EXPORT TestConst := MODULE
-    EXPORT Test01 := ASSERT(Str.EditDistance('','') = 0, CONST);
-    EXPORT Test02 := ASSERT(Str.EditDistance('','                ') = 0, CONST);
-    EXPORT Test03 := ASSERT(Str.EditDistance('                ','') = 0, CONST);
-    EXPORT Test04 := ASSERT(Str.EditDistance('a ','                ') = 1, CONST);
-    //EXPORT Test05 := ASSERT(Str.EditDistance(' a ','   ') = 1, CONST);
-    EXPORT Test06 := ASSERT(Str.EditDistance('Aprs  ','APp') = 3, CONST);
-    EXPORT Test07 := ASSERT(Str.EditDistance('abcd','acbd') = 2, CONST);
-    EXPORT Test08 := ASSERT(Str.EditDistance('abcd','abd') = 1, CONST);
-    EXPORT Test09 := ASSERT(Str.EditDistance('abcd','abc') = 1, CONST);
-    EXPORT Test10 := ASSERT(Str.EditDistance('abcd','bcd') = 1, CONST);
-    EXPORT Test11 := ASSERT(Str.EditDistance('abcd','abcde') = 1, CONST);
-    EXPORT Test12 := ASSERT(Str.EditDistance('abcd','aabcd') = 1, CONST);
-    EXPORT Test13 := ASSERT(Str.EditDistance('abcd',' abcd') = 1, CONST);
-    EXPORT Test14 := ASSERT(Str.EditDistance('abcd','a bcd') = 1, CONST);
-    EXPORT Test15 := ASSERT(Str.EditDistance('abcd','adcd') = 1, CONST);
-    EXPORT Test16 := ASSERT(Str.EditDistance('abcd','') = 4, CONST);
-    EXPORT Test17 := ASSERT(Str.EditDistance(alpha,'') = 26, CONST);
-    EXPORT Test18 := ASSERT(Str.EditDistance(manyAlpha,'') = 255, CONST);       //overflow
-    EXPORT Test19 := ASSERT(Str.EditDistance(alpha,digits) = 26, CONST);
-    EXPORT Test20 := ASSERT(Str.EditDistance(manyAlpha,digits) = 255, CONST);   //overflow
-    EXPORT Test21 := ASSERT(Str.EditDistance(manyAlpha,manyDigits) = 255, CONST);   //overflow
-    EXPORT Test22 := ASSERT(Str.EditDistance(alpha,manyDigits) = 250, CONST);
-    EXPORT Test23 := ASSERT(Str.EditDistance(alpha,manyDigits+'12345') = 255, CONST);
-    EXPORT Test24 := ASSERT(Str.EditDistance(alpha,manyDigits+'123456') = 255, CONST);
-    EXPORT Test25 := ASSERT(Str.EditDistance('123456789','987654321') = 8, CONST);
-    EXPORT Test26 := ASSERT(Str.EditDistance(largeAlpha,manyDigits) = 250, CONST);  //overflow
-    EXPORT Test27 := ASSERT(Str.EditDistance(largeAlpha+'abcdefghijklmnopqrst',manyDigits) = 254, CONST);
-    EXPORT Test28 := ASSERT(Str.EditDistance(largeAlpha+'abcdefghijklmnopqrstu',manyDigits) = 255, CONST);
-  END;
+  EXPORT TestConst := [
+    ASSERT(Str.EditDistance('','') = 0, CONST);
+    ASSERT(Str.EditDistance('','                ') = 0, CONST);
+    ASSERT(Str.EditDistance('                ','') = 0, CONST);
+    ASSERT(Str.EditDistance('a ','                ') = 1, CONST);
+    //ASSERT(Str.EditDistance(' a ','   ') = 1, CONST);
+    ASSERT(Str.EditDistance('Aprs  ','APp') = 3, CONST);
+    ASSERT(Str.EditDistance('abcd','acbd') = 2, CONST);
+    ASSERT(Str.EditDistance('abcd','abd') = 1, CONST);
+    ASSERT(Str.EditDistance('abcd','abc') = 1, CONST);
+    ASSERT(Str.EditDistance('abcd','bcd') = 1, CONST);
+    ASSERT(Str.EditDistance('abcd','abcde') = 1, CONST);
+    ASSERT(Str.EditDistance('abcd','aabcd') = 1, CONST);
+    ASSERT(Str.EditDistance('abcd',' abcd') = 1, CONST);
+    ASSERT(Str.EditDistance('abcd','a bcd') = 1, CONST);
+    ASSERT(Str.EditDistance('abcd','adcd') = 1, CONST);
+    ASSERT(Str.EditDistance('abcd','adca') = 2, CONST);
+    ASSERT(Str.EditDistance('gavin','aving') = 2, CONST);
+    ASSERT(Str.EditDistance('abcdefgh','cdefgha') = 3, CONST);
+    ASSERT(Str.EditDistance('abcdefgh','abcdfgha') = 2, CONST);
+    ASSERT(Str.EditDistance('abcd','') = 4, CONST);
+    ASSERT(Str.EditDistance(alpha,'') = 26, CONST);
+    ASSERT(Str.EditDistance(manyAlpha,'') = 255, CONST);       //overflow
+    ASSERT(Str.EditDistance(alpha,digits) = 26, CONST);
+    ASSERT(Str.EditDistance(manyAlpha,digits) = 255, CONST);   //overflow
+    ASSERT(Str.EditDistance(manyAlpha,manyDigits) = 255, CONST);   //overflow
+    ASSERT(Str.EditDistance(alpha,manyDigits) = 250, CONST);
+    ASSERT(Str.EditDistance(alpha,manyDigits+'12345') = 255, CONST);
+    ASSERT(Str.EditDistance(alpha,manyDigits+'123456') = 255, CONST);
+    ASSERT(Str.EditDistance('123456789','987654321') = 8, CONST);
+    ASSERT(Str.EditDistance(largeAlpha,manyDigits) = 250, CONST);  //overflow
+    ASSERT(Str.EditDistance(largeAlpha+'abcdefghijklmnopqrst',manyDigits) = 254, CONST);
+    ASSERT(Str.EditDistance(largeAlpha+'abcdefghijklmnopqrstu',manyDigits) = 255, CONST);
+
+    ASSERT(Str.EditDistance('','', 1) = 0, CONST);
+    ASSERT(Str.EditDistance('','                ', 1) = 0, CONST);
+    ASSERT(Str.EditDistance('                ','', 1) = 0, CONST);
+    ASSERT(Str.EditDistance('a ','                ', 1) = 1, CONST);
+    //ASSERT(Str.EditDistance(' a ','   ', 1) = 1, CONST);
+    ASSERT(Str.EditDistance('Aprs  ','APp', 1) > 1, CONST);
+    ASSERT(Str.EditDistance('abcd','acbd', 1) = 2, CONST);
+    ASSERT(Str.EditDistance('abcd','abd', 1) = 1, CONST);
+    ASSERT(Str.EditDistance('abcd','abc', 1) = 1, CONST);
+    ASSERT(Str.EditDistance('abcd','bcd', 1) = 1, CONST);
+    ASSERT(Str.EditDistance('abcd','abcde', 1) = 1, CONST);
+    ASSERT(Str.EditDistance('abcd','aabcd', 1) = 1, CONST);
+    ASSERT(Str.EditDistance('abcd',' abcd', 1) = 1, CONST);
+    ASSERT(Str.EditDistance('abcd','a bcd', 1) = 1, CONST);
+    ASSERT(Str.EditDistance('abcd','adcd', 1) = 1, CONST);
+    ASSERT(Str.EditDistance('abcd','adca', 1) > 1, CONST);
+    ASSERT(Str.EditDistance('gavin','aving', 1) > 1, CONST);
+    ASSERT(Str.EditDistance('abcdefgh','cdefgha', 1) > 1, CONST);
+    ASSERT(Str.EditDistance('abcdefgh','abcdfgha') > 1, CONST);
+    ASSERT(Str.EditDistance('abcd','', 1) > 1, CONST);
+    ASSERT(Str.EditDistance(alpha,'', 1) > 1, CONST);
+    ASSERT(Str.EditDistance(manyAlpha,'', 1) > 1, CONST);       //overflow
+    ASSERT(Str.EditDistance(alpha,digits, 1) > 1, CONST);
+    ASSERT(Str.EditDistance(manyAlpha,digits, 1) > 1, CONST);   //overflow
+    ASSERT(Str.EditDistance(manyAlpha,manyDigits, 1) > 1, CONST);   //overflow
+    ASSERT(Str.EditDistance(alpha,manyDigits, 1) > 1, CONST);
+    ASSERT(Str.EditDistance(alpha,manyDigits+'12345', 1) > 1, CONST);
+    ASSERT(Str.EditDistance(alpha,manyDigits+'123456', 1) > 1, CONST);
+    ASSERT(Str.EditDistance('123456789','987654321', 1) > 1, CONST);
+    ASSERT(Str.EditDistance(largeAlpha,manyDigits, 1) > 1, CONST);  //overflow
+    ASSERT(Str.EditDistance(largeAlpha+'abcdefghijklmnopqrst',manyDigits, 1) > 1, CONST);
+    ASSERT(Str.EditDistance(largeAlpha+'abcdefghijklmnopqrstu',manyDigits, 1) > 1, CONST);
+
+    EVALUATE('Done')
+  ];
 
 
 END;
 END;

+ 111 - 63
ecllibrary/teststd/uni/TestEditDistance.ecl

@@ -12,73 +12,121 @@ EXPORT TestEditDistance := MODULE
   UNICODE manyDigits := digits+digits+digits+digits+digits+digits+digits+digits+digits+digits+digits+digits+digits+
   UNICODE manyDigits := digits+digits+digits+digits+digits+digits+digits+digits+digits+digits+digits+digits+digits+
                        digits+digits+digits+digits+digits+digits+digits+digits+digits+digits+digits+digits;
                        digits+digits+digits+digits+digits+digits+digits+digits+digits+digits+digits+digits;
 
 
-  EXPORT TestConst := MODULE
-    EXPORT Test01 := ASSERT(Uni.EditDistance(U'',U'') = 0, CONST);
-    EXPORT Test01a := ASSERT(Uni.EditDistance(U'',U'','en') = 0, CONST);
-    EXPORT Test02 := ASSERT(Uni.EditDistance(U'',U'                ') = 0, CONST);
-    EXPORT Test02a := ASSERT(Uni.EditDistance(U'',U'                ','en') = 0, CONST);
-    EXPORT Test03 := ASSERT(Uni.EditDistance(U'                ',U'') = 0, CONST);
-    EXPORT Test03a := ASSERT(Uni.EditDistance(U'                ',U'','en') = 0, CONST);
-    EXPORT Test04 := ASSERT(Uni.EditDistance(U'a ',U'                ') = 1, CONST);
-    EXPORT Test04a := ASSERT(Uni.EditDistance(U'a ',U'                ','en') = 1, CONST);
-    //EXPORT Test05 := ASSERT(Uni.EditDistance(U' a ',U'   ') = 1, CONST);
-    EXPORT Test06 := ASSERT(Uni.EditDistance(U'Aprs  ',U'APp') = 3, CONST);
-    EXPORT Test06a := ASSERT(Uni.EditDistance(U'Aprs  ',U'APp','en') = 3, CONST);
-    EXPORT Test07 := ASSERT(Uni.EditDistance(U'abcd',U'acbd') = 2, CONST);
-    EXPORT Test07a := ASSERT(Uni.EditDistance(U'abcd',U'acbd','en') = 2, CONST);
-    EXPORT Test08 := ASSERT(Uni.EditDistance(U'abcd',U'abd') = 1, CONST);
-    EXPORT Test08a := ASSERT(Uni.EditDistance(U'abcd',U'abd','en') = 1, CONST);
-    EXPORT Test09 := ASSERT(Uni.EditDistance(U'abcd',U'abc') = 1, CONST);
-    EXPORT Test09a := ASSERT(Uni.EditDistance(U'abcd',U'abc','en') = 1, CONST);
-    EXPORT Test10 := ASSERT(Uni.EditDistance(U'abcd',U'bcd') = 1, CONST);
-    EXPORT Test10a := ASSERT(Uni.EditDistance(U'abcd',U'bcd','en') = 1, CONST);
-    EXPORT Test11 := ASSERT(Uni.EditDistance(U'abcd',U'abcde') = 1, CONST);
-    EXPORT Test11a := ASSERT(Uni.EditDistance(U'abcd',U'abcde','en') = 1, CONST);
-    EXPORT Test12 := ASSERT(Uni.EditDistance(U'abcd',U'aabcd') = 1, CONST);
-    EXPORT Test12a := ASSERT(Uni.EditDistance(U'abcd',U'aabcd','en') = 1, CONST);
-    EXPORT Test13 := ASSERT(Uni.EditDistance(U'abcd',U' abcd') = 1, CONST);
-    EXPORT Test13a := ASSERT(Uni.EditDistance(U'abcd',U' abcd','en') = 1, CONST);
-    EXPORT Test14 := ASSERT(Uni.EditDistance(U'abcd',U'a bcd') = 1, CONST);
-    EXPORT Test14a := ASSERT(Uni.EditDistance(U'abcd',U'a bcd','en') = 1, CONST);
-    EXPORT Test15 := ASSERT(Uni.EditDistance(U'abcd',U'adcd') = 1, CONST);
-    EXPORT Test15a := ASSERT(Uni.EditDistance(U'abcd',U'adcd','en') = 1, CONST);
-    EXPORT Test16 := ASSERT(Uni.EditDistance(U'abcd',U'') = 4, CONST);
-    EXPORT Test16a := ASSERT(Uni.EditDistance(U'abcd',U'','en') = 4, CONST);
-    EXPORT Test17 := ASSERT(Uni.EditDistance(alpha,U'') = 26, CONST);
-    EXPORT Test17a := ASSERT(Uni.EditDistance(alpha,U'','en') = 26, CONST);
-    EXPORT Test18 := ASSERT(Uni.EditDistance(manyAlpha,U'') = 255, CONST);      //overflow
-    EXPORT Test18a := ASSERT(Uni.EditDistance(manyAlpha,U'','en') = 255, CONST);      //overflow
-    EXPORT Test19 := ASSERT(Uni.EditDistance(alpha,digits) = 26, CONST);
-    EXPORT Test19a := ASSERT(Uni.EditDistance(alpha,digits,'en') = 26, CONST);
-    EXPORT Test20 := ASSERT(Uni.EditDistance(manyAlpha,digits) = 255, CONST);   //overflow
-    EXPORT Test20a := ASSERT(Uni.EditDistance(manyAlpha,digits,'en') = 255, CONST);   //overflow
-    EXPORT Test21 := ASSERT(Uni.EditDistance(manyAlpha,manyDigits) = 255, CONST);   //overflow
-    EXPORT Test21a := ASSERT(Uni.EditDistance(manyAlpha,manyDigits,'en') = 255, CONST);   //overflow
-    EXPORT Test22 := ASSERT(Uni.EditDistance(alpha,manyDigits) = 250, CONST);
-    EXPORT Test22a := ASSERT(Uni.EditDistance(alpha,manyDigits,'en') = 250, CONST);
-    EXPORT Test23 := ASSERT(Uni.EditDistance(alpha,manyDigits+U'12345') = 255, CONST);
-    EXPORT Test23a := ASSERT(Uni.EditDistance(alpha,manyDigits+U'12345','en') = 255, CONST);
-    EXPORT Test24 := ASSERT(Uni.EditDistance(alpha,manyDigits+U'123456') = 255, CONST);
-    EXPORT Test24a := ASSERT(Uni.EditDistance(alpha,manyDigits+U'123456','en') = 255, CONST);
-    EXPORT Test25 := ASSERT(Uni.EditDistance(U'123456789',U'987654321') = 8, CONST);
-    EXPORT Test25a := ASSERT(Uni.EditDistance(U'123456789',U'987654321','en') = 8, CONST);
-    EXPORT Test26 := ASSERT(Uni.EditDistance(U'AVILÉS',U'AVILES') = 1, CONST);
-    EXPORT Test26a := ASSERT(Uni.EditDistance(U'AVILÉS',U'AVILES','en') = 1, CONST);
-    EXPORT Test27 := ASSERT(Uni.EditDistance(U'MOMBRU',U'MOMBRÚ') = 1, CONST);
-    EXPORT Test27a := ASSERT(Uni.EditDistance(U'MOMBRU',U'MOMBRÚ','en') = 1, CONST);
-    EXPORT Test28 := ASSERT(Uni.EditDistance(U'BLVAREZ',U'ÁLVAREZ') = 1, CONST);
-    EXPORT Test28a := ASSERT(Uni.EditDistance(U'BLVAREZ',U'ÁLVAREZ','en') = 1, CONST);
+  EXPORT TestConst := [
+    ASSERT(Uni.EditDistance(U'',U'') = 0, CONST);
+    ASSERT(Uni.EditDistance(U'',U'','en') = 0, CONST);
+    ASSERT(Uni.EditDistance(U'',U'                ') = 0, CONST);
+    ASSERT(Uni.EditDistance(U'',U'                ','en') = 0, CONST);
+    ASSERT(Uni.EditDistance(U'                ',U'') = 0, CONST);
+    ASSERT(Uni.EditDistance(U'                ',U'','en') = 0, CONST);
+    ASSERT(Uni.EditDistance(U'a ',U'                ') = 1, CONST);
+    ASSERT(Uni.EditDistance(U'a ',U'                ','en') = 1, CONST);
+    //ASSERT(Uni.EditDistance(U' a ',U'   ') = 1, CONST);
+    ASSERT(Uni.EditDistance(U'Aprs  ',U'APp') = 3, CONST);
+    ASSERT(Uni.EditDistance(U'Aprs  ',U'APp','en') = 3, CONST);
+    ASSERT(Uni.EditDistance(U'abcd',U'acbd') = 2, CONST);
+    ASSERT(Uni.EditDistance(U'abcd',U'acbd','en') = 2, CONST);
+    ASSERT(Uni.EditDistance(U'abcd',U'abd') = 1, CONST);
+    ASSERT(Uni.EditDistance(U'abcd',U'abd','en') = 1, CONST);
+    ASSERT(Uni.EditDistance(U'abcd',U'abc') = 1, CONST);
+    ASSERT(Uni.EditDistance(U'abcd',U'abc','en') = 1, CONST);
+    ASSERT(Uni.EditDistance(U'abcd',U'bcd') = 1, CONST);
+    ASSERT(Uni.EditDistance(U'abcd',U'bcd','en') = 1, CONST);
+    ASSERT(Uni.EditDistance(U'abcd',U'abcde') = 1, CONST);
+    ASSERT(Uni.EditDistance(U'abcd',U'abcde','en') = 1, CONST);
+    ASSERT(Uni.EditDistance(U'abcd',U'aabcd') = 1, CONST);
+    ASSERT(Uni.EditDistance(U'abcd',U'aabcd','en') = 1, CONST);
+    ASSERT(Uni.EditDistance(U'abcd',U' abcd') = 1, CONST);
+    ASSERT(Uni.EditDistance(U'abcd',U' abcd','en') = 1, CONST);
+    ASSERT(Uni.EditDistance(U'abcd',U'a bcd') = 1, CONST);
+    ASSERT(Uni.EditDistance(U'abcd',U'a bcd','en') = 1, CONST);
+    ASSERT(Uni.EditDistance(U'abcd',U'adcd') = 1, CONST);
+    ASSERT(Uni.EditDistance(U'abcd',U'adcd','en') = 1, CONST);
+    ASSERT(Uni.EditDistance(U'abcd',U'') = 4, CONST);
+    ASSERT(Uni.EditDistance(U'abcd',U'','en') = 4, CONST);
+    ASSERT(Uni.EditDistance(U'gavin',U'aving') = 2, CONST);
+    ASSERT(Uni.EditDistance(U'abcdefgh',U'cdefgha') = 3, CONST);
+    ASSERT(Uni.EditDistance(U'abcdefgh',U'abcdfgha') = 2, CONST);
+    ASSERT(Uni.EditDistance(alpha,U'') = 26, CONST);
+    ASSERT(Uni.EditDistance(alpha,U'','en') = 26, CONST);
+    ASSERT(Uni.EditDistance(manyAlpha,U'') = 255, CONST);      //overflow
+    ASSERT(Uni.EditDistance(manyAlpha,U'','en') = 255, CONST);      //overflow
+    ASSERT(Uni.EditDistance(alpha,digits) = 26, CONST);
+    ASSERT(Uni.EditDistance(alpha,digits,'en') = 26, CONST);
+    ASSERT(Uni.EditDistance(manyAlpha,digits) = 255, CONST);   //overflow
+    ASSERT(Uni.EditDistance(manyAlpha,digits,'en') = 255, CONST);   //overflow
+    ASSERT(Uni.EditDistance(manyAlpha,manyDigits) = 255, CONST);   //overflow
+    ASSERT(Uni.EditDistance(manyAlpha,manyDigits,'en') = 255, CONST);   //overflow
+    ASSERT(Uni.EditDistance(alpha,manyDigits) = 250, CONST);
+    ASSERT(Uni.EditDistance(alpha,manyDigits,'en') = 250, CONST);
+    ASSERT(Uni.EditDistance(alpha,manyDigits+U'12345') = 255, CONST);
+    ASSERT(Uni.EditDistance(alpha,manyDigits+U'12345','en') = 255, CONST);
+    ASSERT(Uni.EditDistance(alpha,manyDigits+U'123456') = 255, CONST);
+    ASSERT(Uni.EditDistance(alpha,manyDigits+U'123456','en') = 255, CONST);
+    ASSERT(Uni.EditDistance(U'123456789',U'987654321') = 8, CONST);
+    ASSERT(Uni.EditDistance(U'123456789',U'987654321','en') = 8, CONST);
+    ASSERT(Uni.EditDistance(U'AVILÉS',U'AVILES') = 1, CONST);
+    ASSERT(Uni.EditDistance(U'AVILÉS',U'AVILES','en') = 1, CONST);
+    ASSERT(Uni.EditDistance(U'MOMBRU',U'MOMBRÚ') = 1, CONST);
+    ASSERT(Uni.EditDistance(U'MOMBRU',U'MOMBRÚ','en') = 1, CONST);
+    ASSERT(Uni.EditDistance(U'BLVAREZ',U'ÁLVAREZ') = 1, CONST);
+    ASSERT(Uni.EditDistance(U'BLVAREZ',U'ÁLVAREZ','en') = 1, CONST);
     // when character's encoding is from 0x00ffff - 0x10ffff range: 0x1D306 ; Description=TETRAGRAM FOR CENTER (Tai Xuan Jing Symbols)
     // when character's encoding is from 0x00ffff - 0x10ffff range: 0x1D306 ; Description=TETRAGRAM FOR CENTER (Tai Xuan Jing Symbols)
     // UTF-16 representation is xD834,xDF06 (2 16-bit surrogates)
     // UTF-16 representation is xD834,xDF06 (2 16-bit surrogates)
-    EXPORT Test29 := ASSERT(Uni.EditDistance(U'\uD834\uDF06XXX',U'XXXX') = 1, CONST);
-    EXPORT Test29a := ASSERT(Uni.EditDistance(U'\uD834\uDF06XXX',U'XXXX','en') = 1, CONST);
+    ASSERT(Uni.EditDistance(U'\uD834\uDF06XXX',U'XXXX') = 1, CONST);
+    ASSERT(Uni.EditDistance(U'\uD834\uDF06XXX',U'XXXX','en') = 1, CONST);
     // NFC (normalized form composed) for accented characters uses multiple 16-bit code units
     // NFC (normalized form composed) for accented characters uses multiple 16-bit code units
     // for example: Ḍ̛ is encoded as 0x1E0C,0x031B, and Ḍ̛̇ as 0x1E0C,0x031B,0x0307
     // for example: Ḍ̛ is encoded as 0x1E0C,0x031B, and Ḍ̛̇ as 0x1E0C,0x031B,0x0307
     // These are the cases where the fast function version (ToDo) does not work correctly, but this one does
     // These are the cases where the fast function version (ToDo) does not work correctly, but this one does
-    EXPORT Test30 := ASSERT(Uni.EditDistance(U'\u1E0C\u031BDDD',U'DDDD') = 2, CONST);
-    EXPORT Test30a := ASSERT(Uni.EditDistance(U'\u1E0C\u031BDDD',U'DDDD','en') = 1, CONST);
+    ASSERT(Uni.EditDistance(U'\u1E0C\u031BDDD',U'DDDD') = 2, CONST);
+    ASSERT(Uni.EditDistance(U'\u1E0C\u031BDDD',U'DDDD','en') = 1, CONST);
     // Lithuanian 'i dot acute' is encoded as 0069 0307 0301
     // Lithuanian 'i dot acute' is encoded as 0069 0307 0301
-    EXPORT Test31 := ASSERT(Uni.EditDistance(U'\u0069\u0307\u0301DDD',U'DDDD') = 3, CONST);
-    EXPORT Test31a := ASSERT(Uni.EditDistance(U'\u0069\u0307\u0301DDD',U'DDDD','lt') = 1, CONST);
-  END;
+    ASSERT(Uni.EditDistance(U'\u0069\u0307\u0301DDD',U'DDDD') = 3, CONST);
+    ASSERT(Uni.EditDistance(U'\u0069\u0307\u0301DDD',U'DDDD','lt') = 1, CONST);
+
+    ASSERT(Uni.EditDistance(U'',U'','en', 1) = 0, CONST);
+    ASSERT(Uni.EditDistance(U'',U'                ','en', 1) = 0, CONST);
+    ASSERT(Uni.EditDistance(U'                ',U'','en', 1) = 0, CONST);
+    ASSERT(Uni.EditDistance(U'a ',U'                ','en', 1) = 1, CONST);
+    ASSERT(Uni.EditDistance(U'Aprs  ',U'APp','en', 1) = 3, CONST);
+    ASSERT(Uni.EditDistance(U'abcd',U'acbd','en', 1) = 2, CONST);
+    ASSERT(Uni.EditDistance(U'abcd',U'abd','en', 1) = 1, CONST);
+    ASSERT(Uni.EditDistance(U'abcd',U'abc','en', 1) = 1, CONST);
+    ASSERT(Uni.EditDistance(U'abcd',U'bcd','en', 1) = 1, CONST);
+    ASSERT(Uni.EditDistance(U'abcd',U'abcde','en', 1) = 1, CONST);
+    ASSERT(Uni.EditDistance(U'abcd',U'aabcd','en', 1) = 1, CONST);
+    ASSERT(Uni.EditDistance(U'abcd',U' abcd','en', 1) = 1, CONST);
+    ASSERT(Uni.EditDistance(U'abcd',U'a bcd','en', 1) = 1, CONST);
+    ASSERT(Uni.EditDistance(U'abcd',U'adcd','en', 1) = 1, CONST);
+    ASSERT(Uni.EditDistance(U'abcd',U'','en', 1) = 4, CONST);
+    ASSERT(Uni.EditDistance(U'gavin',U'aving', 'en', 1) > 1, CONST);
+    ASSERT(Uni.EditDistance(U'abcdefgh',U'cdefgha', 'en', 1) > 1, CONST);
+    ASSERT(Uni.EditDistance(U'abcdefgh',U'abcdfgha', 'en', 1) > 1, CONST);
+    ASSERT(Uni.EditDistance(alpha,U'','en', 1) > 1, CONST);
+    ASSERT(Uni.EditDistance(manyAlpha,U'','en', 1) > 1, CONST);      //overflow
+    ASSERT(Uni.EditDistance(alpha,digits,'en', 1) > 1, CONST);
+    ASSERT(Uni.EditDistance(manyAlpha,digits,'en', 1) > 1, CONST);   //overflow
+    ASSERT(Uni.EditDistance(manyAlpha,manyDigits,'en', 1) > 1, CONST);   //overflow
+    ASSERT(Uni.EditDistance(alpha,manyDigits,'en', 1) > 1, CONST);
+    ASSERT(Uni.EditDistance(alpha,manyDigits+U'12345','en', 1) > 1, CONST);
+    ASSERT(Uni.EditDistance(alpha,manyDigits+U'123456','en', 1) > 1, CONST);
+    ASSERT(Uni.EditDistance(U'123456789',U'987654321','en', 1) > 1, CONST);
+    ASSERT(Uni.EditDistance(U'123456789',U'987654321','en', 7) > 7, CONST);
+    ASSERT(Uni.EditDistance(U'123456789',U'987654321','en', 8) = 8, CONST);
+    ASSERT(Uni.EditDistance(U'AVILÉS',U'AVILES','en', 1) = 1, CONST);
+    ASSERT(Uni.EditDistance(U'MOMBRU',U'MOMBRÚ','en', 1) = 1, CONST);
+    ASSERT(Uni.EditDistance(U'BLVAREZ',U'ÁLVAREZ','en', 1) = 1, CONST);
+    // when character's encoding is from 0x00ffff - 0x10ffff range: 0x1D306 ; Description=TETRAGRAM FOR CENTER (Tai Xuan Jing Symbols)
+    // UTF-16 representation is xD834,xDF06 (2 16-bit surrogates)
+    ASSERT(Uni.EditDistance(U'\uD834\uDF06XXX',U'XXXX','en', 1) = 1, CONST);
+    // NFC (normalized form composed, 1) for accented characters uses multiple 16-bit code units
+    // for example: Ḍ̛ is encoded as 0x1E0C,0x031B, and Ḍ̛̇ as 0x1E0C,0x031B,0x0307
+    // These are the cases where the fast function version (ToDo, 1) does not work correctly, but this one does
+    ASSERT(Uni.EditDistance(U'\u1E0C\u031BDDD',U'DDDD','en', 1) = 1, CONST);
+    // Lithuanian 'i dot acute' is encoded as 0069 0307 0301
+    ASSERT(Uni.EditDistance(U'\u0069\u0307\u0301DDD',U'DDDD','lt', 1) = 1, CONST);
+
+    EVALUATE('Done')
+  ];
+
 END;
 END;

+ 9 - 0
plugins/stringlib/stringlib.cpp

@@ -76,6 +76,7 @@ static const char * EclDefinition =
 "  unsigned integer4 EditDistance(const string l, const string r) : c, time, pure,entrypoint='slEditDistanceV2'; \n"
 "  unsigned integer4 EditDistance(const string l, const string r) : c, time, pure,entrypoint='slEditDistanceV2'; \n"
 "  boolean EditDistanceWithinRadius(const string l, const string r, unsigned4 radius) : c,time,pure,entrypoint='slEditDistanceWithinRadiusV2'; \n"
 "  boolean EditDistanceWithinRadius(const string l, const string r, unsigned4 radius) : c,time,pure,entrypoint='slEditDistanceWithinRadiusV2'; \n"
 "  unsigned integer4 EditDistanceV2(const string l, const string r) : c,time,pure,entrypoint='slEditDistanceV2'; \n"
 "  unsigned integer4 EditDistanceV2(const string l, const string r) : c,time,pure,entrypoint='slEditDistanceV2'; \n"
+"  unsigned integer4 EditDistanceV3(const string l, const string r, unsigned4 radius) : c,time,pure,entrypoint='slEditDistanceV3'; \n"
 "  boolean EditDistanceWithinRadiusV2(const string l, const string r, unsigned4 radius) : c,time,pure,entrypoint='slEditDistanceWithinRadiusV2'; \n"
 "  boolean EditDistanceWithinRadiusV2(const string l, const string r, unsigned4 radius) : c,time,pure,entrypoint='slEditDistanceWithinRadiusV2'; \n"
 "  string StringGetNthWord(const string src, unsigned4 n) : c, pure,entrypoint='slStringGetNthWord'; \n"
 "  string StringGetNthWord(const string src, unsigned4 n) : c, pure,entrypoint='slStringGetNthWord'; \n"
 "  string StringExcludeLastWord(const string src) : c, pure,entrypoint='slStringExcludeLastWord'; \n"
 "  string StringExcludeLastWord(const string src) : c, pure,entrypoint='slStringExcludeLastWord'; \n"
@@ -952,6 +953,14 @@ STRINGLIB_API unsigned STRINGLIB_CALL slEditDistanceV2(unsigned leftLen, const c
 }
 }
 
 
 
 
+STRINGLIB_API unsigned STRINGLIB_CALL slEditDistanceV3(unsigned leftLen, const char * left, unsigned rightLen, const char * right, unsigned radius)
+{
+    if (radius == 0)
+        return nsStringlib::editDistance(leftLen, left, rightLen, right);
+    else
+        return nsStringlib::editDistanceWithinRadius(leftLen, left, rightLen, right, radius);
+}
+
 STRINGLIB_API bool STRINGLIB_CALL slEditDistanceWithinRadiusV2(unsigned leftLen, const char * left, unsigned rightLen, const char * right, unsigned radius)
 STRINGLIB_API bool STRINGLIB_CALL slEditDistanceWithinRadiusV2(unsigned leftLen, const char * left, unsigned rightLen, const char * right, unsigned radius)
 {
 {
     return nsStringlib::editDistanceWithinRadius(leftLen, left, rightLen, right, radius) <= radius;
     return nsStringlib::editDistanceWithinRadius(leftLen, left, rightLen, right, radius) <= radius;

+ 1 - 0
plugins/stringlib/stringlib.hpp

@@ -76,6 +76,7 @@ STRINGLIB_API bool STRINGLIB_CALL slStringWildExactMatch(unsigned srcLen, const
 STRINGLIB_API bool STRINGLIB_CALL slStringContains(unsigned srcLen, const char * src, unsigned sampleLen, const char * sample, bool noCase);
 STRINGLIB_API bool STRINGLIB_CALL slStringContains(unsigned srcLen, const char * src, unsigned sampleLen, const char * sample, bool noCase);
 STRINGLIB_API void STRINGLIB_CALL slStringExtractMultiple(unsigned & tgtLen, char * & tgt, unsigned srcLen, const char * src, unsigned __int64 mask);
 STRINGLIB_API void STRINGLIB_CALL slStringExtractMultiple(unsigned & tgtLen, char * & tgt, unsigned srcLen, const char * src, unsigned __int64 mask);
 STRINGLIB_API unsigned STRINGLIB_CALL slEditDistanceV2(unsigned leftLen, const char * left, unsigned rightLen, const char * right);
 STRINGLIB_API unsigned STRINGLIB_CALL slEditDistanceV2(unsigned leftLen, const char * left, unsigned rightLen, const char * right);
+STRINGLIB_API unsigned STRINGLIB_CALL slEditDistanceV3(unsigned leftLen, const char * left, unsigned rightLen, const char * right, unsigned radius);
 STRINGLIB_API bool STRINGLIB_CALL slEditDistanceWithinRadiusV2(unsigned leftLen, const char * left, unsigned rightLen, const char * right, unsigned radius);
 STRINGLIB_API bool STRINGLIB_CALL slEditDistanceWithinRadiusV2(unsigned leftLen, const char * left, unsigned rightLen, const char * right, unsigned radius);
 STRINGLIB_API void STRINGLIB_CALL slStringGetNthWord(unsigned & tgtLen, char * & tgt, unsigned srcLen, const char * src, unsigned n);
 STRINGLIB_API void STRINGLIB_CALL slStringGetNthWord(unsigned & tgtLen, char * & tgt, unsigned srcLen, const char * src, unsigned n);
 STRINGLIB_API unsigned STRINGLIB_CALL slStringWordCount(unsigned srcLen, const char * src);
 STRINGLIB_API unsigned STRINGLIB_CALL slStringWordCount(unsigned srcLen, const char * src);

+ 11 - 2
plugins/unicodelib/unicodelib.cpp

@@ -96,6 +96,7 @@ static const char * EclDefinition =
 "  boolean UnicodeWildMatch(const unicode src, const unicode _pattern, boolean _noCase) : c, pure,entrypoint='ulUnicodeWildMatch', hole; \n"
 "  boolean UnicodeWildMatch(const unicode src, const unicode _pattern, boolean _noCase) : c, pure,entrypoint='ulUnicodeWildMatch', hole; \n"
 "  boolean UnicodeContains(const unicode src, const unicode _pattern, boolean _noCase) : c, pure,entrypoint='ulUnicodeContains', hole; \n"
 "  boolean UnicodeContains(const unicode src, const unicode _pattern, boolean _noCase) : c, pure,entrypoint='ulUnicodeContains', hole; \n"
 "  unsigned4 UnicodeLocaleEditDistance(const unicode left, const unicode right, const varstring localename) : c,time,pure,entrypoint='ulUnicodeLocaleEditDistance', hole; \n"
 "  unsigned4 UnicodeLocaleEditDistance(const unicode left, const unicode right, const varstring localename) : c,time,pure,entrypoint='ulUnicodeLocaleEditDistance', hole; \n"
+"  unsigned4 UnicodeLocaleEditDistanceV2(const unicode left, const unicode right, const varstring localename, unsigned4 radius) : c,time,pure,entrypoint='ulUnicodeLocaleEditDistanceV2', hole; \n"
 "  boolean UnicodeLocaleEditDistanceWithinRadius(const unicode left, const unicode right, unsigned4 radius,  const varstring localename) : c,time,pure,entrypoint='ulUnicodeLocaleEditDistanceWithinRadius', hole; \n"
 "  boolean UnicodeLocaleEditDistanceWithinRadius(const unicode left, const unicode right, unsigned4 radius,  const varstring localename) : c,time,pure,entrypoint='ulUnicodeLocaleEditDistanceWithinRadius', hole; \n"
 "  unsigned4 UnicodeLocaleWordCount(const unicode text, const varstring localename) : c, pure,entrypoint='ulUnicodeLocaleWordCount', hole; \n"
 "  unsigned4 UnicodeLocaleWordCount(const unicode text, const varstring localename) : c, pure,entrypoint='ulUnicodeLocaleWordCount', hole; \n"
 "  unicode UnicodeLocaleGetNthWord(const unicode text, unsigned4 n, const varstring localename) : c,pure,entrypoint='ulUnicodeLocaleGetNthWord';\n"
 "  unicode UnicodeLocaleGetNthWord(const unicode text, unsigned4 n, const varstring localename) : c,pure,entrypoint='ulUnicodeLocaleGetNthWord';\n"
@@ -1751,6 +1752,12 @@ UNICODELIB_API void UNICODELIB_CALL ulUnicodeCleanAccents(unsigned & tgtLen, UCh
 
 
 UNICODELIB_API unsigned UNICODELIB_CALL ulUnicodeLocaleEditDistance(unsigned leftLen, UChar const * left, unsigned rightLen, UChar const * right, char const * localename)
 UNICODELIB_API unsigned UNICODELIB_CALL ulUnicodeLocaleEditDistance(unsigned leftLen, UChar const * left, unsigned rightLen, UChar const * right, char const * localename)
 {
 {
+    return ulUnicodeLocaleEditDistanceV2(leftLen, left, rightLen, right, localename, 0);
+}
+
+
+UNICODELIB_API unsigned UNICODELIB_CALL ulUnicodeLocaleEditDistanceV2(unsigned leftLen, UChar const * left, unsigned rightLen, UChar const * right, char const * localename, unsigned radius)
+{
     BreakIterator* bi = 0;
     BreakIterator* bi = 0;
     if (localename && *localename)
     if (localename && *localename)
     {
     {
@@ -1762,8 +1769,10 @@ UNICODELIB_API unsigned UNICODELIB_CALL ulUnicodeLocaleEditDistance(unsigned lef
     UnicodeString uLeft(false, left, leftLen); // Readonly-aliasing UChar* constructor.
     UnicodeString uLeft(false, left, leftLen); // Readonly-aliasing UChar* constructor.
     UnicodeString uRight(false, right, rightLen);
     UnicodeString uRight(false, right, rightLen);
 
 
-    unsigned distance = nsUnicodelib::unicodeEditDistanceV4(uLeft, uRight, 254, bi);
-    return distance;
+    if (radius == 0)
+        return nsUnicodelib::unicodeEditDistanceV4(uLeft, uRight, 254, bi);
+    else
+        return nsUnicodelib::unicodeEditDistanceV4(uLeft, uRight, radius, bi);
 }
 }
 
 
 
 

+ 1 - 0
plugins/unicodelib/unicodelib.hpp

@@ -98,6 +98,7 @@ UNICODELIB_API void UNICODELIB_CALL ulUnicodeCleanSpaces80(UChar * tgt, unsigned
 UNICODELIB_API bool UNICODELIB_CALL ulUnicodeWildMatch(unsigned srcLen, UChar const * src, unsigned patLen, UChar const * pat, bool noCase);
 UNICODELIB_API bool UNICODELIB_CALL ulUnicodeWildMatch(unsigned srcLen, UChar const * src, unsigned patLen, UChar const * pat, bool noCase);
 UNICODELIB_API bool UNICODELIB_CALL ulUnicodeContains(unsigned srcLen, UChar const * src, unsigned patLen, UChar const * pat, bool noCase);
 UNICODELIB_API bool UNICODELIB_CALL ulUnicodeContains(unsigned srcLen, UChar const * src, unsigned patLen, UChar const * pat, bool noCase);
 UNICODELIB_API unsigned UNICODELIB_CALL ulUnicodeLocaleEditDistance(unsigned leftLen, UChar const * left, unsigned rightLen, UChar const * right,char const * localename);
 UNICODELIB_API unsigned UNICODELIB_CALL ulUnicodeLocaleEditDistance(unsigned leftLen, UChar const * left, unsigned rightLen, UChar const * right,char const * localename);
+UNICODELIB_API unsigned UNICODELIB_CALL ulUnicodeLocaleEditDistanceV2(unsigned leftLen, UChar const * left, unsigned rightLen, UChar const * right,char const * localename, unsigned radius);
 UNICODELIB_API bool UNICODELIB_CALL ulUnicodeLocaleEditDistanceWithinRadius(unsigned leftLen, UChar const * left, unsigned rightLen, UChar const * right, unsigned radius,char const * localename);
 UNICODELIB_API bool UNICODELIB_CALL ulUnicodeLocaleEditDistanceWithinRadius(unsigned leftLen, UChar const * left, unsigned rightLen, UChar const * right, unsigned radius,char const * localename);
 UNICODELIB_API unsigned UNICODELIB_CALL ulUnicodeLocaleWordCount(unsigned textLen, UChar const * text,char const * localename);
 UNICODELIB_API unsigned UNICODELIB_CALL ulUnicodeLocaleWordCount(unsigned textLen, UChar const * text,char const * localename);
 UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleGetNthWord(unsigned & tgtLen, UChar * & tgt, unsigned textLen, UChar const * text, unsigned n, char const * localename);
 UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleGetNthWord(unsigned & tgtLen, UChar * & tgt, unsigned textLen, UChar const * text, unsigned n, char const * localename);