浏览代码

Merge pull request #11222 from ghalliday/issue18043

HPCC-18043 Unicode Implementation for CountWords

Reviewed-By: Richard Chapman <rchapman@hpccsystems.com>
Richard Chapman 7 年之前
父节点
当前提交
87e014cd52

+ 13 - 0
ecllibrary/std/Uni.ecl

@@ -504,4 +504,17 @@ EXPORT Repeat(unicode src, unsigned4 n) :=
 EXPORT unsigned4 FindCount(unicode src, unicode hit, string form) :=
     lib_unicodelib.UnicodeLib.UnicodeLocaleFindCount(src, hit, form);
 
+/**
+ * Returns the number of words that the string contains.  Words are separated by one or more separator strings. No 
+ * spaces are stripped from either string before matching. allowBlankItems set to false by default.
+ *
+ * @param src               The string being searched in.
+ * @param delim             The string used to separate words
+ * @param allowBlankItems   Indicates if empty/blank string items are included in the results.
+ * @return                  The number of delimited tokens in the source string
+ */
+
+EXPORT unsigned4 CountWords(unicode src, unicode delim, boolean allowBlankItems = FALSE) :=
+    lib_unicodelib.UnicodeLib.UnicodeLocaleCountWords(src, delim, allowBlankItems);
+
 END;

+ 75 - 0
ecllibrary/teststd/uni/TestCountWords.ecl

@@ -0,0 +1,75 @@
+/*##############################################################################
+## HPCC SYSTEMS software Copyright (C) 2017 HPCC Systems®.  All rights reserved.
+############################################################################## */
+
+IMPORT Std.Uni;
+
+EXPORT TestCountWords := MODULE
+
+  EXPORT TestConst := MODULE
+    //Check action on strings with no entries: empty source string, search string, or return string.
+    EXPORT Test01 := ASSERT(Uni.CountWords(U'', U'') = 0);
+    EXPORT Test01b := ASSERT(Uni.CountWords(U'abcde', U'') = 0);
+    EXPORT Test01c := ASSERT(Uni.CountWords(U'', U'abc', TRUE) = 0);
+    EXPORT Test02 := ASSERT(Uni.CountWords(U'x', U'x') = 0);
+    EXPORT Test03 := ASSERT(Uni.CountWords(U'x', U' ') = 1);
+    EXPORT Test04 := ASSERT(Uni.CountWords(U' ', U' ') = 0);
+    EXPORT Test05 := ASSERT(Uni.CountWords(U'  ', U' ') = 0);
+    EXPORT Test06 := ASSERT(Uni.CountWords(U'x ', U' ') = 1);
+    EXPORT Test07 := ASSERT(Uni.CountWords(U' x', U' ') = 1);
+    EXPORT Test08 := ASSERT(Uni.CountWords(U' x ', U' ') = 1);
+    EXPORT Test09 := ASSERT(Uni.CountWords(U' abc def ', U' ') = 2);
+    EXPORT Test10 := ASSERT(Uni.CountWords(U' abc   def ', U' ') = 2);
+    EXPORT Test11 := ASSERT(Uni.CountWords(U' a b c   def ', U' ') = 4);
+    EXPORT Test12 := ASSERT(Uni.CountWords(U' abc   def', U' ') = 2);
+    EXPORT Test13 := ASSERT(Uni.CountWords(U'$', U'$$') = 1);
+    EXPORT Test14 := ASSERT(Uni.CountWords(U'$x', U'$$') = 1);
+    EXPORT Test15 := ASSERT(Uni.CountWords(U'$$', U'$$') = 0);
+    EXPORT Test16 := ASSERT(Uni.CountWords(U'$$$', U'$$') = 1);
+    EXPORT Test17 := ASSERT(Uni.CountWords(U'$$$$', U'$$') = 0);
+    EXPORT Test18 := ASSERT(Uni.CountWords(U'$$x$$', U'$$') = 1);
+    EXPORT Test19 := ASSERT(Uni.CountWords(U'$$x$$y', U'$$') = 2);
+    EXPORT Test20 := ASSERT(Uni.CountWords(U'$$x$$xy', U'$$') = 2);
+    EXPORT Test21 := ASSERT(Uni.CountWords(U'a,c,d', U',', TRUE) = 3);
+    EXPORT Test21a := ASSERT(Uni.CountWords(U'a,c,d', U',', FALSE) = 3);
+    EXPORT Test22 := ASSERT(Uni.CountWords(U'a,,d', U',', TRUE) = 3);
+    EXPORT Test22a := ASSERT(Uni.CountWords(U'a,,d', U',', FALSE) = 2);
+    EXPORT Test23 := ASSERT(Uni.CountWords(U',,,', U',', TRUE) = 4);
+    EXPORT Test23a := ASSERT(Uni.CountWords(U',,,', U',', FALSE) = 0);
+    EXPORT Test24 := ASSERT(Uni.CountWords(U' \377ABCDEF FEDCBA ', U' ') = 2);
+    //Check action on a string containing punctuation characters.
+    EXPORT Test25 := ASSERT(Uni.CountWords(U' ,&%$@ ',U'%$') = 2);
+    //Check action on a string containing an apostrophe.
+    EXPORT Test26 := ASSERT(Uni.CountWords(U'I couldn\'t hear you!',U'\'') = 2);
+    //Check action on a string containing different variations/combinations of numbers and other characters.
+    EXPORT Test27 := ASSERT(Uni.CountWords(U'1 234 123abc 23.6 abc123',U'2') = 5);
+    //Test other space characters (< 0x20).
+    EXPORT Test28 := ASSERT(Uni.CountWords(U'an\nt\tdef',U' ') = 1);
+    EXPORT Test29 := ASSERT(Uni.CountWords(U'  a n\nt \t  def    ',U't') = 2);
+    //Check action on a string containing latin diacritical marks.
+    EXPORT Test30 := ASSERT(Uni.CountWords(U'À à',U'À') = 1);
+    EXPORT Test31 := ASSERT(Uni.CountWords(U'ȭ š',U'ȭ') = 1);
+    //Check action on a string containing Spanish words with latin accents.
+    //Translation: "The deceased changed the girls"
+    EXPORT Test32 := ASSERT(Uni.CountWords(U'El difunto cambió las niñas',U'cambió') = 2);
+    //Check action on a string containing Chinese characters.
+    //Translation: "I am a computer"
+    EXPORT Test33 := ASSERT(Uni.CountWords(U'我是電腦',U'是') = 2);
+    //Check action on a string containing Modern Greek characters.
+    //Translation: "Do you come here often?"
+    EXPORT Test34 := ASSERT(Uni.CountWords(U' Έρχεσαι συχνά εδώ; ',U'χ') = 3);
+    //Testcases 35 and 36 test for bidirectional capabilities with scripts in arabic and hebrew.
+    //Check action on arabic lettering with accent marks. Bidirectional.
+    //Translation: "Good morning"
+    EXPORT Test35 := ASSERT(Uni.CountWords(U'صباح الخير',U'ا') = 3);
+    //Check action on hebrew lettering with accent marks (called pointing). Bidirectional.
+    //Translation: (not a phrase, 2 different words separated by a space)
+    EXPORT Test36 := ASSERT(Uni.CountWords(U'קָמָץ שִׁי״ן',U'קָ') = 1);
+    //Check action on surrogate pairs.
+    EXPORT Test37 := ASSERT(Uni.CountWords(U'x𐐀x𐐀',U'𐐀') = 2);
+    EXPORT Test38 := ASSERT(Uni.CountWords(U'𐐀',U'𐐀') = 0);
+    EXPORT Test39 := ASSERT(Uni.CountWords(U'x',U'𐐀') = 1);
+    EXPORT Test40 := ASSERT(Uni.CountWords(U'𐐀xx𐐀𐐀',U'x') = 2);
+  END;
+
+END;

+ 63 - 0
plugins/unicodelib/unicodelib.cpp

@@ -94,6 +94,7 @@ static const char * EclDefinition =
 "  unicode UnicodeLocaleRemoveSuffix(const unicode src, const unicode suff, const string form) :c,pure,entrypoint='ulUnicodeLocaleRemoveSuffix';\n"
 "  unicode UnicodeLocaleRepeat(const unicode src, unsigned4 n) : c, pure,entrypoint='ulUnicodeLocaleRepeat'; \n"
 "  unsigned4 UnicodeLocaleFindCount(const unicode src, const unicode hit, const string form) :c,pure,entrypoint='ulUnicodeLocaleFindCount';\n"
+"  unsigned4 UnicodeLocaleCountWords(const unicode src, const unicode delim, boolean allowBlankItems) : c,pure,entrypoint='ulUnicodeLocaleCountWords', hole;\n"
 "END;\n";
 
 static const char * compatibleVersions[] = {
@@ -950,6 +951,62 @@ unsigned findCount(UnicodeString const & source, UnicodeString const & seek)
     return matches;
 }
 
+unsigned countDelimitedWords(UnicodeString const & source, unsigned delimLen, UChar const * delim, bool allowBlankItems)
+{
+    UnicodeString const delimiter(delim, delimLen);
+    if (source.isEmpty() || delimiter.isEmpty())
+        return 0;
+
+    int32_t sourceLength = source.countChar32();
+    int32_t delimiterLength = delimiter.countChar32();
+    if (sourceLength < delimiterLength)
+        return 1;
+
+    bool startedWord = false;
+    int32_t idx = 0;
+    int32_t wordCount = 0;
+    int32_t max = source.length() - delimiter.length();
+    StringCharacterIterator it(source);
+    UChar32 startChar = delimiter.char32At(0);
+    while (idx <= max)
+    {
+        if (it.current32() == startChar)
+        {
+            int32_t endPos = source.moveIndex32(idx, delimiterLength);
+            if (source.compareCodePointOrder(idx, endPos - idx, delimiter) == 0)
+            {
+                if (startedWord || allowBlankItems)
+                {
+                    wordCount++;
+                    startedWord = false;
+                }
+                idx = it.move32(delimiterLength, CharacterIterator::kCurrent);
+            }
+            else
+            {
+                idx = it.move32(1, CharacterIterator::kCurrent);
+                if (!startedWord)
+                    startedWord = true;
+            }
+        }
+        else
+        {
+            idx = it.move32(1, CharacterIterator::kCurrent);
+            if (!startedWord)
+                startedWord = true;
+        }
+    }
+
+    /*source.length() used instead of sourceLength because the iterator's value is representative of code units
+     *despite incrementing by code points
+     */
+    if (startedWord || idx != source.length() || allowBlankItems)
+        wordCount++;
+
+    return wordCount;
+}
+
+
 void excludeNthWord(RuleBasedBreakIterator& bi, UnicodeString & source, unsigned n)
 {
     bi.setText(source);
@@ -1842,3 +1899,9 @@ UNICODELIB_API unsigned UNICODELIB_CALL ulUnicodeLocaleFindCount(unsigned srcLen
 
     return findCount(source, sought);
 }
+
+UNICODELIB_API unsigned UNICODELIB_CALL ulUnicodeLocaleCountWords(unsigned srcLen, UChar const * src, unsigned delimLen, UChar const * delim, bool allowBlankItems)
+{
+    UnicodeString const processed(src, srcLen);
+    return countDelimitedWords(processed, delimLen, delim, allowBlankItems);
+}

+ 1 - 0
plugins/unicodelib/unicodelib.hpp

@@ -110,6 +110,7 @@ UNICODELIB_API void UNICODELIB_CALL ulUnicodeVersion(unsigned & tgtLen, char * &
 UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleRemoveSuffix(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src, unsigned suffLen, UChar const * suff, unsigned formLen, char const * form);
 UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleRepeat(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src, unsigned n);
 UNICODELIB_API unsigned UNICODELIB_CALL ulUnicodeLocaleFindCount(unsigned srcLen, UChar const * src, unsigned hitLen, UChar const * hit, unsigned formLen, char const * form);
+UNICODELIB_API unsigned UNICODELIB_CALL ulUnicodeLocaleCountWords(unsigned srcLen, UChar const * src, unsigned delimLen, UChar const * delim, bool allowBlankItems);
 }
 
 #endif