7 years ago · b34d649f43
--- a/ecllibrary/std/Uni.ecl
+++ b/ecllibrary/std/Uni.ecl
@@ -517,4 +517,17 @@ EXPORT unsigned4 FindCount(unicode src, unicode hit, string form) :=
 
																 EXPORT unsigned4 CountWords(unicode src, unicode delim, boolean allowBlankItems = FALSE) :=
															
 
																     lib_unicodelib.UnicodeLib.UnicodeLocaleCountWords(src, delim, allowBlankItems);
															
 
																+/**
															
 
																+ * Returns the delimited words that the string contains in a UnicodeSet.  Words are separated by one or more separator strings. No 
															
 
																+ * spaces are stripped from either string before matching. allowBlankItems set to false by default.
															
 
																+ *
															
 
																+ * @param src               The string being searched in.
															
 
																+ * @param delim             The string used to separate words
															
 
																+ * @param allowBlankItems   Indicates if empty/blank string items are included in the results.
															
 
																+ * @return                  A UnicodeSet whose members are the delimited words
															
 
																+ */
															
 
																+
															
 
																+EXPORT SplitWords(unicode src, unicode delim, boolean allowBlankItems = FALSE) :=
															
 
																+    lib_unicodelib.UnicodeLib.UnicodeLocaleSplitWords(src, delim, allowBlankItems);
															
 
																+
															
 
																 END;
															
--- a/ecllibrary/teststd/uni/TestCountWords.ecl
+++ b/ecllibrary/teststd/uni/TestCountWords.ecl
@@ -9,7 +9,7 @@ EXPORT TestCountWords := MODULE
 
																   EXPORT TestConst := MODULE
															
 
																     //Check action on strings with no entries: empty source string, search string, or return string.
															
 
																     EXPORT Test01 := ASSERT(Uni.CountWords(U'', U'') = 0);
															
 
																-    EXPORT Test01b := ASSERT(Uni.CountWords(U'abcde', U'') = 0);
															
 
																+    EXPORT Test01b := ASSERT(Uni.CountWords(U'abcde', U'') = 1);
															
 
																     EXPORT Test01c := ASSERT(Uni.CountWords(U'', U'abc', TRUE) = 0);
															
 
																     EXPORT Test02 := ASSERT(Uni.CountWords(U'x', U'x') = 0);
															
 
																     EXPORT Test03 := ASSERT(Uni.CountWords(U'x', U' ') = 1);
															
--- a/ecllibrary/teststd/uni/TestSplitWords.ecl
+++ b/ecllibrary/teststd/uni/TestSplitWords.ecl
@@ -0,0 +1,72 @@
 
																+/*##############################################################################
															
 
																+## HPCC SYSTEMS software Copyright (C) 2017 HPCC Systems®.  All rights reserved.
															
 
																+############################################################################## */
															
 
																+
															
 
																+IMPORT Std.Uni;
															
 
																+
															
 
																+EXPORT TestSplitWords := MODULE
															
 
																+  EXPORT TestRuntime := MODULE
															
 
																+    //Check action on strings with no entries: empty source string, search string, or return string.
															
 
																+    EXPORT Test01 := ASSERT(Uni.SplitWords(U'', U'') = []);
															
 
																+    EXPORT Test01b := ASSERT(Uni.SplitWords(U'abcde', U'') = [U'abcde']);
															
 
																+    EXPORT Test01c := ASSERT(Uni.SplitWords(U'', U'abc', TRUE) = []);
															
 
																+    EXPORT Test02 := ASSERT(Uni.SplitWords(U'x', U'x') = []);
															
 
																+    EXPORT Test03 := ASSERT(Uni.SplitWords(U'x', U' ') = [U'x']);
															
 
																+    EXPORT Test04 := ASSERT(Uni.SplitWords(U' ', U' ') = []);
															
 
																+    EXPORT Test05 := ASSERT(Uni.SplitWords(U'  ', U' ') = []);
															
 
																+    EXPORT Test06 := ASSERT(Uni.SplitWords(U'x ', U' ') = [U'x']);
															
 
																+    EXPORT Test07 := ASSERT(Uni.SplitWords(U' x', U' ') = [U'x']);
															
 
																+    EXPORT Test08 := ASSERT(Uni.SplitWords(U' x ', U' ') = [U'x']);
															
 
																+    EXPORT Test09 := ASSERT(Uni.SplitWords(U' abc def ', U' ') = [U'abc',U'def']);
															
 
																+    EXPORT Test10 := ASSERT(Uni.SplitWords(U' abc   def ', U' ') = [U'abc',U'def']);
															
 
																+    EXPORT Test11 := ASSERT(Uni.SplitWords(U' a b c   def ', U' ') = [U'a', U'b', U'c',U'def']);
															
 
																+    EXPORT Test12 := ASSERT(Uni.SplitWords(U' abc   def', U' ') = [U'abc',U'def']);
															
 
																+    EXPORT Test13 := ASSERT(Uni.SplitWords(U'$', U'$$') = [U'$']);
															
 
																+    EXPORT Test14 := ASSERT(Uni.SplitWords(U'$x', U'$$') = [U'$x']);
															
 
																+    EXPORT Test15 := ASSERT(Uni.SplitWords(U'$$', U'$$') = []);
															
 
																+    EXPORT Test16 := ASSERT(Uni.SplitWords(U'$$$', U'$$') = [U'$']);
															
 
																+    EXPORT Test17 := ASSERT(Uni.SplitWords(U'$$$$', U'$$') = []);
															
 
																+    EXPORT Test18 := ASSERT(Uni.SplitWords(U'$$x$$', U'$$') = [U'x']);
															
 
																+    EXPORT Test19 := ASSERT(Uni.SplitWords(U'$$x$$y', U'$$') = [U'x',U'y']);
															
 
																+    EXPORT Test21 := ASSERT(Uni.SplitWords(U'a,c,d', U',', TRUE) = [U'a',U'c',U'd']);
															
 
																+    EXPORT Test21a := ASSERT(Uni.SplitWords(U'a,c,d', U',', FALSE) = [U'a',U'c',U'd']);
															
 
																+    EXPORT Test22 := ASSERT(Uni.SplitWords(U'a,,d', U',', TRUE) = [U'a',U'',U'd']);
															
 
																+    EXPORT Test22a := ASSERT(Uni.SplitWords(U'a,,d', U',', FALSE) = [U'a',U'd']);
															
 
																+    EXPORT Test23 := ASSERT(Uni.SplitWords(U',,,', U',', TRUE) = [U'',U'',U'',U'']);
															
 
																+    EXPORT Test23a := ASSERT(Uni.SplitWords(U',,,', U',', FALSE) = []);
															
 
																+    EXPORT Test24 := ASSERT(Uni.SplitWords(U' \377ABCDEF FEDCBA ', U' ') = [U'\377ABCDEF',U'FEDCBA']);
															
 
																+    //Check action on a string containing punctuation characters.
															
 
																+    EXPORT Test25 := ASSERT(Uni.SplitWords(U' ,&%$@ ',U'%$') = [U' ,&',U'@ ']);
															
 
																+    //Check action on a string containing an apostrophe.
															
 
																+    EXPORT Test26 := ASSERT(Uni.SplitWords(U'I couldn\'t hear you!',U'\'') = [U'I couldn',U't hear you!']);
															
 
																+    //Check action on a string containing different variations/combinations of numbers and other characters.
															
 
																+    EXPORT Test27 := ASSERT(Uni.SplitWords(U'1 234 123abc 23.6 abc123',U'2') = [U'1 ',U'34 1',U'3abc ',U'3.6 abc1',U'3']);
															
 
																+    //Test other space characters (< 0x20).
															
 
																+    EXPORT Test28 := ASSERT(Uni.SplitWords(U'an\nt\tdef',U' ') = [U'an\nt\tdef']);
															
 
																+    EXPORT Test29 := ASSERT(Uni.SplitWords(U'  a n\nt \t  def    ',U't') = [U'  a n\n',U' \t  def    ']);
															
 
																+    //Check action on a string containing latin diacritical marks.
															
 
																+    EXPORT Test30 := ASSERT(Uni.SplitWords(U'À à',U'À') = [U' à']);
															
 
																+    EXPORT Test31 := ASSERT(Uni.SplitWords(U'ȭ š',U'ȭ') = [U' š']);
															
 
																+    //Check action on a string containing Spanish words with latin accents.
															
 
																+    //Translation: "The deceased changed the girls"
															
 
																+    EXPORT Test32 := ASSERT(Uni.SplitWords(U'El difunto cambió las niñas',U'cambió') = [U'El difunto ',U' las niñas']);
															
 
																+    //Check action on a string containing Chinese characters.
															
 
																+    //Translation: "I am a computer"
															
 
																+    EXPORT Test33 := ASSERT(Uni.SplitWords(U'我是電腦',U'是') = [U'我',U'電腦']);
															
 
																+    //Check action on a string containing Modern Greek characters.
															
 
																+    //Translation: "Do you come here often?"
															
 
																+    EXPORT Test34 := ASSERT(Uni.SplitWords(U' Έρχεσαι συχνά εδώ; ',U'χ') = [U' Έρ',U'εσαι συ',U'νά εδώ; ']);
															
 
																+    //Testcases 35 and 36 test for bidirectional capabilities with scripts in arabic and hebrew.
															
 
																+    //Check action on arabic lettering with accent marks. Bidirectional.
															
 
																+    //Translation: "Good morning"
															
 
																+    EXPORT Test35 := ASSERT(Uni.SplitWords(U'صباح الخير',U'ا') = [U'صب',U'ح ',U'لخير']);
															
 
																+    //Check action on hebrew lettering with accent marks (called pointing). Bidirectional.
															
 
																+    //Translation: (not a phrase, 2 different words separated by a space)
															
 
																+    EXPORT Test36 := ASSERT(Uni.SplitWords(U'קָמָץ שִׁי״ן',U'קָ') = [U'מָץ שִׁי״ן']);
															
 
																+    //Check action on surrogate pairs.
															
 
																+    EXPORT Test37 := ASSERT(Uni.SplitWords(U'x𐐀x𐐀',U'𐐀') = [U'x',U'x']);
															
 
																+    EXPORT Test38 := ASSERT(Uni.SplitWords(U'𐐀',U'𐐀') = []);
															
 
																+    EXPORT Test39 := ASSERT(Uni.SplitWords(U'x',U'𐐀') = [U'x']);
															
 
																+    EXPORT Test40 := ASSERT(Uni.SplitWords(U'𐐀xx𐐀𐐀',U'x') = [U'𐐀',U'𐐀𐐀']);
															
 
																+  END;
															
 
																+END;
															
--- a/plugins/unicodelib/unicodelib.cpp
+++ b/plugins/unicodelib/unicodelib.cpp
@@ -94,7 +94,8 @@ static const char * EclDefinition =
 
																 "  unicode UnicodeLocaleRemoveSuffix(const unicode src, const unicode suff, const string form) :c,pure,entrypoint='ulUnicodeLocaleRemoveSuffix';\n"
															
 
																 "  unicode UnicodeLocaleRepeat(const unicode src, unsigned4 n) : c, pure,entrypoint='ulUnicodeLocaleRepeat'; \n"
															
 
																 "  unsigned4 UnicodeLocaleFindCount(const unicode src, const unicode hit, const string form) :c,pure,entrypoint='ulUnicodeLocaleFindCount';\n"
															
 
																-"  unsigned4 UnicodeLocaleCountWords(const unicode src, const unicode delim, boolean allowBlankItems) : c,pure,entrypoint='ulUnicodeLocaleCountWords', hole;\n"
															
 
																+"  unsigned4 UnicodeLocaleCountWords(const unicode src, const unicode delim, boolean allowBlankItems) : c,pure,entrypoint='ulUnicodeLocaleCountWords';\n"
															
 
																+"  SET OF UNICODE UnicodeLocaleSplitWords(const unicode src, const unicode delim, boolean allowBlankItems) : c,pure,entrypoint='ulUnicodeLocaleSplitWords';\n"
															
 
																 "END;\n";
															
 
																 static const char * compatibleVersions[] = {
															
@@ -954,12 +955,12 @@ unsigned findCount(UnicodeString const & source, UnicodeString const & seek)
 
																 unsigned countDelimitedWords(UnicodeString const & source, unsigned delimLen, UChar const * delim, bool allowBlankItems)
															
 
																 {
															
 
																     UnicodeString const delimiter(delim, delimLen);
															
 
																-    if (source.isEmpty() || delimiter.isEmpty())
															
 
																+    if (source.isEmpty())
															
 
																         return 0;
															
 
																     int32_t sourceLength = source.countChar32();
															
 
																     int32_t delimiterLength = delimiter.countChar32();
															
 
																-    if (sourceLength < delimiterLength)
															
 
																+    if ((sourceLength < delimiterLength) || (delimLen == 0))
															
 
																         return 1;
															
 
																     bool startedWord = false;
															
@@ -1006,6 +1007,62 @@ unsigned countDelimitedWords(UnicodeString const & source, unsigned delimLen, UC
 
																     return wordCount;
															
 
																 }
															
 
																+static void appendUnicode(MemoryBuffer & result, const UnicodeString & source, int32_t from, int32_t length)
															
 
																+{
															
 
																+    result.append((unsigned)length);
															
 
																+    UChar * target = (UChar *)result.reserve(length * sizeof(UChar));
															
 
																+    source.extractBetween(from, from+length, target, 0);
															
 
																+}
															
 
																+
															
 
																+void splitWords(MemoryBuffer & result, const UnicodeString & source, unsigned delimLen, UChar const * delim, bool allowBlankItems)
															
 
																+{
															
 
																+    if (source.isEmpty())
															
 
																+        return;
															
 
																+
															
 
																+    const UnicodeString delimiter(delim, delimLen);
															
 
																+    int32_t sourceLength = source.countChar32();
															
 
																+    int32_t delimiterLength = delimiter.countChar32();
															
 
																+    if ((sourceLength < delimiterLength) || (delimLen == 0))
															
 
																+    {
															
 
																+        appendUnicode(result, source, 0, source.length());
															
 
																+        return;
															
 
																+    }
															
 
																+
															
 
																+    int32_t startWord = 0;
															
 
																+    int32_t idx = 0;
															
 
																+    int32_t max = source.length() - delimiterLength;
															
 
																+    StringCharacterIterator it(source);
															
 
																+    while (idx <= max)
															
 
																+    {
															
 
																+        if (source.char32At(idx) == delimiter.char32At(0))
															
 
																+        {
															
 
																+            int32_t endPos = source.moveIndex32(idx, delimiterLength);
															
 
																+            if (source.compareCodePointOrder(idx, endPos - idx, delimiter) == 0)
															
 
																+            {
															
 
																+                if ((startWord != idx)|| allowBlankItems)
															
 
																+                    appendUnicode(result, source, startWord, idx - startWord);
															
 
																+
															
 
																+                startWord = endPos;
															
 
																+                idx = it.move32(delimiterLength, CharacterIterator::kCurrent);
															
 
																+            }
															
 
																+            else
															
 
																+            {
															
 
																+                idx = it.move32(1, CharacterIterator::kCurrent);
															
 
																+            }
															
 
																+        }
															
 
																+        else
															
 
																+        {
															
 
																+            idx = it.move32(1, CharacterIterator::kCurrent);
															
 
																+        }
															
 
																+    }
															
 
																+
															
 
																+    /*source.length() used instead of sourceLength because the iterator's value is representative of code units
															
 
																+     *despite incrementing by code points
															
 
																+     */
															
 
																+    if ((startWord != idx) || (idx != source.length()) || allowBlankItems)
															
 
																+        appendUnicode(result, source, startWord, source.length() - startWord);
															
 
																+}
															
 
																+
															
 
																 void excludeNthWord(RuleBasedBreakIterator& bi, UnicodeString & source, unsigned n)
															
 
																 {
															
@@ -1905,3 +1962,13 @@ UNICODELIB_API unsigned UNICODELIB_CALL ulUnicodeLocaleCountWords(unsigned srcLe
 
																     UnicodeString const processed(src, srcLen);
															
 
																     return countDelimitedWords(processed, delimLen, delim, allowBlankItems);
															
 
																 }
															
 
																+
															
 
																+UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleSplitWords(bool & isAllResult,size32_t & lenResult,void * & result, unsigned srcLen, UChar const * src, unsigned delimLen, UChar const * delim, bool allowBlankItems)
															
 
																+{
															
 
																+    const UnicodeString source(src, srcLen);
															
 
																+    MemoryBuffer out;
															
 
																+    splitWords(out, source, delimLen, delim, allowBlankItems);
															
 
																+    isAllResult = false;
															
 
																+    lenResult = out.length();
															
 
																+    result = out.detach();
															
 
																+}
															
--- a/plugins/unicodelib/unicodelib.hpp
+++ b/plugins/unicodelib/unicodelib.hpp
@@ -111,6 +111,7 @@ UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleRemoveSuffix(unsigned & tgtLe
 
																 UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleRepeat(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src, unsigned n);
															
 
																 UNICODELIB_API unsigned UNICODELIB_CALL ulUnicodeLocaleFindCount(unsigned srcLen, UChar const * src, unsigned hitLen, UChar const * hit, unsigned formLen, char const * form);
															
 
																 UNICODELIB_API unsigned UNICODELIB_CALL ulUnicodeLocaleCountWords(unsigned srcLen, UChar const * src, unsigned delimLen, UChar const * delim, bool allowBlankItems);
															
 
																+UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleSplitWords(bool & isAllResult,size32_t & lenResult,void * & result, unsigned srcLen, UChar const * src, unsigned delimLen, UChar const * delim, bool allowBlankItems);
															
 
																 }
															
 
																 #endif