Browse Source

HPCC-18044 Unicode Implementation for SplitWords

- Adds testcases to ecllibrary/teststd/uni/TestSplitWords.ecl
- Adds documentation to ecllibrary/std/Uni.ecl
- Adds code to plugins/unicodelib/unicodelib.cpp and .hpp

Signed-off-by: Gavin Halliday <gavin.halliday@lexisnexis.com>
David Skaff 7 years ago
parent
commit
b34d649f43

+ 13 - 0
ecllibrary/std/Uni.ecl

@@ -517,4 +517,17 @@ EXPORT unsigned4 FindCount(unicode src, unicode hit, string form) :=
 EXPORT unsigned4 CountWords(unicode src, unicode delim, boolean allowBlankItems = FALSE) :=
 EXPORT unsigned4 CountWords(unicode src, unicode delim, boolean allowBlankItems = FALSE) :=
     lib_unicodelib.UnicodeLib.UnicodeLocaleCountWords(src, delim, allowBlankItems);
     lib_unicodelib.UnicodeLib.UnicodeLocaleCountWords(src, delim, allowBlankItems);
 
 
+/**
+ * Returns the delimited words that the string contains in a UnicodeSet.  Words are separated by one or more separator strings. No 
+ * spaces are stripped from either string before matching. allowBlankItems set to false by default.
+ *
+ * @param src               The string being searched in.
+ * @param delim             The string used to separate words
+ * @param allowBlankItems   Indicates if empty/blank string items are included in the results.
+ * @return                  A UnicodeSet whose members are the delimited words
+ */
+
+EXPORT SplitWords(unicode src, unicode delim, boolean allowBlankItems = FALSE) :=
+    lib_unicodelib.UnicodeLib.UnicodeLocaleSplitWords(src, delim, allowBlankItems);
+
 END;
 END;

+ 1 - 1
ecllibrary/teststd/uni/TestCountWords.ecl

@@ -9,7 +9,7 @@ EXPORT TestCountWords := MODULE
   EXPORT TestConst := MODULE
   EXPORT TestConst := MODULE
     //Check action on strings with no entries: empty source string, search string, or return string.
     //Check action on strings with no entries: empty source string, search string, or return string.
     EXPORT Test01 := ASSERT(Uni.CountWords(U'', U'') = 0);
     EXPORT Test01 := ASSERT(Uni.CountWords(U'', U'') = 0);
-    EXPORT Test01b := ASSERT(Uni.CountWords(U'abcde', U'') = 0);
+    EXPORT Test01b := ASSERT(Uni.CountWords(U'abcde', U'') = 1);
     EXPORT Test01c := ASSERT(Uni.CountWords(U'', U'abc', TRUE) = 0);
     EXPORT Test01c := ASSERT(Uni.CountWords(U'', U'abc', TRUE) = 0);
     EXPORT Test02 := ASSERT(Uni.CountWords(U'x', U'x') = 0);
     EXPORT Test02 := ASSERT(Uni.CountWords(U'x', U'x') = 0);
     EXPORT Test03 := ASSERT(Uni.CountWords(U'x', U' ') = 1);
     EXPORT Test03 := ASSERT(Uni.CountWords(U'x', U' ') = 1);

+ 72 - 0
ecllibrary/teststd/uni/TestSplitWords.ecl

@@ -0,0 +1,72 @@
+/*##############################################################################
+## HPCC SYSTEMS software Copyright (C) 2017 HPCC Systems®.  All rights reserved.
+############################################################################## */
+
+IMPORT Std.Uni;
+
+EXPORT TestSplitWords := MODULE
+  EXPORT TestRuntime := MODULE
+    //Check action on strings with no entries: empty source string, search string, or return string.
+    EXPORT Test01 := ASSERT(Uni.SplitWords(U'', U'') = []);
+    EXPORT Test01b := ASSERT(Uni.SplitWords(U'abcde', U'') = [U'abcde']);
+    EXPORT Test01c := ASSERT(Uni.SplitWords(U'', U'abc', TRUE) = []);
+    EXPORT Test02 := ASSERT(Uni.SplitWords(U'x', U'x') = []);
+    EXPORT Test03 := ASSERT(Uni.SplitWords(U'x', U' ') = [U'x']);
+    EXPORT Test04 := ASSERT(Uni.SplitWords(U' ', U' ') = []);
+    EXPORT Test05 := ASSERT(Uni.SplitWords(U'  ', U' ') = []);
+    EXPORT Test06 := ASSERT(Uni.SplitWords(U'x ', U' ') = [U'x']);
+    EXPORT Test07 := ASSERT(Uni.SplitWords(U' x', U' ') = [U'x']);
+    EXPORT Test08 := ASSERT(Uni.SplitWords(U' x ', U' ') = [U'x']);
+    EXPORT Test09 := ASSERT(Uni.SplitWords(U' abc def ', U' ') = [U'abc',U'def']);
+    EXPORT Test10 := ASSERT(Uni.SplitWords(U' abc   def ', U' ') = [U'abc',U'def']);
+    EXPORT Test11 := ASSERT(Uni.SplitWords(U' a b c   def ', U' ') = [U'a', U'b', U'c',U'def']);
+    EXPORT Test12 := ASSERT(Uni.SplitWords(U' abc   def', U' ') = [U'abc',U'def']);
+    EXPORT Test13 := ASSERT(Uni.SplitWords(U'$', U'$$') = [U'$']);
+    EXPORT Test14 := ASSERT(Uni.SplitWords(U'$x', U'$$') = [U'$x']);
+    EXPORT Test15 := ASSERT(Uni.SplitWords(U'$$', U'$$') = []);
+    EXPORT Test16 := ASSERT(Uni.SplitWords(U'$$$', U'$$') = [U'$']);
+    EXPORT Test17 := ASSERT(Uni.SplitWords(U'$$$$', U'$$') = []);
+    EXPORT Test18 := ASSERT(Uni.SplitWords(U'$$x$$', U'$$') = [U'x']);
+    EXPORT Test19 := ASSERT(Uni.SplitWords(U'$$x$$y', U'$$') = [U'x',U'y']);
+    EXPORT Test21 := ASSERT(Uni.SplitWords(U'a,c,d', U',', TRUE) = [U'a',U'c',U'd']);
+    EXPORT Test21a := ASSERT(Uni.SplitWords(U'a,c,d', U',', FALSE) = [U'a',U'c',U'd']);
+    EXPORT Test22 := ASSERT(Uni.SplitWords(U'a,,d', U',', TRUE) = [U'a',U'',U'd']);
+    EXPORT Test22a := ASSERT(Uni.SplitWords(U'a,,d', U',', FALSE) = [U'a',U'd']);
+    EXPORT Test23 := ASSERT(Uni.SplitWords(U',,,', U',', TRUE) = [U'',U'',U'',U'']);
+    EXPORT Test23a := ASSERT(Uni.SplitWords(U',,,', U',', FALSE) = []);
+    EXPORT Test24 := ASSERT(Uni.SplitWords(U' \377ABCDEF FEDCBA ', U' ') = [U'\377ABCDEF',U'FEDCBA']);
+    //Check action on a string containing punctuation characters.
+    EXPORT Test25 := ASSERT(Uni.SplitWords(U' ,&%$@ ',U'%$') = [U' ,&',U'@ ']);
+    //Check action on a string containing an apostrophe.
+    EXPORT Test26 := ASSERT(Uni.SplitWords(U'I couldn\'t hear you!',U'\'') = [U'I couldn',U't hear you!']);
+    //Check action on a string containing different variations/combinations of numbers and other characters.
+    EXPORT Test27 := ASSERT(Uni.SplitWords(U'1 234 123abc 23.6 abc123',U'2') = [U'1 ',U'34 1',U'3abc ',U'3.6 abc1',U'3']);
+    //Test other space characters (< 0x20).
+    EXPORT Test28 := ASSERT(Uni.SplitWords(U'an\nt\tdef',U' ') = [U'an\nt\tdef']);
+    EXPORT Test29 := ASSERT(Uni.SplitWords(U'  a n\nt \t  def    ',U't') = [U'  a n\n',U' \t  def    ']);
+    //Check action on a string containing latin diacritical marks.
+    EXPORT Test30 := ASSERT(Uni.SplitWords(U'À à',U'À') = [U' à']);
+    EXPORT Test31 := ASSERT(Uni.SplitWords(U'ȭ š',U'ȭ') = [U' š']);
+    //Check action on a string containing Spanish words with latin accents.
+    //Translation: "The deceased changed the girls"
+    EXPORT Test32 := ASSERT(Uni.SplitWords(U'El difunto cambió las niñas',U'cambió') = [U'El difunto ',U' las niñas']);
+    //Check action on a string containing Chinese characters.
+    //Translation: "I am a computer"
+    EXPORT Test33 := ASSERT(Uni.SplitWords(U'我是電腦',U'是') = [U'我',U'電腦']);
+    //Check action on a string containing Modern Greek characters.
+    //Translation: "Do you come here often?"
+    EXPORT Test34 := ASSERT(Uni.SplitWords(U' Έρχεσαι συχνά εδώ; ',U'χ') = [U' Έρ',U'εσαι συ',U'νά εδώ; ']);
+    //Testcases 35 and 36 test for bidirectional capabilities with scripts in arabic and hebrew.
+    //Check action on arabic lettering with accent marks. Bidirectional.
+    //Translation: "Good morning"
+    EXPORT Test35 := ASSERT(Uni.SplitWords(U'صباح الخير',U'ا') = [U'صب',U'ح ',U'لخير']);
+    //Check action on hebrew lettering with accent marks (called pointing). Bidirectional.
+    //Translation: (not a phrase, 2 different words separated by a space)
+    EXPORT Test36 := ASSERT(Uni.SplitWords(U'קָמָץ שִׁי״ן',U'קָ') = [U'מָץ שִׁי״ן']);
+    //Check action on surrogate pairs.
+    EXPORT Test37 := ASSERT(Uni.SplitWords(U'x𐐀x𐐀',U'𐐀') = [U'x',U'x']);
+    EXPORT Test38 := ASSERT(Uni.SplitWords(U'𐐀',U'𐐀') = []);
+    EXPORT Test39 := ASSERT(Uni.SplitWords(U'x',U'𐐀') = [U'x']);
+    EXPORT Test40 := ASSERT(Uni.SplitWords(U'𐐀xx𐐀𐐀',U'x') = [U'𐐀',U'𐐀𐐀']);
+  END;
+END;

+ 70 - 3
plugins/unicodelib/unicodelib.cpp

@@ -94,7 +94,8 @@ static const char * EclDefinition =
 "  unicode UnicodeLocaleRemoveSuffix(const unicode src, const unicode suff, const string form) :c,pure,entrypoint='ulUnicodeLocaleRemoveSuffix';\n"
 "  unicode UnicodeLocaleRemoveSuffix(const unicode src, const unicode suff, const string form) :c,pure,entrypoint='ulUnicodeLocaleRemoveSuffix';\n"
 "  unicode UnicodeLocaleRepeat(const unicode src, unsigned4 n) : c, pure,entrypoint='ulUnicodeLocaleRepeat'; \n"
 "  unicode UnicodeLocaleRepeat(const unicode src, unsigned4 n) : c, pure,entrypoint='ulUnicodeLocaleRepeat'; \n"
 "  unsigned4 UnicodeLocaleFindCount(const unicode src, const unicode hit, const string form) :c,pure,entrypoint='ulUnicodeLocaleFindCount';\n"
 "  unsigned4 UnicodeLocaleFindCount(const unicode src, const unicode hit, const string form) :c,pure,entrypoint='ulUnicodeLocaleFindCount';\n"
-"  unsigned4 UnicodeLocaleCountWords(const unicode src, const unicode delim, boolean allowBlankItems) : c,pure,entrypoint='ulUnicodeLocaleCountWords', hole;\n"
+"  unsigned4 UnicodeLocaleCountWords(const unicode src, const unicode delim, boolean allowBlankItems) : c,pure,entrypoint='ulUnicodeLocaleCountWords';\n"
+"  SET OF UNICODE UnicodeLocaleSplitWords(const unicode src, const unicode delim, boolean allowBlankItems) : c,pure,entrypoint='ulUnicodeLocaleSplitWords';\n"
 "END;\n";
 "END;\n";
 
 
 static const char * compatibleVersions[] = {
 static const char * compatibleVersions[] = {
@@ -954,12 +955,12 @@ unsigned findCount(UnicodeString const & source, UnicodeString const & seek)
 unsigned countDelimitedWords(UnicodeString const & source, unsigned delimLen, UChar const * delim, bool allowBlankItems)
 unsigned countDelimitedWords(UnicodeString const & source, unsigned delimLen, UChar const * delim, bool allowBlankItems)
 {
 {
     UnicodeString const delimiter(delim, delimLen);
     UnicodeString const delimiter(delim, delimLen);
-    if (source.isEmpty() || delimiter.isEmpty())
+    if (source.isEmpty())
         return 0;
         return 0;
 
 
     int32_t sourceLength = source.countChar32();
     int32_t sourceLength = source.countChar32();
     int32_t delimiterLength = delimiter.countChar32();
     int32_t delimiterLength = delimiter.countChar32();
-    if (sourceLength < delimiterLength)
+    if ((sourceLength < delimiterLength) || (delimLen == 0))
         return 1;
         return 1;
 
 
     bool startedWord = false;
     bool startedWord = false;
@@ -1006,6 +1007,62 @@ unsigned countDelimitedWords(UnicodeString const & source, unsigned delimLen, UC
     return wordCount;
     return wordCount;
 }
 }
 
 
+static void appendUnicode(MemoryBuffer & result, const UnicodeString & source, int32_t from, int32_t length)
+{
+    result.append((unsigned)length);
+    UChar * target = (UChar *)result.reserve(length * sizeof(UChar));
+    source.extractBetween(from, from+length, target, 0);
+}
+
+void splitWords(MemoryBuffer & result, const UnicodeString & source, unsigned delimLen, UChar const * delim, bool allowBlankItems)
+{
+    if (source.isEmpty())
+        return;
+
+    const UnicodeString delimiter(delim, delimLen);
+    int32_t sourceLength = source.countChar32();
+    int32_t delimiterLength = delimiter.countChar32();
+    if ((sourceLength < delimiterLength) || (delimLen == 0))
+    {
+        appendUnicode(result, source, 0, source.length());
+        return;
+    }
+
+    int32_t startWord = 0;
+    int32_t idx = 0;
+    int32_t max = source.length() - delimiterLength;
+    StringCharacterIterator it(source);
+    while (idx <= max)
+    {
+        if (source.char32At(idx) == delimiter.char32At(0))
+        {
+            int32_t endPos = source.moveIndex32(idx, delimiterLength);
+            if (source.compareCodePointOrder(idx, endPos - idx, delimiter) == 0)
+            {
+                if ((startWord != idx)|| allowBlankItems)
+                    appendUnicode(result, source, startWord, idx - startWord);
+
+                startWord = endPos;
+                idx = it.move32(delimiterLength, CharacterIterator::kCurrent);
+            }
+            else
+            {
+                idx = it.move32(1, CharacterIterator::kCurrent);
+            }
+        }
+        else
+        {
+            idx = it.move32(1, CharacterIterator::kCurrent);
+        }
+    }
+
+    /*source.length() used instead of sourceLength because the iterator's value is representative of code units
+     *despite incrementing by code points
+     */
+    if ((startWord != idx) || (idx != source.length()) || allowBlankItems)
+        appendUnicode(result, source, startWord, source.length() - startWord);
+}
+
 
 
 void excludeNthWord(RuleBasedBreakIterator& bi, UnicodeString & source, unsigned n)
 void excludeNthWord(RuleBasedBreakIterator& bi, UnicodeString & source, unsigned n)
 {
 {
@@ -1905,3 +1962,13 @@ UNICODELIB_API unsigned UNICODELIB_CALL ulUnicodeLocaleCountWords(unsigned srcLe
     UnicodeString const processed(src, srcLen);
     UnicodeString const processed(src, srcLen);
     return countDelimitedWords(processed, delimLen, delim, allowBlankItems);
     return countDelimitedWords(processed, delimLen, delim, allowBlankItems);
 }
 }
+
+UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleSplitWords(bool & isAllResult,size32_t & lenResult,void * & result, unsigned srcLen, UChar const * src, unsigned delimLen, UChar const * delim, bool allowBlankItems)
+{
+    const UnicodeString source(src, srcLen);
+    MemoryBuffer out;
+    splitWords(out, source, delimLen, delim, allowBlankItems);
+    isAllResult = false;
+    lenResult = out.length();
+    result = out.detach();
+}

+ 1 - 0
plugins/unicodelib/unicodelib.hpp

@@ -111,6 +111,7 @@ UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleRemoveSuffix(unsigned & tgtLe
 UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleRepeat(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src, unsigned n);
 UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleRepeat(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src, unsigned n);
 UNICODELIB_API unsigned UNICODELIB_CALL ulUnicodeLocaleFindCount(unsigned srcLen, UChar const * src, unsigned hitLen, UChar const * hit, unsigned formLen, char const * form);
 UNICODELIB_API unsigned UNICODELIB_CALL ulUnicodeLocaleFindCount(unsigned srcLen, UChar const * src, unsigned hitLen, UChar const * hit, unsigned formLen, char const * form);
 UNICODELIB_API unsigned UNICODELIB_CALL ulUnicodeLocaleCountWords(unsigned srcLen, UChar const * src, unsigned delimLen, UChar const * delim, bool allowBlankItems);
 UNICODELIB_API unsigned UNICODELIB_CALL ulUnicodeLocaleCountWords(unsigned srcLen, UChar const * src, unsigned delimLen, UChar const * delim, bool allowBlankItems);
+UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleSplitWords(bool & isAllResult,size32_t & lenResult,void * & result, unsigned srcLen, UChar const * src, unsigned delimLen, UChar const * delim, bool allowBlankItems);
 }
 }
 
 
 #endif
 #endif