Browse Source

HPCC-18040 Provides unicode implementations for excludeNthWord

- Adds interface definition to ecllibrary/std/uni.ecl
- Adds test cases to mirror string version but with Unicode capability
- Adds code that executes excludeNthWord with unicode capabilities

Signed-off-by: David Skaff <David.Skaff@lexisnexisrisk.com>
David Skaff 8 years ago
parent
commit
5089651b2d

+ 17 - 1
ecllibrary/std/Uni.ecl

@@ -1,4 +1,4 @@
-/*##############################################################################
+/*##############################################################################
 ## HPCC SYSTEMS software Copyright (C) 2012 HPCC Systems®.  All rights reserved.
 ############################################################################## */
 
@@ -382,4 +382,20 @@ EXPORT unsigned4 WordCount(unicode text, varstring localename = '') :=
 EXPORT unicode GetNthWord(unicode text, unsigned4 n, varstring localename = '') :=
     lib_unicodelib.UnicodeLib.UnicodeLocaleGetNthWord(text, n, localename);
 
+/**
+ * Returns everything but the string's nth word and some whitespaces. Words are marked by the unicode break semantics.
+ * Trailing whitespaes are always removed with the word.
+ * Leading whitespaces are only removed with the word if the nth word is the first word.
+ * Returns a blank string if there are no words in the source string.
+ * Returns the source string if the number of words in the string is less than the n parameter's assigned value.
+ *
+ * @param text          The string to be broken into words.
+ * @param n             Which word should be removed from the string.
+ * @param localname     The locale to use for the break semantics.  Defaults to ''.
+ * @return              The string excluding the nth word.
+ */
+
+EXPORT ExcludeNthWord(unicode text, unsigned4 n, varstring localename = '') :=
+    lib_unicodelib.UnicodeLib.UnicodeLocaleExcludeNthWord(text, n, localename);
+
 END;

+ 90 - 0
ecllibrary/teststd/uni/TestExcludeNthWord.ecl

@@ -0,0 +1,90 @@
+/*##############################################################################
+     HPCC SYSTEMS software Copyright (C) 2017 HPCC Systems®.
+ 
+     Licensed under the Apache License, Version 2.0 (the "License");
+     you may not use this file except in compliance with the License.
+     You may obtain a copy of the License at
+ 
+        http://www.apache.org/licenses/LICENSE-2.0
+ 
+     Unless required by applicable law or agreed to in writing, software
+     distributed under the License is distributed on an "AS IS" BASIS,
+     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     See the License for the specific language governing permissions and
+     limitations under the License.
+ ##############################################################################*/
+ 
+ IMPORT Std.Uni;
+ 
+ EXPORT TestExcludeNthWord := MODULE
+ 
+   EXPORT TestConst := MODULE
+    //Check action on a string with no entries.
+    EXPORT Test01 := ASSERT(Uni.ExcludeNthWord('',0)+'!' = '!', CONST);
+    EXPORT Test02 := ASSERT(Uni.ExcludeNthWord('',1)+'!' = '!', CONST);
+    EXPORT Test03 := ASSERT(Uni.ExcludeNthWord('',-1)+'!' = '!', CONST);
+    EXPORT Test04 := ASSERT(Uni.ExcludeNthWord('             ',0)+'!' = '!', CONST);
+    EXPORT Test05 := ASSERT(Uni.ExcludeNthWord('             ',1)+'!' = '!', CONST);
+    EXPORT Test06 := ASSERT(Uni.ExcludeNthWord('             ',-1)+'!' = '!', CONST);
+    //Check action on a string containing a single word - with various whitespace
+    EXPORT Test07 := ASSERT(Uni.ExcludeNthWord('x',0)+'!' = 'x!');
+    EXPORT Test08 := ASSERT(Uni.ExcludeNthWord('x',1)+'!' = '!');
+    EXPORT Test09 := ASSERT(Uni.ExcludeNthWord('x',2)+'!' = 'x!');
+    EXPORT Test10 := ASSERT(Uni.ExcludeNthWord('x',3)+'!' = 'x!');
+    EXPORT Test11 := ASSERT(Uni.ExcludeNthWord(' x',1)+'!' = '!');
+    EXPORT Test12 := ASSERT(Uni.ExcludeNthWord('x ',1)+'!' = '!');
+    EXPORT Test13 := ASSERT(Uni.ExcludeNthWord(' x',2)+'!' = ' x!');
+    EXPORT Test14 := ASSERT(Uni.ExcludeNthWord(' x ',1)+'!' = '!');
+    //Check action on a string containg multiple words - with various whitespace combinations.
+    EXPORT Test15 := ASSERT(Uni.ExcludeNthWord(' abc def ', 1)+'!' = 'def !');
+    EXPORT Test16 := ASSERT(Uni.ExcludeNthWord(' abc def ', 2)+'!' = ' abc !');
+    EXPORT Test17 := ASSERT(Uni.ExcludeNthWord('  a b c   def    ',0)+'!' = '  a b c   def    !');
+    EXPORT Test18 := ASSERT(Uni.ExcludeNthWord('  a b c   def    ',1)+'!' = 'b c   def    !');
+    EXPORT Test19 := ASSERT(Uni.ExcludeNthWord('  a b c   def    ',2)+'!' = '  a c   def    !');
+    EXPORT Test20 := ASSERT(Uni.ExcludeNthWord('  a b c   def    ',3)+'!' = '  a b def    !');
+    EXPORT Test21 := ASSERT(Uni.ExcludeNthWord('  a b c   def    ',4)+'!' = '  a b c   !');
+    EXPORT Test22 := ASSERT(Uni.ExcludeNthWord('  a b c   def    ',5)+'!' = '  a b c   def    !');
+    //Check action on a string containing multiple commas as part of a list initiated by a colon.
+    EXPORT Test23 := ASSERT(Uni.ExcludeNthWord(' ,,,, ',1)+'!' = '!');
+    EXPORT Test24 := ASSERT(Uni.ExcludeNthWord('List: abc, def, ghi,   jhi    ',0)+'!' = 'List: abc, def, ghi,   jhi    !');
+    EXPORT Test25 := ASSERT(Uni.ExcludeNthWord('List: abc, def, ghi,   jhi    ',1)+'!' = 'abc, def, ghi,   jhi    !');
+    EXPORT Test26 := ASSERT(Uni.ExcludeNthWord('List: abc, def, ghi,   jhi    ',2)+'!' = 'List: def, ghi,   jhi    !');
+    EXPORT Test27 := ASSERT(Uni.ExcludeNthWord('List: abc, def, ghi,   jhi    ',4)+'!' = 'List: abc, def, jhi    !');
+    //Check action on a string containing an apostrophe
+    EXPORT Test28 := ASSERT(Uni.ExcludeNthWord('I couldn\'t hear you!',4)+'!' = 'I couldn\'t hear !');
+    EXPORT Test29 := ASSERT(Uni.ExcludeNthWord('I couldn\'t hear you!',2)+'!' = 'I hear you!!');
+    //Check action on a string containing other Symbols
+    EXPORT Test30 := ASSERT(Uni.ExcludeNthWord('abc := name',1)+'!' = 'name!');
+    EXPORT Test31 := ASSERT(Uni.ExcludeNthWord('abc := name',2)+'!' = 'abc := !');
+    //Check action on a string containing different variations/combinations of numbers and other characters
+    EXPORT Test32 := ASSERT(Uni.ExcludeNthWord('1 234 123abc 23.6 abc123',1)+'!' = '234 123abc 23.6 abc123!');
+    EXPORT Test33 := ASSERT(Uni.ExcludeNthWord('1 234 123abc 23.6 abc123',2)+'!' = '1 123abc 23.6 abc123!');
+    EXPORT Test34 := ASSERT(Uni.ExcludeNthWord('1 234 123abc 23.6 abc123',3)+'!' = '1 234 23.6 abc123!');
+    EXPORT Test35 := ASSERT(Uni.ExcludeNthWord('1 234 123abc 23.6 abc123',4)+'!' = '1 234 123abc abc123!');
+    EXPORT Test36 := ASSERT(Uni.ExcludeNthWord('1 234 123abc 23.6 abc123',5)+'!' = '1 234 123abc 23.6 !');
+    //Test other space characters (< 0x20)
+    EXPORT Test37 := ASSERT(Uni.ExcludeNthWord('  a b\nc \t  def    ',2)+'!' = '  a c \t  def    !');
+    EXPORT Test38 := ASSERT(Uni.ExcludeNthWord('  a b\nc \t  def    ',3)+'!' = '  a b\ndef    !');
+    //Check action on a string containing latin diacritical marks
+    EXPORT Test39 := ASSERT(Uni.ExcludeNthWord(U'À à',2)+U'!' = U'À !');
+    EXPORT Test40 := ASSERT(Uni.ExcludeNthWord(U'ä̰́ Ä̰́',2)+U'!' = U'ä̰́ !');
+    //Check action on a string containing Spanish words with latin accents.
+    //Translation: "The deceased changed the girls" --> "The deceased the girls" & --> "The deceased changed the"
+    EXPORT Test41 := ASSERT(Uni.ExcludeNthWord(U'El difunto cambió las niñas',3)+U'!' = U'El difunto las niñas!');
+    EXPORT Test42 := ASSERT(Uni.ExcludeNthWord(U'El difunto cambió las niñas',5)+U'!' = U'El difunto cambió las !');
+    //Check action on a string containing Chinese characters.
+    //Translation: "I am a computer" --> "I am"
+    EXPORT Test43 := ASSERT(Uni.ExcludeNthWord(U'我是電腦',2)+U'!' = U'我是!');
+    //Check action on a string containing Modern Greek characters.
+    //Translation: "Do you come here often?" --> "come here often?"
+    EXPORT Test44 := ASSERT(Uni.ExcludeNthWord(U' Έρχεσαι συχνά εδώ; ',1)+U'!' = U'συχνά εδώ; !');
+    //Testcases 45 and 46 test for bidirectional capabilities with scripts in arabic and hebrew.
+    //Check action on arabic lettering with accent marks. Bidirectional.
+    //Translation: "Good morning" --> "morning"
+    EXPORT Test45 := ASSERT(Uni.ExcludeNthWord(U'صباح الخير',2)+U'!' = U'صباح !');
+    //Check action on hebrew lettering with accent marks (called pointing). Bidirectional.
+    //Translation: (not a phrase, 2 different words separated by a space)
+    EXPORT Test46 := ASSERT(Uni.ExcludeNthWord(U'קָמָץ שִׁי״ן',2)+U'!' = U'קָמָץ !');
+   END;
+ 
+ END;

+ 60 - 0
plugins/unicodelib/unicodelib.cpp

@@ -82,6 +82,7 @@ static const char * EclDefinition =
 "  boolean UnicodeLocaleEditDistanceWithinRadius(const unicode left, const unicode right, unsigned4 radius,  const varstring localename) : c,time,pure,entrypoint='ulUnicodeLocaleEditDistanceWithinRadius', hole; \n"
 "  unsigned4 UnicodeLocaleWordCount(const unicode text, const varstring localename) : c, pure,entrypoint='ulUnicodeLocaleWordCount', hole; \n"
 "  unicode UnicodeLocaleGetNthWord(const unicode text, unsigned4 n, const varstring localename) : c,pure,entrypoint='ulUnicodeLocaleGetNthWord';\n"
+"  unicode UnicodeLocaleExcludeNthWord(const unicode text, unsigned4 n, const varstring localename) :c,pure,entrypoint='ulUnicodeLocaleExcludeNthWord';\n"
 "END;\n";
 
 static const char * compatibleVersions[] = {
@@ -718,6 +719,45 @@ unsigned unicodeEditDistanceV4(UnicodeString & left, UnicodeString & right, unsi
     return da[mask(leftLen-1)][rightLen-1];
 }
 
+void excludeNthWord(RuleBasedBreakIterator& bi, UnicodeString & source, unsigned n)
+{
+    bi.setText(source);
+    int32_t idx = bi.first();
+    int32_t wordidx = 0;
+    unsigned wordBeginning = 0;
+    while (idx != BreakIterator::DONE)
+    {
+        int breakType = bi.getRuleStatus();
+        if (breakType != UBRK_WORD_NONE)
+        {
+            // Exclude spaces, punctuation, and the like.
+            //   A status value UBRK_WORD_NONE indicates that the boundary does
+            //   not start a word or number.
+            if (++wordidx == n)
+            {
+                if (n == 1)
+                {
+                    wordBeginning = 0;
+                }
+                unsigned wordEnd;
+                do
+                {
+                    wordEnd = idx;
+                    idx = bi.next();
+                } while (bi.getRuleStatus() == UBRK_WORD_NONE && idx != BreakIterator::DONE);
+                source.removeBetween(wordBeginning, wordEnd);
+                return;
+            }
+        }
+        wordBeginning = idx;
+        idx = bi.next();
+    }
+    if (!wordidx)
+    {
+        source.removeBetween(bi.first(), bi.last());
+    }
+}
+
 UnicodeString getNthWord(RuleBasedBreakIterator& bi, UnicodeString const & source, unsigned n)
 {
     UnicodeString word;
@@ -1416,3 +1456,23 @@ UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleGetNthWord(unsigned & tgtLen,
     }
 }
 
+UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleExcludeNthWord(unsigned & tgtLen, UChar * & tgt, unsigned textLen, UChar const * text, unsigned n, char const * localename)
+{
+    UErrorCode status = U_ZERO_ERROR;
+    Locale locale(localename);
+    RuleBasedBreakIterator* bi = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createWordInstance(locale, status);
+    UnicodeString processed(text, textLen);
+    excludeNthWord(*bi, processed, n);
+    delete bi;
+    if (processed.length()>0)
+    {
+        tgtLen = processed.length();
+        tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
+        processed.extract(0, tgtLen, tgt);
+    }
+    else
+    {
+        tgtLen = 0;
+        tgt = 0;
+    }
+}

+ 1 - 0
plugins/unicodelib/unicodelib.hpp

@@ -101,6 +101,7 @@ UNICODELIB_API unsigned UNICODELIB_CALL ulUnicodeLocaleEditDistance(unsigned lef
 UNICODELIB_API bool UNICODELIB_CALL ulUnicodeLocaleEditDistanceWithinRadius(unsigned leftLen, UChar const * left, unsigned rightLen, UChar const * right, unsigned radius,char const * localename);
 UNICODELIB_API unsigned UNICODELIB_CALL ulUnicodeLocaleWordCount(unsigned textLen, UChar const * text,char const * localename);
 UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleGetNthWord(unsigned & tgtLen, UChar * & tgt, unsigned textLen, UChar const * text, unsigned n, char const * localename);
+UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleExcludeNthWord(unsigned & tgtLen, UChar * & tgt, unsigned textLen, UChar const * text, unsigned n, char const * localename);
 }
 
 #endif