Przeglądaj źródła

Merge pull request #10321 from dskaff/hpcc-18041-excludeLastWord

HPCC-18041 Unicode Implementation for ExcludeLastWord

Reviewed-by: Gavin Halliday <ghalliday@hpccsystems.com>
Gavin Halliday 7 lat temu
rodzic
commit
84facad87c

+ 23 - 11
ecllibrary/std/Uni.ecl

@@ -399,6 +399,29 @@ EXPORT ExcludeNthWord(unicode text, unsigned4 n, varstring localename = '') :=
     lib_unicodelib.UnicodeLib.UnicodeLocaleExcludeNthWord(text, n, localename);
 
 /**
+ * Returns everything except the first word from the string.  Words are marked by the unicode break semantics.
+ * Whitespace before and after the first word is also removed.
+ *
+ * @param text          The string to be broken into words.
+ * @return              The string excluding the first word.
+ */
+
+EXPORT ExcludeFirstWord(unicode text, varstring localename = '') :=
+    lib_unicodelib.UnicodeLib.UnicodeLocaleExcludeNthWord(text, 1, localename);
+
+/**
+ * Returns everything except the last word from the string.  Word boundaries are marked by the unicode break semantics.
+ * Whitespace after a word is removed with the word and leading whitespace is removed with the first word.
+ *
+ * @param text          The string to be broken into words.
+ * @param localname     The locale to use for the break semantics. Defaults to ''.
+ * @return              The string excluding the last word.
+ */
+
+EXPORT unicode ExcludeLastWord(unicode text, varstring localename = '') :=
+    lib_unicodelib.UnicodeLib.UnicodeLocaleExcludeLastWord(text, localename);
+
+/**
  * Returns the source string with the all characters that match characters in the search string replaced
  * with the character at the corresponding position in the replacement string.
  * The isEmpty() tests in the beginning of the function check for invalid sequences in addition to blank strings.
@@ -413,15 +436,4 @@ EXPORT ExcludeNthWord(unicode text, unsigned4 n, varstring localename = '') :=
 EXPORT Translate(unicode text, unicode sear, unicode repl) :=
     lib_unicodelib.UnicodeLib.UnicodeLocaleTranslate(text, sear, repl);
 
-/*
- * Returns everything except the first word from the string.  Words are marked by the unicode break semantics.
- * Whitespace before and after the first word is also removed.
- *
- * @param text          The string to be broken into words.
- * @return              The string excluding the first word.
- */
-
-EXPORT ExcludeFirstWord(unicode text, varstring localename = '') :=
-    lib_unicodelib.UnicodeLib.UnicodeLocaleExcludeNthWord(text, 1, localename);
-
 END;

+ 52 - 0
ecllibrary/teststd/uni/TestExcludeLastWord.ecl

@@ -0,0 +1,52 @@
+/*##############################################################################
+## HPCC SYSTEMS software Copyright (C) 2012 HPCC Systems®.  All rights reserved.
+############################################################################## */
+
+IMPORT Std.Uni;
+
+EXPORT TestExcludeLastWord := MODULE
+
+  EXPORT TestConst := MODULE
+    //Check action on a string with no entries.
+    EXPORT Test01 := ASSERT(Uni.ExcludeLastWord(U'')+U'!' = U'!');
+    EXPORT Test02 := ASSERT(Uni.ExcludeLastWord(U'             ')+U'!' = U'!');
+    //Check action on a string containing a single word - with various whitespace
+    EXPORT Test03 := ASSERT(Uni.ExcludeLastWord(U'x')+U'!' = U'!');
+    EXPORT Test04 := ASSERT(Uni.ExcludeLastWord(U' x')+U'!' = U'!');
+    EXPORT Test05 := ASSERT(Uni.ExcludeLastWord(U'x ')+U'!' = U'!');
+    EXPORT Test06 := ASSERT(Uni.ExcludeLastWord(U' x ')+U'!' = U'!');
+    //Check action on a string containg multiple words - with various whitespace combinations.
+    EXPORT Test07 := ASSERT(Uni.ExcludeLastWord(U' abc def ')+U'!' = U' abc !');
+    EXPORT Test08 := ASSERT(Uni.ExcludeLastWord(U'  a b c   def    ')+U'!' = U'  a b c   !');
+    //Check action on a string containing multiple commas as part of a list initiated by a colon.
+    EXPORT Test09 := ASSERT(Uni.ExcludeLastWord(U' ,,,, ')+U'!' = U'!');
+    EXPORT Test10 := ASSERT(Uni.ExcludeLastWord(U'List: abc, def, ghi,')+U'!' = U'List: abc, def, !');
+    //Check action on a string containing an apostrophe
+    EXPORT Test11 := ASSERT(Uni.ExcludeLastWord(U'I couldn\'t')+U'!' = U'I !');
+    //Check action on a string containing other Symbols
+    EXPORT Test12 := ASSERT(Uni.ExcludeLastWord(U'abc := name')+U'!' = U'abc := !');
+    //Check action on a string containing different variations/combinations of numbers and other characters
+    EXPORT Test13 := ASSERT(Uni.ExcludeLastWord(U'1 234 123abc 23.6 abc123')+U'!' = U'1 234 123abc 23.6 !');
+    //Test other space characters (< 0x20)
+    EXPORT Test14 := ASSERT(Uni.ExcludeLastWord(U'  a b\nc \t   ')+U'!' = U'  a b\n!');
+    //Check action on a string containing latin diacritical marks
+    EXPORT Test15 := ASSERT(Uni.ExcludeLastWord(U'À à')+U'!' = U'À !');
+    //Check action on a string containing Spanish words with latin accents.
+    //Translation: "The deceased changed the girls"
+    EXPORT Test16 := ASSERT(Uni.ExcludeLastWord(U'El difunto cambió las niñas')+U'!' = U'El difunto cambió las !');
+    //Check action on a string containing Chinese characters.
+    //Translation: "I am a computer"
+    EXPORT Test17 := ASSERT(Uni.ExcludeLastWord(U'我是電腦')+U'!' = U'我是!');
+    //Check action on a string containing Modern Greek characters.
+    //Translation: "Do you come here often?"
+    EXPORT Test18 := ASSERT(Uni.ExcludeLastWord(U' Έρχεσαι συχνά εδώ; ')+U'!' = U' Έρχεσαι συχνά !');
+    //Testcases 19 and 20 test for bidirectional capabilities with scripts in arabic and hebrew.
+    //Check action on arabic lettering with accent marks. Bidirectional.
+    //Translation: "Good morning"
+    EXPORT Test19 := ASSERT(Uni.ExcludeLastWord(U'صباح الخير')+U'!' = U'صباح !');
+    //Check action on hebrew lettering with accent marks (called pointing). Bidirectional.
+    //Translation: (not a phrase, 2 different words separated by a space)
+    EXPORT Test20 := ASSERT(Uni.ExcludeLastWord(U'קָמָץ שִׁי״ן')+U'!' = U'קָמָץ !');
+  END;
+
+END;

+ 69 - 0
plugins/unicodelib/unicodelib.cpp

@@ -83,6 +83,7 @@ static const char * EclDefinition =
 "  unsigned4 UnicodeLocaleWordCount(const unicode text, const varstring localename) : c, pure,entrypoint='ulUnicodeLocaleWordCount', hole; \n"
 "  unicode UnicodeLocaleGetNthWord(const unicode text, unsigned4 n, const varstring localename) : c,pure,entrypoint='ulUnicodeLocaleGetNthWord';\n"
 "  unicode UnicodeLocaleExcludeNthWord(const unicode text, unsigned4 n, const varstring localename) :c,pure,entrypoint='ulUnicodeLocaleExcludeNthWord';\n"
+"  unicode UnicodeLocaleExcludeLastWord(const unicode text, const varstring localename) : c,pure,entrypoint='ulUnicodeLocaleExcludeLastWord';\n"
 "  unicode UnicodeLocaleTranslate(const unicode text, unicode sear, unicode repl) :c,pure,entrypoint='ulUnicodeLocaleTranslate';\n"
 "END;\n";
 
@@ -747,6 +748,53 @@ void translate(UnicodeString & toProcess, UChar const * sear, unsigned searLen,
     }
 }
 
+void excludeLastWord(RuleBasedBreakIterator& bi, UnicodeString & toProcess)
+{
+    bi.setText(toProcess);
+    int32_t idx = bi.last();
+    int32_t wordidx = 0;
+    int32_t wordBeginning = 0;
+    int32_t wordEnd = idx;
+    while (idx != 0)
+    {
+        //Backwards iterator operates until the iterator reaches 0 from bi.last()
+        int breakType = bi.getRuleStatus();
+        if (breakType != UBRK_WORD_NONE)
+        {
+            // Exclude spaces, punctuation, and the like.
+            //   A status value UBRK_WORD_NONE indicates that the boundary does
+            //   not start a word or number.
+            ++wordidx;
+            wordBeginning = bi.previous();
+            //Increments the wordidx count and then moves iterator backwards past the one word that was recorded.
+            //Iterator located just before the start of the last word.
+            if (bi.getRuleStatus() != UBRK_WORD_NONE)
+            {
+                //Check for languages that do not use space characters to separate words.
+                //If a word lies before the current location of the iterator,
+                //incremement the wordidx to prevent removal of this extra word.
+                ++wordidx;
+            }
+            if (bi.previous() == 0 && wordidx == 1)
+            {
+                //Check for single word string. In place to remove leading whitespaces if so.
+                //Moves iterator backwards to the next boundary: either the beginning or end of a word.
+                //If at the beginning of a word, wordidx should be 2,
+                //and the condition should fail regardless of the iterator being the first position.
+                //If at the end of a word, wordidx should be 1,
+                //and the condition should fail because the iterator is not the first position.
+                wordBeginning = 0;
+            }
+            toProcess.removeBetween(wordBeginning, wordEnd);
+            return;
+        }
+        //Should only be called once before reaching a word or the beginning of the string.
+        idx = bi.previous();
+    }
+    //Called if the string has no words.
+    toProcess.removeBetween(0, bi.last());
+}
+
 void excludeNthWord(RuleBasedBreakIterator& bi, UnicodeString & source, unsigned n)
 {
     bi.setText(source);
@@ -1505,6 +1553,27 @@ UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleExcludeNthWord(unsigned & tgt
     }
 }
 
+UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleExcludeLastWord(unsigned & tgtLen, UChar * & tgt, unsigned textLen, UChar const * text, char const * localename)
+{
+    UErrorCode status = U_ZERO_ERROR;
+    Locale locale(localename);
+    RuleBasedBreakIterator* bi = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createWordInstance(locale, status);
+    UnicodeString processed(text, textLen);
+    excludeLastWord(*bi, processed);
+    delete bi;
+    if (processed.length()>0)
+    {
+        tgtLen = processed.length();
+        tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen * 2);
+        processed.extract(0, tgtLen, tgt);
+    }
+    else
+    {
+        tgtLen = 0;
+        tgt = 0;
+    }
+}
+
 UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleTranslate(unsigned & tgtLen, UChar * & tgt, unsigned textLen, UChar const * text, unsigned searLen, UChar const * sear, unsigned replLen, UChar * repl)
 {
     UnicodeString processed(text, textLen);

+ 1 - 0
plugins/unicodelib/unicodelib.hpp

@@ -102,6 +102,7 @@ UNICODELIB_API bool UNICODELIB_CALL ulUnicodeLocaleEditDistanceWithinRadius(unsi
 UNICODELIB_API unsigned UNICODELIB_CALL ulUnicodeLocaleWordCount(unsigned textLen, UChar const * text,char const * localename);
 UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleGetNthWord(unsigned & tgtLen, UChar * & tgt, unsigned textLen, UChar const * text, unsigned n, char const * localename);
 UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleExcludeNthWord(unsigned & tgtLen, UChar * & tgt, unsigned textLen, UChar const * text, unsigned n, char const * localename);
+UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleExcludeLastWord(unsigned & tgtLen, UChar * & tgt, unsigned textLen, UChar const * text, char const * localename);
 UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleTranslate(unsigned & tgtLen, UChar * & tgt, unsigned textLen, UChar const * text, unsigned searLen, UChar const * sear, unsigned replLen, UChar * repl);
 }