8 years ago · 5089651b2d
--- a/ecllibrary/std/Uni.ecl
+++ b/ecllibrary/std/Uni.ecl
@@ -1,4 +1,4 @@
 
				-/*##############################################################################
			
 
				+/*##############################################################################
			
 
				 ## HPCC SYSTEMS software Copyright (C) 2012 HPCC Systems®.  All rights reserved.
			
 
				 ############################################################################## */
			
 
				 
			
@@ -382,4 +382,20 @@ EXPORT unsigned4 WordCount(unicode text, varstring localename = '') :=
 
				 EXPORT unicode GetNthWord(unicode text, unsigned4 n, varstring localename = '') :=
			
 
				     lib_unicodelib.UnicodeLib.UnicodeLocaleGetNthWord(text, n, localename);
			
 
				 
			
 
				+/**
			
 
				+ * Returns everything but the string's nth word and some whitespaces. Words are marked by the unicode break semantics.
			
 
				+ * Trailing whitespaes are always removed with the word.
			
 
				+ * Leading whitespaces are only removed with the word if the nth word is the first word.
			
 
				+ * Returns a blank string if there are no words in the source string.
			
 
				+ * Returns the source string if the number of words in the string is less than the n parameter's assigned value.
			
 
				+ *
			
 
				+ * @param text          The string to be broken into words.
			
 
				+ * @param n             Which word should be removed from the string.
			
 
				+ * @param localname     The locale to use for the break semantics.  Defaults to ''.
			
 
				+ * @return              The string excluding the nth word.
			
 
				+ */
			
 
				+
			
 
				+EXPORT ExcludeNthWord(unicode text, unsigned4 n, varstring localename = '') :=
			
 
				+    lib_unicodelib.UnicodeLib.UnicodeLocaleExcludeNthWord(text, n, localename);
			
 
				+
			
 
				 END;
			
--- a/ecllibrary/teststd/uni/TestExcludeNthWord.ecl
+++ b/ecllibrary/teststd/uni/TestExcludeNthWord.ecl
@@ -0,0 +1,90 @@
 
				+/*##############################################################################
			
 
				+     HPCC SYSTEMS software Copyright (C) 2017 HPCC Systems®.
			
 
				+ 
			
 
				+     Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+     you may not use this file except in compliance with the License.
			
 
				+     You may obtain a copy of the License at
			
 
				+ 
			
 
				+        http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ 
			
 
				+     Unless required by applicable law or agreed to in writing, software
			
 
				+     distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+     See the License for the specific language governing permissions and
			
 
				+     limitations under the License.
			
 
				+ ##############################################################################*/
			
 
				+ 
			
 
				+ IMPORT Std.Uni;
			
 
				+ 
			
 
				+ EXPORT TestExcludeNthWord := MODULE
			
 
				+ 
			
 
				+   EXPORT TestConst := MODULE
			
 
				+    //Check action on a string with no entries.
			
 
				+    EXPORT Test01 := ASSERT(Uni.ExcludeNthWord('',0)+'!' = '!', CONST);
			
 
				+    EXPORT Test02 := ASSERT(Uni.ExcludeNthWord('',1)+'!' = '!', CONST);
			
 
				+    EXPORT Test03 := ASSERT(Uni.ExcludeNthWord('',-1)+'!' = '!', CONST);
			
 
				+    EXPORT Test04 := ASSERT(Uni.ExcludeNthWord('             ',0)+'!' = '!', CONST);
			
 
				+    EXPORT Test05 := ASSERT(Uni.ExcludeNthWord('             ',1)+'!' = '!', CONST);
			
 
				+    EXPORT Test06 := ASSERT(Uni.ExcludeNthWord('             ',-1)+'!' = '!', CONST);
			
 
				+    //Check action on a string containing a single word - with various whitespace
			
 
				+    EXPORT Test07 := ASSERT(Uni.ExcludeNthWord('x',0)+'!' = 'x!');
			
 
				+    EXPORT Test08 := ASSERT(Uni.ExcludeNthWord('x',1)+'!' = '!');
			
 
				+    EXPORT Test09 := ASSERT(Uni.ExcludeNthWord('x',2)+'!' = 'x!');
			
 
				+    EXPORT Test10 := ASSERT(Uni.ExcludeNthWord('x',3)+'!' = 'x!');
			
 
				+    EXPORT Test11 := ASSERT(Uni.ExcludeNthWord(' x',1)+'!' = '!');
			
 
				+    EXPORT Test12 := ASSERT(Uni.ExcludeNthWord('x ',1)+'!' = '!');
			
 
				+    EXPORT Test13 := ASSERT(Uni.ExcludeNthWord(' x',2)+'!' = ' x!');
			
 
				+    EXPORT Test14 := ASSERT(Uni.ExcludeNthWord(' x ',1)+'!' = '!');
			
 
				+    //Check action on a string containg multiple words - with various whitespace combinations.
			
 
				+    EXPORT Test15 := ASSERT(Uni.ExcludeNthWord(' abc def ', 1)+'!' = 'def !');
			
 
				+    EXPORT Test16 := ASSERT(Uni.ExcludeNthWord(' abc def ', 2)+'!' = ' abc !');
			
 
				+    EXPORT Test17 := ASSERT(Uni.ExcludeNthWord('  a b c   def    ',0)+'!' = '  a b c   def    !');
			
 
				+    EXPORT Test18 := ASSERT(Uni.ExcludeNthWord('  a b c   def    ',1)+'!' = 'b c   def    !');
			
 
				+    EXPORT Test19 := ASSERT(Uni.ExcludeNthWord('  a b c   def    ',2)+'!' = '  a c   def    !');
			
 
				+    EXPORT Test20 := ASSERT(Uni.ExcludeNthWord('  a b c   def    ',3)+'!' = '  a b def    !');
			
 
				+    EXPORT Test21 := ASSERT(Uni.ExcludeNthWord('  a b c   def    ',4)+'!' = '  a b c   !');
			
 
				+    EXPORT Test22 := ASSERT(Uni.ExcludeNthWord('  a b c   def    ',5)+'!' = '  a b c   def    !');
			
 
				+    //Check action on a string containing multiple commas as part of a list initiated by a colon.
			
 
				+    EXPORT Test23 := ASSERT(Uni.ExcludeNthWord(' ,,,, ',1)+'!' = '!');
			
 
				+    EXPORT Test24 := ASSERT(Uni.ExcludeNthWord('List: abc, def, ghi,   jhi    ',0)+'!' = 'List: abc, def, ghi,   jhi    !');
			
 
				+    EXPORT Test25 := ASSERT(Uni.ExcludeNthWord('List: abc, def, ghi,   jhi    ',1)+'!' = 'abc, def, ghi,   jhi    !');
			
 
				+    EXPORT Test26 := ASSERT(Uni.ExcludeNthWord('List: abc, def, ghi,   jhi    ',2)+'!' = 'List: def, ghi,   jhi    !');
			
 
				+    EXPORT Test27 := ASSERT(Uni.ExcludeNthWord('List: abc, def, ghi,   jhi    ',4)+'!' = 'List: abc, def, jhi    !');
			
 
				+    //Check action on a string containing an apostrophe
			
 
				+    EXPORT Test28 := ASSERT(Uni.ExcludeNthWord('I couldn\'t hear you!',4)+'!' = 'I couldn\'t hear !');
			
 
				+    EXPORT Test29 := ASSERT(Uni.ExcludeNthWord('I couldn\'t hear you!',2)+'!' = 'I hear you!!');
			
 
				+    //Check action on a string containing other Symbols
			
 
				+    EXPORT Test30 := ASSERT(Uni.ExcludeNthWord('abc := name',1)+'!' = 'name!');
			
 
				+    EXPORT Test31 := ASSERT(Uni.ExcludeNthWord('abc := name',2)+'!' = 'abc := !');
			
 
				+    //Check action on a string containing different variations/combinations of numbers and other characters
			
 
				+    EXPORT Test32 := ASSERT(Uni.ExcludeNthWord('1 234 123abc 23.6 abc123',1)+'!' = '234 123abc 23.6 abc123!');
			
 
				+    EXPORT Test33 := ASSERT(Uni.ExcludeNthWord('1 234 123abc 23.6 abc123',2)+'!' = '1 123abc 23.6 abc123!');
			
 
				+    EXPORT Test34 := ASSERT(Uni.ExcludeNthWord('1 234 123abc 23.6 abc123',3)+'!' = '1 234 23.6 abc123!');
			
 
				+    EXPORT Test35 := ASSERT(Uni.ExcludeNthWord('1 234 123abc 23.6 abc123',4)+'!' = '1 234 123abc abc123!');
			
 
				+    EXPORT Test36 := ASSERT(Uni.ExcludeNthWord('1 234 123abc 23.6 abc123',5)+'!' = '1 234 123abc 23.6 !');
			
 
				+    //Test other space characters (< 0x20)
			
 
				+    EXPORT Test37 := ASSERT(Uni.ExcludeNthWord('  a b\nc \t  def    ',2)+'!' = '  a c \t  def    !');
			
 
				+    EXPORT Test38 := ASSERT(Uni.ExcludeNthWord('  a b\nc \t  def    ',3)+'!' = '  a b\ndef    !');
			
 
				+    //Check action on a string containing latin diacritical marks
			
 
				+    EXPORT Test39 := ASSERT(Uni.ExcludeNthWord(U'À à',2)+U'!' = U'À !');
			
 
				+    EXPORT Test40 := ASSERT(Uni.ExcludeNthWord(U'ä̰́ Ä̰́',2)+U'!' = U'ä̰́ !');
			
 
				+    //Check action on a string containing Spanish words with latin accents.
			
 
				+    //Translation: "The deceased changed the girls" --> "The deceased the girls" & --> "The deceased changed the"
			
 
				+    EXPORT Test41 := ASSERT(Uni.ExcludeNthWord(U'El difunto cambió las niñas',3)+U'!' = U'El difunto las niñas!');
			
 
				+    EXPORT Test42 := ASSERT(Uni.ExcludeNthWord(U'El difunto cambió las niñas',5)+U'!' = U'El difunto cambió las !');
			
 
				+    //Check action on a string containing Chinese characters.
			
 
				+    //Translation: "I am a computer" --> "I am"
			
 
				+    EXPORT Test43 := ASSERT(Uni.ExcludeNthWord(U'我是電腦',2)+U'!' = U'我是!');
			
 
				+    //Check action on a string containing Modern Greek characters.
			
 
				+    //Translation: "Do you come here often?" --> "come here often?"
			
 
				+    EXPORT Test44 := ASSERT(Uni.ExcludeNthWord(U' Έρχεσαι συχνά εδώ; ',1)+U'!' = U'συχνά εδώ; !');
			
 
				+    //Testcases 45 and 46 test for bidirectional capabilities with scripts in arabic and hebrew.
			
 
				+    //Check action on arabic lettering with accent marks. Bidirectional.
			
 
				+    //Translation: "Good morning" --> "morning"
			
 
				+    EXPORT Test45 := ASSERT(Uni.ExcludeNthWord(U'صباح الخير',2)+U'!' = U'صباح !');
			
 
				+    //Check action on hebrew lettering with accent marks (called pointing). Bidirectional.
			
 
				+    //Translation: (not a phrase, 2 different words separated by a space)
			
 
				+    EXPORT Test46 := ASSERT(Uni.ExcludeNthWord(U'קָמָץ שִׁי״ן',2)+U'!' = U'קָמָץ !');
			
 
				+   END;
			
 
				+ 
			
 
				+ END;
			
--- a/plugins/unicodelib/unicodelib.cpp
+++ b/plugins/unicodelib/unicodelib.cpp
@@ -82,6 +82,7 @@ static const char * EclDefinition =
 
				 "  boolean UnicodeLocaleEditDistanceWithinRadius(const unicode left, const unicode right, unsigned4 radius,  const varstring localename) : c,time,pure,entrypoint='ulUnicodeLocaleEditDistanceWithinRadius', hole; \n"
			
 
				 "  unsigned4 UnicodeLocaleWordCount(const unicode text, const varstring localename) : c, pure,entrypoint='ulUnicodeLocaleWordCount', hole; \n"
			
 
				 "  unicode UnicodeLocaleGetNthWord(const unicode text, unsigned4 n, const varstring localename) : c,pure,entrypoint='ulUnicodeLocaleGetNthWord';\n"
			
 
				+"  unicode UnicodeLocaleExcludeNthWord(const unicode text, unsigned4 n, const varstring localename) :c,pure,entrypoint='ulUnicodeLocaleExcludeNthWord';\n"
			
 
				 "END;\n";
			
 
				 
			
 
				 static const char * compatibleVersions[] = {
			
@@ -718,6 +719,45 @@ unsigned unicodeEditDistanceV4(UnicodeString & left, UnicodeString & right, unsi
 
				     return da[mask(leftLen-1)][rightLen-1];
			
 
				 }
			
 
				 
			
 
				+void excludeNthWord(RuleBasedBreakIterator& bi, UnicodeString & source, unsigned n)
			
 
				+{
			
 
				+    bi.setText(source);
			
 
				+    int32_t idx = bi.first();
			
 
				+    int32_t wordidx = 0;
			
 
				+    unsigned wordBeginning = 0;
			
 
				+    while (idx != BreakIterator::DONE)
			
 
				+    {
			
 
				+        int breakType = bi.getRuleStatus();
			
 
				+        if (breakType != UBRK_WORD_NONE)
			
 
				+        {
			
 
				+            // Exclude spaces, punctuation, and the like.
			
 
				+            //   A status value UBRK_WORD_NONE indicates that the boundary does
			
 
				+            //   not start a word or number.
			
 
				+            if (++wordidx == n)
			
 
				+            {
			
 
				+                if (n == 1)
			
 
				+                {
			
 
				+                    wordBeginning = 0;
			
 
				+                }
			
 
				+                unsigned wordEnd;
			
 
				+                do
			
 
				+                {
			
 
				+                    wordEnd = idx;
			
 
				+                    idx = bi.next();
			
 
				+                } while (bi.getRuleStatus() == UBRK_WORD_NONE && idx != BreakIterator::DONE);
			
 
				+                source.removeBetween(wordBeginning, wordEnd);
			
 
				+                return;
			
 
				+            }
			
 
				+        }
			
 
				+        wordBeginning = idx;
			
 
				+        idx = bi.next();
			
 
				+    }
			
 
				+    if (!wordidx)
			
 
				+    {
			
 
				+        source.removeBetween(bi.first(), bi.last());
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				 UnicodeString getNthWord(RuleBasedBreakIterator& bi, UnicodeString const & source, unsigned n)
			
 
				 {
			
 
				     UnicodeString word;
			
@@ -1416,3 +1456,23 @@ UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleGetNthWord(unsigned & tgtLen,
 
				     }
			
 
				 }
			
 
				 
			
 
				+UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleExcludeNthWord(unsigned & tgtLen, UChar * & tgt, unsigned textLen, UChar const * text, unsigned n, char const * localename)
			
 
				+{
			
 
				+    UErrorCode status = U_ZERO_ERROR;
			
 
				+    Locale locale(localename);
			
 
				+    RuleBasedBreakIterator* bi = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createWordInstance(locale, status);
			
 
				+    UnicodeString processed(text, textLen);
			
 
				+    excludeNthWord(*bi, processed, n);
			
 
				+    delete bi;
			
 
				+    if (processed.length()>0)
			
 
				+    {
			
 
				+        tgtLen = processed.length();
			
 
				+        tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
			
 
				+        processed.extract(0, tgtLen, tgt);
			
 
				+    }
			
 
				+    else
			
 
				+    {
			
 
				+        tgtLen = 0;
			
 
				+        tgt = 0;
			
 
				+    }
			
 
				+}
			
--- a/plugins/unicodelib/unicodelib.hpp
+++ b/plugins/unicodelib/unicodelib.hpp
@@ -101,6 +101,7 @@ UNICODELIB_API unsigned UNICODELIB_CALL ulUnicodeLocaleEditDistance(unsigned lef
 
				 UNICODELIB_API bool UNICODELIB_CALL ulUnicodeLocaleEditDistanceWithinRadius(unsigned leftLen, UChar const * left, unsigned rightLen, UChar const * right, unsigned radius,char const * localename);
			
 
				 UNICODELIB_API unsigned UNICODELIB_CALL ulUnicodeLocaleWordCount(unsigned textLen, UChar const * text,char const * localename);
			
 
				 UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleGetNthWord(unsigned & tgtLen, UChar * & tgt, unsigned textLen, UChar const * text, unsigned n, char const * localename);
			
 
				+UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleExcludeNthWord(unsigned & tgtLen, UChar * & tgt, unsigned textLen, UChar const * text, unsigned n, char const * localename);
			
 
				 }
			
 
				 
			
 
				 #endif