7 lat temu · 84facad87c
--- a/ecllibrary/std/Uni.ecl
+++ b/ecllibrary/std/Uni.ecl
@@ -399,6 +399,29 @@ EXPORT ExcludeNthWord(unicode text, unsigned4 n, varstring localename = '') :=
 
				     lib_unicodelib.UnicodeLib.UnicodeLocaleExcludeNthWord(text, n, localename);
			
 
				 
			
 
				 /**
			
 
				+ * Returns everything except the first word from the string.  Words are marked by the unicode break semantics.
			
 
				+ * Whitespace before and after the first word is also removed.
			
 
				+ *
			
 
				+ * @param text          The string to be broken into words.
			
 
				+ * @return              The string excluding the first word.
			
 
				+ */
			
 
				+
			
 
				+EXPORT ExcludeFirstWord(unicode text, varstring localename = '') :=
			
 
				+    lib_unicodelib.UnicodeLib.UnicodeLocaleExcludeNthWord(text, 1, localename);
			
 
				+
			
 
				+/**
			
 
				+ * Returns everything except the last word from the string.  Word boundaries are marked by the unicode break semantics.
			
 
				+ * Whitespace after a word is removed with the word and leading whitespace is removed with the first word.
			
 
				+ *
			
 
				+ * @param text          The string to be broken into words.
			
 
				+ * @param localname     The locale to use for the break semantics. Defaults to ''.
			
 
				+ * @return              The string excluding the last word.
			
 
				+ */
			
 
				+
			
 
				+EXPORT unicode ExcludeLastWord(unicode text, varstring localename = '') :=
			
 
				+    lib_unicodelib.UnicodeLib.UnicodeLocaleExcludeLastWord(text, localename);
			
 
				+
			
 
				+/**
			
 
				  * Returns the source string with the all characters that match characters in the search string replaced
			
 
				  * with the character at the corresponding position in the replacement string.
			
 
				  * The isEmpty() tests in the beginning of the function check for invalid sequences in addition to blank strings.
			
@@ -413,15 +436,4 @@ EXPORT ExcludeNthWord(unicode text, unsigned4 n, varstring localename = '') :=
 
				 EXPORT Translate(unicode text, unicode sear, unicode repl) :=
			
 
				     lib_unicodelib.UnicodeLib.UnicodeLocaleTranslate(text, sear, repl);
			
 
				 
			
 
				-/*
			
 
				- * Returns everything except the first word from the string.  Words are marked by the unicode break semantics.
			
 
				- * Whitespace before and after the first word is also removed.
			
 
				- *
			
 
				- * @param text          The string to be broken into words.
			
 
				- * @return              The string excluding the first word.
			
 
				- */
			
 
				-
			
 
				-EXPORT ExcludeFirstWord(unicode text, varstring localename = '') :=
			
 
				-    lib_unicodelib.UnicodeLib.UnicodeLocaleExcludeNthWord(text, 1, localename);
			
 
				-
			
 
				 END;
			
--- a/ecllibrary/teststd/uni/TestExcludeLastWord.ecl
+++ b/ecllibrary/teststd/uni/TestExcludeLastWord.ecl
@@ -0,0 +1,52 @@
 
				+/*##############################################################################
			
 
				+## HPCC SYSTEMS software Copyright (C) 2012 HPCC Systems®.  All rights reserved.
			
 
				+############################################################################## */
			
 
				+
			
 
				+IMPORT Std.Uni;
			
 
				+
			
 
				+EXPORT TestExcludeLastWord := MODULE
			
 
				+
			
 
				+  EXPORT TestConst := MODULE
			
 
				+    //Check action on a string with no entries.
			
 
				+    EXPORT Test01 := ASSERT(Uni.ExcludeLastWord(U'')+U'!' = U'!');
			
 
				+    EXPORT Test02 := ASSERT(Uni.ExcludeLastWord(U'             ')+U'!' = U'!');
			
 
				+    //Check action on a string containing a single word - with various whitespace
			
 
				+    EXPORT Test03 := ASSERT(Uni.ExcludeLastWord(U'x')+U'!' = U'!');
			
 
				+    EXPORT Test04 := ASSERT(Uni.ExcludeLastWord(U' x')+U'!' = U'!');
			
 
				+    EXPORT Test05 := ASSERT(Uni.ExcludeLastWord(U'x ')+U'!' = U'!');
			
 
				+    EXPORT Test06 := ASSERT(Uni.ExcludeLastWord(U' x ')+U'!' = U'!');
			
 
				+    //Check action on a string containg multiple words - with various whitespace combinations.
			
 
				+    EXPORT Test07 := ASSERT(Uni.ExcludeLastWord(U' abc def ')+U'!' = U' abc !');
			
 
				+    EXPORT Test08 := ASSERT(Uni.ExcludeLastWord(U'  a b c   def    ')+U'!' = U'  a b c   !');
			
 
				+    //Check action on a string containing multiple commas as part of a list initiated by a colon.
			
 
				+    EXPORT Test09 := ASSERT(Uni.ExcludeLastWord(U' ,,,, ')+U'!' = U'!');
			
 
				+    EXPORT Test10 := ASSERT(Uni.ExcludeLastWord(U'List: abc, def, ghi,')+U'!' = U'List: abc, def, !');
			
 
				+    //Check action on a string containing an apostrophe
			
 
				+    EXPORT Test11 := ASSERT(Uni.ExcludeLastWord(U'I couldn\'t')+U'!' = U'I !');
			
 
				+    //Check action on a string containing other Symbols
			
 
				+    EXPORT Test12 := ASSERT(Uni.ExcludeLastWord(U'abc := name')+U'!' = U'abc := !');
			
 
				+    //Check action on a string containing different variations/combinations of numbers and other characters
			
 
				+    EXPORT Test13 := ASSERT(Uni.ExcludeLastWord(U'1 234 123abc 23.6 abc123')+U'!' = U'1 234 123abc 23.6 !');
			
 
				+    //Test other space characters (< 0x20)
			
 
				+    EXPORT Test14 := ASSERT(Uni.ExcludeLastWord(U'  a b\nc \t   ')+U'!' = U'  a b\n!');
			
 
				+    //Check action on a string containing latin diacritical marks
			
 
				+    EXPORT Test15 := ASSERT(Uni.ExcludeLastWord(U'À à')+U'!' = U'À !');
			
 
				+    //Check action on a string containing Spanish words with latin accents.
			
 
				+    //Translation: "The deceased changed the girls"
			
 
				+    EXPORT Test16 := ASSERT(Uni.ExcludeLastWord(U'El difunto cambió las niñas')+U'!' = U'El difunto cambió las !');
			
 
				+    //Check action on a string containing Chinese characters.
			
 
				+    //Translation: "I am a computer"
			
 
				+    EXPORT Test17 := ASSERT(Uni.ExcludeLastWord(U'我是電腦')+U'!' = U'我是!');
			
 
				+    //Check action on a string containing Modern Greek characters.
			
 
				+    //Translation: "Do you come here often?"
			
 
				+    EXPORT Test18 := ASSERT(Uni.ExcludeLastWord(U' Έρχεσαι συχνά εδώ; ')+U'!' = U' Έρχεσαι συχνά !');
			
 
				+    //Testcases 19 and 20 test for bidirectional capabilities with scripts in arabic and hebrew.
			
 
				+    //Check action on arabic lettering with accent marks. Bidirectional.
			
 
				+    //Translation: "Good morning"
			
 
				+    EXPORT Test19 := ASSERT(Uni.ExcludeLastWord(U'صباح الخير')+U'!' = U'صباح !');
			
 
				+    //Check action on hebrew lettering with accent marks (called pointing). Bidirectional.
			
 
				+    //Translation: (not a phrase, 2 different words separated by a space)
			
 
				+    EXPORT Test20 := ASSERT(Uni.ExcludeLastWord(U'קָמָץ שִׁי״ן')+U'!' = U'קָמָץ !');
			
 
				+  END;
			
 
				+
			
 
				+END;
			
--- a/plugins/unicodelib/unicodelib.cpp
+++ b/plugins/unicodelib/unicodelib.cpp
@@ -83,6 +83,7 @@ static const char * EclDefinition =
 
				 "  unsigned4 UnicodeLocaleWordCount(const unicode text, const varstring localename) : c, pure,entrypoint='ulUnicodeLocaleWordCount', hole; \n"
			
 
				 "  unicode UnicodeLocaleGetNthWord(const unicode text, unsigned4 n, const varstring localename) : c,pure,entrypoint='ulUnicodeLocaleGetNthWord';\n"
			
 
				 "  unicode UnicodeLocaleExcludeNthWord(const unicode text, unsigned4 n, const varstring localename) :c,pure,entrypoint='ulUnicodeLocaleExcludeNthWord';\n"
			
 
				+"  unicode UnicodeLocaleExcludeLastWord(const unicode text, const varstring localename) : c,pure,entrypoint='ulUnicodeLocaleExcludeLastWord';\n"
			
 
				 "  unicode UnicodeLocaleTranslate(const unicode text, unicode sear, unicode repl) :c,pure,entrypoint='ulUnicodeLocaleTranslate';\n"
			
 
				 "END;\n";
			
 
				 
			
@@ -747,6 +748,53 @@ void translate(UnicodeString & toProcess, UChar const * sear, unsigned searLen,
 
				     }
			
 
				 }
			
 
				 
			
 
				+void excludeLastWord(RuleBasedBreakIterator& bi, UnicodeString & toProcess)
			
 
				+{
			
 
				+    bi.setText(toProcess);
			
 
				+    int32_t idx = bi.last();
			
 
				+    int32_t wordidx = 0;
			
 
				+    int32_t wordBeginning = 0;
			
 
				+    int32_t wordEnd = idx;
			
 
				+    while (idx != 0)
			
 
				+    {
			
 
				+        //Backwards iterator operates until the iterator reaches 0 from bi.last()
			
 
				+        int breakType = bi.getRuleStatus();
			
 
				+        if (breakType != UBRK_WORD_NONE)
			
 
				+        {
			
 
				+            // Exclude spaces, punctuation, and the like.
			
 
				+            //   A status value UBRK_WORD_NONE indicates that the boundary does
			
 
				+            //   not start a word or number.
			
 
				+            ++wordidx;
			
 
				+            wordBeginning = bi.previous();
			
 
				+            //Increments the wordidx count and then moves iterator backwards past the one word that was recorded.
			
 
				+            //Iterator located just before the start of the last word.
			
 
				+            if (bi.getRuleStatus() != UBRK_WORD_NONE)
			
 
				+            {
			
 
				+                //Check for languages that do not use space characters to separate words.
			
 
				+                //If a word lies before the current location of the iterator,
			
 
				+                //incremement the wordidx to prevent removal of this extra word.
			
 
				+                ++wordidx;
			
 
				+            }
			
 
				+            if (bi.previous() == 0 && wordidx == 1)
			
 
				+            {
			
 
				+                //Check for single word string. In place to remove leading whitespaces if so.
			
 
				+                //Moves iterator backwards to the next boundary: either the beginning or end of a word.
			
 
				+                //If at the beginning of a word, wordidx should be 2,
			
 
				+                //and the condition should fail regardless of the iterator being the first position.
			
 
				+                //If at the end of a word, wordidx should be 1,
			
 
				+                //and the condition should fail because the iterator is not the first position.
			
 
				+                wordBeginning = 0;
			
 
				+            }
			
 
				+            toProcess.removeBetween(wordBeginning, wordEnd);
			
 
				+            return;
			
 
				+        }
			
 
				+        //Should only be called once before reaching a word or the beginning of the string.
			
 
				+        idx = bi.previous();
			
 
				+    }
			
 
				+    //Called if the string has no words.
			
 
				+    toProcess.removeBetween(0, bi.last());
			
 
				+}
			
 
				+
			
 
				 void excludeNthWord(RuleBasedBreakIterator& bi, UnicodeString & source, unsigned n)
			
 
				 {
			
 
				     bi.setText(source);
			
@@ -1505,6 +1553,27 @@ UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleExcludeNthWord(unsigned & tgt
 
				     }
			
 
				 }
			
 
				 
			
 
				+UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleExcludeLastWord(unsigned & tgtLen, UChar * & tgt, unsigned textLen, UChar const * text, char const * localename)
			
 
				+{
			
 
				+    UErrorCode status = U_ZERO_ERROR;
			
 
				+    Locale locale(localename);
			
 
				+    RuleBasedBreakIterator* bi = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createWordInstance(locale, status);
			
 
				+    UnicodeString processed(text, textLen);
			
 
				+    excludeLastWord(*bi, processed);
			
 
				+    delete bi;
			
 
				+    if (processed.length()>0)
			
 
				+    {
			
 
				+        tgtLen = processed.length();
			
 
				+        tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen * 2);
			
 
				+        processed.extract(0, tgtLen, tgt);
			
 
				+    }
			
 
				+    else
			
 
				+    {
			
 
				+        tgtLen = 0;
			
 
				+        tgt = 0;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				 UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleTranslate(unsigned & tgtLen, UChar * & tgt, unsigned textLen, UChar const * text, unsigned searLen, UChar const * sear, unsigned replLen, UChar * repl)
			
 
				 {
			
 
				     UnicodeString processed(text, textLen);
			
--- a/plugins/unicodelib/unicodelib.hpp
+++ b/plugins/unicodelib/unicodelib.hpp
@@ -102,6 +102,7 @@ UNICODELIB_API bool UNICODELIB_CALL ulUnicodeLocaleEditDistanceWithinRadius(unsi
 
				 UNICODELIB_API unsigned UNICODELIB_CALL ulUnicodeLocaleWordCount(unsigned textLen, UChar const * text,char const * localename);
			
 
				 UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleGetNthWord(unsigned & tgtLen, UChar * & tgt, unsigned textLen, UChar const * text, unsigned n, char const * localename);
			
 
				 UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleExcludeNthWord(unsigned & tgtLen, UChar * & tgt, unsigned textLen, UChar const * text, unsigned n, char const * localename);
			
 
				+UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleExcludeLastWord(unsigned & tgtLen, UChar * & tgt, unsigned textLen, UChar const * text, char const * localename);
			
 
				 UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleTranslate(unsigned & tgtLen, UChar * & tgt, unsigned textLen, UChar const * text, unsigned searLen, UChar const * sear, unsigned replLen, UChar * repl);
			
 
				 }