Bläddra i källkod

Merge pull request #10953 from ghalliday/issue18187

HPCC-18187 Unicode Implementation of Repeat

Reviewed-By: Richard Chapman <rchapman@hpccsystems.com>
Richard Chapman 7 år sedan
förälder
incheckning
21b654b042

+ 11 - 0
ecllibrary/std/Uni.ecl

@@ -481,4 +481,15 @@ EXPORT STRING Version() := lib_unicodelib.UnicodeLib.UnicodeVersion();
 EXPORT RemoveSuffix(unicode src, unicode suff, string form) :=
     lib_unicodelib.UnicodeLib.UnicodeLocaleRemoveSuffix(src, suff, form);
 
+/*
+ * Returns a string containing text repeated n times.
+ *
+ * @param src           The string to be repeated.
+ * @param n             Number of repetitions.
+ * @return              A string containing n concatenations of the string text.
+ */
+
+EXPORT Repeat(unicode src, unsigned4 n) :=
+    lib_unicodelib.UnicodeLib.UnicodeLocaleRepeat(src, n);
+
 END;

+ 60 - 0
ecllibrary/teststd/uni/TestRepeat.ecl

@@ -0,0 +1,60 @@
+/*##############################################################################
+## HPCC SYSTEMS software Copyright (C) 2018 HPCC Systems®.  All rights reserved.
+############################################################################## */
+
+IMPORT Std.Uni;
+
+EXPORT TestRepeat := MODULE
+
+   EXPORT TestConst := MODULE
+
+    angstrom := U'A\u030A';         // Single character
+    angstrom2d := x'41000A03';      // Bytes for A followed by circle
+    angstrom2 := (>unicode<)angstrom2d; // Convert to a unicode, but it will not be normalized
+    revangstrom := U'\u030AA';      // circle followed by an A
+
+    EXPORT Tests := [
+        ASSERT(Uni.Repeat('Repeat this string ', 0) = '');
+        ASSERT(Uni.Repeat('Repeat this string ', 1) = 'Repeat this string ');
+        ASSERT(Uni.Repeat('Repeat this string ', 2) = 'Repeat this string Repeat this string');
+
+        ASSERT(Uni.Repeat(U'', 0) = '');
+        ASSERT(Uni.Repeat(U'', 1) = '');
+        ASSERT(Uni.Repeat(U'', 2) = '');
+        ASSERT(Uni.Repeat(U'', 10) = '');
+        ASSERT(Uni.Repeat(U'', -2) = '');
+
+        ASSERT(Uni.Repeat(U'r', 0) = '');
+        ASSERT(Uni.Repeat(U'r', 1) = 'r');
+        ASSERT(Uni.Repeat(U'r', 2) = 'rr');
+        ASSERT(Uni.Repeat(U'r', 10) = 'rrrrrrrrrr');
+        ASSERT(Uni.Repeat(U'r', -2) = '');
+
+        ASSERT(Uni.Repeat(U'abc', 0) = '');
+        ASSERT(Uni.Repeat(U'abc', 1) = 'abc');
+        ASSERT(Uni.Repeat(U'abc', 2) = 'abcabc');
+        ASSERT(Uni.Repeat(U'abc', 10) = 'abcabcabcabcabcabcabcabcabcabc');
+        ASSERT(Uni.Repeat(U'abc', -2) = '');
+
+        //Various checks to ensure that strings are correctly normalized after duplicating
+        ASSERT(Uni.Repeat(angstrom, 1) = U'\u212B');
+        ASSERT(LENGTH(angstrom) = 1);
+        ASSERT(LENGTH(angstrom2) = 2);
+        ASSERT(LENGTH(TRIM(angstrom2)) = 2);
+        ASSERT(LENGTH(Uni.Repeat(angstrom, 1)) = 1);
+        ASSERT(LENGTH(Uni.Repeat(angstrom2, 1)) = 2);
+        ASSERT(LENGTH(TRIM(Uni.Repeat(angstrom2, 1))) = 1);
+        ASSERT(LENGTH(Uni.Repeat(angstrom2, 2)) = 4);
+        ASSERT(LENGTH(TRIM(Uni.Repeat(angstrom2, 2))) = 2);
+        ASSERT(Uni.Repeat(angstrom2, 1) = U'\u212B');
+        ASSERT(revangstrom[2] = 'A');
+
+        ASSERT(LENGTH(Uni.Repeat(revangstrom, 1)) = 2);
+        ASSERT(LENGTH(TRIM(Uni.Repeat(revangstrom   , 1))) = 2);
+        ASSERT(LENGTH(Uni.Repeat(revangstrom, 2)) = 4);
+        ASSERT(LENGTH(TRIM(Uni.Repeat(revangstrom   , 2))) = 3);
+        ASSERT(Uni.Repeat(revangstrom, 2) = U'\u030A\u212bA');
+
+        ASSERT(TRUE)];
+   END;
+END;

+ 42 - 1
plugins/unicodelib/unicodelib.cpp

@@ -92,6 +92,7 @@ static const char * EclDefinition =
 "  boolean UnicodeLocaleEndsWith(const unicode src, const unicode suff, const string form) :c,pure,entrypoint='ulUnicodeLocaleEndsWith';\n"
 "  string UnicodeVersion():c,pure,entrypoint='ulUnicodeVersion';\n"
 "  unicode UnicodeLocaleRemoveSuffix(const unicode src, const unicode suff, const string form) :c,pure,entrypoint='ulUnicodeLocaleRemoveSuffix';\n"
+"  unicode UnicodeLocaleRepeat(const unicode src, unsigned4 n) : c, pure,entrypoint='ulUnicodeLocaleRepeat'; \n"
 "END;\n";
 
 static const char * compatibleVersions[] = {
@@ -121,6 +122,21 @@ UNICODELIB_API bool getECLPluginDefinition(ECLPluginDefinitionBlock *pb)
     return true;
 }
 
+static void unicodeEnsureIsNormalized(unsigned inLen, UChar * in)
+{
+    UErrorCode err = U_ZERO_ERROR;
+    if (!unorm_isNormalized(in, inLen, UNORM_NFC, &err))
+    {
+        UChar * buff = (UChar *)malloc(inLen * 2);
+        unsigned len = unorm_normalize(in, inLen, UNORM_NFC, 0, buff, inLen, &err);
+        if (len > inLen)
+            len = inLen;
+        memcpy(in, buff, len*sizeof(UChar));
+        while (len < inLen) in[len++] = 0x0020;
+        free(buff);
+    }
+}
+
 
 namespace nsUnicodelib {
 
@@ -330,7 +346,7 @@ private:
         next_ = new uint32_t[capacity_+1]; // the number of characters is always less or equal to the string length
         unsigned index=0;
         next_[index] = 0;
-        int32_t end = 0;
+        uint32_t end = 0;
         while (end < capacity_)
         {
             end = end+ucpLength(ustring_[end]);
@@ -1749,3 +1765,28 @@ UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleRemoveSuffix(unsigned & tgtLe
     tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen * 2);
     pro.extract(0, tgtLen, tgt);
 }
+
+UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleRepeat(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src, unsigned n)
+{
+    size32_t resultLen = srcLen * n;
+    //Check for empty string or overflow in the length of the string
+    if (((int)n <= 0) || (srcLen == 0) || (resultLen /n != srcLen))
+    {
+        tgtLen = 0;
+        tgt = nullptr;
+        return;
+    }
+
+    UChar * result = (UChar *)CTXMALLOC(parentCtx, resultLen * sizeof(UChar));
+    assertex(result);
+    for (unsigned i = 0; i < n; ++i)
+    {
+        memcpy(&result[i * srcLen], src, srcLen * sizeof(UChar));
+    }
+
+    //Now need to ensure the string is normalized since characters from the end of one string may combine with start of the next
+    unicodeEnsureIsNormalized(resultLen, result);
+
+    tgtLen = resultLen;
+    tgt = result;
+}

+ 1 - 0
plugins/unicodelib/unicodelib.hpp

@@ -108,6 +108,7 @@ UNICODELIB_API bool UNICODELIB_CALL ulUnicodeLocaleStartsWith(unsigned srcLen, U
 UNICODELIB_API bool UNICODELIB_CALL ulUnicodeLocaleEndsWith(unsigned srcLen, UChar const * src, unsigned suffLen, UChar const * suff, unsigned formLen, char const * form);
 UNICODELIB_API void UNICODELIB_CALL ulUnicodeVersion(unsigned & tgtLen, char * & tgt);
 UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleRemoveSuffix(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src, unsigned suffLen, UChar const * suff, unsigned formLen, char const * form);
+UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleRepeat(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src, unsigned n);
 }
 
 #endif

+ 4 - 4
rtl/eclrtl/eclrtl.cpp

@@ -357,19 +357,19 @@ bool vunicodeNeedsNormalize(UChar * in, UErrorCode * err)
 
 void unicodeReplaceNormalized(unsigned inlen, UChar * in, UErrorCode * err)
 {
-    UChar * buff = (UChar *)rtlMalloc(inlen*2);
+    UChar * buff = (UChar *)rtlMalloc(inlen*sizeof(UChar));
     unsigned len = unorm_normalize(in, inlen, UNORM_NFC, 0, buff, inlen, err);
     while(len<inlen) buff[len++] = 0x0020;
-    memcpy(in, buff, inlen);
+    memcpy(in, buff, inlen * sizeof(UChar));
     free(buff);
 }
 
 void vunicodeReplaceNormalized(unsigned inlen, UChar * in, UErrorCode * err)
 {
-    UChar * buff = (UChar *)rtlMalloc(inlen*2);
+    UChar * buff = (UChar *)rtlMalloc(inlen*sizeof(UChar));
     unsigned len = unorm_normalize(in, -1, UNORM_NFC, 0, buff, inlen-1, err);
     buff[len] = 0x0000;
-    memcpy(in, buff, inlen);
+    memcpy(in, buff, inlen * sizeof(UChar));
     free(buff);
 }