浏览代码

Merge pull request #11409 from ghalliday/issue20083

HPCC-20083 Fix problems with utf8 ordering for chars < 0x80

Reviewed-By: Richard Chapman <rchapman@hpccsystems.com>
Richard Chapman 7 年之前
父节点
当前提交
ee66550921
共有 3 个文件被更改,包括 163 次插入32 次删除
  1. 19 32
      rtl/eclrtl/eclrtl.cpp
  2. 99 0
      testing/regress/ecl/key/utf8order.xml
  3. 45 0
      testing/regress/ecl/utf8order.ecl

+ 19 - 32
rtl/eclrtl/eclrtl.cpp

@@ -2766,8 +2766,8 @@ const static UChar nullUStr = 0;
 #ifdef _USE_ICU
 int rtlCompareUnicodeUnicode(unsigned l1, UChar const * p1, unsigned l2, UChar const * p2, char const * locale)
 {
-    while(l1 && u_isUWhiteSpace(p1[l1-1])) l1--;
-    while(l2 && u_isUWhiteSpace(p2[l2-1])) l2--;
+    while(l1 && (p1[l1-1] == ' ')) l1--;
+    while(l2 && (p2[l2-1] == ' ')) l2--;
     if (!p1) p1 = &nullUStr;
     if (!p2) p2 = &nullUStr;
     return ucol_strcoll(queryRTLLocale(locale)->queryCollator(), p1, l1, p2, l2);
@@ -2775,8 +2775,8 @@ int rtlCompareUnicodeUnicode(unsigned l1, UChar const * p1, unsigned l2, UChar c
 
 int rtlCompareUnicodeUnicodeStrength(unsigned l1, UChar const * p1, unsigned l2, UChar const * p2, char const * locale, unsigned strength)
 {
-    while(l1 && u_isUWhiteSpace(p1[l1-1])) l1--;
-    while(l2 && u_isUWhiteSpace(p2[l2-1])) l2--;
+    while(l1 && (p1[l1-1] == ' ')) l1--;
+    while(l2 && (p2[l2-1] == ' ')) l2--;
     if (!p1) p1 = &nullUStr;
     if (!p2) p2 = &nullUStr;
     return ucol_strcoll(queryRTLLocale(locale)->queryCollator(strength), p1, l1, p2, l2);
@@ -4912,34 +4912,21 @@ static int rtlCompareUtf8Utf8ViaUnicode(size32_t llen, const char * left, size32
 #ifdef _USE_ICU
 int rtlCompareUtf8Utf8(size32_t llen, const char * left, size32_t rlen, const char * right, const char * locale)
 {
-    //MORE: Do a simple comparison as long as there are no non->0x80 characters around
-    //      fall back to a full unicode comparison if we hit one - or in the next character to allow for accents etc.
-    const byte * bleft = (const byte *)left;
-    const byte * bright = (const byte *)right;
-    unsigned len = llen > rlen ? rlen : llen;
-    for (unsigned i = 0; i < len; i++)
-    {
-        byte nextLeft = bleft[i];
-        byte nextRight = bright[i];
-        if (nextLeft >= 0x80 || nextRight >= 0x80)
-            return rtlCompareUtf8Utf8ViaUnicode(llen-i, left+i, rlen-i, right+i, locale);
-        if ((i+1 != len) && ((bleft[i+1] >= 0x80) || bright[i+1] >= 0x80))
-            return rtlCompareUtf8Utf8ViaUnicode(llen-i, left+i, rlen-i, right+i, locale);
-        if (nextLeft != nextRight)
-            return nextLeft - nextRight;
-    }
-    int diff = 0;
-    if (len != llen)
-    {
-        for (;(diff == 0) && (len != llen);len++)
-            diff = bleft[len] - ' ';
-    }
-    else if (len != rlen)
-    {
-        for (;(diff == 0) && (len != rlen);len++)
-            diff = ' ' - bright[len];
-    }
-    return diff;
+#if U_ICU_VERSION_MAJOR_NUM>=50
+    size_t lSize = rtlUtf8Size(llen, left);
+    while (lSize && (left[lSize-1] == ' '))
+        lSize--;
+
+    size_t rSize = rtlUtf8Size(rlen, right);
+    while (rSize && (right[rSize-1] == ' '))
+        rSize--;
+
+    UCollator * collator = queryRTLLocale(locale)->queryCollator();
+    UErrorCode status = U_ZERO_ERROR; // Not documented, but this needs to be cleared otherwise the function can fail
+    return ucol_strcollUTF8(collator, left, lSize, right, rSize, &status);
+#else
+    return rtlCompareUtf8Utf8ViaUnicode(llen, left, rlen, right, locale);
+#endif
 }
 
 int rtlCompareUtf8Utf8Strength(size32_t llen, const char * left, size32_t rlen, const char * right, const char * locale, unsigned strength)

+ 99 - 0
testing/regress/ecl/key/utf8order.xml

@@ -0,0 +1,99 @@
+<Dataset name='Result 1'>
+ <Row><Result_1>Cycles - which should never happen!</Result_1></Row>
+</Dataset>
+<Dataset name='Result 2'>
+ <Row><Result_2>true</Result_2></Row>
+</Dataset>
+<Dataset name='Result 3'>
+ <Row><Result_3>true</Result_3></Row>
+</Dataset>
+<Dataset name='Result 4'>
+ <Row><Result_4>false</Result_4></Row>
+</Dataset>
+<Dataset name='Result 5'>
+ <Row><Result_5>true</Result_5></Row>
+</Dataset>
+<Dataset name='Result 6'>
+ <Row><Result_6>true</Result_6></Row>
+</Dataset>
+<Dataset name='Result 7'>
+ <Row><Result_7>false</Result_7></Row>
+</Dataset>
+<Dataset name='Result 8'>
+ <Row><Result_8>Unicode:</Result_8></Row>
+</Dataset>
+<Dataset name='Result 9'>
+ <Row><Result_9>true</Result_9></Row>
+</Dataset>
+<Dataset name='Result 10'>
+ <Row><Result_10>true</Result_10></Row>
+</Dataset>
+<Dataset name='Result 11'>
+ <Row><Result_11>true</Result_11></Row>
+</Dataset>
+<Dataset name='Result 12'>
+ <Row><Result_12>true</Result_12></Row>
+</Dataset>
+<Dataset name='Result 13'>
+ <Row><Result_13>true</Result_13></Row>
+</Dataset>
+<Dataset name='Result 14'>
+ <Row><Result_14>Utf8:</Result_14></Row>
+</Dataset>
+<Dataset name='Result 15'>
+ <Row><Result_15>true</Result_15></Row>
+</Dataset>
+<Dataset name='Result 16'>
+ <Row><Result_16>true</Result_16></Row>
+</Dataset>
+<Dataset name='Result 17'>
+ <Row><Result_17>true</Result_17></Row>
+</Dataset>
+<Dataset name='Result 18'>
+ <Row><Result_18>true</Result_18></Row>
+</Dataset>
+<Dataset name='Result 19'>
+ <Row><Result_19>true</Result_19></Row>
+</Dataset>
+<Dataset name='Result 20'>
+ <Row><Result_20>true</Result_20></Row>
+</Dataset>
+<Dataset name='Result 21'>
+ <Row><Result_21>true</Result_21></Row>
+</Dataset>
+<Dataset name='Result 22'>
+ <Row><Result_22>true</Result_22></Row>
+</Dataset>
+<Dataset name='Result 23'>
+ <Row><Result_23>true</Result_23></Row>
+</Dataset>
+<Dataset name='Result 24'>
+ <Row><Result_24>true</Result_24></Row>
+</Dataset>
+<Dataset name='Result 25'>
+ <Row><Result_25>true</Result_25></Row>
+</Dataset>
+<Dataset name='Result 26'>
+ <Row><text>abc </text></Row>
+ <Row><text>abC </text></Row>
+ <Row><text>ABc </text></Row>
+ <Row><text>ABC </text></Row>
+ <Row><text>abcE</text></Row>
+ <Row><text>abcÈ</text></Row>
+</Dataset>
+<Dataset name='Result 27'>
+ <Row><text>ABC </text></Row>
+ <Row><text>ABc </text></Row>
+ <Row><text>abC </text></Row>
+ <Row><text>abc </text></Row>
+ <Row><text>abcE</text></Row>
+ <Row><text>abc&#200;</text></Row>
+</Dataset>
+<Dataset name='Result 28'>
+ <Row><text>abc</text></Row>
+ <Row><text>abC</text></Row>
+ <Row><text>ABc</text></Row>
+ <Row><text>ABC</text></Row>
+ <Row><text>abcE</text></Row>
+ <Row><text>abcÈ</text></Row>
+</Dataset>

+ 45 - 0
testing/regress/ecl/utf8order.ecl

@@ -0,0 +1,45 @@
+u8null := u8'' : stored('u8null');
+unull := u'' : stored('unull');
+
+output('Cycles - which should never happen!');
+output(u8'BC' > u8'BB\u20AC');
+output(u8'BB\u20AC' > u8'Ba');
+output(u8'Ba' > u8'BC');
+
+output(u'BC' > u'BB\u20AC');
+output(u'BB\u20AC' > u'Ba');
+output(u'Ba' > u'BC');
+
+output('Unicode:');
+output(u'abcÈ' > u'abcE');
+output(u'abcÈ'+unull > u'abcE');
+output(u'abcÈ'+unull != u'abcE');
+
+//Check correct length is used rather than size.
+output(u'AB\u20ACX'+unull < u'AB\u20ACY');
+output(u'AB\u20ACX'+unull != u'AB\u20ACY');
+
+Output('Utf8:');
+output(u8'abcÈ' > u8'abcE');
+output(u8'abcÈ'+u8null > u8'abcE');
+output(u8'abcÈ'+u8null != u8'abcE');
+
+output(u8'AB\u20ACX'+u8null < u8'AB\u20ACY');
+output(u8'AB\u20ACX'+u8null != u8'AB\u20ACY');
+
+output(U'AB ' = U'AB');
+output(U'AB ' != U'AB\t');
+output(U8'AB ' = U8'AB');
+output(U8'AB ' != U8'AB\t');
+output('AB ' = 'AB');
+output('AB ' != 'AB\t');
+
+//Illustrate the different ordering or string v unicode v utf8
+d1 := dataset([U'ABC',U'ABc',U'abc',U'abC',U'abcÈ',U'abcE'], { unicode4 text });
+output(sort(d1, text));
+
+d2 := dataset(['ABC','ABc','abc','abC','abcÈ','abcE'], { string4 text });
+output(sort(d2, text));
+
+d3 := dataset([u8'ABC',u8'ABc',u8'abc',u8'abC',u8'abcÈ',u8'abcE'], { utf8 text });
+output(sort(d3, text));