Sfoglia il codice sorgente

HPCC-10475 Avoid u8'string' going via a unicode type

Currently the Unicode type (incorrectly) counts lengths in codepoints
rather than abstract characters.  Avoiding going via Unicode type allows
utf8 constants to be treated correctly - without the upheaval of changing
the utf16 handling.

Signed-off-by: Gavin Halliday <gavin.halliday@lexisnexis.com>
Gavin Halliday 11 anni fa
parent
commit
50408e4c17

+ 19 - 0
common/deftype/defvalue.cpp

@@ -739,6 +739,25 @@ IValue *createUnicodeValue(char const * value, unsigned size, char const * local
     return ret;
 }
 
+IValue *createUtf8Value(size32_t len, char const * value, char const * locale, bool unescape)
+{
+    if (unescape)
+    {
+        rtlDataAttr temp;
+        size32_t newlen = 0;
+        size32_t size = rtlUtf8Size(len, value);
+        rtlCodepageToUtf8XUnescape(newlen, temp.refstr(), size, value, "UTF-8");
+
+        ITypeInfo * type = makeUtf8Type(newlen, createLowerCaseAtom(locale));
+        return createUtf8Value(temp.getstr(), type);
+    }
+    else
+    {
+        ITypeInfo * type = makeUtf8Type(len, createLowerCaseAtom(locale));
+        return createUtf8Value(value, type);
+    }
+}
+
 IValue *createUnicodeValue(char const * value, ITypeInfo * type)
 {
     if(type->getSize() == UNKNOWN_LENGTH)

+ 1 - 0
common/deftype/defvalue.hpp

@@ -82,6 +82,7 @@ extern DEFTYPE_API IValue * createVarUnicodeValue(char const * value, unsigned s
 extern DEFTYPE_API IValue * createVarUnicodeValue(size32_t len, const void * text, ITypeInfo * type);
 extern DEFTYPE_API IValue * createUtf8Value(char const * value, ITypeInfo *type);
 extern DEFTYPE_API IValue * createUtf8Value(unsigned srclen, char const * value, ITypeInfo *type);
+extern DEFTYPE_API IValue * createUtf8Value(size32_t len, char const * value, char const * locale, bool unescape);
 extern DEFTYPE_API IValue * createDataValue(const char * value, unsigned size);
 extern DEFTYPE_API IValue * createDataValue(const char * value, ITypeInfo *type);
 extern DEFTYPE_API IValue * createQStringValue(unsigned len, const char * value, ITypeInfo *type);

+ 10 - 6
ecl/hql/hqllex.l

@@ -27,14 +27,12 @@
 #include "hqlerrors.hpp"
 #include "hql.hpp"
 #include "hqlgram.hpp"
+#include "eclrtl.hpp"
 
 #include "hqlgram.h"
 
 #define YY_DECL int HqlLex::doyyFlex(YYSTYPE & returnToken, yyscan_t yyscanner, HqlLex * lexer, bool lookup, const short * activeState)
 
-//declare this rather than including the header because VC++ seems to get very upset running out of heap if you do.
-extern int rtlSingleUtf8ToCodepage(char * out, unsigned inlen, char const * in, char const * outcodepage);
-
 //#define DEBUG_TOKEN 
 
 #ifdef DEBUG_TOKEN
@@ -1632,12 +1630,18 @@ FUNCTIONMACRO|MACRO {
                             lexer->reportError(returnToken, ERR_ESCAPE_UNKNOWN, "%s", msg.str());
                             returnToken.setPosition(lexer->yyLineNo, oldColumn, lexer->yyPosition, lexer->querySourcePath());
                         }
-                        Owned<IValue> unicodeValue = createUnicodeValue(CUR_TOKEN_TEXT + start, CUR_TOKEN_LENGTH - (start+1), "", true, true);
+
+                        size32_t size = CUR_TOKEN_LENGTH - (start+1);
+                        const char * value = CUR_TOKEN_TEXT + start;
+                        Owned<IValue> unicodeValue;
                         if (utf8)
                         {
-                            Owned<ITypeInfo> castType = makeUtf8Type(UNKNOWN_LENGTH, NULL);
-                            unicodeValue.setown(unicodeValue->castTo(castType));
+                            size32_t length = rtlUtf8Length(size, value);
+                            unicodeValue.setown(createUtf8Value(length, value, "", true));
                         }
+                        else
+                            unicodeValue.setown(createUnicodeValue(value, size, "", true, true));
+                        
                         returnToken.setExpr(createConstant(unicodeValue.getClear()));
                         return (UNICODE_CONST);
                     }

+ 18 - 0
rtl/eclrtl/eclrtl.cpp

@@ -3181,6 +3181,24 @@ void rtlCodepageToUnicodeXUnescape(unsigned & outlen, UChar * & out, unsigned in
     normalized.extract(0, outlen, out);
 }
 
+void rtlCodepageToUtf8XUnescape(unsigned & outlen, char * & out, unsigned inlen, char const * in, char const * codepage)
+{
+    //If the input contains a character which doesn't exist in its claimed codepage, this will
+    //generate U+FFFD (substitution character). This most likely won't be displayed.
+    UnicodeString raw(in, inlen, codepage);
+    UnicodeString unescaped = raw.unescape();
+    UnicodeString normalized;
+    normalizeUnicodeString(unescaped, normalized);
+
+    UConverter * utf8Conv = queryRTLUnicodeConverter(UTF8_CODEPAGE)->query();
+    UErrorCode err = U_ZERO_ERROR;
+    size32_t outsize = normalized.extract(NULL, 0, utf8Conv, err);
+    err = U_ZERO_ERROR;
+    out = (char *)rtlMalloc(outsize);
+    outsize = normalized.extract(out, outsize, utf8Conv, err);
+    outlen = rtlUtf8Length(outsize, out);
+}
+
 void rtlUnicodeToCodepageX(unsigned & outlen, char * & out, unsigned inlen, UChar const * in, char const * codepage)
 {
     //If the unicode contains a character which doesn't exist in the destination codepage,

+ 2 - 0
rtl/eclrtl/eclrtl.hpp

@@ -322,6 +322,8 @@ ECLRTL_API void rtlCodepageToVUnicode(unsigned outlen, UChar * out, unsigned inl
 ECLRTL_API void rtlVCodepageToUnicode(unsigned outlen, UChar * out, char const * in, char const * codepage);
 ECLRTL_API void rtlVCodepageToVUnicode(unsigned outlen, UChar * out, char const * in, char const * codepage);
 ECLRTL_API void rtlCodepageToUnicodeUnescape(unsigned outlen, UChar * out, unsigned inlen, char const * in, char const * codepage);
+ECLRTL_API void rtlCodepageToUtf8XUnescape(unsigned & outlen, char * & out, unsigned inlen, char const * in, char const * codepage);
+
 ECLRTL_API void rtlUnicodeToCodepage(unsigned outlen, char * out, unsigned inlen, UChar const * in, char const * codepage);
 ECLRTL_API void rtlUnicodeToData(unsigned outlen, void * out, unsigned inlen, UChar const * in);
 ECLRTL_API void rtlUnicodeToVCodepage(unsigned outlen, char * out, unsigned inlen, UChar const * in, char const * codepage);

BIN
testing/regress/ecl/issue10475.ecl


+ 36 - 0
testing/regress/ecl/key/issue10475.xml

@@ -0,0 +1,36 @@
+<Dataset name='Result 1'>
+ <Row><Result_1>noël</Result_1></Row>
+</Dataset>
+<Dataset name='Result 2'>
+ <Row><Result_2>lëon</Result_2></Row>
+</Dataset>
+<Dataset name='Result 3'>
+ <Row><Result_3>noë</Result_3></Row>
+</Dataset>
+<Dataset name='Result 4'>
+ <Row><Result_4>4</Result_4></Row>
+</Dataset>
+<Dataset name='Result 5'>
+ <Row><Result_5>😸😾</Result_5></Row>
+</Dataset>
+<Dataset name='Result 6'>
+ <Row><Result_6>2</Result_6></Row>
+</Dataset>
+<Dataset name='Result 7'>
+ <Row><Result_7>😾</Result_7></Row>
+</Dataset>
+<Dataset name='Result 8'>
+ <Row><Result_8>😾😸</Result_8></Row>
+</Dataset>
+<Dataset name='Result 9'>
+ <Row><Result_9>baffle</Result_9></Row>
+</Dataset>
+<Dataset name='Result 10'>
+ <Row><Result_10>4</Result_10></Row>
+</Dataset>
+<Dataset name='Result 11'>
+ <Row><Result_11>BAFFLE</Result_11></Row>
+</Dataset>
+<Dataset name='Result 12'>
+ <Row><Result_12>true</Result_12></Row>
+</Dataset>