11 anni fa · 50408e4c17
--- a/common/deftype/defvalue.cpp
+++ b/common/deftype/defvalue.cpp
@@ -739,6 +739,25 @@ IValue *createUnicodeValue(char const * value, unsigned size, char const * local
 
				     return ret;
			
 
				 }
			
 
				 
			
 
				+IValue *createUtf8Value(size32_t len, char const * value, char const * locale, bool unescape)
			
 
				+{
			
 
				+    if (unescape)
			
 
				+    {
			
 
				+        rtlDataAttr temp;
			
 
				+        size32_t newlen = 0;
			
 
				+        size32_t size = rtlUtf8Size(len, value);
			
 
				+        rtlCodepageToUtf8XUnescape(newlen, temp.refstr(), size, value, "UTF-8");
			
 
				+
			
 
				+        ITypeInfo * type = makeUtf8Type(newlen, createLowerCaseAtom(locale));
			
 
				+        return createUtf8Value(temp.getstr(), type);
			
 
				+    }
			
 
				+    else
			
 
				+    {
			
 
				+        ITypeInfo * type = makeUtf8Type(len, createLowerCaseAtom(locale));
			
 
				+        return createUtf8Value(value, type);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				 IValue *createUnicodeValue(char const * value, ITypeInfo * type)
			
 
				 {
			
 
				     if(type->getSize() == UNKNOWN_LENGTH)
			
--- a/common/deftype/defvalue.hpp
+++ b/common/deftype/defvalue.hpp
@@ -82,6 +82,7 @@ extern DEFTYPE_API IValue * createVarUnicodeValue(char const * value, unsigned s
 
				 extern DEFTYPE_API IValue * createVarUnicodeValue(size32_t len, const void * text, ITypeInfo * type);
			
 
				 extern DEFTYPE_API IValue * createUtf8Value(char const * value, ITypeInfo *type);
			
 
				 extern DEFTYPE_API IValue * createUtf8Value(unsigned srclen, char const * value, ITypeInfo *type);
			
 
				+extern DEFTYPE_API IValue * createUtf8Value(size32_t len, char const * value, char const * locale, bool unescape);
			
 
				 extern DEFTYPE_API IValue * createDataValue(const char * value, unsigned size);
			
 
				 extern DEFTYPE_API IValue * createDataValue(const char * value, ITypeInfo *type);
			
 
				 extern DEFTYPE_API IValue * createQStringValue(unsigned len, const char * value, ITypeInfo *type);
			
--- a/ecl/hql/hqllex.l
+++ b/ecl/hql/hqllex.l
@@ -27,14 +27,12 @@
 
				 #include "hqlerrors.hpp"
			
 
				 #include "hql.hpp"
			
 
				 #include "hqlgram.hpp"
			
 
				+#include "eclrtl.hpp"
			
 
				 
			
 
				 #include "hqlgram.h"
			
 
				 
			
 
				 #define YY_DECL int HqlLex::doyyFlex(YYSTYPE & returnToken, yyscan_t yyscanner, HqlLex * lexer, bool lookup, const short * activeState)
			
 
				 
			
 
				-//declare this rather than including the header because VC++ seems to get very upset running out of heap if you do.
			
 
				-extern int rtlSingleUtf8ToCodepage(char * out, unsigned inlen, char const * in, char const * outcodepage);
			
 
				-
			
 
				 //#define DEBUG_TOKEN 
			
 
				 
			
 
				 #ifdef DEBUG_TOKEN
			
@@ -1632,12 +1630,18 @@ FUNCTIONMACRO|MACRO {
 
				                             lexer->reportError(returnToken, ERR_ESCAPE_UNKNOWN, "%s", msg.str());
			
 
				                             returnToken.setPosition(lexer->yyLineNo, oldColumn, lexer->yyPosition, lexer->querySourcePath());
			
 
				                         }
			
 
				-                        Owned<IValue> unicodeValue = createUnicodeValue(CUR_TOKEN_TEXT + start, CUR_TOKEN_LENGTH - (start+1), "", true, true);
			
 
				+
			
 
				+                        size32_t size = CUR_TOKEN_LENGTH - (start+1);
			
 
				+                        const char * value = CUR_TOKEN_TEXT + start;
			
 
				+                        Owned<IValue> unicodeValue;
			
 
				                         if (utf8)
			
 
				                         {
			
 
				-                            Owned<ITypeInfo> castType = makeUtf8Type(UNKNOWN_LENGTH, NULL);
			
 
				-                            unicodeValue.setown(unicodeValue->castTo(castType));
			
 
				+                            size32_t length = rtlUtf8Length(size, value);
			
 
				+                            unicodeValue.setown(createUtf8Value(length, value, "", true));
			
 
				                         }
			
 
				+                        else
			
 
				+                            unicodeValue.setown(createUnicodeValue(value, size, "", true, true));
			
 
				+                        
			
 
				                         returnToken.setExpr(createConstant(unicodeValue.getClear()));
			
 
				                         return (UNICODE_CONST);
			
 
				                     }
			
--- a/rtl/eclrtl/eclrtl.cpp
+++ b/rtl/eclrtl/eclrtl.cpp
@@ -3181,6 +3181,24 @@ void rtlCodepageToUnicodeXUnescape(unsigned & outlen, UChar * & out, unsigned in
 
				     normalized.extract(0, outlen, out);
			
 
				 }
			
 
				 
			
 
				+void rtlCodepageToUtf8XUnescape(unsigned & outlen, char * & out, unsigned inlen, char const * in, char const * codepage)
			
 
				+{
			
 
				+    //If the input contains a character which doesn't exist in its claimed codepage, this will
			
 
				+    //generate U+FFFD (substitution character). This most likely won't be displayed.
			
 
				+    UnicodeString raw(in, inlen, codepage);
			
 
				+    UnicodeString unescaped = raw.unescape();
			
 
				+    UnicodeString normalized;
			
 
				+    normalizeUnicodeString(unescaped, normalized);
			
 
				+
			
 
				+    UConverter * utf8Conv = queryRTLUnicodeConverter(UTF8_CODEPAGE)->query();
			
 
				+    UErrorCode err = U_ZERO_ERROR;
			
 
				+    size32_t outsize = normalized.extract(NULL, 0, utf8Conv, err);
			
 
				+    err = U_ZERO_ERROR;
			
 
				+    out = (char *)rtlMalloc(outsize);
			
 
				+    outsize = normalized.extract(out, outsize, utf8Conv, err);
			
 
				+    outlen = rtlUtf8Length(outsize, out);
			
 
				+}
			
 
				+
			
 
				 void rtlUnicodeToCodepageX(unsigned & outlen, char * & out, unsigned inlen, UChar const * in, char const * codepage)
			
 
				 {
			
 
				     //If the unicode contains a character which doesn't exist in the destination codepage,
			
--- a/rtl/eclrtl/eclrtl.hpp
+++ b/rtl/eclrtl/eclrtl.hpp
@@ -322,6 +322,8 @@ ECLRTL_API void rtlCodepageToVUnicode(unsigned outlen, UChar * out, unsigned inl
 
				 ECLRTL_API void rtlVCodepageToUnicode(unsigned outlen, UChar * out, char const * in, char const * codepage);
			
 
				 ECLRTL_API void rtlVCodepageToVUnicode(unsigned outlen, UChar * out, char const * in, char const * codepage);
			
 
				 ECLRTL_API void rtlCodepageToUnicodeUnescape(unsigned outlen, UChar * out, unsigned inlen, char const * in, char const * codepage);
			
 
				+ECLRTL_API void rtlCodepageToUtf8XUnescape(unsigned & outlen, char * & out, unsigned inlen, char const * in, char const * codepage);
			
 
				+
			
 
				 ECLRTL_API void rtlUnicodeToCodepage(unsigned outlen, char * out, unsigned inlen, UChar const * in, char const * codepage);
			
 
				 ECLRTL_API void rtlUnicodeToData(unsigned outlen, void * out, unsigned inlen, UChar const * in);
			
 
				 ECLRTL_API void rtlUnicodeToVCodepage(unsigned outlen, char * out, unsigned inlen, UChar const * in, char const * codepage);
			
--- a/testing/regress/ecl/issue10475.ecl
+++ b/testing/regress/ecl/issue10475.ecl
--- a/testing/regress/ecl/key/issue10475.xml
+++ b/testing/regress/ecl/key/issue10475.xml
@@ -0,0 +1,36 @@
 
				+<Dataset name='Result 1'>
			
 
				+ <Row><Result_1>noël</Result_1></Row>
			
 
				+</Dataset>
			
 
				+<Dataset name='Result 2'>
			
 
				+ <Row><Result_2>lëon</Result_2></Row>
			
 
				+</Dataset>
			
 
				+<Dataset name='Result 3'>
			
 
				+ <Row><Result_3>noë</Result_3></Row>
			
 
				+</Dataset>
			
 
				+<Dataset name='Result 4'>
			
 
				+ <Row><Result_4>4</Result_4></Row>
			
 
				+</Dataset>
			
 
				+<Dataset name='Result 5'>
			
 
				+ <Row><Result_5>😸😾</Result_5></Row>
			
 
				+</Dataset>
			
 
				+<Dataset name='Result 6'>
			
 
				+ <Row><Result_6>2</Result_6></Row>
			
 
				+</Dataset>
			
 
				+<Dataset name='Result 7'>
			
 
				+ <Row><Result_7>😾</Result_7></Row>
			
 
				+</Dataset>
			
 
				+<Dataset name='Result 8'>
			
 
				+ <Row><Result_8>😾😸</Result_8></Row>
			
 
				+</Dataset>
			
 
				+<Dataset name='Result 9'>
			
 
				+ <Row><Result_9>baﬄe</Result_9></Row>
			
 
				+</Dataset>
			
 
				+<Dataset name='Result 10'>
			
 
				+ <Row><Result_10>4</Result_10></Row>
			
 
				+</Dataset>
			
 
				+<Dataset name='Result 11'>
			
 
				+ <Row><Result_11>BAFFLE</Result_11></Row>
			
 
				+</Dataset>
			
 
				+<Dataset name='Result 12'>
			
 
				+ <Row><Result_12>true</Result_12></Row>
			
 
				+</Dataset>