瀏覽代碼

Merge pull request #9111 from ghalliday/issue16038

HPCC-16038 Ensure headers and separators are utf8 for unicode csv

Reviewed-By: Shamser Ahmed <shamser.ahmed@lexisnexis.co.uk>
Reviewed-By: Richard Chapman <rchapman@hpccsystems.com>
Richard Chapman 8 年之前
父節點
當前提交
c9de838fa9

+ 1 - 0
common/deftype/defvalue.hpp

@@ -60,6 +60,7 @@ public:
     virtual void toMem(void * ptr) = 0;
 
     virtual unsigned getHash(unsigned initval)=0;
+    virtual bool isValid() const = 0;
 };
 
 typedef Owned<IValue> OwnedIValue;

+ 2 - 0
common/deftype/defvalue.ipp

@@ -46,6 +46,7 @@ public:
     // serializable
     virtual void serialize(MemoryBuffer &tgt) { UNIMPLEMENTED; }
     virtual void deserialize(MemoryBuffer &src) { UNIMPLEMENTED; }
+    virtual bool isValid() const { return true; }
 
 protected:
     IValue * doCastTo(unsigned osize, const char * text, ITypeInfo *t);     // common code for string casting
@@ -382,6 +383,7 @@ public:
 // serializable
     virtual void serialize(MemoryBuffer &tgt);
     virtual void deserialize(MemoryBuffer &src);
+    virtual bool isValid() const { return rtlIsValidReal(sizeof(val), &val); }
 
 }; 
 

+ 25 - 22
ecl/hql/hqlfold.cpp

@@ -3295,35 +3295,38 @@ IHqlExpression * foldConstantOperator(IHqlExpression * expr, unsigned foldOption
             IValue * childValue = child->queryValue();
             if (childValue)
             {
-                ITypeInfo * exprType = expr->queryType();
+                Linked<ITypeInfo> exprType = expr->queryType();
                 ITypeInfo * childType = child->queryType();
-                if (exprType->getSize() <= childType->getSize())
+                size32_t childSize = childValue->getSize();
+                const void * rawvalue = childValue->queryValue();
+                unsigned newSize = exprType->getSize();
+                if (newSize == UNKNOWN_LENGTH)
                 {
-                    switch (childType->getTypeCode())
+                    unsigned newLen = UNKNOWN_LENGTH;
+                    switch (exprType->getTypeCode())
                     {
                     case type_string:
-                    case type_unicode:
                     case type_varstring:
+                        newLen = childSize;
+                        break;
+                    case type_unicode:
+                        newLen = childSize / sizeof(UChar);
+                        break;
                     case type_utf8:
-                        {
-                            //MORE: Should probably have more protection .....
-                            IValue * transferred = createValueFromMem(expr->getType(), childValue->queryValue());
-                            if (transferred)
-                                return createConstant(transferred);
-                            break;
-                        }
-                    case type_int:
-                        {
-                            __int64 value = childValue->getIntValue();
-                            const byte * ptr = (const byte *)&value;
-                            if (__BYTE_ORDER != __LITTLE_ENDIAN)
-                                ptr += sizeof(value) - childType->getSize();
-                            IValue * transferred = createValueFromMem(expr->getType(), ptr);
-                            if (transferred)
-                                return createConstant(transferred);
-                            break;
-                        }
+                        newLen = rtlUtf8Length(childSize, rawvalue);
+                        break;
                     }
+                    if (newLen != UNKNOWN_LENGTH)
+                    {
+                        newSize = childSize;
+                        exprType.setown(getStretchedType(newLen, exprType));
+                    }
+                }
+                if (newSize <= childSize)
+                {
+                    IValue * transferred = createValueFromMem(LINK(exprType), rawvalue);
+                    if (transferred && transferred->isValid())
+                        return createConstant(transferred);
                 }
             }
             break;

+ 66 - 19
ecl/hqlcpp/hqlhtcpp.cpp

@@ -9500,18 +9500,16 @@ void HqlCppTranslator::buildReturnCsvValue(BuildCtx & ctx, IHqlExpression * _exp
     buildReturn(ctx, expr, constUnknownVarStringType);
 }
 
-void HqlCppTranslator::buildCsvListFunc(BuildCtx & classctx, const char * func, IHqlExpression * attr, const char * defaultValue)
+void HqlCppTranslator::buildCsvListFunc(BuildCtx & classctx, const char * func, IHqlExpression * value, const char * defaultValue)
 {
     BuildCtx funcctx(classctx);
     StringBuffer s;
 
     s.clear().append("virtual const char * ").append(func).append("(unsigned idx)");
     funcctx.addQuotedCompound(s);
-    if (attr || defaultValue)
+    if (value || defaultValue)
     {
         OwnedHqlExpr idxVar = createVariable("idx", LINK(unsignedType));
-        IHqlExpression * value = attr ? attr->queryChild(0) : NULL;
-
         if (!value || !isEmptyList(value))
         {
             IHqlStmt * caseStmt = funcctx.addSwitch(idxVar);
@@ -9556,15 +9554,51 @@ void HqlCppTranslator::buildCsvListFunc(BuildCtx & classctx, const char * func,
     funcctx.addReturn(queryQuotedNullExpr());
 }
 
-static void expandDefaultString(StringBuffer & out, IHqlExpression * property, const char * defaultValue)
+static void expandDefaultString(StringBuffer & out, IHqlExpression * value, const char * defaultValue, IAtom * encoding)
 {
-    IHqlExpression * value = property ? property->queryChild(0) : NULL;
+    //If there are multiple alternatives use the first in the list as the default
+    if (value && value->getOperator() == no_list)
+        value = value->queryChild(0);
     if (value && value->queryValue())
-        value->queryValue()->getStringValue(out);
+    {
+        if (encoding == unicodeAtom)
+            getUTF8Value(out, value);
+        else
+            value->queryValue()->getStringValue(out);
+    }
     else
         out.append(defaultValue);
 }
 
+static IHqlExpression * forceToCorrectEncoding(IHqlExpression * expr, IAtom * encoding)
+{
+    //This is ugly.  Really it should cast to a varutf8 type - but that isn't implemented.  So instead it
+    //casts it to a utf8, type transfers it to a string, and then casts that to a varstring!
+    //Reimplement if varutf8 is ever implemented.
+    if (expr && (encoding == unicodeAtom))
+    {
+        if (expr->isList())
+        {
+            assertex(expr->getOperator() == no_list);
+            HqlExprArray args;
+            ForEachChild(i, expr)
+            {
+                IHqlExpression * value = expr->queryChild(i);
+                args.append(*forceToCorrectEncoding(value, encoding));
+            }
+            return expr->clone(args);
+        }
+        else
+        {
+            OwnedHqlExpr cast = ensureExprType(expr, unknownUtf8Type);
+            OwnedHqlExpr transfer = createValue(no_typetransfer, LINK(unknownStringType), LINK(cast));
+            OwnedHqlExpr recast = ensureExprType(transfer, unknownVarStringType);
+            return foldHqlExpression(recast);
+        }
+    }
+    return LINK(expr);
+}
+
 void HqlCppTranslator::buildCsvParameters(BuildCtx & subctx, IHqlExpression * csvAttr, IHqlExpression * record, bool isReading)
 {
     HqlExprArray attrs;
@@ -9580,14 +9614,22 @@ void HqlCppTranslator::buildCsvParameters(BuildCtx & subctx, IHqlExpression * cs
     bool singleHeader = false;
     bool manyHeader = false;
     IHqlExpression * headerAttr = queryAttribute(headingAtom, attrs);
-    IHqlExpression * terminator = queryAttribute(terminatorAtom, attrs);
-    IHqlExpression * separator = queryAttribute(separatorAtom, attrs);
-    IHqlExpression * escape = queryAttribute(escapeAtom, attrs);
+    IHqlExpression * terminatorAttr = queryAttribute(terminatorAtom, attrs);
+    IHqlExpression * separatorAttr = queryAttribute(separatorAtom, attrs);
+    IHqlExpression * escapeAttr = queryAttribute(escapeAtom, attrs);
+    IHqlExpression * quoteAttr = queryAttribute(quoteAtom, attrs);
+    LinkedHqlExpr terminator = terminatorAttr ? terminatorAttr->queryChild(0) : nullptr;
+    LinkedHqlExpr separator = separatorAttr ? separatorAttr->queryChild(0) : nullptr;
+    LinkedHqlExpr escape = escapeAttr ? escapeAttr->queryChild(0) : nullptr;
+    LinkedHqlExpr quote = quoteAttr ? quoteAttr->queryChild(0) : nullptr;
+
+    IAtom * encoding = queryCsvEncoding(csvAttr);
     if (headerAttr)
     {
-        IHqlExpression * header = queryRealChild(headerAttr, 0);
+        LinkedHqlExpr header = queryRealChild(headerAttr, 0);
         if (header)
         {
+            header.setown(forceToCorrectEncoding(header, encoding));
             if (header->queryType()->isInteger())
             {
                 classctx.addQuotedLiteral("virtual const char * getHeader() { return NULL; }");
@@ -9605,9 +9647,9 @@ void HqlCppTranslator::buildCsvParameters(BuildCtx & subctx, IHqlExpression * cs
             if (!isReading)
             {
                 StringBuffer comma;
-                expandDefaultString(comma, separator, ",");
+                expandDefaultString(comma, separator, ",", encoding);
                 expandFieldNames(queryErrorProcessor(), names, record, comma.str(), queryAttributeChild(headerAttr, formatAtom, 0));
-                expandDefaultString(names, terminator, "\n");
+                expandDefaultString(names, terminator, "\n", encoding);
             }
             OwnedHqlExpr namesExpr = createConstant(names.str());
             doBuildVarStringFunction(classctx, "getHeader", namesExpr);
@@ -9638,19 +9680,24 @@ void HqlCppTranslator::buildCsvParameters(BuildCtx & subctx, IHqlExpression * cs
 
     doBuildSizetFunction(classctx, "queryMaxSize", getCsvMaxLength(csvAttr));
 
-    buildCsvListFunc(classctx, "getQuote", queryAttribute(quoteAtom, attrs), isReading ? "\"" : NULL);
+    quote.setown(forceToCorrectEncoding(quote, encoding));
+    separator.setown(forceToCorrectEncoding(separator, encoding));
+    terminator.setown(forceToCorrectEncoding(terminator, encoding));
+    escape.setown(forceToCorrectEncoding(escape, encoding));
+
+    buildCsvListFunc(classctx, "getQuote", quote, isReading ? "\"" : NULL);
     buildCsvListFunc(classctx, "getSeparator", separator, ",");
     buildCsvListFunc(classctx, "getTerminator", terminator, isReading ? "\r\n|\n" : "\n");
     buildCsvListFunc(classctx, "getEscape", escape, NULL);
 
     StringBuffer flags;
-    if (!queryAttribute(quoteAtom, attrs))       flags.append("|defaultQuote");
-    if (!queryAttribute(separatorAtom, attrs))   flags.append("|defaultSeparate");
-    if (!queryAttribute(terminatorAtom, attrs))  flags.append("|defaultTerminate");
-    if (!queryAttribute(escapeAtom, attrs))      flags.append("|defaultEscape");
+    if (!quoteAttr)                             flags.append("|defaultQuote");
+    if (!separatorAttr)                         flags.append("|defaultSeparate");
+    if (!terminatorAttr)                        flags.append("|defaultTerminate");
+    if (!escapeAttr)                            flags.append("|defaultEscape");
     if (singleHeader)                           flags.append("|singleHeaderFooter");
     if (manyHeader)                             flags.append("|manyHeaderFooter");
-    if (queryAttribute(noTrimAtom, attrs))       flags.append("|preserveWhitespace");
+    if (queryAttribute(noTrimAtom, attrs))      flags.append("|preserveWhitespace");
     if (flags.length() == 0)                    flags.append("|0");
 
     doBuildUnsignedFunction(classctx, "getFlags", flags.str()+1);

+ 7 - 0
ecl/regress/bug1002.ecl

@@ -29,6 +29,13 @@ cast(y);
 (string6)(TRANSFER(x'12345e', DECIMAL5));
 (string6)(TRANSFER(x'12345f', DECIMAL5));
 
+(string6)(TRANSFER(NOFOLD(x'12345a'), DECIMAL5));
+(string6)(TRANSFER(NOFOLD(x'12345b'), DECIMAL5));
+(string6)(TRANSFER(NOFOLD(x'12345c'), DECIMAL5));
+(string6)(TRANSFER(NOFOLD(x'12345d'), DECIMAL5));
+(string6)(TRANSFER(NOFOLD(x'12345e'), DECIMAL5));
+(string6)(TRANSFER(NOFOLD(x'12345f'), DECIMAL5));
+
 (string6)(TRANSFER(x'12345a', DECIMAL5)) = ' 12345';
 (string6)(TRANSFER(x'12345b', DECIMAL5)) = '-12345';
 (string6)(TRANSFER(x'12345c', DECIMAL5)) = ' 12345';

+ 3 - 0
ecl/regress/decstr.ecl

@@ -19,3 +19,6 @@ decimal11_3 a := transfer(x'10000000696d',decimal11_3);
 
 (string)a;
 
+decimal11_3 a2 := transfer(nofold(x'10000000696d'),decimal11_3);
+
+(string)a2;

+ 23 - 0
ecl/regress/issue16038.ecl

@@ -0,0 +1,23 @@
+rec := RECORD
+    STRING field1;
+    STRING field2;
+    INTEGER field3;
+    INTEGER field4;
+    STRING field5;
+END;
+
+ds1 := DATASET([{'0000000000','abcdefghijklmnopqr',159,666,'abcdefghijklmnopq' },
+                 {'0000000001','abcdefg',134,217,'abcdefghijklmnopqrstuvwxyz012' },
+                 {'0000000002','abcdefghijk',255,779,'abcdefgh' },
+                 {'0000000003','abcdefghijklmnop',33,94,'abcdefghijklPQRST' },
+                 {'0000000004','abcdefghijklmnopqrstuv',253,519,'abcdefghijkl' }
+                 ], rec);
+output(ds1, NAMED('Original'));
+output(ds1,,'~data-with-exotic-sep_a1.csv', CSV(SEPARATOR('Ý'), HEADING('field1Ýfield2Ýfield3Ýfield4Ýfield5\n')), OVERWRITE);
+output(ds1,,'~data-with-exotic-sep_a2.csv', CSV(SEPARATOR('Ý')), OVERWRITE);
+output(ds1,,'~data-with-exotic-sep_u1.csv', CSV(UTF8, SEPARATOR('Ý'), HEADING('field1Ýfield2Ýfield3Ýfield4Ýfield5\n')), OVERWRITE);
+output(ds1,,'~data-with-exotic-sep_u2.csv', CSV(UTF8, SEPARATOR(U'Ý'), HEADING), OVERWRITE);
+output(ds1,,'~data-with-exotic-sep_u3.csv', CSV(UTF8, SEPARATOR([U'Ý',U'-Ý',U'Ý-']), HEADING), OVERWRITE);
+
+ds2 := DATASET('~data-with-exotic-sep.csv', rec, csv( SEPARATOR('Ý')  ) );
+output(ds2, NAMED('Created_from_exotic'));

+ 2 - 1
ecl/regress/transfer3.ecl

@@ -15,4 +15,5 @@
     limitations under the License.
 ############################################################################## */
 
-(integer) transfer(x'12345678', unsigned decimal8)
+(integer) transfer(x'12345678', unsigned decimal8);
+(integer) transfer(nofold(x'12345678'), unsigned decimal8);