12 gadi atpakaļ · 917ad53c03
--- a/common/thorhelper/csvsplitter.cpp
+++ b/common/thorhelper/csvsplitter.cpp
@@ -32,7 +32,7 @@ CSVSplitter::CSVSplitter()
 
				     lengths = NULL;
			
 
				     data = NULL;
			
 
				     numQuotes = 0;
			
 
				-    unquotedBuffer = NULL;
			
 
				+    internalBuffer = NULL;
			
 
				     maxColumns = 0;
			
 
				     curUnquoted = NULL;
			
 
				 }
			
@@ -41,7 +41,7 @@ CSVSplitter::~CSVSplitter()
 
				 {
			
 
				     delete [] lengths;
			
 
				     delete [] data;
			
 
				-    free(unquotedBuffer);
			
 
				+    free(internalBuffer);
			
 
				 }
			
 
				 
			
 
				 void CSVSplitter::addQuote(const char * text)
			
@@ -72,11 +72,11 @@ void CSVSplitter::reset()
 
				     matcher.reset();
			
 
				     delete [] lengths;
			
 
				     delete [] data;
			
 
				-    free(unquotedBuffer);
			
 
				+    free(internalBuffer);
			
 
				     lengths = NULL;
			
 
				     data = NULL;
			
 
				     numQuotes = 0;
			
 
				-    unquotedBuffer = NULL;
			
 
				+    internalBuffer = NULL;
			
 
				     maxCsvSize = 0;
			
 
				 }
			
 
				 
			
@@ -84,7 +84,7 @@ void CSVSplitter::init(unsigned _maxColumns, ICsvParameters * csvInfo, const cha
 
				 {
			
 
				     reset();
			
 
				     maxCsvSize = csvInfo->queryMaxSize();
			
 
				-    unquotedBuffer = (byte *)malloc(maxCsvSize);
			
 
				+    internalBuffer = (byte *)malloc(maxCsvSize);
			
 
				 
			
 
				     maxColumns = _maxColumns;
			
 
				     lengths = new unsigned [maxColumns+1];      // NB: One larger to remove some tests in main loop...
			
@@ -131,16 +131,20 @@ void CSVSplitter::init(unsigned _maxColumns, ICsvParameters * csvInfo, const cha
 
				         }
			
 
				     }
			
 
				 
			
 
				-    if (dfsEscapes && (flags & ICsvParameters::defaultEscape))
			
 
				-        addActionList(matcher, dfsEscapes, ESCAPE);
			
 
				-    else
			
 
				+    // Old workunits won't have queryEscape. MORE: deprecate on the next major version
			
 
				+    if (flags & ICsvParameters::supportsEscape)
			
 
				     {
			
 
				-        for (idx=0;;idx++)
			
 
				+        if (dfsEscapes && (flags & ICsvParameters::defaultEscape))
			
 
				+            addActionList(matcher, dfsEscapes, ESCAPE);
			
 
				+        else
			
 
				         {
			
 
				-            const char * text = csvInfo->queryEscape(idx);
			
 
				-            if (!text)
			
 
				-                break;
			
 
				-            addEscape(text);
			
 
				+            for (idx=0;;idx++)
			
 
				+            {
			
 
				+                const char * text = csvInfo->queryEscape(idx);
			
 
				+                if (!text)
			
 
				+                    break;
			
 
				+                addEscape(text);
			
 
				+            }
			
 
				         }
			
 
				     }
			
 
				 
			
@@ -154,11 +158,15 @@ void CSVSplitter::init(unsigned _maxColumns, ICsvParameters * csvInfo, const cha
 
				 
			
 
				 void CSVSplitter::setFieldRange(const byte * start, const byte * end, unsigned curColumn, unsigned quoteToStrip, bool unescape)
			
 
				 {
			
 
				+    // Either quoting or escaping will use the local buffer
			
 
				+    if ((quoteToStrip || unescape) &&
			
 
				+        (unsigned)(curUnquoted - internalBuffer) + (unsigned)(end - start) > maxCsvSize)
			
 
				+        throw MakeStringException(99, "MAXLENGTH for CSV file is not large enough");
			
 
				+
			
 
				+    // point to the beginning of the local (possibly changed) buffer, for escaping later
			
 
				+    byte * curUnescaped = curUnquoted;
			
 
				     if (quoteToStrip)
			
 
				     {
			
 
				-        if ((unsigned)(curUnquoted - unquotedBuffer) + (unsigned)(end - start) > maxCsvSize)
			
 
				-            throw MakeStringException(99, "MAXLENGTH for CSV file is not large enough");
			
 
				-
			
 
				         data[curColumn] = curUnquoted;
			
 
				         const byte * lastCopied = start;
			
 
				         const byte *cur;
			
@@ -203,25 +211,40 @@ done:
 
				     }
			
 
				     else
			
 
				     {
			
 
				-        data[curColumn] = start;
			
 
				         lengths[curColumn] = (size32_t)(end-start);
			
 
				+        // Only if ESCAPEs were detected in the input
			
 
				+        if (unescape)
			
 
				+        {
			
 
				+            // Need to copy original to a local string (using allocated buffer)
			
 
				+            memcpy(curUnescaped, start, lengths[curColumn]);
			
 
				+            data[curColumn] = curUnescaped;
			
 
				+            // and update the buffer pointer, to re-use on next iteration
			
 
				+            curUnquoted = curUnescaped + lengths[curColumn];
			
 
				+        }
			
 
				+        else
			
 
				+        {
			
 
				+            data[curColumn] = start;
			
 
				+            return;
			
 
				+        }
			
 
				     }
			
 
				     // Un-escape string, if necessary.
			
 
				     if (unescape)
			
 
				     {
			
 
				-        byte * cur = const_cast<byte*>(data[curColumn]);
			
 
				+        byte * cur = curUnescaped; // data[curColumn] is already pointing here one way or another
			
 
				         byte * end = cur + lengths[curColumn];
			
 
				-        for (; cur <= end; cur++)
			
 
				+        for (; cur < end; cur++)
			
 
				         {
			
 
				             unsigned matchLen;
			
 
				             unsigned match = matcher.getMatch((size32_t)(end-cur), (const char *)cur, matchLen);
			
 
				             if ((match & 255) == ESCAPE)
			
 
				             {
			
 
				-                unsigned long restLen = end-cur+matchLen;
			
 
				+                ptrdiff_t restLen = end-cur+matchLen;
			
 
				                 memmove(cur, cur+matchLen, restLen);
			
 
				-                cur += matchLen;
			
 
				                 end -= matchLen;
			
 
				                 lengths[curColumn] -= matchLen;
			
 
				+                // Avoid having cur past end
			
 
				+                if (cur == end)
			
 
				+                    break;
			
 
				             }
			
 
				         }
			
 
				     }
			
@@ -236,8 +259,8 @@ size32_t CSVSplitter::splitLine(size32_t maxLength, const byte * start)
 
				     const byte * end = start + maxLength;
			
 
				     const byte * firstGood = start;
			
 
				     const byte * lastGood = start;
			
 
				-    bool unescape = false;
			
 
				-    curUnquoted = unquotedBuffer;
			
 
				+    bool lastEscape = false;
			
 
				+    curUnquoted = internalBuffer;
			
 
				 
			
 
				     while (cur != end)
			
 
				     {
			
@@ -250,7 +273,7 @@ size32_t CSVSplitter::splitLine(size32_t maxLength, const byte * start)
 
				             lastGood = cur;
			
 
				             break;
			
 
				         case WHITESPACE:
			
 
				-            //Skip leading whitepace
			
 
				+            //Skip leading whitespace
			
 
				             if (quote)
			
 
				                 lastGood = cur+matchLen;
			
 
				             else if (cur == firstGood)
			
@@ -260,10 +283,11 @@ size32_t CSVSplitter::splitLine(size32_t maxLength, const byte * start)
 
				             }
			
 
				             break;
			
 
				         case SEPARATOR:
			
 
				+            // Quoted separator
			
 
				             if ((curColumn < maxColumns) && (quote == 0))
			
 
				             {
			
 
				-                setFieldRange(firstGood, lastGood, curColumn, quoteToStrip, unescape);
			
 
				-                unescape = false;
			
 
				+                setFieldRange(firstGood, lastGood, curColumn, quoteToStrip, lastEscape);
			
 
				+                lastEscape = false;
			
 
				                 quoteToStrip = 0;
			
 
				                 curColumn++;
			
 
				                 firstGood = cur + matchLen;
			
@@ -273,8 +297,8 @@ size32_t CSVSplitter::splitLine(size32_t maxLength, const byte * start)
 
				         case TERMINATOR:
			
 
				             if (quote == 0) // Is this a good idea? Means a mismatched quote is not fixed by EOL
			
 
				             {
			
 
				-                setFieldRange(firstGood, lastGood, curColumn, quoteToStrip, unescape);
			
 
				-                unescape = false;
			
 
				+                setFieldRange(firstGood, lastGood, curColumn, quoteToStrip, lastEscape);
			
 
				+                lastEscape = false;
			
 
				                 while (++curColumn < maxColumns)
			
 
				                     lengths[curColumn] = 0;
			
 
				                 return (size32_t)(cur + matchLen - start);
			
@@ -282,6 +306,7 @@ size32_t CSVSplitter::splitLine(size32_t maxLength, const byte * start)
 
				             lastGood = cur+matchLen;
			
 
				             break;
			
 
				         case QUOTE:
			
 
				+            // Quoted quote
			
 
				             if (quote == 0)
			
 
				             {
			
 
				                 if (cur == firstGood)
			
@@ -318,14 +343,24 @@ size32_t CSVSplitter::splitLine(size32_t maxLength, const byte * start)
 
				             }
			
 
				             break;
			
 
				         case ESCAPE:
			
 
				-            unescape = true;
			
 
				-            lastGood = cur+matchLen+1;
			
 
				-            cur++; // MORE: This assumes next char is ASCii
			
 
				+            lastEscape = true;
			
 
				+            lastGood = cur+matchLen;
			
 
				+            // If this escape is at the end, proceed to field range
			
 
				+            if (lastGood == end)
			
 
				+                break;
			
 
				+
			
 
				+            // Skip escape and ignore the next match
			
 
				+            cur += matchLen;
			
 
				+            match = matcher.getMatch((size32_t)(end-cur), (const char *)cur, matchLen);
			
 
				+            if ((match & 255) == NONE)
			
 
				+                matchLen = 1;
			
 
				+            lastGood += matchLen;
			
 
				+            break;
			
 
				         }
			
 
				         cur += matchLen;
			
 
				     }
			
 
				 
			
 
				-    setFieldRange(firstGood, lastGood, curColumn, quoteToStrip, unescape);
			
 
				+    setFieldRange(firstGood, lastGood, curColumn, quoteToStrip, lastEscape);
			
 
				     while (++curColumn < maxColumns)
			
 
				         lengths[curColumn] = 0;
			
 
				     return (size32_t)(end - start);
			
--- a/common/thorhelper/csvsplitter.hpp
+++ b/common/thorhelper/csvsplitter.hpp
@@ -32,6 +32,35 @@
 
				 #include "eclhelper.hpp"
			
 
				 #include "unicode/utf.h"
			
 
				 
			
 
				+/**
			
 
				+ * CSVSplitter - splits CSV files into fields and rows.
			
 
				+ *
			
 
				+ * CSV files are text based records that can have user defined syntax for quoting,
			
 
				+ * escaping, separating fields and rows. According to RFC-4180, there isn't a
			
 
				+ * standard way of building CSV files, however, there is a set of general rules
			
 
				+ * that most implementations seem to follow. This makes it hard to implement a CSV
			
 
				+ * parser, since even if you follow the RFC, you might not read some files as the
			
 
				+ * producer intended.
			
 
				+ *
			
 
				+ * The general rules are:
			
 
				+ *  * rows are separated by EOL
			
 
				+ *  * fields are separated by comma
			
 
				+ *  * special text must be enclosed by quotes
			
 
				+ *  * there must be a form of escaping quotes
			
 
				+ *
			
 
				+ * However, this implementation allows for user-specified quotes, (field) separators,
			
 
				+ * terminators (row separators), whitespace and (multi-char) escaping sequences, so
			
 
				+ * it should be possible to accommodate most files that deviate from the norm, while
			
 
				+ * still reading the files correctly by default.
			
 
				+ *
			
 
				+ * One important rule is that any special behaviour should be enclosed by quotes, so
			
 
				+ * you don't need to account for escaping separators or terminators when they're not
			
 
				+ * themselves quoted. This, and non-matching quotes should be considered syntax error
			
 
				+ * and the producer should, then, fix their output.
			
 
				+ *
			
 
				+ * Also, many CSV producers (including commercial databases) use slash (\) as escaping
			
 
				+ * char, while the RFC mentions re-using quotes (""). We implement both.
			
 
				+ */
			
 
				 class THORHELPER_API CSVSplitter
			
 
				 {
			
 
				 public:
			
@@ -60,7 +89,7 @@ protected:
 
				     unsigned            numQuotes;
			
 
				     unsigned *          lengths;
			
 
				     const byte * *      data;
			
 
				-    byte *              unquotedBuffer;
			
 
				+    byte *              internalBuffer;
			
 
				     byte *              curUnquoted;
			
 
				     unsigned            maxCsvSize;
			
 
				 };
			
--- a/ecl/hqlcpp/hqlhtcpp.cpp
+++ b/ecl/hqlcpp/hqlhtcpp.cpp
@@ -9191,6 +9191,9 @@ void HqlCppTranslator::buildCsvParameters(BuildCtx & subctx, IHqlExpression * cs
 
				     buildCsvListFunc(classctx, "queryEscape", escape, NULL);
			
 
				 
			
 
				     StringBuffer flags;
			
 
				+    // Backward compatible option hasEscape should be deprecated in next major version
			
 
				+    flags.append("|supportsEscape");
			
 
				+    // Proper flags
			
 
				     if (!queryProperty(quoteAtom, attrs))       flags.append("|defaultQuote");
			
 
				     if (!queryProperty(separatorAtom, attrs))   flags.append("|defaultSeparate");
			
 
				     if (!queryProperty(terminatorAtom, attrs))  flags.append("|defaultTerminate");
			
--- a/rtl/include/eclhelper.hpp
+++ b/rtl/include/eclhelper.hpp
@@ -1914,14 +1914,15 @@ struct ICsvParameters
 
				 {
			
 
				     enum
			
 
				     {
			
 
				-        defaultQuote = 1,
			
 
				-        defaultSeparate = 2,
			
 
				-        defaultTerminate = 4,
			
 
				-        hasUnicode = 8,
			
 
				-        singleHeaderFooter = 16,
			
 
				-        preserveWhitespace = 32,
			
 
				-        manyHeaderFooter = 64,
			
 
				-        defaultEscape = 128
			
 
				+        defaultQuote =        0x0001,
			
 
				+        defaultSeparate =     0x0002,
			
 
				+        defaultTerminate =    0x0004,
			
 
				+        hasUnicode =          0x0008,
			
 
				+        singleHeaderFooter =  0x0010,
			
 
				+        preserveWhitespace =  0x0020,
			
 
				+        manyHeaderFooter =    0x0040,
			
 
				+        defaultEscape =       0x0080,
			
 
				+        supportsEscape =      0x0100, // MORE: deprecate on next major version
			
 
				     }; // flags values
			
 
				     virtual unsigned     getFlags() = 0;
			
 
				     virtual bool         queryEBCDIC() = 0;
			
--- a/testing/ecl/csv-escaped.ecl
+++ b/testing/ecl/csv-escaped.ecl
@@ -23,8 +23,8 @@ END;
 
				 
			
 
				 // Default is no escape
			
 
				 orig := DATASET([{'this is an \\\'escaped\\\' string', 10, 'while this is not'}], rec);
			
 
				-OUTPUT(orig, , 'regress::csv-escaped', OVERWRITE, CSV);
			
 
				-escaped := DATASET('regress::csv-escaped', rec, CSV);
			
 
				+OUTPUT(orig, , 'regress::csv-orig', OVERWRITE, CSV);
			
 
				+escaped := DATASET('regress::csv-orig', rec, CSV);
			
 
				 OUTPUT(escaped);
			
 
				 
			
 
				 // Standard escape
			
@@ -35,6 +35,42 @@ OUTPUT(escaped2);
 
				 
			
 
				 // Multi-char escape
			
 
				 orig3 := DATASET([{'this is an -=-\'escaped-=-\' string', 10, 'while this is not'}], rec);
			
 
				-OUTPUT(orig3, , 'regress::csv-escaped', OVERWRITE, CSV);
			
 
				-escaped3 := DATASET('regress::csv-escaped', rec, CSV(ESCAPE('-=-')));
			
 
				+OUTPUT(orig3, , 'regress::csv-escaped-multi', OVERWRITE, CSV);
			
 
				+escaped3 := DATASET('regress::csv-escaped-multi', rec, CSV(ESCAPE('-=-')));
			
 
				 OUTPUT(escaped3);
			
 
				+
			
 
				+// Escape the escape
			
 
				+orig4 := DATASET([{'escape the \\\\ escape', 10, 'escape at the end \\\\'}], rec);
			
 
				+OUTPUT(orig4, , 'regress::csv-escaped-escape', OVERWRITE, CSV);
			
 
				+escaped4 := DATASET('regress::csv-escaped-escape', rec, CSV(ESCAPE('\\')));
			
 
				+OUTPUT(escaped4);
			
 
				+
			
 
				+// Multi-escapes in a row
			
 
				+orig5 := DATASET([{'multiple escapes \\\\\\\\ in a row', 10, 'multiple at end \\\\\\\\'}], rec);
			
 
				+OUTPUT(orig5, , 'regress::csv-escaped-many', OVERWRITE, CSV);
			
 
				+escaped5 := DATASET('regress::csv-escaped-many', rec, CSV(ESCAPE('\\')));
			
 
				+OUTPUT(escaped5);
			
 
				+
			
 
				+// Many escapes
			
 
				+orig6 := DATASET([{'many escapes like \\\'\\\' \\\'  \\\' and \\\\\\\\ \\\\ \\\\  \\\\  \\\\ escape', 10, 'escape at the end \\\''}], rec);
			
 
				+OUTPUT(orig6, , 'regress::csv-escaped-many-more', OVERWRITE, CSV);
			
 
				+escaped6 := DATASET('regress::csv-escaped-many-more', rec, CSV(ESCAPE('\\')));
			
 
				+OUTPUT(escaped6);
			
 
				+
			
 
				+// Escape separator
			
 
				+orig7 := DATASET([{'escaping \\, the \\,\\, \\, \\, separator', 10, 'escape at the end \\,'}], rec);
			
 
				+OUTPUT(orig7, , 'regress::csv-escaped-separator', OVERWRITE, CSV);
			
 
				+escaped7 := DATASET('regress::csv-escaped-separator', rec, CSV(ESCAPE('\\')));
			
 
				+OUTPUT(escaped7);
			
 
				+
			
 
				+// Escape with quotes
			
 
				+orig8 := DATASET([{'\'escaping\'\'the quote\'', 10, 'au naturel'}], rec);
			
 
				+OUTPUT(orig8, , 'regress::csv-escaped-escaped', OVERWRITE, CSV);
			
 
				+escaped8 := DATASET('regress::csv-escaped-escaped', rec, CSV);
			
 
				+OUTPUT(escaped8);
			
 
				+
			
 
				+// Escape with quotes with ESCAPE()
			
 
				+orig9 := DATASET([{'\'escaping\'\'the quote\'', 10, 'with user defined escape'}], rec);
			
 
				+OUTPUT(orig9, , 'regress::csv-escaped-escaped2', OVERWRITE, CSV);
			
 
				+escaped9 := DATASET('regress::csv-escaped-escaped2', rec, CSV(ESCAPE('\\')));
			
 
				+OUTPUT(escaped9);
			
--- a/testing/ecl/key/csv-escaped.xml
+++ b/testing/ecl/key/csv-escaped.xml
@@ -11,5 +11,35 @@
 
				 <Dataset name='Result 5'>
			
 
				 </Dataset>
			
 
				 <Dataset name='Result 6'>
			
 
				- <Row><foo>this is an &apos;escaped&apos; string</foo><id>0</id><bar>while this is not</bar></Row>
			
 
				+ <Row><foo>this is an &apos;escaped&apos; string</foo><id>10</id><bar>while this is not</bar></Row>
			
 
				+</Dataset>
			
 
				+<Dataset name='Result 7'>
			
 
				+</Dataset>
			
 
				+<Dataset name='Result 8'>
			
 
				+ <Row><foo>escape the \ escape</foo><id>10</id><bar>escape at the end \</bar></Row>
			
 
				+</Dataset>
			
 
				+<Dataset name='Result 9'>
			
 
				+</Dataset>
			
 
				+<Dataset name='Result 10'>
			
 
				+ <Row><foo>multiple escapes \\ in a row</foo><id>10</id><bar>multiple at end \\</bar></Row>
			
 
				+</Dataset>
			
 
				+<Dataset name='Result 11'>
			
 
				+</Dataset>
			
 
				+<Dataset name='Result 12'>
			
 
				+ <Row><foo>many escapes like &apos;&apos; &apos;  &apos; and \\ \ \  \  \ escape</foo><id>10</id><bar>escape at the end &apos;</bar></Row>
			
 
				+</Dataset>
			
 
				+<Dataset name='Result 13'>
			
 
				+</Dataset>
			
 
				+<Dataset name='Result 14'>
			
 
				+ <Row><foo>escaping , the ,, , , separator</foo><id>10</id><bar>escape at the end ,</bar></Row>
			
 
				+</Dataset>
			
 
				+<Dataset name='Result 15'>
			
 
				+</Dataset>
			
 
				+<Dataset name='Result 16'>
			
 
				+ <Row><foo>escaping&apos;the quote</foo><id>10</id><bar>au naturel</bar></Row>
			
 
				+</Dataset>
			
 
				+<Dataset name='Result 17'>
			
 
				+</Dataset>
			
 
				+<Dataset name='Result 18'>
			
 
				+ <Row><foo>escaping&apos;the quote</foo><id>10</id><bar>with user defined escape</bar></Row>
			
 
				 </Dataset>
			
--- a/thorlcr/activities/csvread/thcsvread.cpp
+++ b/thorlcr/activities/csvread/thcsvread.cpp
@@ -46,6 +46,8 @@ public:
 
				             else dst.append(false);
			
 
				             if (fileDesc->queryProperties().hasProp("@csvTerminate")) dst.append(true).append(fileDesc->queryProperties().queryProp("@csvTerminate"));
			
 
				             else dst.append(false);
			
 
				+            if (fileDesc->queryProperties().hasProp("@csvEscape")) dst.append(true).append(fileDesc->queryProperties().queryProp("@csvEscape"));
			
 
				+            else dst.append(false);
			
 
				         }
			
 
				         if (headerLines)
			
 
				         {