Pārlūkot izejas kodu

HPCC-3092 Fixes suggestions from review

Better dealing with escape detection, backward compatible supportsEscape,
more tests.

Signed-off-by: Renato Golin <rengolin@hpccsystems.com>
Renato Golin 12 gadi atpakaļ
vecāks
revīzija
917ad53c03

+ 67 - 32
common/thorhelper/csvsplitter.cpp

@@ -32,7 +32,7 @@ CSVSplitter::CSVSplitter()
     lengths = NULL;
     data = NULL;
     numQuotes = 0;
-    unquotedBuffer = NULL;
+    internalBuffer = NULL;
     maxColumns = 0;
     curUnquoted = NULL;
 }
@@ -41,7 +41,7 @@ CSVSplitter::~CSVSplitter()
 {
     delete [] lengths;
     delete [] data;
-    free(unquotedBuffer);
+    free(internalBuffer);
 }
 
 void CSVSplitter::addQuote(const char * text)
@@ -72,11 +72,11 @@ void CSVSplitter::reset()
     matcher.reset();
     delete [] lengths;
     delete [] data;
-    free(unquotedBuffer);
+    free(internalBuffer);
     lengths = NULL;
     data = NULL;
     numQuotes = 0;
-    unquotedBuffer = NULL;
+    internalBuffer = NULL;
     maxCsvSize = 0;
 }
 
@@ -84,7 +84,7 @@ void CSVSplitter::init(unsigned _maxColumns, ICsvParameters * csvInfo, const cha
 {
     reset();
     maxCsvSize = csvInfo->queryMaxSize();
-    unquotedBuffer = (byte *)malloc(maxCsvSize);
+    internalBuffer = (byte *)malloc(maxCsvSize);
 
     maxColumns = _maxColumns;
     lengths = new unsigned [maxColumns+1];      // NB: One larger to remove some tests in main loop...
@@ -131,16 +131,20 @@ void CSVSplitter::init(unsigned _maxColumns, ICsvParameters * csvInfo, const cha
         }
     }
 
-    if (dfsEscapes && (flags & ICsvParameters::defaultEscape))
-        addActionList(matcher, dfsEscapes, ESCAPE);
-    else
+    // Old workunits won't have queryEscape. MORE: deprecate on the next major version
+    if (flags & ICsvParameters::supportsEscape)
     {
-        for (idx=0;;idx++)
+        if (dfsEscapes && (flags & ICsvParameters::defaultEscape))
+            addActionList(matcher, dfsEscapes, ESCAPE);
+        else
         {
-            const char * text = csvInfo->queryEscape(idx);
-            if (!text)
-                break;
-            addEscape(text);
+            for (idx=0;;idx++)
+            {
+                const char * text = csvInfo->queryEscape(idx);
+                if (!text)
+                    break;
+                addEscape(text);
+            }
         }
     }
 
@@ -154,11 +158,15 @@ void CSVSplitter::init(unsigned _maxColumns, ICsvParameters * csvInfo, const cha
 
 void CSVSplitter::setFieldRange(const byte * start, const byte * end, unsigned curColumn, unsigned quoteToStrip, bool unescape)
 {
+    // Either quoting or escaping will use the local buffer
+    if ((quoteToStrip || unescape) &&
+        (unsigned)(curUnquoted - internalBuffer) + (unsigned)(end - start) > maxCsvSize)
+        throw MakeStringException(99, "MAXLENGTH for CSV file is not large enough");
+
+    // point to the beginning of the local (possibly changed) buffer, for escaping later
+    byte * curUnescaped = curUnquoted;
     if (quoteToStrip)
     {
-        if ((unsigned)(curUnquoted - unquotedBuffer) + (unsigned)(end - start) > maxCsvSize)
-            throw MakeStringException(99, "MAXLENGTH for CSV file is not large enough");
-
         data[curColumn] = curUnquoted;
         const byte * lastCopied = start;
         const byte *cur;
@@ -203,25 +211,40 @@ done:
     }
     else
     {
-        data[curColumn] = start;
         lengths[curColumn] = (size32_t)(end-start);
+        // Only if ESCAPEs were detected in the input
+        if (unescape)
+        {
+            // Need to copy original to a local string (using allocated buffer)
+            memcpy(curUnescaped, start, lengths[curColumn]);
+            data[curColumn] = curUnescaped;
+            // and update the buffer pointer, to re-use on next iteration
+            curUnquoted = curUnescaped + lengths[curColumn];
+        }
+        else
+        {
+            data[curColumn] = start;
+            return;
+        }
     }
     // Un-escape string, if necessary.
     if (unescape)
     {
-        byte * cur = const_cast<byte*>(data[curColumn]);
+        byte * cur = curUnescaped; // data[curColumn] is already pointing here one way or another
         byte * end = cur + lengths[curColumn];
-        for (; cur <= end; cur++)
+        for (; cur < end; cur++)
         {
             unsigned matchLen;
             unsigned match = matcher.getMatch((size32_t)(end-cur), (const char *)cur, matchLen);
             if ((match & 255) == ESCAPE)
             {
-                unsigned long restLen = end-cur+matchLen;
+                ptrdiff_t restLen = end-cur+matchLen;
                 memmove(cur, cur+matchLen, restLen);
-                cur += matchLen;
                 end -= matchLen;
                 lengths[curColumn] -= matchLen;
+                // Avoid having cur past end
+                if (cur == end)
+                    break;
             }
         }
     }
@@ -236,8 +259,8 @@ size32_t CSVSplitter::splitLine(size32_t maxLength, const byte * start)
     const byte * end = start + maxLength;
     const byte * firstGood = start;
     const byte * lastGood = start;
-    bool unescape = false;
-    curUnquoted = unquotedBuffer;
+    bool lastEscape = false;
+    curUnquoted = internalBuffer;
 
     while (cur != end)
     {
@@ -250,7 +273,7 @@ size32_t CSVSplitter::splitLine(size32_t maxLength, const byte * start)
             lastGood = cur;
             break;
         case WHITESPACE:
-            //Skip leading whitepace
+            //Skip leading whitespace
             if (quote)
                 lastGood = cur+matchLen;
             else if (cur == firstGood)
@@ -260,10 +283,11 @@ size32_t CSVSplitter::splitLine(size32_t maxLength, const byte * start)
             }
             break;
         case SEPARATOR:
+            // Quoted separator
             if ((curColumn < maxColumns) && (quote == 0))
             {
-                setFieldRange(firstGood, lastGood, curColumn, quoteToStrip, unescape);
-                unescape = false;
+                setFieldRange(firstGood, lastGood, curColumn, quoteToStrip, lastEscape);
+                lastEscape = false;
                 quoteToStrip = 0;
                 curColumn++;
                 firstGood = cur + matchLen;
@@ -273,8 +297,8 @@ size32_t CSVSplitter::splitLine(size32_t maxLength, const byte * start)
         case TERMINATOR:
             if (quote == 0) // Is this a good idea? Means a mismatched quote is not fixed by EOL
             {
-                setFieldRange(firstGood, lastGood, curColumn, quoteToStrip, unescape);
-                unescape = false;
+                setFieldRange(firstGood, lastGood, curColumn, quoteToStrip, lastEscape);
+                lastEscape = false;
                 while (++curColumn < maxColumns)
                     lengths[curColumn] = 0;
                 return (size32_t)(cur + matchLen - start);
@@ -282,6 +306,7 @@ size32_t CSVSplitter::splitLine(size32_t maxLength, const byte * start)
             lastGood = cur+matchLen;
             break;
         case QUOTE:
+            // Quoted quote
             if (quote == 0)
             {
                 if (cur == firstGood)
@@ -318,14 +343,24 @@ size32_t CSVSplitter::splitLine(size32_t maxLength, const byte * start)
             }
             break;
         case ESCAPE:
-            unescape = true;
-            lastGood = cur+matchLen+1;
-            cur++; // MORE: This assumes next char is ASCii
+            lastEscape = true;
+            lastGood = cur+matchLen;
+            // If this escape is at the end, proceed to field range
+            if (lastGood == end)
+                break;
+
+            // Skip escape and ignore the next match
+            cur += matchLen;
+            match = matcher.getMatch((size32_t)(end-cur), (const char *)cur, matchLen);
+            if ((match & 255) == NONE)
+                matchLen = 1;
+            lastGood += matchLen;
+            break;
         }
         cur += matchLen;
     }
 
-    setFieldRange(firstGood, lastGood, curColumn, quoteToStrip, unescape);
+    setFieldRange(firstGood, lastGood, curColumn, quoteToStrip, lastEscape);
     while (++curColumn < maxColumns)
         lengths[curColumn] = 0;
     return (size32_t)(end - start);

+ 30 - 1
common/thorhelper/csvsplitter.hpp

@@ -32,6 +32,35 @@
 #include "eclhelper.hpp"
 #include "unicode/utf.h"
 
+/**
+ * CSVSplitter - splits CSV files into fields and rows.
+ *
+ * CSV files are text based records that can have user defined syntax for quoting,
+ * escaping, separating fields and rows. According to RFC-4180, there isn't a
+ * standard way of building CSV files, however, there is a set of general rules
+ * that most implementations seem to follow. This makes it hard to implement a CSV
+ * parser, since even if you follow the RFC, you might not read some files as the
+ * producer intended.
+ *
+ * The general rules are:
+ *  * rows are separated by EOL
+ *  * fields are separated by comma
+ *  * special text must be enclosed by quotes
+ *  * there must be a form of escaping quotes
+ *
+ * However, this implementation allows for user-specified quotes, (field) separators,
+ * terminators (row separators), whitespace and (multi-char) escaping sequences, so
+ * it should be possible to accommodate most files that deviate from the norm, while
+ * still reading the files correctly by default.
+ *
+ * One important rule is that any special behaviour should be enclosed by quotes, so
+ * you don't need to account for escaping separators or terminators when they're not
+ * themselves quoted. This, and non-matching quotes should be considered syntax error
+ * and the producer should, then, fix their output.
+ *
+ * Also, many CSV producers (including commercial databases) use slash (\) as escaping
+ * char, while the RFC mentions re-using quotes (""). We implement both.
+ */
 class THORHELPER_API CSVSplitter
 {
 public:
@@ -60,7 +89,7 @@ protected:
     unsigned            numQuotes;
     unsigned *          lengths;
     const byte * *      data;
-    byte *              unquotedBuffer;
+    byte *              internalBuffer;
     byte *              curUnquoted;
     unsigned            maxCsvSize;
 };

+ 3 - 0
ecl/hqlcpp/hqlhtcpp.cpp

@@ -9191,6 +9191,9 @@ void HqlCppTranslator::buildCsvParameters(BuildCtx & subctx, IHqlExpression * cs
     buildCsvListFunc(classctx, "queryEscape", escape, NULL);
 
     StringBuffer flags;
+    // Backward compatible option hasEscape should be deprecated in next major version
+    flags.append("|supportsEscape");
+    // Proper flags
     if (!queryProperty(quoteAtom, attrs))       flags.append("|defaultQuote");
     if (!queryProperty(separatorAtom, attrs))   flags.append("|defaultSeparate");
     if (!queryProperty(terminatorAtom, attrs))  flags.append("|defaultTerminate");

+ 9 - 8
rtl/include/eclhelper.hpp

@@ -1914,14 +1914,15 @@ struct ICsvParameters
 {
     enum
     {
-        defaultQuote = 1,
-        defaultSeparate = 2,
-        defaultTerminate = 4,
-        hasUnicode = 8,
-        singleHeaderFooter = 16,
-        preserveWhitespace = 32,
-        manyHeaderFooter = 64,
-        defaultEscape = 128
+        defaultQuote =        0x0001,
+        defaultSeparate =     0x0002,
+        defaultTerminate =    0x0004,
+        hasUnicode =          0x0008,
+        singleHeaderFooter =  0x0010,
+        preserveWhitespace =  0x0020,
+        manyHeaderFooter =    0x0040,
+        defaultEscape =       0x0080,
+        supportsEscape =      0x0100, // MORE: deprecate on next major version
     }; // flags values
     virtual unsigned     getFlags() = 0;
     virtual bool         queryEBCDIC() = 0;

+ 40 - 4
testing/ecl/csv-escaped.ecl

@@ -23,8 +23,8 @@ END;
 
 // Default is no escape
 orig := DATASET([{'this is an \\\'escaped\\\' string', 10, 'while this is not'}], rec);
-OUTPUT(orig, , 'regress::csv-escaped', OVERWRITE, CSV);
-escaped := DATASET('regress::csv-escaped', rec, CSV);
+OUTPUT(orig, , 'regress::csv-orig', OVERWRITE, CSV);
+escaped := DATASET('regress::csv-orig', rec, CSV);
 OUTPUT(escaped);
 
 // Standard escape
@@ -35,6 +35,42 @@ OUTPUT(escaped2);
 
 // Multi-char escape
 orig3 := DATASET([{'this is an -=-\'escaped-=-\' string', 10, 'while this is not'}], rec);
-OUTPUT(orig3, , 'regress::csv-escaped', OVERWRITE, CSV);
-escaped3 := DATASET('regress::csv-escaped', rec, CSV(ESCAPE('-=-')));
+OUTPUT(orig3, , 'regress::csv-escaped-multi', OVERWRITE, CSV);
+escaped3 := DATASET('regress::csv-escaped-multi', rec, CSV(ESCAPE('-=-')));
 OUTPUT(escaped3);
+
+// Escape the escape
+orig4 := DATASET([{'escape the \\\\ escape', 10, 'escape at the end \\\\'}], rec);
+OUTPUT(orig4, , 'regress::csv-escaped-escape', OVERWRITE, CSV);
+escaped4 := DATASET('regress::csv-escaped-escape', rec, CSV(ESCAPE('\\')));
+OUTPUT(escaped4);
+
+// Multi-escapes in a row
+orig5 := DATASET([{'multiple escapes \\\\\\\\ in a row', 10, 'multiple at end \\\\\\\\'}], rec);
+OUTPUT(orig5, , 'regress::csv-escaped-many', OVERWRITE, CSV);
+escaped5 := DATASET('regress::csv-escaped-many', rec, CSV(ESCAPE('\\')));
+OUTPUT(escaped5);
+
+// Many escapes
+orig6 := DATASET([{'many escapes like \\\'\\\' \\\'  \\\' and \\\\\\\\ \\\\ \\\\  \\\\  \\\\ escape', 10, 'escape at the end \\\''}], rec);
+OUTPUT(orig6, , 'regress::csv-escaped-many-more', OVERWRITE, CSV);
+escaped6 := DATASET('regress::csv-escaped-many-more', rec, CSV(ESCAPE('\\')));
+OUTPUT(escaped6);
+
+// Escape separator
+orig7 := DATASET([{'escaping \\, the \\,\\, \\, \\, separator', 10, 'escape at the end \\,'}], rec);
+OUTPUT(orig7, , 'regress::csv-escaped-separator', OVERWRITE, CSV);
+escaped7 := DATASET('regress::csv-escaped-separator', rec, CSV(ESCAPE('\\')));
+OUTPUT(escaped7);
+
+// Escape with quotes
+orig8 := DATASET([{'\'escaping\'\'the quote\'', 10, 'au naturel'}], rec);
+OUTPUT(orig8, , 'regress::csv-escaped-escaped', OVERWRITE, CSV);
+escaped8 := DATASET('regress::csv-escaped-escaped', rec, CSV);
+OUTPUT(escaped8);
+
+// Escape with quotes with ESCAPE()
+orig9 := DATASET([{'\'escaping\'\'the quote\'', 10, 'with user defined escape'}], rec);
+OUTPUT(orig9, , 'regress::csv-escaped-escaped2', OVERWRITE, CSV);
+escaped9 := DATASET('regress::csv-escaped-escaped2', rec, CSV(ESCAPE('\\')));
+OUTPUT(escaped9);

+ 31 - 1
testing/ecl/key/csv-escaped.xml

@@ -11,5 +11,35 @@
 <Dataset name='Result 5'>
 </Dataset>
 <Dataset name='Result 6'>
- <Row><foo>this is an &apos;escaped&apos; string</foo><id>0</id><bar>while this is not</bar></Row>
+ <Row><foo>this is an &apos;escaped&apos; string</foo><id>10</id><bar>while this is not</bar></Row>
+</Dataset>
+<Dataset name='Result 7'>
+</Dataset>
+<Dataset name='Result 8'>
+ <Row><foo>escape the \ escape</foo><id>10</id><bar>escape at the end \</bar></Row>
+</Dataset>
+<Dataset name='Result 9'>
+</Dataset>
+<Dataset name='Result 10'>
+ <Row><foo>multiple escapes \\ in a row</foo><id>10</id><bar>multiple at end \\</bar></Row>
+</Dataset>
+<Dataset name='Result 11'>
+</Dataset>
+<Dataset name='Result 12'>
+ <Row><foo>many escapes like &apos;&apos; &apos;  &apos; and \\ \ \  \  \ escape</foo><id>10</id><bar>escape at the end &apos;</bar></Row>
+</Dataset>
+<Dataset name='Result 13'>
+</Dataset>
+<Dataset name='Result 14'>
+ <Row><foo>escaping , the ,, , , separator</foo><id>10</id><bar>escape at the end ,</bar></Row>
+</Dataset>
+<Dataset name='Result 15'>
+</Dataset>
+<Dataset name='Result 16'>
+ <Row><foo>escaping&apos;the quote</foo><id>10</id><bar>au naturel</bar></Row>
+</Dataset>
+<Dataset name='Result 17'>
+</Dataset>
+<Dataset name='Result 18'>
+ <Row><foo>escaping&apos;the quote</foo><id>10</id><bar>with user defined escape</bar></Row>
 </Dataset>

+ 2 - 0
thorlcr/activities/csvread/thcsvread.cpp

@@ -46,6 +46,8 @@ public:
             else dst.append(false);
             if (fileDesc->queryProperties().hasProp("@csvTerminate")) dst.append(true).append(fileDesc->queryProperties().queryProp("@csvTerminate"));
             else dst.append(false);
+            if (fileDesc->queryProperties().hasProp("@csvEscape")) dst.append(true).append(fileDesc->queryProperties().queryProp("@csvEscape"));
+            else dst.append(false);
         }
         if (headerLines)
         {