RONCC
/
Big-Data-HPC-Platform


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119
							/*##############################################################################

    HPCC SYSTEMS software Copyright (C) 2012 HPCC Systems®.

    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

    Unless required by applicable law or agreed to in writing, software
    distributed under the License is distributed on an "AS IS" BASIS,
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License.
############################################################################## */

#ifndef CSVSPLITTER_INCL
#define CSVSPLITTER_INCL

#ifdef THORHELPER_EXPORTS
#define THORHELPER_API DECL_EXPORT
#else
#define THORHELPER_API DECL_IMPORT
#endif

#include "jregexp.hpp"
#include "eclhelper.hpp"
#include "unicode/utf.h"

/**
 * CSVSplitter - splits CSV files into fields and rows.
 *
 * CSV files are text based records that can have user defined syntax for quoting,
 * escaping, separating fields and rows. According to RFC-4180, there isn't a
 * standard way of building CSV files, however, there is a set of general rules
 * that most implementations seem to follow. This makes it hard to implement a CSV
 * parser, since even if you follow the RFC, you might not read some files as the
 * producer intended.
 *
 * The general rules are:
 *  * rows are separated by EOL
 *  * fields are separated by comma
 *  * special text must be enclosed by quotes
 *  * there must be a form of escaping quotes
 *
 * However, this implementation allows for user-specified quotes, (field) separators,
 * terminators (row separators), whitespace and (multi-char) escaping sequences, so
 * it should be possible to accommodate most files that deviate from the norm, while
 * still reading the files correctly by default.
 *
 * One important rule is that any special behaviour should be enclosed by quotes, so
 * you don't need to account for escaping separators or terminators when they're not
 * themselves quoted. This, and non-matching quotes should be considered syntax error
 * and the producer should, then, fix their output.
 *
 * Also, many CSV producers (including commercial databases) use slash (\) as escaping
 * char, while the RFC mentions re-using quotes (""). We implement both.
 */
class THORHELPER_API CSVSplitter
{
public:
    CSVSplitter();
    ~CSVSplitter();

    void addQuote(const char * text);
    void addSeparator(const char * text);
    void addTerminator(const char * text);
    void addEscape(const char * text);

    void init(unsigned maxColumns, ICsvParameters * csvInfo, const char * dfsQuotes, const char * dfsSeparators, const char * dfsTerminators, const char * dfsEscapes);
    void reset();
    size32_t splitLine(size32_t maxLen, const byte * start);

    inline unsigned * queryLengths() { return lengths; }
    inline const byte * * queryData() { return data; }

protected:
    void setFieldRange(const byte * start, const byte * end, unsigned curColumn, unsigned quoteToStrip, bool unescape);

protected:
    enum { NONE=0, SEPARATOR=1, TERMINATOR=2, WHITESPACE=3, QUOTE=4, ESCAPE=5 };
    unsigned            maxColumns;
    StringMatcher       matcher;
    unsigned            numQuotes;
    unsigned *          lengths;
    const byte * *      data;
    byte *              internalBuffer;
    size32_t            internalOffset;
    size32_t            sizeInternal;
    size32_t            maxCsvSize;
};

class THORHELPER_API CSVOutputStream : public StringBuffer, implements ITypedOutputStream
{
public:
    void beginLine();
    void writeHeaderLn(size32_t len, const char * data); // no need for endLine
    void endLine();
    void init(ICsvParameters * args, bool _oldOutputFormat);

    virtual void writeReal(double value)                    { append(prefix).append(value); prefix = separator; }
    virtual void writeSigned(__int64 value)                 { append(prefix).append(value); prefix = separator; }
    virtual void writeString(size32_t len, const char * data);
    virtual void writeUnicode(size32_t len, const UChar * data);
    virtual void writeUnsigned(unsigned __int64 value)      { append(prefix).append(value); prefix = separator; }
    virtual void writeUtf8(size32_t len, const char * data);

protected:
    StringAttr separator;
    StringAttr terminator;
    StringAttr quote;
    StringAttr escape;
    const char * prefix;
    bool oldOutputFormat;
};

#endif // CSVSPLITTER_INCL