123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548 |
- /*##############################################################################
- HPCC SYSTEMS software Copyright (C) 2012 HPCC Systems.
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- http://www.apache.org/licenses/LICENSE-2.0
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- ############################################################################## */
- #include "platform.h"
- #include "jregexp.hpp"
- #include "jlib.hpp"
- #include "jexcept.hpp"
- #include "junicode.hpp"
- #include "eclhelper.hpp"
- #include "unicode/uchar.h"
- #include "csvsplitter.hpp"
- #include "eclrtl.hpp"
- CSVSplitter::CSVSplitter()
- {
- lengths = NULL;
- data = NULL;
- numQuotes = 0;
- internalBuffer = NULL;
- maxColumns = 0;
- curUnquoted = NULL;
- }
- CSVSplitter::~CSVSplitter()
- {
- delete [] lengths;
- delete [] data;
- free(internalBuffer);
- }
- void CSVSplitter::addQuote(const char * text)
- {
- //Allow '' to remove quoting.
- if (text && *text)
- matcher.addEntry(text, QUOTE+(numQuotes++<<8));
- }
- void CSVSplitter::addSeparator(const char * text)
- {
- if (text && *text)
- matcher.addEntry(text, SEPARATOR);
- }
- void CSVSplitter::addTerminator(const char * text)
- {
- matcher.addEntry(text, TERMINATOR);
- }
- void CSVSplitter::addEscape(const char * text)
- {
- matcher.addEntry(text, ESCAPE);
- }
- void CSVSplitter::reset()
- {
- matcher.reset();
- delete [] lengths;
- delete [] data;
- free(internalBuffer);
- lengths = NULL;
- data = NULL;
- numQuotes = 0;
- internalBuffer = NULL;
- maxCsvSize = 0;
- }
- void CSVSplitter::init(unsigned _maxColumns, ICsvParameters * csvInfo, const char * dfsQuotes, const char * dfsSeparators, const char * dfsTerminators, const char * dfsEscapes)
- {
- reset();
- maxCsvSize = csvInfo->queryMaxSize();
- internalBuffer = (byte *)malloc(maxCsvSize);
- maxColumns = _maxColumns;
- lengths = new unsigned [maxColumns+1]; // NB: One larger to remove some tests in main loop...
- data = new const byte * [maxColumns+1];
- unsigned idx;
- unsigned flags = csvInfo->getFlags();
- if (dfsQuotes && (flags & ICsvParameters::defaultQuote))
- addActionList(matcher, dfsQuotes, QUOTE);
- else
- {
- for (idx=0;;idx++)
- {
- const char * text = csvInfo->queryQuote(idx);
- if (!text)
- break;
- addQuote(text);
- }
- }
- if (dfsSeparators && (flags & ICsvParameters::defaultSeparate))
- addActionList(matcher, dfsSeparators, SEPARATOR);
- else
- {
- for (idx=0;;idx++)
- {
- const char * text = csvInfo->querySeparator(idx);
- if (!text)
- break;
- addSeparator(text);
- }
- }
- if (dfsTerminators && (flags & ICsvParameters::defaultTerminate))
- addActionList(matcher, dfsTerminators, TERMINATOR);
- else
- {
- for (idx=0;;idx++)
- {
- const char * text = csvInfo->queryTerminator(idx);
- if (!text)
- break;
- addTerminator(text);
- }
- }
- // Old workunits won't have queryEscape. MORE: deprecate on the next major version
- if (flags & ICsvParameters::supportsEscape)
- {
- if (dfsEscapes && (flags & ICsvParameters::defaultEscape))
- addActionList(matcher, dfsEscapes, ESCAPE);
- else
- {
- for (idx=0;;idx++)
- {
- const char * text = csvInfo->queryEscape(idx);
- if (!text)
- break;
- addEscape(text);
- }
- }
- }
- //MORE Should this be configurable??
- if (!(flags & ICsvParameters::preserveWhitespace))
- {
- matcher.queryAddEntry(1, " ", WHITESPACE);
- matcher.queryAddEntry(1, "\t", WHITESPACE);
- }
- }
- void CSVSplitter::setFieldRange(const byte * start, const byte * end, unsigned curColumn, unsigned quoteToStrip, bool unescape)
- {
- // Either quoting or escaping will use the local buffer
- if ((quoteToStrip || unescape) &&
- (unsigned)(curUnquoted - internalBuffer) + (unsigned)(end - start) > maxCsvSize)
- throw MakeStringException(99, "MAXLENGTH for CSV file is not large enough");
- // point to the beginning of the local (possibly changed) buffer, for escaping later
- byte * curUnescaped = curUnquoted;
- if (quoteToStrip)
- {
- data[curColumn] = curUnquoted;
- const byte * lastCopied = start;
- const byte *cur;
- for (cur = start; cur != end; )
- {
- unsigned matchLen;
- unsigned match = matcher.getMatch((size32_t)(end-cur), (const char *)cur, matchLen);
- switch (match & 255)
- {
- case NONE:
- matchLen = 1;
- break;
- case WHITESPACE:
- case SEPARATOR:
- break;
- case TERMINATOR:
- goto done;
- case QUOTE:
- {
- const byte * next = cur + matchLen;
- if ((match == quoteToStrip) && (next != end))
- {
- unsigned nextMatchLen;
- unsigned nextMatch = matcher.getMatch((size32_t)(end-next), (const char *)next, nextMatchLen);
- if (nextMatch == match)
- {
- memcpy(curUnquoted, lastCopied, next-lastCopied);
- curUnquoted += (next-lastCopied);
- matchLen += nextMatchLen;
- lastCopied = cur+matchLen;
- }
- }
- break;
- }
- }
- cur += matchLen;
- }
- done:
- memcpy(curUnquoted, lastCopied, cur-lastCopied);
- curUnquoted += (cur-lastCopied);
- lengths[curColumn] = (size32_t)(curUnquoted - data[curColumn]);
- }
- else
- {
- lengths[curColumn] = (size32_t)(end-start);
- // Only if ESCAPEs were detected in the input
- if (unescape)
- {
- // Need to copy original to a local string (using allocated buffer)
- memcpy(curUnescaped, start, lengths[curColumn]);
- data[curColumn] = curUnescaped;
- // and update the buffer pointer, to re-use on next iteration
- curUnquoted = curUnescaped + lengths[curColumn];
- }
- else
- {
- data[curColumn] = start;
- return;
- }
- }
- // Un-escape string, if necessary.
- if (unescape)
- {
- byte * cur = curUnescaped; // data[curColumn] is already pointing here one way or another
- byte * end = cur + lengths[curColumn];
- for (; cur < end; cur++)
- {
- unsigned matchLen;
- unsigned match = matcher.getMatch((size32_t)(end-cur), (const char *)cur, matchLen);
- if ((match & 255) == ESCAPE)
- {
- ptrdiff_t restLen = end-(cur+matchLen);
- memmove(cur, cur+matchLen, restLen);
- end -= matchLen;
- lengths[curColumn] -= matchLen;
- // Avoid having cur past end
- if (cur == end)
- break;
- }
- }
- }
- }
- size32_t CSVSplitter::splitLine(size32_t maxLength, const byte * start)
- {
- unsigned curColumn = 0;
- unsigned quote = 0;
- unsigned quoteToStrip = 0;
- const byte * cur = start;
- const byte * end = start + maxLength;
- const byte * firstGood = start;
- const byte * lastGood = start;
- bool lastEscape = false;
- curUnquoted = internalBuffer;
- while (cur != end)
- {
- unsigned matchLen;
- unsigned match = matcher.getMatch((size32_t)(end-cur), (const char *)cur, matchLen);
- switch (match & 255)
- {
- case NONE:
- cur++; // matchLen == 0;
- lastGood = cur;
- break;
- case WHITESPACE:
- //Skip leading whitespace
- if (quote)
- lastGood = cur+matchLen;
- else if (cur == firstGood)
- {
- firstGood = cur+matchLen;
- lastGood = cur+matchLen;
- }
- break;
- case SEPARATOR:
- // Quoted separator
- if ((curColumn < maxColumns) && (quote == 0))
- {
- setFieldRange(firstGood, lastGood, curColumn, quoteToStrip, lastEscape);
- lastEscape = false;
- quoteToStrip = 0;
- curColumn++;
- firstGood = cur + matchLen;
- }
- lastGood = cur+matchLen;
- break;
- case TERMINATOR:
- if (quote == 0) // Is this a good idea? Means a mismatched quote is not fixed by EOL
- {
- setFieldRange(firstGood, lastGood, curColumn, quoteToStrip, lastEscape);
- lastEscape = false;
- while (++curColumn < maxColumns)
- lengths[curColumn] = 0;
- return (size32_t)(cur + matchLen - start);
- }
- lastGood = cur+matchLen;
- break;
- case QUOTE:
- // Quoted quote
- if (quote == 0)
- {
- if (cur == firstGood)
- {
- quote = match;
- firstGood = cur+matchLen;
- }
- lastGood = cur+matchLen;
- }
- else
- {
- if (quote == match)
- {
- const byte * next = cur + matchLen;
- //Check for double quotes
- if ((next != end))
- {
- unsigned nextMatchLen;
- unsigned nextMatch = matcher.getMatch((size32_t)(end-next), (const char *)next, nextMatchLen);
- if (nextMatch == quote)
- {
- quoteToStrip = quote;
- matchLen += nextMatchLen;
- lastGood = cur+matchLen;
- }
- else
- quote = 0;
- }
- else
- quote = 0;
- }
- else
- lastGood = cur+matchLen;
- }
- break;
- case ESCAPE:
- lastEscape = true;
- lastGood = cur+matchLen;
- // If this escape is at the end, proceed to field range
- if (lastGood == end)
- break;
- // Skip escape and ignore the next match
- cur += matchLen;
- match = matcher.getMatch((size32_t)(end-cur), (const char *)cur, matchLen);
- if ((match & 255) == NONE)
- matchLen = 1;
- lastGood += matchLen;
- break;
- }
- cur += matchLen;
- }
- setFieldRange(firstGood, lastGood, curColumn, quoteToStrip, lastEscape);
- while (++curColumn < maxColumns)
- lengths[curColumn] = 0;
- return (size32_t)(end - start);
- }
- //=====================================================================================================
- void CSVOutputStream::beginLine()
- {
- clear();
- prefix = NULL;
- }
- void CSVOutputStream::endLine()
- {
- append(terminator);
- }
- void CSVOutputStream::init(ICsvParameters * args, bool _oldOutputFormat)
- {
- if (args->queryEBCDIC())
- throw MakeStringException(99, "EBCDIC CSV output not yet implemented");
- quote.set(args->queryQuote(0));
- separator.set(args->querySeparator(0));
- terminator.set(args->queryTerminator(0));
- escape.set(args->queryEscape(0));
- oldOutputFormat = _oldOutputFormat||!quote.length();
- }
- void CSVOutputStream::writeUnicode(size32_t len, const UChar * data)
- {
- unsigned utf8Length;
- char * utf8Data = NULL;
- rtlUnicodeToCodepageX(utf8Length, utf8Data, len, data, "utf-8");
- writeString(utf8Length, utf8Data);
- rtlFree(utf8Data);
- }
- void CSVOutputStream::writeUtf8(size32_t len, const char * data)
- {
- append(prefix);
- if (oldOutputFormat) {
- append(quote).append(rtlUtf8Size(len, data), data).append(quote);
- }
- else if (len) {
- // is this OTT?
- // not sure if best way but generate an array of utf8 sizes
- MemoryAttr ma;
- size32_t * cl;
- if (len>256)
- cl = (size32_t *)ma.allocate(sizeof(size32_t)*len);
- else
- cl = (size32_t *)alloca(sizeof(size32_t)*len);
- unsigned start=(unsigned)-1;
- unsigned end=0;
- const byte * s = (const byte *)data;
- unsigned i;
- for (i=0;i<len;i++) {
- const byte *p=s;
- UChar next = readUtf8Character(sizeof(UChar), s);
- cl[i] = (size32_t)(s-p);
- if (!u_isspace(next)) {
- end = i;
- if (start==(unsigned)-1)
- start = i;
- }
- }
- const byte *e=s;
- // do trim
- if (start!=(unsigned)-1) {
- for (i=0;i<start;i++)
- data += *(cl++);
- len -= start;
- end -= start;
- end++;
- while (end<len)
- e -= cl[--len];
- }
- // now see if need quoting by looking for separator, terminator or quote
- // I *think* this can be done with memcmps as has to be exact
- size32_t sl = separator.length();
- size32_t tl = terminator.length();
- size32_t ql = quote.length();
- bool needquote=false;
- s = (const byte *)data;
- for (i=0;i<len;i++) {
- size32_t l = (size32_t)(e-s);
- if (sl&&(l>=sl)&&(memcmp(separator.get(),s,sl)==0)) {
- needquote = true;
- break;
- }
- if (tl&&(l>=tl)&&(memcmp(terminator.get(),s,tl)==0)) {
- needquote = true;
- break;
- }
- if ((l>=ql)&&(memcmp(quote.get(),s,ql)==0)) {
- needquote = true;
- break;
- }
- s+=cl[i];
- }
- if (needquote) {
- append(quote);
- s = (const byte *)data;
- for (i=0;i<len;i++) {
- size32_t l = (size32_t)(e-s);
- if ((l>=ql)&&(memcmp(quote.get(),s,ql)==0))
- append(quote);
- append(cl[i],(const char *)s);
- s+=cl[i];
- }
- append(quote);
- }
- else
- append((size32_t)(e-(const byte *)data),data);
- }
- prefix = separator;
- }
- void CSVOutputStream::writeString(size32_t len, const char * data)
- {
-
- append(prefix);
- if (oldOutputFormat) {
- append(quote).append(len, data).append(quote);
- }
- else if (len) {
- // New format (as per GS)
- // first trim
- while (len&&(*data==' ')) {
- len--;
- data++;
- }
- while (len&&(data[len-1]==' '))
- len--;
- // now see if need quoting by looking for separator, terminator or quote
- size32_t sl = separator.length();
- size32_t tl = terminator.length();
- size32_t ql = quote.length();
- bool needquote=false;
- const char *s = data;
- for (unsigned l=len;l>0;l--) {
- if (sl&&(l>=sl)&&(memcmp(separator.get(),s,sl)==0)) {
- needquote = true;
- break;
- }
- if (tl&&(l>=tl)&&(memcmp(terminator.get(),s,tl)==0)) {
- needquote = true;
- break;
- }
- if ((l>=ql)&&(memcmp(quote.get(),s,ql)==0)) {
- needquote = true;
- break;
- }
- s++;
- }
- if (needquote) {
- append(quote);
- const char *s = data;
- for (unsigned l=len;l>0;l--) {
- if ((l>=ql)&&(memcmp(quote.get(),s,ql)==0))
- append(quote);
- append(*(s++));
- }
- append(quote);
- }
- else
- append(len,data);
- }
- prefix = separator;
- }
- void CSVOutputStream::writeHeaderLn(size32_t len, const char * data)
- {
- append(len,data);
- if (!oldOutputFormat&&len) {
- size32_t tl = terminator.length();
- if ((tl>len)||(memcmp(data+len-tl,terminator.get(),tl)!=0))
- endLine();
- }
- }
|