csvsplitter.cpp 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548
  1. /*##############################################################################
  2. HPCC SYSTEMS software Copyright (C) 2012 HPCC Systems.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. ############################################################################## */
  13. #include "platform.h"
  14. #include "jregexp.hpp"
  15. #include "jlib.hpp"
  16. #include "jexcept.hpp"
  17. #include "junicode.hpp"
  18. #include "eclhelper.hpp"
  19. #include "unicode/uchar.h"
  20. #include "csvsplitter.hpp"
  21. #include "eclrtl.hpp"
  22. CSVSplitter::CSVSplitter()
  23. {
  24. lengths = NULL;
  25. data = NULL;
  26. numQuotes = 0;
  27. internalBuffer = NULL;
  28. maxColumns = 0;
  29. curUnquoted = NULL;
  30. }
  31. CSVSplitter::~CSVSplitter()
  32. {
  33. delete [] lengths;
  34. delete [] data;
  35. free(internalBuffer);
  36. }
  37. void CSVSplitter::addQuote(const char * text)
  38. {
  39. //Allow '' to remove quoting.
  40. if (text && *text)
  41. matcher.addEntry(text, QUOTE+(numQuotes++<<8));
  42. }
  43. void CSVSplitter::addSeparator(const char * text)
  44. {
  45. if (text && *text)
  46. matcher.addEntry(text, SEPARATOR);
  47. }
  48. void CSVSplitter::addTerminator(const char * text)
  49. {
  50. matcher.addEntry(text, TERMINATOR);
  51. }
  52. void CSVSplitter::addEscape(const char * text)
  53. {
  54. matcher.addEntry(text, ESCAPE);
  55. }
  56. void CSVSplitter::reset()
  57. {
  58. matcher.reset();
  59. delete [] lengths;
  60. delete [] data;
  61. free(internalBuffer);
  62. lengths = NULL;
  63. data = NULL;
  64. numQuotes = 0;
  65. internalBuffer = NULL;
  66. maxCsvSize = 0;
  67. }
  68. void CSVSplitter::init(unsigned _maxColumns, ICsvParameters * csvInfo, const char * dfsQuotes, const char * dfsSeparators, const char * dfsTerminators, const char * dfsEscapes)
  69. {
  70. reset();
  71. maxCsvSize = csvInfo->queryMaxSize();
  72. internalBuffer = (byte *)malloc(maxCsvSize);
  73. maxColumns = _maxColumns;
  74. lengths = new unsigned [maxColumns+1]; // NB: One larger to remove some tests in main loop...
  75. data = new const byte * [maxColumns+1];
  76. unsigned idx;
  77. unsigned flags = csvInfo->getFlags();
  78. if (dfsQuotes && (flags & ICsvParameters::defaultQuote))
  79. addActionList(matcher, dfsQuotes, QUOTE);
  80. else
  81. {
  82. for (idx=0;;idx++)
  83. {
  84. const char * text = csvInfo->queryQuote(idx);
  85. if (!text)
  86. break;
  87. addQuote(text);
  88. }
  89. }
  90. if (dfsSeparators && (flags & ICsvParameters::defaultSeparate))
  91. addActionList(matcher, dfsSeparators, SEPARATOR);
  92. else
  93. {
  94. for (idx=0;;idx++)
  95. {
  96. const char * text = csvInfo->querySeparator(idx);
  97. if (!text)
  98. break;
  99. addSeparator(text);
  100. }
  101. }
  102. if (dfsTerminators && (flags & ICsvParameters::defaultTerminate))
  103. addActionList(matcher, dfsTerminators, TERMINATOR);
  104. else
  105. {
  106. for (idx=0;;idx++)
  107. {
  108. const char * text = csvInfo->queryTerminator(idx);
  109. if (!text)
  110. break;
  111. addTerminator(text);
  112. }
  113. }
  114. // Old workunits won't have queryEscape. MORE: deprecate on the next major version
  115. if (flags & ICsvParameters::supportsEscape)
  116. {
  117. if (dfsEscapes && (flags & ICsvParameters::defaultEscape))
  118. addActionList(matcher, dfsEscapes, ESCAPE);
  119. else
  120. {
  121. for (idx=0;;idx++)
  122. {
  123. const char * text = csvInfo->queryEscape(idx);
  124. if (!text)
  125. break;
  126. addEscape(text);
  127. }
  128. }
  129. }
  130. //MORE Should this be configurable??
  131. if (!(flags & ICsvParameters::preserveWhitespace))
  132. {
  133. matcher.queryAddEntry(1, " ", WHITESPACE);
  134. matcher.queryAddEntry(1, "\t", WHITESPACE);
  135. }
  136. }
  137. void CSVSplitter::setFieldRange(const byte * start, const byte * end, unsigned curColumn, unsigned quoteToStrip, bool unescape)
  138. {
  139. // Either quoting or escaping will use the local buffer
  140. if ((quoteToStrip || unescape) &&
  141. (unsigned)(curUnquoted - internalBuffer) + (unsigned)(end - start) > maxCsvSize)
  142. throw MakeStringException(99, "MAXLENGTH for CSV file is not large enough");
  143. // point to the beginning of the local (possibly changed) buffer, for escaping later
  144. byte * curUnescaped = curUnquoted;
  145. if (quoteToStrip)
  146. {
  147. data[curColumn] = curUnquoted;
  148. const byte * lastCopied = start;
  149. const byte *cur;
  150. for (cur = start; cur != end; )
  151. {
  152. unsigned matchLen;
  153. unsigned match = matcher.getMatch((size32_t)(end-cur), (const char *)cur, matchLen);
  154. switch (match & 255)
  155. {
  156. case NONE:
  157. matchLen = 1;
  158. break;
  159. case WHITESPACE:
  160. case SEPARATOR:
  161. break;
  162. case TERMINATOR:
  163. goto done;
  164. case QUOTE:
  165. {
  166. const byte * next = cur + matchLen;
  167. if ((match == quoteToStrip) && (next != end))
  168. {
  169. unsigned nextMatchLen;
  170. unsigned nextMatch = matcher.getMatch((size32_t)(end-next), (const char *)next, nextMatchLen);
  171. if (nextMatch == match)
  172. {
  173. memcpy(curUnquoted, lastCopied, next-lastCopied);
  174. curUnquoted += (next-lastCopied);
  175. matchLen += nextMatchLen;
  176. lastCopied = cur+matchLen;
  177. }
  178. }
  179. break;
  180. }
  181. }
  182. cur += matchLen;
  183. }
  184. done:
  185. memcpy(curUnquoted, lastCopied, cur-lastCopied);
  186. curUnquoted += (cur-lastCopied);
  187. lengths[curColumn] = (size32_t)(curUnquoted - data[curColumn]);
  188. }
  189. else
  190. {
  191. lengths[curColumn] = (size32_t)(end-start);
  192. // Only if ESCAPEs were detected in the input
  193. if (unescape)
  194. {
  195. // Need to copy original to a local string (using allocated buffer)
  196. memcpy(curUnescaped, start, lengths[curColumn]);
  197. data[curColumn] = curUnescaped;
  198. // and update the buffer pointer, to re-use on next iteration
  199. curUnquoted = curUnescaped + lengths[curColumn];
  200. }
  201. else
  202. {
  203. data[curColumn] = start;
  204. return;
  205. }
  206. }
  207. // Un-escape string, if necessary.
  208. if (unescape)
  209. {
  210. byte * cur = curUnescaped; // data[curColumn] is already pointing here one way or another
  211. byte * end = cur + lengths[curColumn];
  212. for (; cur < end; cur++)
  213. {
  214. unsigned matchLen;
  215. unsigned match = matcher.getMatch((size32_t)(end-cur), (const char *)cur, matchLen);
  216. if ((match & 255) == ESCAPE)
  217. {
  218. ptrdiff_t restLen = end-(cur+matchLen);
  219. memmove(cur, cur+matchLen, restLen);
  220. end -= matchLen;
  221. lengths[curColumn] -= matchLen;
  222. // Avoid having cur past end
  223. if (cur == end)
  224. break;
  225. }
  226. }
  227. }
  228. }
  229. size32_t CSVSplitter::splitLine(size32_t maxLength, const byte * start)
  230. {
  231. unsigned curColumn = 0;
  232. unsigned quote = 0;
  233. unsigned quoteToStrip = 0;
  234. const byte * cur = start;
  235. const byte * end = start + maxLength;
  236. const byte * firstGood = start;
  237. const byte * lastGood = start;
  238. bool lastEscape = false;
  239. curUnquoted = internalBuffer;
  240. while (cur != end)
  241. {
  242. unsigned matchLen;
  243. unsigned match = matcher.getMatch((size32_t)(end-cur), (const char *)cur, matchLen);
  244. switch (match & 255)
  245. {
  246. case NONE:
  247. cur++; // matchLen == 0;
  248. lastGood = cur;
  249. break;
  250. case WHITESPACE:
  251. //Skip leading whitespace
  252. if (quote)
  253. lastGood = cur+matchLen;
  254. else if (cur == firstGood)
  255. {
  256. firstGood = cur+matchLen;
  257. lastGood = cur+matchLen;
  258. }
  259. break;
  260. case SEPARATOR:
  261. // Quoted separator
  262. if ((curColumn < maxColumns) && (quote == 0))
  263. {
  264. setFieldRange(firstGood, lastGood, curColumn, quoteToStrip, lastEscape);
  265. lastEscape = false;
  266. quoteToStrip = 0;
  267. curColumn++;
  268. firstGood = cur + matchLen;
  269. }
  270. lastGood = cur+matchLen;
  271. break;
  272. case TERMINATOR:
  273. if (quote == 0) // Is this a good idea? Means a mismatched quote is not fixed by EOL
  274. {
  275. setFieldRange(firstGood, lastGood, curColumn, quoteToStrip, lastEscape);
  276. lastEscape = false;
  277. while (++curColumn < maxColumns)
  278. lengths[curColumn] = 0;
  279. return (size32_t)(cur + matchLen - start);
  280. }
  281. lastGood = cur+matchLen;
  282. break;
  283. case QUOTE:
  284. // Quoted quote
  285. if (quote == 0)
  286. {
  287. if (cur == firstGood)
  288. {
  289. quote = match;
  290. firstGood = cur+matchLen;
  291. }
  292. lastGood = cur+matchLen;
  293. }
  294. else
  295. {
  296. if (quote == match)
  297. {
  298. const byte * next = cur + matchLen;
  299. //Check for double quotes
  300. if ((next != end))
  301. {
  302. unsigned nextMatchLen;
  303. unsigned nextMatch = matcher.getMatch((size32_t)(end-next), (const char *)next, nextMatchLen);
  304. if (nextMatch == quote)
  305. {
  306. quoteToStrip = quote;
  307. matchLen += nextMatchLen;
  308. lastGood = cur+matchLen;
  309. }
  310. else
  311. quote = 0;
  312. }
  313. else
  314. quote = 0;
  315. }
  316. else
  317. lastGood = cur+matchLen;
  318. }
  319. break;
  320. case ESCAPE:
  321. lastEscape = true;
  322. lastGood = cur+matchLen;
  323. // If this escape is at the end, proceed to field range
  324. if (lastGood == end)
  325. break;
  326. // Skip escape and ignore the next match
  327. cur += matchLen;
  328. match = matcher.getMatch((size32_t)(end-cur), (const char *)cur, matchLen);
  329. if ((match & 255) == NONE)
  330. matchLen = 1;
  331. lastGood += matchLen;
  332. break;
  333. }
  334. cur += matchLen;
  335. }
  336. setFieldRange(firstGood, lastGood, curColumn, quoteToStrip, lastEscape);
  337. while (++curColumn < maxColumns)
  338. lengths[curColumn] = 0;
  339. return (size32_t)(end - start);
  340. }
  341. //=====================================================================================================
  342. void CSVOutputStream::beginLine()
  343. {
  344. clear();
  345. prefix = NULL;
  346. }
  347. void CSVOutputStream::endLine()
  348. {
  349. append(terminator);
  350. }
  351. void CSVOutputStream::init(ICsvParameters * args, bool _oldOutputFormat)
  352. {
  353. if (args->queryEBCDIC())
  354. throw MakeStringException(99, "EBCDIC CSV output not yet implemented");
  355. quote.set(args->queryQuote(0));
  356. separator.set(args->querySeparator(0));
  357. terminator.set(args->queryTerminator(0));
  358. escape.set(args->queryEscape(0));
  359. oldOutputFormat = _oldOutputFormat||!quote.length();
  360. }
  361. void CSVOutputStream::writeUnicode(size32_t len, const UChar * data)
  362. {
  363. unsigned utf8Length;
  364. char * utf8Data = NULL;
  365. rtlUnicodeToCodepageX(utf8Length, utf8Data, len, data, "utf-8");
  366. writeString(utf8Length, utf8Data);
  367. rtlFree(utf8Data);
  368. }
  369. void CSVOutputStream::writeUtf8(size32_t len, const char * data)
  370. {
  371. append(prefix);
  372. if (oldOutputFormat) {
  373. append(quote).append(rtlUtf8Size(len, data), data).append(quote);
  374. }
  375. else if (len) {
  376. // is this OTT?
  377. // not sure if best way but generate an array of utf8 sizes
  378. MemoryAttr ma;
  379. size32_t * cl;
  380. if (len>256)
  381. cl = (size32_t *)ma.allocate(sizeof(size32_t)*len);
  382. else
  383. cl = (size32_t *)alloca(sizeof(size32_t)*len);
  384. unsigned start=(unsigned)-1;
  385. unsigned end=0;
  386. const byte * s = (const byte *)data;
  387. unsigned i;
  388. for (i=0;i<len;i++) {
  389. const byte *p=s;
  390. UChar next = readUtf8Character(sizeof(UChar), s);
  391. cl[i] = (size32_t)(s-p);
  392. if (!u_isspace(next)) {
  393. end = i;
  394. if (start==(unsigned)-1)
  395. start = i;
  396. }
  397. }
  398. const byte *e=s;
  399. // do trim
  400. if (start!=(unsigned)-1) {
  401. for (i=0;i<start;i++)
  402. data += *(cl++);
  403. len -= start;
  404. end -= start;
  405. end++;
  406. while (end<len)
  407. e -= cl[--len];
  408. }
  409. // now see if need quoting by looking for separator, terminator or quote
  410. // I *think* this can be done with memcmps as has to be exact
  411. size32_t sl = separator.length();
  412. size32_t tl = terminator.length();
  413. size32_t ql = quote.length();
  414. bool needquote=false;
  415. s = (const byte *)data;
  416. for (i=0;i<len;i++) {
  417. size32_t l = (size32_t)(e-s);
  418. if (sl&&(l>=sl)&&(memcmp(separator.get(),s,sl)==0)) {
  419. needquote = true;
  420. break;
  421. }
  422. if (tl&&(l>=tl)&&(memcmp(terminator.get(),s,tl)==0)) {
  423. needquote = true;
  424. break;
  425. }
  426. if ((l>=ql)&&(memcmp(quote.get(),s,ql)==0)) {
  427. needquote = true;
  428. break;
  429. }
  430. s+=cl[i];
  431. }
  432. if (needquote) {
  433. append(quote);
  434. s = (const byte *)data;
  435. for (i=0;i<len;i++) {
  436. size32_t l = (size32_t)(e-s);
  437. if ((l>=ql)&&(memcmp(quote.get(),s,ql)==0))
  438. append(quote);
  439. append(cl[i],(const char *)s);
  440. s+=cl[i];
  441. }
  442. append(quote);
  443. }
  444. else
  445. append((size32_t)(e-(const byte *)data),data);
  446. }
  447. prefix = separator;
  448. }
  449. void CSVOutputStream::writeString(size32_t len, const char * data)
  450. {
  451. append(prefix);
  452. if (oldOutputFormat) {
  453. append(quote).append(len, data).append(quote);
  454. }
  455. else if (len) {
  456. // New format (as per GS)
  457. // first trim
  458. while (len&&(*data==' ')) {
  459. len--;
  460. data++;
  461. }
  462. while (len&&(data[len-1]==' '))
  463. len--;
  464. // now see if need quoting by looking for separator, terminator or quote
  465. size32_t sl = separator.length();
  466. size32_t tl = terminator.length();
  467. size32_t ql = quote.length();
  468. bool needquote=false;
  469. const char *s = data;
  470. for (unsigned l=len;l>0;l--) {
  471. if (sl&&(l>=sl)&&(memcmp(separator.get(),s,sl)==0)) {
  472. needquote = true;
  473. break;
  474. }
  475. if (tl&&(l>=tl)&&(memcmp(terminator.get(),s,tl)==0)) {
  476. needquote = true;
  477. break;
  478. }
  479. if ((l>=ql)&&(memcmp(quote.get(),s,ql)==0)) {
  480. needquote = true;
  481. break;
  482. }
  483. s++;
  484. }
  485. if (needquote) {
  486. append(quote);
  487. const char *s = data;
  488. for (unsigned l=len;l>0;l--) {
  489. if ((l>=ql)&&(memcmp(quote.get(),s,ql)==0))
  490. append(quote);
  491. append(*(s++));
  492. }
  493. append(quote);
  494. }
  495. else
  496. append(len,data);
  497. }
  498. prefix = separator;
  499. }
  500. void CSVOutputStream::writeHeaderLn(size32_t len, const char * data)
  501. {
  502. append(len,data);
  503. if (!oldOutputFormat&&len) {
  504. size32_t tl = terminator.length();
  505. if ((tl>len)||(memcmp(data+len-tl,terminator.get(),tl)!=0))
  506. endLine();
  507. }
  508. }