csvsplitter.hpp 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119
  1. /*##############################################################################
  2. HPCC SYSTEMS software Copyright (C) 2012 HPCC Systems®.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. ############################################################################## */
  13. #ifndef CSVSPLITTER_INCL
  14. #define CSVSPLITTER_INCL
  15. #ifdef THORHELPER_EXPORTS
  16. #define THORHELPER_API DECL_EXPORT
  17. #else
  18. #define THORHELPER_API DECL_IMPORT
  19. #endif
  20. #include "jregexp.hpp"
  21. #include "eclhelper.hpp"
  22. #include "unicode/utf.h"
  23. /**
  24. * CSVSplitter - splits CSV files into fields and rows.
  25. *
  26. * CSV files are text based records that can have user defined syntax for quoting,
  27. * escaping, separating fields and rows. According to RFC-4180, there isn't a
  28. * standard way of building CSV files, however, there is a set of general rules
  29. * that most implementations seem to follow. This makes it hard to implement a CSV
  30. * parser, since even if you follow the RFC, you might not read some files as the
  31. * producer intended.
  32. *
  33. * The general rules are:
  34. * * rows are separated by EOL
  35. * * fields are separated by comma
  36. * * special text must be enclosed by quotes
  37. * * there must be a form of escaping quotes
  38. *
  39. * However, this implementation allows for user-specified quotes, (field) separators,
  40. * terminators (row separators), whitespace and (multi-char) escaping sequences, so
  41. * it should be possible to accommodate most files that deviate from the norm, while
  42. * still reading the files correctly by default.
  43. *
  44. * One important rule is that any special behaviour should be enclosed by quotes, so
  45. * you don't need to account for escaping separators or terminators when they're not
  46. * themselves quoted. This, and non-matching quotes should be considered syntax error
  47. * and the producer should, then, fix their output.
  48. *
  49. * Also, many CSV producers (including commercial databases) use slash (\) as escaping
  50. * char, while the RFC mentions re-using quotes (""). We implement both.
  51. */
  52. class THORHELPER_API CSVSplitter
  53. {
  54. public:
  55. CSVSplitter();
  56. ~CSVSplitter();
  57. void addQuote(const char * text);
  58. void addSeparator(const char * text);
  59. void addTerminator(const char * text);
  60. void addEscape(const char * text);
  61. void init(unsigned maxColumns, ICsvParameters * csvInfo, const char * dfsQuotes, const char * dfsSeparators, const char * dfsTerminators, const char * dfsEscapes);
  62. void reset();
  63. size32_t splitLine(size32_t maxLen, const byte * start);
  64. inline unsigned * queryLengths() { return lengths; }
  65. inline const byte * * queryData() { return data; }
  66. protected:
  67. void setFieldRange(const byte * start, const byte * end, unsigned curColumn, unsigned quoteToStrip, bool unescape);
  68. protected:
  69. enum { NONE=0, SEPARATOR=1, TERMINATOR=2, WHITESPACE=3, QUOTE=4, ESCAPE=5 };
  70. unsigned maxColumns;
  71. StringMatcher matcher;
  72. unsigned numQuotes;
  73. unsigned * lengths;
  74. const byte * * data;
  75. byte * internalBuffer;
  76. size32_t internalOffset;
  77. size32_t sizeInternal;
  78. size32_t maxCsvSize;
  79. };
  80. class THORHELPER_API CSVOutputStream : public StringBuffer, implements ITypedOutputStream
  81. {
  82. public:
  83. void beginLine();
  84. void writeHeaderLn(size32_t len, const char * data); // no need for endLine
  85. void endLine();
  86. void init(ICsvParameters * args, bool _oldOutputFormat);
  87. virtual void writeReal(double value) { append(prefix).append(value); prefix = separator; }
  88. virtual void writeSigned(__int64 value) { append(prefix).append(value); prefix = separator; }
  89. virtual void writeString(size32_t len, const char * data);
  90. virtual void writeUnicode(size32_t len, const UChar * data);
  91. virtual void writeUnsigned(unsigned __int64 value) { append(prefix).append(value); prefix = separator; }
  92. virtual void writeUtf8(size32_t len, const char * data);
  93. protected:
  94. StringAttr separator;
  95. StringAttr terminator;
  96. StringAttr quote;
  97. StringAttr escape;
  98. const char * prefix;
  99. bool oldOutputFormat;
  100. };
  101. #endif // CSVSPLITTER_INCL