eclregex.cpp 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471
  1. /*##############################################################################
  2. HPCC SYSTEMS software Copyright (C) 2012 HPCC Systems®.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. ############################################################################## */
  13. #include "limits.h"
  14. #if defined(_USE_BOOST_REGEX)
  15. #include "boost/regex.hpp" // must precede platform.h ; n.b. this uses a #pragma comment(lib, ...) to link the appropriate .lib in MSVC
  16. #elif defined(_USE_C11_REGEX)
  17. #include <regex>
  18. #endif
  19. #include "platform.h"
  20. #include "eclrtl.hpp"
  21. #include "eclrtl_imp.hpp"
  22. #include "unicode/regex.h"
  23. #define UTF8_CODEPAGE "UTF-8"
  24. #define UTF8_MAXSIZE 4
  25. #if defined(_USE_BOOST_REGEX) || defined(_USE_C11_REGEX)
  26. #if defined(_USE_BOOST_REGEX)
  27. using boost::regex;
  28. using boost::regex_search;
  29. using boost::regex_replace;
  30. using boost::regex_iterator;
  31. using boost::cmatch;
  32. using boost::match_results;
  33. #else
  34. using std::regex;
  35. using std::regex_search;
  36. using std::regex_replace;
  37. using std::regex_iterator;
  38. using std::cmatch;
  39. using std::match_results;
  40. #endif
  41. class CStrRegExprFindInstance : implements IStrRegExprFindInstance
  42. {
  43. private:
  44. bool matched;
  45. const regex * regEx;
  46. cmatch subs;
  47. char * sample; //only required if findstr/findvstr will be called
  48. public:
  49. CStrRegExprFindInstance(const regex * _regEx, const char * _str, size32_t _from, size32_t _len, bool _keep)
  50. : regEx(_regEx)
  51. {
  52. matched = false;
  53. sample = NULL;
  54. try
  55. {
  56. if (_keep)
  57. {
  58. sample = (char *)rtlMalloc(_len + 1); //required for findstr
  59. memcpy(sample, _str + _from, _len);
  60. sample[_len] = (char)NULL;
  61. matched = regex_search(sample, subs, *regEx);
  62. }
  63. else
  64. {
  65. matched = regex_search(_str + _from, _str + _len, subs, *regEx);
  66. }
  67. }
  68. catch (const std::runtime_error & e)
  69. {
  70. std::string msg = "Error in regex search: ";
  71. msg += e.what();
  72. #if defined(_USE_BOOST_REGEX)
  73. msg += "(regex: ";
  74. msg += regEx->str();
  75. msg += ")";
  76. #endif
  77. rtlFail(0, msg.c_str());
  78. }
  79. }
  80. ~CStrRegExprFindInstance() //CAVEAT non-virtual destructor !
  81. {
  82. free(sample);
  83. }
  84. //IStrRegExprFindInstance
  85. bool found() const { return matched; }
  86. void getMatchX(unsigned & outlen, char * & out, unsigned n = 0) const
  87. {
  88. if (matched && (n < subs.size()))
  89. {
  90. outlen = subs[n].second - subs[n].first;
  91. out = (char *)rtlMalloc(outlen);
  92. memcpy(out, subs[n].first, outlen);
  93. }
  94. else
  95. {
  96. outlen = 0;
  97. out = NULL;
  98. }
  99. }
  100. char const * findvstr(unsigned outlen, char * out, unsigned n = 0)
  101. {
  102. if (matched && (n < subs.size()))
  103. {
  104. unsigned sublen = subs[n].second - subs[n].first;
  105. if (sublen >= outlen)
  106. sublen = outlen - 1;
  107. memcpy(out, subs[n].first, sublen);
  108. out[sublen] = 0;
  109. }
  110. else
  111. {
  112. out[0] = 0;
  113. }
  114. return out;
  115. }
  116. };
  117. //---------------------------------------------------------------------------
  118. class CCompiledStrRegExpr : implements ICompiledStrRegExpr
  119. {
  120. private:
  121. regex regEx;
  122. public:
  123. CCompiledStrRegExpr(const char * _regExp, bool _isCaseSensitive = false)
  124. {
  125. try
  126. {
  127. #if defined(_USE_BOOST_REGEX)
  128. if (_isCaseSensitive)
  129. regEx.assign(_regExp, regex::perl);
  130. else
  131. regEx.assign(_regExp, regex::perl | regex::icase);
  132. #else
  133. if (_isCaseSensitive)
  134. regEx.assign(_regExp, regex::ECMAScript);
  135. else
  136. regEx.assign(_regExp, regex::ECMAScript | regex::icase);
  137. #endif
  138. }
  139. #if defined(_USE_BOOST_REGEX)
  140. catch(const boost::bad_expression & e)
  141. #else
  142. catch(const std::regex_error & e)
  143. #endif
  144. {
  145. std::string msg = "Bad regular expression: ";
  146. msg += e.what();
  147. msg += ": ";
  148. msg += _regExp;
  149. rtlFail(0, msg.c_str()); //throws
  150. }
  151. }
  152. //ICompiledStrRegExpr
  153. void replace(size32_t & outlen, char * & out, size32_t slen, char const * str, size32_t rlen, char const * replace) const
  154. {
  155. std::string src(str, str + slen);
  156. std::string fmt(replace, replace + rlen);
  157. std::string tgt;
  158. try
  159. {
  160. // tgt = boost::regex_merge(src, cre->regEx, fmt, boost::format_perl); //Algorithm regex_merge has been renamed regex_replace, existing code will continue to compile, but new code should use regex_replace instead.
  161. #if defined(_USE_BOOST_REGEX)
  162. tgt = regex_replace(src, regEx, fmt, boost::format_perl);
  163. #else
  164. tgt = regex_replace(src, regEx, fmt);
  165. #endif
  166. }
  167. catch(const std::runtime_error & e)
  168. {
  169. std::string msg = "Error in regex replace: ";
  170. msg += e.what();
  171. #if defined(_USE_BOOST_REGEX)
  172. msg += "(regex: ";
  173. msg += regEx.str();
  174. msg += ")";
  175. #endif
  176. rtlFail(0, msg.c_str());
  177. }
  178. outlen = tgt.length();
  179. out = (char *)rtlMalloc(outlen);
  180. memcpy(out, tgt.data(), outlen);
  181. }
  182. IStrRegExprFindInstance * find(const char * str, size32_t from, size32_t len, bool needToKeepSearchString) const
  183. {
  184. CStrRegExprFindInstance * findInst = new CStrRegExprFindInstance(&regEx, str, from, len, needToKeepSearchString);
  185. return findInst;
  186. }
  187. void getMatchSet(bool & __isAllResult, size32_t & __resultBytes, void * & __result, size32_t _srcLen, const char * _search)
  188. {
  189. rtlRowBuilder out;
  190. size32_t outBytes = 0;
  191. const char * search_end = _search+_srcLen;
  192. regex_iterator<const char *> cur(_search, search_end, regEx);
  193. regex_iterator<const char *> end; // Default contructor creates an end of list marker
  194. for (; cur != end; ++cur)
  195. {
  196. const match_results<const char *> &match = *cur;
  197. if (match[0].first==search_end) break;
  198. const size32_t lenBytes = match[0].second - match[0].first;
  199. out.ensureAvailable(outBytes+lenBytes+sizeof(size32_t));
  200. byte *outData = out.getbytes()+outBytes;
  201. * (size32_t *) outData = lenBytes;
  202. rtlStrToStr(lenBytes, outData+sizeof(size32_t), lenBytes, match[0].first);
  203. outBytes += lenBytes+sizeof(size32_t);
  204. }
  205. __isAllResult = false;
  206. __resultBytes = outBytes;
  207. __result = out.detachdata();
  208. };
  209. };
  210. //---------------------------------------------------------------------------
  211. ECLRTL_API ICompiledStrRegExpr * rtlCreateCompiledStrRegExpr(const char * regExpr, bool isCaseSensitive)
  212. {
  213. CCompiledStrRegExpr * expr = new CCompiledStrRegExpr(regExpr, isCaseSensitive);
  214. return expr;
  215. }
  216. ECLRTL_API void rtlDestroyCompiledStrRegExpr(ICompiledStrRegExpr * compiledExpr)
  217. {
  218. if (compiledExpr)
  219. delete (CCompiledStrRegExpr*)compiledExpr;
  220. }
  221. ECLRTL_API void rtlDestroyStrRegExprFindInstance(IStrRegExprFindInstance * findInst)
  222. {
  223. if (findInst)
  224. delete (CStrRegExprFindInstance*)findInst;
  225. }
  226. //---------------------------------------------------------------------------
  227. // RegEx Compiler for unicode strings
  228. class CUStrRegExprFindInstance : implements IUStrRegExprFindInstance
  229. {
  230. private:
  231. bool matched;
  232. RegexMatcher * matcher;
  233. UnicodeString sample;
  234. unsigned matchedSize;
  235. public:
  236. CUStrRegExprFindInstance(RegexMatcher * _matcher, const UChar * _str, size32_t _from, size32_t _len)
  237. : matcher(_matcher)
  238. {
  239. matched = false;
  240. sample.setTo(_str + _from, _len);
  241. matcher->reset(sample);
  242. matched = matcher->find() != FALSE;
  243. if (matched)
  244. matchedSize = (unsigned)matcher->groupCount() + 1;
  245. }
  246. //IUStrRegExprFindInstance
  247. bool found() const { return matched; }
  248. void getMatchX(unsigned & outlen, UChar * & out, unsigned n = 0) const
  249. {
  250. if(matched && (n < matchedSize))
  251. {
  252. UErrorCode uerr = U_ZERO_ERROR;
  253. int32_t start = n ? matcher->start(n, uerr) : matcher->start(uerr);
  254. int32_t end = n ? matcher->end(n, uerr) : matcher->end(uerr);
  255. outlen = end - start;
  256. out = (UChar *)rtlMalloc(outlen*2);
  257. sample.extract(start, outlen, out);
  258. }
  259. else
  260. {
  261. outlen = 0;
  262. out = NULL;
  263. }
  264. }
  265. UChar const * findvstr(unsigned outlen, UChar * out, unsigned n = 0)
  266. {
  267. if(matched && (n < matchedSize))
  268. {
  269. UErrorCode uerr = U_ZERO_ERROR;
  270. int32_t start = n ? matcher->start(n, uerr) : matcher->start(uerr);
  271. int32_t end = n ? matcher->end(n, uerr) : matcher->end(uerr);
  272. unsigned sublen = end - start;
  273. if(sublen >= outlen)
  274. sublen = outlen - 1;
  275. sample.extract(start, sublen, out);
  276. out[sublen] = 0;
  277. }
  278. else
  279. {
  280. out[0] = 0;
  281. }
  282. return out;
  283. }
  284. };
  285. //---------------------------------------------------------------------------
  286. class CCompiledUStrRegExpr : implements ICompiledUStrRegExpr
  287. {
  288. private:
  289. RegexPattern * pattern;
  290. RegexMatcher * matcher;
  291. public:
  292. CCompiledUStrRegExpr(const UChar * _UregExp, bool _isCaseSensitive = false)
  293. {
  294. UErrorCode uerr = U_ZERO_ERROR;
  295. UParseError uperr;
  296. if (_isCaseSensitive)
  297. pattern = RegexPattern::compile(_UregExp, uperr, uerr);
  298. else
  299. pattern = RegexPattern::compile(_UregExp, UREGEX_CASE_INSENSITIVE, uperr, uerr);
  300. matcher = pattern->matcher(uerr);
  301. if (U_FAILURE(uerr))
  302. {
  303. char * expAscii;
  304. unsigned expAsciiLen;
  305. rtlUnicodeToEscapedStrX(expAsciiLen, expAscii, rtlUnicodeStrlen(_UregExp), _UregExp);
  306. std::string msg = "Bad regular expression: ";
  307. msg += u_errorName(uerr);
  308. msg += ": ";
  309. msg.append(expAscii, expAsciiLen);
  310. rtlFree(expAscii);
  311. delete matcher;
  312. delete pattern;
  313. matcher = 0;
  314. pattern = 0;
  315. rtlFail(0, msg.c_str()); //throws
  316. }
  317. }
  318. ~CCompiledUStrRegExpr()
  319. {
  320. if (matcher)
  321. delete matcher;
  322. if (pattern)
  323. delete pattern;
  324. }
  325. void replace(size32_t & outlen, UChar * & out, size32_t slen, const UChar * str, size32_t rlen, UChar const * replace) const
  326. {
  327. UnicodeString const src(str, slen);
  328. UErrorCode err = U_ZERO_ERROR;
  329. RegexMatcher * replacer = pattern->matcher(src, err);
  330. UnicodeString const fmt(replace, rlen);
  331. UnicodeString const tgt = replacer->replaceAll(fmt, err);
  332. outlen = tgt.length();
  333. out = (UChar *)rtlMalloc(outlen*2);
  334. tgt.extract(0, outlen, out);
  335. delete replacer;
  336. }
  337. IUStrRegExprFindInstance * find(const UChar * str, size32_t from, size32_t len) const
  338. {
  339. CUStrRegExprFindInstance * findInst = new CUStrRegExprFindInstance(matcher, str, from, len);
  340. return findInst;
  341. }
  342. void getMatchSet(bool & __isAllResult, size32_t & __resultBytes, void * & __result, size32_t _srcLen, const UChar * _search)
  343. {
  344. rtlRowBuilder out;
  345. size32_t outBytes = 0;
  346. UErrorCode uerr = U_ZERO_ERROR;
  347. UnicodeString uStrSearch;
  348. uStrSearch.setTo(_search, _srcLen);
  349. matcher->reset(uStrSearch);
  350. while (matcher->find())
  351. {
  352. uerr = U_ZERO_ERROR;
  353. int32_t start = matcher->start(uerr);
  354. if ((size32_t) start==_srcLen) break;
  355. int32_t end = matcher->end(uerr);
  356. int32_t numUChars = end - start;
  357. out.ensureAvailable(outBytes+numUChars*sizeof(UChar)+sizeof(size32_t));
  358. byte *outData = out.getbytes()+outBytes;
  359. * (size32_t *) outData = numUChars;
  360. uStrSearch.extract(start,numUChars,(UChar *) (outData+sizeof(size32_t)));
  361. outBytes += numUChars*sizeof(UChar) + sizeof(size32_t);
  362. }
  363. __isAllResult = false;
  364. __resultBytes = outBytes;
  365. __result = out.detachdata();
  366. }
  367. };
  368. //---------------------------------------------------------------------------
  369. ECLRTL_API ICompiledUStrRegExpr * rtlCreateCompiledUStrRegExpr(const UChar * regExpr, bool isCaseSensitive)
  370. {
  371. CCompiledUStrRegExpr * expr = new CCompiledUStrRegExpr(regExpr, isCaseSensitive);
  372. return expr;
  373. }
  374. ECLRTL_API void rtlDestroyCompiledUStrRegExpr(ICompiledUStrRegExpr * compiledExpr)
  375. {
  376. if (compiledExpr)
  377. delete (CCompiledUStrRegExpr*)compiledExpr;
  378. }
  379. ECLRTL_API void rtlDestroyUStrRegExprFindInstance(IUStrRegExprFindInstance * findInst)
  380. {
  381. if (findInst)
  382. delete (CUStrRegExprFindInstance*)findInst;
  383. }
  384. #else // _USE_BOOST_REGEX or _USE_C11_REGEX not set
  385. ECLRTL_API ICompiledStrRegExpr * rtlCreateCompiledStrRegExpr(const char * regExpr, bool isCaseSensitive)
  386. {
  387. rtlFail(0, "Boost/C++11 regex disabled");
  388. }
  389. ECLRTL_API void rtlDestroyCompiledStrRegExpr(ICompiledStrRegExpr * compiledExpr)
  390. {
  391. }
  392. ECLRTL_API void rtlDestroyStrRegExprFindInstance(IStrRegExprFindInstance * findInst)
  393. {
  394. }
  395. ECLRTL_API ICompiledUStrRegExpr * rtlCreateCompiledUStrRegExpr(const UChar * regExpr, bool isCaseSensitive)
  396. {
  397. rtlFail(0, "Boost/C++11 regex disabled");
  398. }
  399. ECLRTL_API void rtlDestroyCompiledUStrRegExpr(ICompiledUStrRegExpr * compiledExpr)
  400. {
  401. }
  402. ECLRTL_API void rtlDestroyUStrRegExprFindInstance(IUStrRegExprFindInstance * findInst)
  403. {
  404. }
  405. #endif // _USE_BOOST_REGEX or _USE_C11_REGEX