eclregex.cpp 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488
  1. /*##############################################################################
  2. HPCC SYSTEMS software Copyright (C) 2012 HPCC Systems®.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. ############################################################################## */
  13. #include "limits.h"
  14. #if defined(_USE_BOOST_REGEX)
  15. #include "boost/regex.hpp" // must precede platform.h ; n.b. this uses a #pragma comment(lib, ...) to link the appropriate .lib in MSVC
  16. #elif defined(_USE_C11_REGEX)
  17. #include <regex>
  18. #endif
  19. #include "platform.h"
  20. #include "eclrtl.hpp"
  21. #include "eclrtl_imp.hpp"
  22. #ifdef _USE_ICU
  23. #include "unicode/regex.h"
  24. #endif
  25. #define UTF8_CODEPAGE "UTF-8"
  26. #define UTF8_MAXSIZE 4
  27. #if defined(_USE_BOOST_REGEX) || defined(_USE_C11_REGEX)
  28. #if defined(_USE_BOOST_REGEX)
  29. using boost::regex;
  30. using boost::regex_search;
  31. using boost::regex_replace;
  32. using boost::regex_iterator;
  33. using boost::cmatch;
  34. using boost::match_results;
  35. #else
  36. using std::regex;
  37. using std::regex_search;
  38. using std::regex_replace;
  39. using std::regex_iterator;
  40. using std::cmatch;
  41. using std::match_results;
  42. #endif
  43. class CStrRegExprFindInstance : implements IStrRegExprFindInstance
  44. {
  45. private:
  46. bool matched;
  47. const regex * regEx;
  48. cmatch subs;
  49. char * sample; //only required if findstr/findvstr will be called
  50. public:
  51. CStrRegExprFindInstance(const regex * _regEx, const char * _str, size32_t _from, size32_t _len, bool _keep)
  52. : regEx(_regEx)
  53. {
  54. matched = false;
  55. sample = NULL;
  56. try
  57. {
  58. if (_keep)
  59. {
  60. sample = (char *)rtlMalloc(_len + 1); //required for findstr
  61. memcpy(sample, _str + _from, _len);
  62. sample[_len] = (char)NULL;
  63. matched = regex_search(sample, subs, *regEx);
  64. }
  65. else
  66. {
  67. matched = regex_search(_str + _from, _str + _len, subs, *regEx);
  68. }
  69. }
  70. catch (const std::runtime_error & e)
  71. {
  72. std::string msg = "Error in regex search: ";
  73. msg += e.what();
  74. #if defined(_USE_BOOST_REGEX)
  75. msg += "(regex: ";
  76. msg += regEx->str();
  77. msg += ")";
  78. #endif
  79. rtlFail(0, msg.c_str());
  80. }
  81. }
  82. ~CStrRegExprFindInstance() //CAVEAT non-virtual destructor !
  83. {
  84. free(sample);
  85. }
  86. //IStrRegExprFindInstance
  87. bool found() const { return matched; }
  88. void getMatchX(unsigned & outlen, char * & out, unsigned n = 0) const
  89. {
  90. if (matched && (n < subs.size()))
  91. {
  92. outlen = subs[n].second - subs[n].first;
  93. out = (char *)rtlMalloc(outlen);
  94. memcpy(out, subs[n].first, outlen);
  95. }
  96. else
  97. {
  98. outlen = 0;
  99. out = NULL;
  100. }
  101. }
  102. char const * findvstr(unsigned outlen, char * out, unsigned n = 0)
  103. {
  104. if (matched && (n < subs.size()))
  105. {
  106. unsigned sublen = subs[n].second - subs[n].first;
  107. if (sublen >= outlen)
  108. sublen = outlen - 1;
  109. memcpy(out, subs[n].first, sublen);
  110. out[sublen] = 0;
  111. }
  112. else
  113. {
  114. out[0] = 0;
  115. }
  116. return out;
  117. }
  118. };
  119. //---------------------------------------------------------------------------
  120. class CCompiledStrRegExpr : implements ICompiledStrRegExpr
  121. {
  122. private:
  123. regex regEx;
  124. public:
  125. CCompiledStrRegExpr(const char * _regExp, bool _isCaseSensitive = false)
  126. {
  127. try
  128. {
  129. #if defined(_USE_BOOST_REGEX)
  130. if (_isCaseSensitive)
  131. regEx.assign(_regExp, regex::perl);
  132. else
  133. regEx.assign(_regExp, regex::perl | regex::icase);
  134. #else
  135. if (_isCaseSensitive)
  136. regEx.assign(_regExp, regex::ECMAScript);
  137. else
  138. regEx.assign(_regExp, regex::ECMAScript | regex::icase);
  139. #endif
  140. }
  141. #if defined(_USE_BOOST_REGEX)
  142. catch(const boost::bad_expression & e)
  143. #else
  144. catch(const std::regex_error & e)
  145. #endif
  146. {
  147. std::string msg = "Bad regular expression: ";
  148. msg += e.what();
  149. msg += ": ";
  150. msg += _regExp;
  151. rtlFail(0, msg.c_str()); //throws
  152. }
  153. }
  154. //ICompiledStrRegExpr
  155. void replace(size32_t & outlen, char * & out, size32_t slen, char const * str, size32_t rlen, char const * replace) const
  156. {
  157. std::string src(str, str + slen);
  158. std::string fmt(replace, replace + rlen);
  159. std::string tgt;
  160. try
  161. {
  162. // tgt = boost::regex_merge(src, cre->regEx, fmt, boost::format_perl); //Algorithm regex_merge has been renamed regex_replace, existing code will continue to compile, but new code should use regex_replace instead.
  163. #if defined(_USE_BOOST_REGEX)
  164. tgt = regex_replace(src, regEx, fmt, boost::format_perl);
  165. #else
  166. tgt = regex_replace(src, regEx, fmt);
  167. #endif
  168. }
  169. catch(const std::runtime_error & e)
  170. {
  171. std::string msg = "Error in regex replace: ";
  172. msg += e.what();
  173. #if defined(_USE_BOOST_REGEX)
  174. msg += "(regex: ";
  175. msg += regEx.str();
  176. msg += ")";
  177. #endif
  178. rtlFail(0, msg.c_str());
  179. }
  180. outlen = tgt.length();
  181. out = (char *)rtlMalloc(outlen);
  182. memcpy(out, tgt.data(), outlen);
  183. }
  184. IStrRegExprFindInstance * find(const char * str, size32_t from, size32_t len, bool needToKeepSearchString) const
  185. {
  186. CStrRegExprFindInstance * findInst = new CStrRegExprFindInstance(&regEx, str, from, len, needToKeepSearchString);
  187. return findInst;
  188. }
  189. void getMatchSet(bool & __isAllResult, size32_t & __resultBytes, void * & __result, size32_t _srcLen, const char * _search)
  190. {
  191. rtlRowBuilder out;
  192. size32_t outBytes = 0;
  193. const char * search_end = _search+_srcLen;
  194. regex_iterator<const char *> cur(_search, search_end, regEx);
  195. regex_iterator<const char *> end; // Default contructor creates an end of list marker
  196. for (; cur != end; ++cur)
  197. {
  198. const match_results<const char *> &match = *cur;
  199. if (match[0].first==search_end) break;
  200. const size32_t lenBytes = match[0].second - match[0].first;
  201. out.ensureAvailable(outBytes+lenBytes+sizeof(size32_t));
  202. byte *outData = out.getbytes()+outBytes;
  203. * (size32_t *) outData = lenBytes;
  204. rtlStrToStr(lenBytes, outData+sizeof(size32_t), lenBytes, match[0].first);
  205. outBytes += lenBytes+sizeof(size32_t);
  206. }
  207. __isAllResult = false;
  208. __resultBytes = outBytes;
  209. __result = out.detachdata();
  210. };
  211. };
  212. //---------------------------------------------------------------------------
  213. ECLRTL_API ICompiledStrRegExpr * rtlCreateCompiledStrRegExpr(const char * regExpr, bool isCaseSensitive)
  214. {
  215. CCompiledStrRegExpr * expr = new CCompiledStrRegExpr(regExpr, isCaseSensitive);
  216. return expr;
  217. }
  218. ECLRTL_API void rtlDestroyCompiledStrRegExpr(ICompiledStrRegExpr * compiledExpr)
  219. {
  220. if (compiledExpr)
  221. delete (CCompiledStrRegExpr*)compiledExpr;
  222. }
  223. ECLRTL_API void rtlDestroyStrRegExprFindInstance(IStrRegExprFindInstance * findInst)
  224. {
  225. if (findInst)
  226. delete (CStrRegExprFindInstance*)findInst;
  227. }
  228. //---------------------------------------------------------------------------
  229. // RegEx Compiler for unicode strings
  230. #ifdef _USE_ICU
  231. class CUStrRegExprFindInstance : implements IUStrRegExprFindInstance
  232. {
  233. private:
  234. bool matched;
  235. RegexMatcher * matcher;
  236. UnicodeString sample;
  237. unsigned matchedSize = 0;
  238. public:
  239. CUStrRegExprFindInstance(RegexMatcher * _matcher, const UChar * _str, size32_t _from, size32_t _len)
  240. : matcher(_matcher)
  241. {
  242. matched = false;
  243. sample.setTo(_str + _from, _len);
  244. matcher->reset(sample);
  245. matched = matcher->find() != FALSE;
  246. if (matched)
  247. matchedSize = (unsigned)matcher->groupCount() + 1;
  248. }
  249. //IUStrRegExprFindInstance
  250. bool found() const { return matched; }
  251. void getMatchX(unsigned & outlen, UChar * & out, unsigned n = 0) const
  252. {
  253. if(matched && (n < matchedSize))
  254. {
  255. UErrorCode uerr = U_ZERO_ERROR;
  256. int32_t start = n ? matcher->start(n, uerr) : matcher->start(uerr);
  257. int32_t end = n ? matcher->end(n, uerr) : matcher->end(uerr);
  258. outlen = end - start;
  259. out = (UChar *)rtlMalloc(outlen*2);
  260. sample.extract(start, outlen, out);
  261. }
  262. else
  263. {
  264. outlen = 0;
  265. out = NULL;
  266. }
  267. }
  268. UChar const * findvstr(unsigned outlen, UChar * out, unsigned n = 0)
  269. {
  270. if(matched && (n < matchedSize))
  271. {
  272. UErrorCode uerr = U_ZERO_ERROR;
  273. int32_t start = n ? matcher->start(n, uerr) : matcher->start(uerr);
  274. int32_t end = n ? matcher->end(n, uerr) : matcher->end(uerr);
  275. unsigned sublen = end - start;
  276. if(sublen >= outlen)
  277. sublen = outlen - 1;
  278. sample.extract(start, sublen, out);
  279. out[sublen] = 0;
  280. }
  281. else
  282. {
  283. out[0] = 0;
  284. }
  285. return out;
  286. }
  287. };
  288. //---------------------------------------------------------------------------
  289. class CCompiledUStrRegExpr : implements ICompiledUStrRegExpr
  290. {
  291. private:
  292. RegexPattern * pattern;
  293. RegexMatcher * matcher;
  294. public:
  295. CCompiledUStrRegExpr(const UChar * _UregExp, bool _isCaseSensitive = false)
  296. {
  297. UErrorCode uerr = U_ZERO_ERROR;
  298. UParseError uperr;
  299. if (_isCaseSensitive)
  300. pattern = RegexPattern::compile(_UregExp, uperr, uerr);
  301. else
  302. pattern = RegexPattern::compile(_UregExp, UREGEX_CASE_INSENSITIVE, uperr, uerr);
  303. matcher = pattern ? pattern->matcher(uerr) : NULL;
  304. if (U_FAILURE(uerr))
  305. {
  306. char * expAscii;
  307. unsigned expAsciiLen;
  308. rtlUnicodeToEscapedStrX(expAsciiLen, expAscii, rtlUnicodeStrlen(_UregExp), _UregExp);
  309. std::string msg = "Bad regular expression: ";
  310. msg += u_errorName(uerr);
  311. msg += ": ";
  312. msg.append(expAscii, expAsciiLen);
  313. rtlFree(expAscii);
  314. delete matcher;
  315. delete pattern;
  316. matcher = NULL;
  317. pattern = NULL;
  318. rtlFail(0, msg.c_str()); //throws
  319. }
  320. }
  321. ~CCompiledUStrRegExpr()
  322. {
  323. if (matcher)
  324. delete matcher;
  325. if (pattern)
  326. delete pattern;
  327. }
  328. void replace(size32_t & outlen, UChar * & out, size32_t slen, const UChar * str, size32_t rlen, UChar const * replace) const
  329. {
  330. UnicodeString const src(str, slen);
  331. UErrorCode err = U_ZERO_ERROR;
  332. RegexMatcher * replacer = pattern->matcher(src, err);
  333. UnicodeString const fmt(replace, rlen);
  334. UnicodeString const tgt = replacer->replaceAll(fmt, err);
  335. outlen = tgt.length();
  336. out = (UChar *)rtlMalloc(outlen*2);
  337. tgt.extract(0, outlen, out);
  338. delete replacer;
  339. }
  340. IUStrRegExprFindInstance * find(const UChar * str, size32_t from, size32_t len) const
  341. {
  342. CUStrRegExprFindInstance * findInst = new CUStrRegExprFindInstance(matcher, str, from, len);
  343. return findInst;
  344. }
  345. void getMatchSet(bool & __isAllResult, size32_t & __resultBytes, void * & __result, size32_t _srcLen, const UChar * _search)
  346. {
  347. rtlRowBuilder out;
  348. size32_t outBytes = 0;
  349. UErrorCode uerr = U_ZERO_ERROR;
  350. UnicodeString uStrSearch;
  351. uStrSearch.setTo(_search, _srcLen);
  352. matcher->reset(uStrSearch);
  353. while (matcher->find())
  354. {
  355. uerr = U_ZERO_ERROR;
  356. int32_t start = matcher->start(uerr);
  357. if ((size32_t) start==_srcLen) break;
  358. int32_t end = matcher->end(uerr);
  359. int32_t numUChars = end - start;
  360. out.ensureAvailable(outBytes+numUChars*sizeof(UChar)+sizeof(size32_t));
  361. byte *outData = out.getbytes()+outBytes;
  362. * (size32_t *) outData = numUChars;
  363. uStrSearch.extract(start,numUChars,(UChar *) (outData+sizeof(size32_t)));
  364. outBytes += numUChars*sizeof(UChar) + sizeof(size32_t);
  365. }
  366. __isAllResult = false;
  367. __resultBytes = outBytes;
  368. __result = out.detachdata();
  369. }
  370. };
  371. //---------------------------------------------------------------------------
  372. ECLRTL_API ICompiledUStrRegExpr * rtlCreateCompiledUStrRegExpr(const UChar * regExpr, bool isCaseSensitive)
  373. {
  374. CCompiledUStrRegExpr * expr = new CCompiledUStrRegExpr(regExpr, isCaseSensitive);
  375. return expr;
  376. }
  377. ECLRTL_API void rtlDestroyCompiledUStrRegExpr(ICompiledUStrRegExpr * compiledExpr)
  378. {
  379. if (compiledExpr)
  380. delete (CCompiledUStrRegExpr*)compiledExpr;
  381. }
  382. ECLRTL_API void rtlDestroyUStrRegExprFindInstance(IUStrRegExprFindInstance * findInst)
  383. {
  384. if (findInst)
  385. delete (CUStrRegExprFindInstance*)findInst;
  386. }
  387. #else
  388. ECLRTL_API ICompiledUStrRegExpr * rtlCreateCompiledUStrRegExpr(const UChar * regExpr, bool isCaseSensitive)
  389. {
  390. rtlFail(0, "ICU regex disabled");
  391. }
  392. ECLRTL_API void rtlDestroyCompiledUStrRegExpr(ICompiledUStrRegExpr * compiledExpr)
  393. {
  394. }
  395. ECLRTL_API void rtlDestroyUStrRegExprFindInstance(IUStrRegExprFindInstance * findInst)
  396. {
  397. }
  398. #endif
  399. #else // _USE_BOOST_REGEX or _USE_C11_REGEX not set
  400. ECLRTL_API ICompiledStrRegExpr * rtlCreateCompiledStrRegExpr(const char * regExpr, bool isCaseSensitive)
  401. {
  402. rtlFail(0, "Boost/C++11 regex disabled");
  403. }
  404. ECLRTL_API void rtlDestroyCompiledStrRegExpr(ICompiledStrRegExpr * compiledExpr)
  405. {
  406. }
  407. ECLRTL_API void rtlDestroyStrRegExprFindInstance(IStrRegExprFindInstance * findInst)
  408. {
  409. }
  410. ECLRTL_API ICompiledUStrRegExpr * rtlCreateCompiledUStrRegExpr(const UChar * regExpr, bool isCaseSensitive)
  411. {
  412. rtlFail(0, "Boost/C++11 regex disabled");
  413. }
  414. ECLRTL_API void rtlDestroyCompiledUStrRegExpr(ICompiledUStrRegExpr * compiledExpr)
  415. {
  416. }
  417. ECLRTL_API void rtlDestroyUStrRegExprFindInstance(IUStrRegExprFindInstance * findInst)
  418. {
  419. }
  420. #endif // _USE_BOOST_REGEX or _USE_C11_REGEX