rtlqstr.cpp 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580
  1. /*##############################################################################
  2. HPCC SYSTEMS software Copyright (C) 2012 HPCC Systems.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. ############################################################################## */
  13. #include "limits.h"
  14. #include "platform.h"
  15. #include <math.h>
  16. #include <stdio.h>
  17. #include "jexcept.hpp"
  18. #include "jmisc.hpp"
  19. #include "jutil.hpp"
  20. #include "jlib.hpp"
  21. #include "jptree.hpp"
  22. #include "eclrtl.hpp"
  23. #include "rtlbcd.hpp"
  24. #include "unicode/uchar.h"
  25. #include "unicode/ucol.h"
  26. #include "unicode/ustring.h"
  27. #include "unicode/ucnv.h"
  28. #include "unicode/schriter.h"
  29. #include "unicode/regex.h"
  30. #include "unicode/normlzr.h"
  31. #include "unicode/locid.h"
  32. #include "jlog.hpp"
  33. #include "jmd5.hpp"
  34. //=============================================================================
  35. // Miscellaneous string functions...
  36. inline unsigned QStrLength(unsigned size) { return (size * 4) / 3; }
  37. inline unsigned QStrSize(unsigned length) { return (length + 1) * 3 / 4; }
  38. byte lastQStrByteMask(unsigned tlen)
  39. {
  40. switch (tlen & 3)
  41. {
  42. case 1:
  43. return 0xfc;
  44. case 2:
  45. return 0xf0;
  46. case 3:
  47. return 0xc0;
  48. }
  49. return 0xff;
  50. }
  51. inline byte expandQChar(byte c)
  52. {
  53. return ' ' + c;
  54. }
  55. #if 1
  56. static const char compressXlat[256] =
  57. {
  58. 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0x00
  59. 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  60. 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0x10
  61. 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  62. 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, // 0x20
  63. 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
  64. 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, // 0x30
  65. 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
  66. 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, // 0x40
  67. 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
  68. 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, // 0x50
  69. 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
  70. 0x00, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, // 0x60
  71. 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
  72. 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, // 0x70
  73. 0x38, 0x39, 0x3a, 0x00, 0x00, 0x00, 0x00, 0x00,
  74. 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0x80
  75. 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  76. 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0x90
  77. 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  78. 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0xA0
  79. 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  80. 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0xB0
  81. 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  82. 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0xC0
  83. 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  84. 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0xD0
  85. 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  86. 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0xE0
  87. 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  88. 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0xF0
  89. 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
  90. };
  91. #define compressQChar(c) compressXlat[(byte)c]
  92. #else
  93. inline byte compressQChar(byte c)
  94. {
  95. if (c > 0x20)
  96. {
  97. if (c < 0x60)
  98. return c - 0x20;
  99. if ((c >= 'a') && (c <= 'z'))
  100. return c - 0x40;
  101. }
  102. return 0;
  103. }
  104. #endif
  105. //---------------------------------------------------------------------------
  106. class QStrReader
  107. {
  108. public:
  109. QStrReader(const byte * _buffer) { buffer = _buffer; curLen = 0; offset = 0; }
  110. byte curQChar()
  111. {
  112. switch (curLen & 3)
  113. {
  114. case 0:
  115. return buffer[offset] >> 2;
  116. case 1:
  117. return ((buffer[offset] & 0x3) << 4) | (buffer[offset+1] >> 4);
  118. case 2:
  119. return ((buffer[offset] & 0xf) << 2) | (buffer[offset+1] >> 6);
  120. case 3:
  121. return (buffer[offset] & 0x3f);
  122. }
  123. return 0;
  124. }
  125. byte nextQChar()
  126. {
  127. byte c = curQChar();
  128. if ((curLen & 3) != 0)
  129. offset++;
  130. curLen++;
  131. return c;
  132. }
  133. byte prevQChar()
  134. {
  135. curLen--;
  136. if ((curLen & 3) != 0)
  137. offset--;
  138. return curQChar();
  139. }
  140. char nextChar()
  141. {
  142. return expandQChar(nextQChar());
  143. }
  144. inline void seek(unsigned pos)
  145. {
  146. curLen = pos;
  147. offset = (pos* 3)/4;
  148. }
  149. protected:
  150. const byte * buffer;
  151. unsigned curLen;
  152. unsigned offset;
  153. };
  154. //---------------------------------------------------------------------------
  155. class QStrBuilder
  156. {
  157. public:
  158. QStrBuilder(void * _buffer) { buffer = (byte *)_buffer; curLen = 0; pending = 0; }
  159. void appendChar(char next)
  160. {
  161. appendQChar(compressQChar(next));
  162. }
  163. void appendCharN(unsigned len, char next)
  164. {
  165. byte c = compressQChar(next);
  166. while (len--)
  167. appendQChar(c);
  168. }
  169. void appendQStr(unsigned len, const char * text)
  170. {
  171. QStrReader reader((const byte *)text);
  172. while (len--)
  173. appendQChar(reader.nextQChar());
  174. }
  175. void appendStr(unsigned len, const char * text)
  176. {
  177. while (len--)
  178. appendChar(*text++);
  179. }
  180. void appendQChar(byte c)
  181. {
  182. switch (curLen & 3)
  183. {
  184. case 0:
  185. pending = c << 2;
  186. break;
  187. case 1:
  188. *buffer++ = pending | (c >> 4);
  189. pending = c << 4;
  190. break;
  191. case 2:
  192. *buffer++ = pending | (c >> 2);
  193. pending = c << 6;
  194. break;
  195. case 3:
  196. *buffer++ = pending | c;
  197. pending = 0;
  198. break;
  199. }
  200. curLen++;
  201. }
  202. void finish(unsigned max, byte fill)
  203. {
  204. while (curLen < max)
  205. appendQChar(fill & 0x3F);
  206. //force a final character to be output, but never writes too many.
  207. appendQChar(fill & 0x3F);
  208. //curLen is now undefined.
  209. }
  210. protected:
  211. byte * buffer;
  212. unsigned curLen;
  213. byte pending;
  214. };
  215. //=============================================================================
  216. void copyQStrRange(unsigned tlen, char * tgt, const char * src, unsigned from, unsigned to)
  217. {
  218. unsigned copylen = to - from;
  219. if ((from & 3) == 0)
  220. {
  221. //can index the qstring directly...
  222. rtlQStrToQStr(tlen, tgt, copylen, src+QStrSize(from));
  223. //make sure the contents are in canonical format
  224. if ((copylen & 3) != 0)
  225. {
  226. unsigned copysize = QStrSize(copylen);
  227. tgt[copysize-1] &= lastQStrByteMask(copylen);
  228. }
  229. }
  230. else if (copylen == 0)
  231. {
  232. memset(tgt, 0, QStrSize(tlen));
  233. }
  234. else
  235. {
  236. //More: Could implement this cleverly by shifting and copying, but not worth it at the moment
  237. unsigned tempSrcLen;
  238. char * tempSrcPtr;
  239. rtlQStrToStrX(tempSrcLen, tempSrcPtr, from+copylen, src);
  240. rtlStrToQStr(tlen, tgt, copylen, tempSrcPtr+from);
  241. rtlFree(tempSrcPtr);
  242. }
  243. }
  244. //-----------------------------------------------------------------------------
  245. unsigned rtlQStrLength(unsigned size) { return QStrLength(size); }
  246. unsigned rtlQStrSize(unsigned length) { return QStrSize(length); }
  247. unsigned rtlTrimQStrLen(size32_t l, const char * t)
  248. {
  249. QStrReader reader((const byte *)t);
  250. reader.seek(l);
  251. while (l && (reader.prevQChar() == 0))
  252. l--;
  253. return l;
  254. }
  255. void rtlStrToQStr(size32_t outlen, char * out, size32_t inlen, const void *in)
  256. {
  257. unsigned outSize = QStrSize(outlen);
  258. if (inlen >= outlen)
  259. inlen = outlen;
  260. else
  261. {
  262. size32_t size = QStrSize(inlen);
  263. memset(out+size, 0, outSize-size);
  264. }
  265. byte * curIn = (byte *)in;
  266. byte * endIn = curIn + inlen;
  267. byte * curOut = (byte *)out;
  268. while ((endIn-curIn)>=4)
  269. {
  270. byte c0 = compressQChar(curIn[0]);
  271. byte c1 = compressQChar(curIn[1]);
  272. byte c2 = compressQChar(curIn[2]);
  273. byte c3 = compressQChar(curIn[3]);
  274. curOut[0] = (c0 << 2) | (c1 >> 4);
  275. curOut[1] = (c1 << 4) | (c2 >> 2);
  276. curOut[2] = (c2 << 6) | c3;
  277. curIn += 4;
  278. curOut += 3;
  279. }
  280. byte c0;
  281. byte c1 = 0;
  282. byte c2 = 0;
  283. switch (endIn - curIn)
  284. {
  285. case 3:
  286. c2 = compressQChar(curIn[2]);
  287. curOut[2] = (c2 << 6);
  288. //fallthrough
  289. case 2:
  290. c1 = compressQChar(curIn[1]);
  291. curOut[1] = (c1 << 4) | (c2 >> 2);
  292. //fall through
  293. case 1:
  294. c0 = compressQChar(curIn[0]);
  295. curOut[0] = (c0 << 2) | (c1 >> 4);
  296. break;
  297. case 0:
  298. break;
  299. default:
  300. UNIMPLEMENTED;
  301. }
  302. }
  303. void rtlStrToQStrX(size32_t & outlen, char * & out, size32_t inlen, const void *in)
  304. {
  305. outlen = inlen;
  306. out = (char *)malloc(QStrSize(inlen));
  307. rtlStrToQStr(inlen, out, inlen, in);
  308. }
  309. void rtlStrToQStrNX(size32_t & outlen, char * & out, size32_t inlen, const void * in, size32_t logicalLength)
  310. {
  311. outlen = logicalLength;
  312. out = (char *)malloc(QStrSize(logicalLength));
  313. rtlStrToQStr(logicalLength, out, inlen, in);
  314. }
  315. void rtlQStrToData(size32_t outlen, void * out, size32_t inlen, const char *in)
  316. {
  317. if (inlen >= outlen)
  318. inlen = outlen;
  319. else
  320. memset((char *)out+inlen, 0, outlen-inlen);
  321. rtlQStrToStr(inlen, (char *)out, inlen, in);
  322. }
  323. void rtlQStrToDataX(size32_t & outlen, void * & out, size32_t inlen, const char *in)
  324. {
  325. outlen = inlen;
  326. out = (char *)malloc(inlen);
  327. rtlQStrToStr(inlen, (char *)out, inlen, in);
  328. }
  329. void rtlQStrToVStr(size32_t outlen, char * out, size32_t inlen, const char *in)
  330. {
  331. out[--outlen] = 0;
  332. if (inlen >= outlen)
  333. inlen = outlen;
  334. else
  335. memset((char *)out+inlen, 0, outlen-inlen);
  336. rtlQStrToStr(inlen, out, inlen, in);
  337. }
  338. //NB: Need to be careful when expanding qstring3 to string3, that 4 bytes aren't written.
  339. void rtlQStrToStr(size32_t outlen, char * out, size32_t inlen, const char * in)
  340. {
  341. if (inlen < outlen)
  342. {
  343. memset(out+inlen, ' ', outlen-inlen);
  344. outlen = inlen;
  345. }
  346. const byte * curIn = (const byte *)in;
  347. byte * curOut = (byte *)out;
  348. byte * endOut = curOut + outlen;
  349. while ((endOut-curOut)>=4)
  350. {
  351. byte c0 = curIn[0];
  352. byte c1 = curIn[1];
  353. byte c2 = curIn[2];
  354. curOut[0] = expandQChar(c0 >> 2);
  355. curOut[1] = expandQChar(((c0 & 0x3) << 4) | (c1 >> 4));
  356. curOut[2] = expandQChar(((c1 & 0xF) << 2) | (c2 >> 6));
  357. curOut[3] = expandQChar(c2 & 0x3F);
  358. curIn += 3;
  359. curOut += 4;
  360. }
  361. switch (endOut - curOut)
  362. {
  363. case 3:
  364. curOut[2] = expandQChar(((curIn[1] & 0xF) << 2) | (curIn[2] >> 6));
  365. //fallthrough
  366. case 2:
  367. curOut[1] = expandQChar(((curIn[0] & 0x3) << 4) | (curIn[1] >> 4));
  368. //fallthrough
  369. case 1:
  370. curOut[0] = expandQChar(curIn[0] >> 2);
  371. break;
  372. case 0:
  373. break;
  374. default:
  375. UNIMPLEMENTED;
  376. }
  377. }
  378. void rtlQStrToStrX(size32_t & outlen, char * & out, size32_t inlen, const char *in)
  379. {
  380. outlen = inlen;
  381. out = (char *)malloc(inlen);
  382. rtlQStrToStr(inlen, out, inlen, in);
  383. }
  384. void rtlQStrToQStr(size32_t outlen, char * out, size32_t inlen, const char * in)
  385. {
  386. size32_t inSize = QStrSize(inlen);
  387. size32_t outSize = QStrSize(outlen);
  388. if (inSize >= outSize)
  389. memcpy(out, in, outSize);
  390. else
  391. {
  392. memcpy(out, in, inSize);
  393. memset(out+inSize, 0, outSize-inSize);
  394. }
  395. }
  396. void rtlQStrToQStrX(unsigned & outlen, char * & out, unsigned inlen, const char * in)
  397. {
  398. size32_t inSize = QStrSize(inlen);
  399. char * data = (char *)malloc(inSize);
  400. memcpy(data, in, inSize);
  401. outlen = inlen;
  402. out = data;
  403. }
  404. int rtlCompareQStrQStr(size32_t llen, const void * left, size32_t rlen, const void * right)
  405. {
  406. size32_t lsize = QStrSize(llen);
  407. size32_t rsize = QStrSize(rlen);
  408. if (lsize < rsize)
  409. {
  410. int ret = memcmp(left, right, lsize);
  411. if (ret == 0)
  412. {
  413. const byte * r = (const byte *)right;
  414. while (lsize < rsize)
  415. {
  416. if (r[lsize])
  417. return -1;
  418. lsize++;
  419. }
  420. }
  421. return ret;
  422. }
  423. int ret = memcmp(left, right, rsize);
  424. if (ret == 0)
  425. {
  426. const byte * l = (const byte *)left;
  427. while (lsize > rsize)
  428. {
  429. if (l[rsize])
  430. return +1;
  431. rsize++;
  432. }
  433. }
  434. return ret;
  435. }
  436. void rtlDecPushQStr(size32_t len, const void * data)
  437. {
  438. char * strData = (char *)alloca(len);
  439. rtlQStrToStr(len, strData, len, (const char *)data);
  440. DecPushString(len, strData);
  441. }
  442. bool rtlQStrToBool(size32_t inlen, const char *in)
  443. {
  444. unsigned size = QStrSize(inlen);
  445. while (size--)
  446. if (in[size])
  447. return true;
  448. return false;
  449. }
  450. //---------------------------------------------------------------------------
  451. ECLRTL_API void rtlCreateQStrRange(size32_t & outlen, char * & out, unsigned fieldLen, unsigned compareLen, size32_t len, const char * qstr, byte fill)
  452. {
  453. //NB: Keep in sync with rtlCreateRange()
  454. if (compareLen > fieldLen)
  455. {
  456. if ((int)compareLen >= 0)
  457. {
  458. //x[1..m] = y, m is larger than fieldLen, so truncate to fieldLen
  459. compareLen = fieldLen;
  460. }
  461. else
  462. compareLen = 0; // probably m[1..-1] or something silly
  463. }
  464. //y has been trimmed when this function is called. If y is longer than field length, then it is never going to match
  465. //so change the search range to FF,FF,FF .. 00.00.00 which will then never match.
  466. if (len > fieldLen)
  467. {
  468. compareLen = 0;
  469. fill = (fill == 0) ? 255 : 0;
  470. }
  471. outlen = fieldLen;
  472. out = (char *)malloc(QStrSize(fieldLen));
  473. QStrBuilder builder(out);
  474. if (len >= compareLen)
  475. builder.appendQStr(compareLen, qstr);
  476. else
  477. {
  478. builder.appendQStr(len, qstr);
  479. builder.appendCharN(compareLen-len, ' ');
  480. }
  481. builder.finish(fieldLen, fill);
  482. }
  483. ECLRTL_API void rtlCreateQStrRangeLow(size32_t & outlen, char * & out, unsigned fieldLen, unsigned compareLen, size32_t len, const char * qstr)
  484. {
  485. len = rtlTrimQStrLen(len, qstr);
  486. rtlCreateQStrRange(outlen, out, fieldLen, compareLen, len, qstr, 0);
  487. }
  488. ECLRTL_API void rtlCreateQStrRangeHigh(size32_t & outlen, char * & out, unsigned fieldLen, unsigned compareLen, size32_t len, const char * qstr)
  489. {
  490. len = rtlTrimQStrLen(len, qstr);
  491. rtlCreateQStrRange(outlen, out, fieldLen, compareLen, len, qstr, 255);
  492. }
  493. void serializeQStrX(size32_t len, const char * data, MemoryBuffer &out)
  494. {
  495. out.append(len).append(QStrSize(len), data);
  496. }
  497. void deserializeQStrX(size32_t & len, char * & data, MemoryBuffer &in)
  498. {
  499. free(data);
  500. in.read(sizeof(len), &len);
  501. unsigned size = QStrSize(len);
  502. data = (char *)malloc(size);
  503. in.read(size, data);
  504. }