junicode.cpp 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625
  1. /*##############################################################################
  2. Copyright (C) 2011 HPCC Systems.
  3. All rights reserved. This program is free software: you can redistribute it and/or modify
  4. it under the terms of the GNU Affero General Public License as
  5. published by the Free Software Foundation, either version 3 of the
  6. License, or (at your option) any later version.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. GNU Affero General Public License for more details.
  11. You should have received a copy of the GNU Affero General Public License
  12. along with this program. If not, see <http://www.gnu.org/licenses/>.
  13. ############################################################################## */
  14. #include "platform.h"
  15. #include "jliball.hpp"
  16. #include "jerror.hpp"
  17. #include "junicode.hpp"
  18. /* Based on code extracted from the following source... Changed quite signficantly */
  19. /*
  20. * Copyright 2001 Unicode, Inc.
  21. *
  22. * Disclaimer
  23. *
  24. * This source code is provided as is by Unicode, Inc. No claims are
  25. * made as to fitness for any particular purpose. No warranties of any
  26. * kind are expressed or implied. The recipient agrees to determine
  27. * applicability of information provided. If this file has been
  28. * purchased on magnetic or optical media from Unicode, Inc., the
  29. * sole remedy for any claim will be exchange of defective media
  30. * within 90 days of receipt.
  31. *
  32. * Limitations on Rights to Redistribute This Code
  33. *
  34. * Unicode, Inc. hereby grants the right to freely use the information
  35. * supplied in this file in the creation of products supporting the
  36. * Unicode Standard, and to make copies of this file in any form
  37. * for internal or external distribution as long as this notice
  38. * remains attached.
  39. */
  40. //----------------------------------------------------------------------------
  41. static const int halfShift = 10; /* used for shifting by 10 bits */
  42. static const UTF32 halfBase = 0x0010000UL;
  43. static const UTF32 halfMask = 0x3FFUL;
  44. #define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD
  45. #define UNI_MAX_BMP (UTF32)0x0000FFFF
  46. #define UNI_MAX_UTF16 (UTF32)0x0010FFFF
  47. #define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
  48. #define UNI_SUR_HIGH_START (UTF32)0xD800
  49. #define UNI_SUR_HIGH_END (UTF32)0xDBFF
  50. #define UNI_SUR_LOW_START (UTF32)0xDC00
  51. #define UNI_SUR_LOW_END (UTF32)0xDFFF
  52. UTF32 UtfReader::next()
  53. {
  54. switch (type)
  55. {
  56. case Utf8: return next8();
  57. case Utf16le: return next16le();
  58. case Utf16be: return next16be();
  59. case Utf32le: return next32le();
  60. case Utf32be: return next32be();
  61. }
  62. UNIMPLEMENTED;
  63. }
  64. size32_t UtfReader::getLegalLength()
  65. {
  66. const byte * saved = cur;
  67. while (next() < errorLowerLimit)
  68. {
  69. }
  70. size32_t ret = (size32_t)(cur-saved);
  71. cur = saved;
  72. return ret;
  73. }
  74. //---------------------------------------------------------------------------
  75. UTF32 UtfReader::next32le()
  76. {
  77. if (end - cur < 4) return sourceExhausted;
  78. UTF32 ch = *(UTF32 *)cur;
  79. if (strictConversion && ((ch >= UNI_SUR_HIGH_START) && (ch <= UNI_SUR_LOW_END)))
  80. return sourceIllegal;
  81. cur += sizeof(UTF32);
  82. return ch;
  83. }
  84. //---------------------------------------------------------------------------
  85. UTF32 UtfReader::next32be()
  86. {
  87. if (end - cur < 4) return sourceExhausted;
  88. UTF32 ch;
  89. _cpyrev4(&ch, cur);
  90. if (strictConversion && ((ch >= UNI_SUR_HIGH_START) && (ch <= UNI_SUR_LOW_END)))
  91. return sourceIllegal;
  92. cur += sizeof(UTF32);
  93. return ch;
  94. }
  95. //---------------------------------------------------------------------------
  96. UTF32 UtfReader::next16le()
  97. {
  98. if (end - cur < 2) return sourceExhausted;
  99. const byte * source = cur;
  100. UTF32 ch = source[0] | (source[1] << 8);
  101. source += 2;
  102. if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
  103. {
  104. if (end - cur < 2)
  105. return sourceExhausted;
  106. UTF32 ch2 = source[0] | (source[1] << 8);
  107. source += 2;
  108. if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
  109. {
  110. ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
  111. + (ch2 - UNI_SUR_LOW_START) + halfBase;
  112. }
  113. else if (strictConversion) /* it's an unpaired high surrogate */
  114. return sourceIllegal;
  115. }
  116. else if ((strictConversion) && (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END))
  117. return sourceIllegal;
  118. cur = (const byte *)source;
  119. return ch;
  120. }
  121. //---------------------------------------------------------------------------
  122. UTF32 UtfReader::next16be()
  123. {
  124. if (end - cur < 2) return sourceExhausted;
  125. const byte * source = cur;
  126. UTF32 ch = (source[0] << 8) | source[1];
  127. source += 2;
  128. if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
  129. {
  130. if (end - cur < 2)
  131. return sourceExhausted;
  132. UTF32 ch2 = (source[0] << 8) | source[1];
  133. source += 2;
  134. if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
  135. {
  136. ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
  137. + (ch2 - UNI_SUR_LOW_START) + halfBase;
  138. }
  139. else if (strictConversion) /* it's an unpaired high surrogate */
  140. return sourceIllegal;
  141. }
  142. else if ((strictConversion) && (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END))
  143. return sourceIllegal;
  144. cur = source;
  145. return ch;
  146. }
  147. //---------------------------------------------------------------------------
  148. //This is probably faster than a table lookup on modern processors since it would avoid a cache hit.
  149. //Especially because first branch is the most common.
  150. inline unsigned getTrailingBytesForUTF8(byte value)
  151. {
  152. if (value < 0xc0)
  153. return 0;
  154. if (value < 0xe0)
  155. return 1;
  156. if (value < 0xf0)
  157. return 2;
  158. if (value < 0xf8)
  159. return 3;
  160. if (value < 0xfc)
  161. return 4;
  162. return 5;
  163. }
  164. /*
  165. * Magic values subtracted from a buffer value during UTF8 conversion.
  166. * This table contains as many values as there might be trailing bytes
  167. * in a UTF-8 sequence.
  168. */
  169. static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
  170. 0x03C82080UL, 0xFA082080UL, 0x82082080UL };
  171. /*
  172. * Utility routine to tell whether a sequence of bytes is legal UTF-8.
  173. * This must be called with the length pre-determined by the first byte.
  174. * If not calling this from ConvertUTF8to*, then the length can be set by:
  175. * length = trailingBytesForUTF8[*source]+1;
  176. * and the sequence is illegal right away if there aren't that many bytes
  177. * available.
  178. * If presented with a length > 4, this returns false. The Unicode
  179. * definition of UTF-8 goes up to 4-byte sequences.
  180. */
  181. unsigned readUtf8Size(const void * _data)
  182. {
  183. const byte * ptr = (const byte *)_data;
  184. return getTrailingBytesForUTF8(*ptr)+1;
  185. }
  186. UTF32 readUtf8Char(const void * _data)
  187. {
  188. const byte * ptr = (const byte *)_data;
  189. unsigned short extraBytesToRead = getTrailingBytesForUTF8(*ptr);
  190. UTF32 ch = 0;
  191. switch (extraBytesToRead) {
  192. case 3: ch += *ptr++; ch <<= 6;
  193. case 2: ch += *ptr++; ch <<= 6;
  194. case 1: ch += *ptr++; ch <<= 6;
  195. case 0: ch += *ptr++;
  196. }
  197. return ch - offsetsFromUTF8[extraBytesToRead];
  198. }
  199. inline bool isLegalUTF8(const UTF8 *source, unsigned length)
  200. {
  201. UTF8 a;
  202. const UTF8 *srcptr = source+length;
  203. switch (length)
  204. {
  205. default: return false;
  206. /* Everything else falls through when "true"... */
  207. case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
  208. case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
  209. case 2: if ((a = (*--srcptr)) > 0xBF) return false;
  210. switch (*source)
  211. {
  212. /* no fall-through in this inner switch */
  213. case 0xE0: if (a < 0xA0) return false; break;
  214. case 0xF0: if (a < 0x90) return false; break;
  215. case 0xF4: if (a > 0x8F) return false; break;
  216. default: if (a < 0x80) return false;
  217. }
  218. case 1: if (*source >= 0x80 && *source < 0xC2) return false;
  219. if (*source > 0xF4) return false;
  220. }
  221. return true;
  222. }
  223. /* --------------------------------------------------------------------- */
  224. UTF32 UtfReader::next8()
  225. {
  226. const UTF8* source = (const UTF8*)cur;
  227. if (source >= end) return sourceExhausted;
  228. unsigned short extraBytesToRead = getTrailingBytesForUTF8(*source);
  229. if (source + extraBytesToRead >= end)
  230. return sourceExhausted;
  231. /* Do this check whether lenient or strict */
  232. if (! isLegalUTF8(source, extraBytesToRead+1))
  233. return sourceIllegal;
  234. /*
  235. * The cases all fall through. See "Note A" below.
  236. */
  237. UTF32 ch = 0;
  238. switch (extraBytesToRead) {
  239. case 3: ch += *source++; ch <<= 6;
  240. case 2: ch += *source++; ch <<= 6;
  241. case 1: ch += *source++; ch <<= 6;
  242. case 0: ch += *source++;
  243. }
  244. cur = (const byte *)source;
  245. return ch - offsetsFromUTF8[extraBytesToRead];
  246. }
  247. //---------------------------------------------------------------------------
  248. UTF32 readUtf8Character(unsigned len, const byte * & cur)
  249. {
  250. const UTF8* source = (const UTF8*)cur;
  251. if (len == 0) return sourceExhausted;
  252. unsigned short extraBytesToRead = getTrailingBytesForUTF8(*source);
  253. if (extraBytesToRead >= len)
  254. return sourceExhausted;
  255. /* Do this check whether lenient or strict */
  256. if (! isLegalUTF8(source, extraBytesToRead+1))
  257. return sourceIllegal;
  258. /*
  259. * The cases all fall through. See "Note A" below.
  260. */
  261. UTF32 ch = 0;
  262. switch (extraBytesToRead) {
  263. case 3: ch += *source++; ch <<= 6;
  264. case 2: ch += *source++; ch <<= 6;
  265. case 1: ch += *source++; ch <<= 6;
  266. case 0: ch += *source++;
  267. }
  268. cur = (const byte *)source;
  269. return ch - offsetsFromUTF8[extraBytesToRead];
  270. }
  271. /*
  272. * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
  273. * into the first byte, depending on how many bytes follow. There are
  274. * as many entries in this table as there are UTF-8 sequence types.
  275. * (I.e., one byte sequence, two byte... six byte sequence.)
  276. */
  277. static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
  278. static const UTF32 byteMask = 0xBF;
  279. static const UTF32 byteMark = 0x80;
  280. unsigned writeUtf8(void * vtarget, unsigned maxLength, UTF32 ch)
  281. {
  282. unsigned short bytesToWrite;
  283. /* Figure out how many bytes the result will require */
  284. if (ch < (UTF32)0x80)
  285. bytesToWrite = 1;
  286. else if (ch < (UTF32)0x800)
  287. bytesToWrite = 2;
  288. else if (ch < (UTF32)0x10000)
  289. bytesToWrite = 3;
  290. else if (ch < (UTF32)0x200000)
  291. bytesToWrite = 4;
  292. else {
  293. bytesToWrite = 2;
  294. ch = UNI_REPLACEMENT_CHAR;
  295. }
  296. if (bytesToWrite > maxLength)
  297. return 0;
  298. UTF8 * target = (UTF8 *)vtarget + bytesToWrite;
  299. switch (bytesToWrite) { /* note: everything falls through. */
  300. case 4: *--target = (ch | byteMark) & byteMask; ch >>= 6;
  301. case 3: *--target = (ch | byteMark) & byteMask; ch >>= 6;
  302. case 2: *--target = (ch | byteMark) & byteMask; ch >>= 6;
  303. case 1: *--target = ch | firstByteMark[bytesToWrite];
  304. }
  305. return bytesToWrite;
  306. }
  307. unsigned writeUtf16le(void * vtarget, unsigned maxLength, UTF32 ch)
  308. {
  309. if (maxLength < 2)
  310. return 0;
  311. UTF16 * target = (UTF16 *)vtarget;
  312. if (ch <= UNI_MAX_BMP)
  313. {
  314. /* Target is a character <= 0xFFFF */
  315. if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
  316. ch = UNI_REPLACEMENT_CHAR;
  317. *target = ch; /* normal case */
  318. return 2;
  319. }
  320. if (ch > UNI_MAX_UTF16)
  321. {
  322. *target = UNI_REPLACEMENT_CHAR;
  323. return 2;
  324. }
  325. /* target is a character in range 0xFFFF - 0x10FFFF. */
  326. if (maxLength < 4)
  327. return 0;
  328. ch -= halfBase;
  329. target[0] = (ch >> halfShift) + UNI_SUR_HIGH_START;
  330. target[1] = (ch & halfMask) + UNI_SUR_LOW_START;
  331. return 4;
  332. }
  333. unsigned writeUtf16be(void * vtarget, unsigned maxLength, UTF32 ch)
  334. {
  335. if (maxLength < 2)
  336. return 0;
  337. UTF16 * target = (UTF16 *)vtarget;
  338. UTF16 temp;
  339. if (ch <= UNI_MAX_BMP)
  340. {
  341. /* Target is a character <= 0xFFFF */
  342. if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
  343. ch = UNI_REPLACEMENT_CHAR;
  344. temp = ch;
  345. _cpyrev2(target, &temp); /* normal case */
  346. return 2;
  347. }
  348. if (ch > UNI_MAX_UTF16)
  349. {
  350. temp = UNI_REPLACEMENT_CHAR;
  351. _cpyrev2(target, &temp);
  352. return 2;
  353. }
  354. /* target is a character in range 0xFFFF - 0x10FFFF. */
  355. if (maxLength < 4)
  356. return 0;
  357. ch -= halfBase;
  358. temp = (ch >> halfShift) + UNI_SUR_HIGH_START;
  359. _cpyrev2(target, &temp);
  360. temp = (ch & halfMask) + UNI_SUR_LOW_START;
  361. _cpyrev2(target+1, &temp);
  362. return 4;
  363. }
  364. unsigned writeUtf32le(void * vtarget, unsigned maxLength, UTF32 ch)
  365. {
  366. if (maxLength < 4) return 0;
  367. *(UTF32 *)vtarget = ch;
  368. return 4;
  369. }
  370. unsigned writeUtf32be(void * vtarget, unsigned maxLength, UTF32 ch)
  371. {
  372. if (maxLength < 4) return 0;
  373. _cpyrev4(vtarget, &ch);
  374. return 4;
  375. }
  376. //---------------------------------------------------------------------------
  377. MemoryBuffer & appendUtf8(MemoryBuffer & out, UTF32 value)
  378. {
  379. char temp[4];
  380. return out.append(writeUtf8(temp, sizeof(temp), value), temp);
  381. }
  382. MemoryBuffer & appendUtf16le(MemoryBuffer & out, UTF32 value)
  383. {
  384. char temp[4];
  385. return out.append(writeUtf16le(temp, sizeof(temp), value), temp);
  386. }
  387. MemoryBuffer & appendUtf16be(MemoryBuffer & out, UTF32 value)
  388. {
  389. char temp[4];
  390. return out.append(writeUtf16be(temp, sizeof(temp), value), temp);
  391. }
  392. MemoryBuffer & appendUtf32le(MemoryBuffer & out, UTF32 value)
  393. {
  394. char temp[4];
  395. return out.append(writeUtf32le(temp, sizeof(temp), value), temp);
  396. }
  397. MemoryBuffer & appendUtf32be(MemoryBuffer & out, UTF32 value)
  398. {
  399. char temp[4];
  400. return out.append(writeUtf32be(temp, sizeof(temp), value), temp);
  401. }
  402. /* ---------------------------------------------------------------------
  403. Note A.
  404. The fall-through switches in UTF-8 reading code save a
  405. temp variable, some decrements & conditionals. The switches
  406. are equivalent to the following loop:
  407. {
  408. int tmpBytesToRead = extraBytesToRead+1;
  409. do {
  410. ch += *source++;
  411. --tmpBytesToRead;
  412. if (tmpBytesToRead) ch <<= 6;
  413. } while (tmpBytesToRead > 0);
  414. }
  415. In UTF-8 writing code, the switches on "bytesToWrite" are
  416. similarly unrolled loops.
  417. --------------------------------------------------------------------- */
  418. bool convertUtf(MemoryBuffer & target, UtfReader::UtfFormat targetType, unsigned sourceLength, const void * source, UtfReader::UtfFormat sourceType)
  419. {
  420. UtfReader input(sourceType, false);
  421. input.set(sourceLength, source);
  422. unsigned originalLength = target.length();
  423. loop
  424. {
  425. UTF32 next = input.next();
  426. if (next == sourceExhausted)
  427. return true;
  428. if (next == sourceIllegal)
  429. {
  430. target.setLength(originalLength);
  431. return false;
  432. }
  433. switch (targetType)
  434. {
  435. case UtfReader::Utf8: appendUtf8(target, next); break;
  436. case UtfReader::Utf16le: appendUtf16le(target, next); break;
  437. case UtfReader::Utf16be: appendUtf16be(target, next); break;
  438. case UtfReader::Utf32le: appendUtf32le(target, next); break;
  439. case UtfReader::Utf32be: appendUtf32be(target, next); break;
  440. }
  441. }
  442. }
  443. bool convertToUtf8(MemoryBuffer & target, unsigned sourceLength, const void * source)
  444. {
  445. if (sourceLength < 2)
  446. return false;
  447. const byte * text = (const byte *)source;
  448. //check for leading BOM of 0xfeff in the appropriate encoding
  449. if ((text[0] == 0xfe) && (text[1] == 0xff))
  450. return convertUtf(target, UtfReader::Utf8, sourceLength-2, text+2, UtfReader::Utf16be);
  451. if ((text[0] == 0xff) && (text[1] == 0xfe))
  452. {
  453. if (sourceLength >= 4 && (text[2] == 0) && (text[3] == 0))
  454. return convertUtf(target, UtfReader::Utf8, sourceLength-4, text+4, UtfReader::Utf32le);
  455. return convertUtf(target, UtfReader::Utf8, sourceLength-2, text+2, UtfReader::Utf16le);
  456. }
  457. if (sourceLength > 4 && (text[0] == 0) && (text[1] == 0) && (text[2] == 0xfe) && (text[3] == 0xff))
  458. return convertUtf(target, UtfReader::Utf8, sourceLength-4, text+4, UtfReader::Utf32be);
  459. //Try and guess the format
  460. if (text[0] && !text[1])
  461. {
  462. if (text[2])
  463. {
  464. if (convertUtf(target, UtfReader::Utf8, sourceLength, source, UtfReader::Utf16le))
  465. return true;
  466. }
  467. else
  468. {
  469. if (convertUtf(target, UtfReader::Utf8, sourceLength, source, UtfReader::Utf32le))
  470. return true;
  471. }
  472. }
  473. else if (!text[0])
  474. {
  475. if (text[1])
  476. {
  477. if (convertUtf(target, UtfReader::Utf8, sourceLength, source, UtfReader::Utf16be))
  478. return true;
  479. }
  480. else
  481. {
  482. if (convertUtf(target, UtfReader::Utf8, sourceLength, source, UtfReader::Utf32be))
  483. return true;
  484. }
  485. }
  486. //No idea first one that matches wins!
  487. return
  488. convertUtf(target, UtfReader::Utf8, sourceLength, source, UtfReader::Utf16le) ||
  489. convertUtf(target, UtfReader::Utf8, sourceLength, source, UtfReader::Utf16be) ||
  490. convertUtf(target, UtfReader::Utf8, sourceLength, source, UtfReader::Utf32le) ||
  491. convertUtf(target, UtfReader::Utf8, sourceLength, source, UtfReader::Utf32be);
  492. }
  493. //---------------------------------------------------------------------------
  494. void addUtfActionList(StringMatcher & matcher, const char * text, unsigned action, unsigned * maxElementLength, UtfReader::UtfFormat utfFormat)
  495. {
  496. if (!text)
  497. return;
  498. unsigned idx=0;
  499. while (*text)
  500. {
  501. StringBuffer str;
  502. while (*text)
  503. {
  504. char next = *text++;
  505. if (next == ',')
  506. break;
  507. if (next == '\\' && *text)
  508. {
  509. next = *text++;
  510. switch (next)
  511. {
  512. case 'r': next = '\r'; break;
  513. case 'n': next = '\n'; break;
  514. case 't': next = '\t'; break;
  515. case 'x':
  516. //hex constant - at least we can define spaces then...
  517. if (text[0] && text[1])
  518. {
  519. next = (hex2num(*text) << 4) | hex2num(text[1]);
  520. text+=2;
  521. }
  522. break;
  523. default:
  524. break; //otherwise \ just quotes the character e.g. \,
  525. }
  526. }
  527. str.append(next);
  528. }
  529. if (str.length())
  530. {
  531. MemoryBuffer converted;
  532. if (!convertUtf(converted, utfFormat, str.length(), str.str(), UtfReader::Utf8))
  533. throwError(JLIBERR_BadUtf8InArguments);
  534. matcher.queryAddEntry(converted.length(), converted.toByteArray(), action+(idx++<<8));
  535. if (maxElementLength && (converted.length() > *maxElementLength))
  536. *maxElementLength = converted.length();
  537. }
  538. }
  539. }