junicode.cpp 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711
  1. /*##############################################################################
  2. Copyright (C) 2011 HPCC Systems.
  3. All rights reserved. This program is free software: you can redistribute it and/or modify
  4. it under the terms of the GNU Affero General Public License as
  5. published by the Free Software Foundation, either version 3 of the
  6. License, or (at your option) any later version.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. GNU Affero General Public License for more details.
  11. You should have received a copy of the GNU Affero General Public License
  12. along with this program. If not, see <http://www.gnu.org/licenses/>.
  13. ############################################################################## */
  14. #include "platform.h"
  15. #include "jliball.hpp"
  16. #include "jerror.hpp"
  17. #include "junicode.hpp"
  18. /* Based on code extracted from the following source... Changed quite signficantly */
  19. /*
  20. * Copyright 2001 Unicode, Inc.
  21. *
  22. * Disclaimer
  23. *
  24. * This source code is provided as is by Unicode, Inc. No claims are
  25. * made as to fitness for any particular purpose. No warranties of any
  26. * kind are expressed or implied. The recipient agrees to determine
  27. * applicability of information provided. If this file has been
  28. * purchased on magnetic or optical media from Unicode, Inc., the
  29. * sole remedy for any claim will be exchange of defective media
  30. * within 90 days of receipt.
  31. *
  32. * Limitations on Rights to Redistribute This Code
  33. *
  34. * Unicode, Inc. hereby grants the right to freely use the information
  35. * supplied in this file in the creation of products supporting the
  36. * Unicode Standard, and to make copies of this file in any form
  37. * for internal or external distribution as long as this notice
  38. * remains attached.
  39. */
  40. //----------------------------------------------------------------------------
  41. static const int halfShift = 10; /* used for shifting by 10 bits */
  42. static const UTF32 halfBase = 0x0010000UL;
  43. static const UTF32 halfMask = 0x3FFUL;
  44. #define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD
  45. #define UNI_MAX_BMP (UTF32)0x0000FFFF
  46. #define UNI_MAX_UTF16 (UTF32)0x0010FFFF
  47. #define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
  48. #define UNI_SUR_HIGH_START (UTF32)0xD800
  49. #define UNI_SUR_HIGH_END (UTF32)0xDBFF
  50. #define UNI_SUR_LOW_START (UTF32)0xDC00
  51. #define UNI_SUR_LOW_END (UTF32)0xDFFF
  52. UTF32 UtfReader::next()
  53. {
  54. switch (type)
  55. {
  56. case Utf8: return next8();
  57. case Utf16le: return next16le();
  58. case Utf16be: return next16be();
  59. case Utf32le: return next32le();
  60. case Utf32be: return next32be();
  61. }
  62. UNIMPLEMENTED;
  63. }
  64. size32_t UtfReader::getLegalLength()
  65. {
  66. const byte * saved = cur;
  67. while (next() < errorLowerLimit)
  68. {
  69. }
  70. size32_t ret = (size32_t)(cur-saved);
  71. cur = saved;
  72. return ret;
  73. }
  74. //---------------------------------------------------------------------------
  75. UTF32 UtfReader::next32le()
  76. {
  77. if (end - cur < 4) return sourceExhausted;
  78. UTF32 ch = *(UTF32 *)cur;
  79. if (strictConversion && ((ch >= UNI_SUR_HIGH_START) && (ch <= UNI_SUR_LOW_END)))
  80. return sourceIllegal;
  81. cur += sizeof(UTF32);
  82. return ch;
  83. }
  84. //---------------------------------------------------------------------------
  85. UTF32 UtfReader::next32be()
  86. {
  87. if (end - cur < 4) return sourceExhausted;
  88. UTF32 ch;
  89. _cpyrev4(&ch, cur);
  90. if (strictConversion && ((ch >= UNI_SUR_HIGH_START) && (ch <= UNI_SUR_LOW_END)))
  91. return sourceIllegal;
  92. cur += sizeof(UTF32);
  93. return ch;
  94. }
  95. //---------------------------------------------------------------------------
  96. UTF32 UtfReader::next16le()
  97. {
  98. if (end - cur < 2) return sourceExhausted;
  99. const byte * source = cur;
  100. UTF32 ch = source[0] | (source[1] << 8);
  101. source += 2;
  102. if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
  103. {
  104. if (end - cur < 2)
  105. return sourceExhausted;
  106. UTF32 ch2 = source[0] | (source[1] << 8);
  107. source += 2;
  108. if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
  109. {
  110. ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
  111. + (ch2 - UNI_SUR_LOW_START) + halfBase;
  112. }
  113. else if (strictConversion) /* it's an unpaired high surrogate */
  114. return sourceIllegal;
  115. }
  116. else if ((strictConversion) && (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END))
  117. return sourceIllegal;
  118. cur = (const byte *)source;
  119. return ch;
  120. }
  121. //---------------------------------------------------------------------------
  122. UTF32 UtfReader::next16be()
  123. {
  124. if (end - cur < 2) return sourceExhausted;
  125. const byte * source = cur;
  126. UTF32 ch = (source[0] << 8) | source[1];
  127. source += 2;
  128. if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
  129. {
  130. if (end - cur < 2)
  131. return sourceExhausted;
  132. UTF32 ch2 = (source[0] << 8) | source[1];
  133. source += 2;
  134. if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
  135. {
  136. ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
  137. + (ch2 - UNI_SUR_LOW_START) + halfBase;
  138. }
  139. else if (strictConversion) /* it's an unpaired high surrogate */
  140. return sourceIllegal;
  141. }
  142. else if ((strictConversion) && (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END))
  143. return sourceIllegal;
  144. cur = source;
  145. return ch;
  146. }
  147. //---------------------------------------------------------------------------
  148. //This is probably faster than a table lookup on modern processors since it would avoid a cache hit.
  149. //Especially because first branch is the most common.
  150. inline unsigned getTrailingBytesForUTF8(byte value)
  151. {
  152. if (value < 0xc0)
  153. return 0;
  154. if (value < 0xe0)
  155. return 1;
  156. if (value < 0xf0)
  157. return 2;
  158. if (value < 0xf8)
  159. return 3;
  160. if (value < 0xfc)
  161. return 4;
  162. return 5;
  163. }
  164. /*
  165. * Magic values subtracted from a buffer value during UTF8 conversion.
  166. * This table contains as many values as there might be trailing bytes
  167. * in a UTF-8 sequence.
  168. */
  169. static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
  170. 0x03C82080UL, 0xFA082080UL, 0x82082080UL };
  171. /*
  172. * Utility routine to tell whether a sequence of bytes is legal UTF-8.
  173. * This must be called with the length pre-determined by the first byte.
  174. * If not calling this from ConvertUTF8to*, then the length can be set by:
  175. * length = trailingBytesForUTF8[*source]+1;
  176. * and the sequence is illegal right away if there aren't that many bytes
  177. * available.
  178. * If presented with a length > 4, this returns false. The Unicode
  179. * definition of UTF-8 goes up to 4-byte sequences.
  180. */
  181. unsigned readUtf8Size(const void * _data)
  182. {
  183. const byte * ptr = (const byte *)_data;
  184. return getTrailingBytesForUTF8(*ptr)+1;
  185. }
  186. UTF32 readUtf8Char(const void * _data)
  187. {
  188. const byte * ptr = (const byte *)_data;
  189. unsigned short extraBytesToRead = getTrailingBytesForUTF8(*ptr);
  190. UTF32 ch = 0;
  191. switch (extraBytesToRead) {
  192. case 3: ch += *ptr++; ch <<= 6;
  193. case 2: ch += *ptr++; ch <<= 6;
  194. case 1: ch += *ptr++; ch <<= 6;
  195. case 0: ch += *ptr++;
  196. }
  197. return ch - offsetsFromUTF8[extraBytesToRead];
  198. }
  199. inline bool isLegalUTF8(const UTF8 *source, unsigned length)
  200. {
  201. UTF8 a;
  202. const UTF8 *srcptr = source+length;
  203. switch (length)
  204. {
  205. default: return false;
  206. /* Everything else falls through when "true"... */
  207. case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
  208. case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
  209. case 2: if ((a = (*--srcptr)) > 0xBF) return false;
  210. switch (*source)
  211. {
  212. /* no fall-through in this inner switch */
  213. case 0xE0: if (a < 0xA0) return false; break;
  214. case 0xF0: if (a < 0x90) return false; break;
  215. case 0xF4: if (a > 0x8F) return false; break;
  216. default: if (a < 0x80) return false;
  217. }
  218. case 1: if (*source >= 0x80 && *source < 0xC2) return false;
  219. if (*source > 0xF4) return false;
  220. }
  221. return true;
  222. }
  223. /* --------------------------------------------------------------------- */
  224. UTF32 UtfReader::next8()
  225. {
  226. const UTF8* source = (const UTF8*)cur;
  227. if (source >= end) return sourceExhausted;
  228. unsigned short extraBytesToRead = getTrailingBytesForUTF8(*source);
  229. if (source + extraBytesToRead >= end)
  230. return sourceExhausted;
  231. /* Do this check whether lenient or strict */
  232. if (! isLegalUTF8(source, extraBytesToRead+1))
  233. return sourceIllegal;
  234. /*
  235. * The cases all fall through. See "Note A" below.
  236. */
  237. UTF32 ch = 0;
  238. switch (extraBytesToRead) {
  239. case 3: ch += *source++; ch <<= 6;
  240. case 2: ch += *source++; ch <<= 6;
  241. case 1: ch += *source++; ch <<= 6;
  242. case 0: ch += *source++;
  243. }
  244. cur = (const byte *)source;
  245. return ch - offsetsFromUTF8[extraBytesToRead];
  246. }
  247. //---------------------------------------------------------------------------
  248. UTF32 readUtf8Character(unsigned len, const byte * & cur)
  249. {
  250. const UTF8* source = (const UTF8*)cur;
  251. if (len == 0) return sourceExhausted;
  252. unsigned short extraBytesToRead = getTrailingBytesForUTF8(*source);
  253. if (extraBytesToRead >= len)
  254. return sourceExhausted;
  255. /* Do this check whether lenient or strict */
  256. if (! isLegalUTF8(source, extraBytesToRead+1))
  257. return sourceIllegal;
  258. /*
  259. * The cases all fall through. See "Note A" below.
  260. */
  261. UTF32 ch = 0;
  262. switch (extraBytesToRead) {
  263. case 3: ch += *source++; ch <<= 6;
  264. case 2: ch += *source++; ch <<= 6;
  265. case 1: ch += *source++; ch <<= 6;
  266. case 0: ch += *source++;
  267. }
  268. cur = (const byte *)source;
  269. return ch - offsetsFromUTF8[extraBytesToRead];
  270. }
  271. /*
  272. * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
  273. * into the first byte, depending on how many bytes follow. There are
  274. * as many entries in this table as there are UTF-8 sequence types.
  275. * (I.e., one byte sequence, two byte... six byte sequence.)
  276. */
  277. static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
  278. static const UTF32 byteMask = 0xBF;
  279. static const UTF32 byteMark = 0x80;
  280. unsigned writeUtf8(void * vtarget, unsigned maxLength, UTF32 ch)
  281. {
  282. unsigned short bytesToWrite;
  283. /* Figure out how many bytes the result will require */
  284. if (ch < (UTF32)0x80)
  285. bytesToWrite = 1;
  286. else if (ch < (UTF32)0x800)
  287. bytesToWrite = 2;
  288. else if (ch < (UTF32)0x10000)
  289. bytesToWrite = 3;
  290. else if (ch < (UTF32)0x200000)
  291. bytesToWrite = 4;
  292. else {
  293. bytesToWrite = 2;
  294. ch = UNI_REPLACEMENT_CHAR;
  295. }
  296. if (bytesToWrite > maxLength)
  297. return 0;
  298. UTF8 * target = (UTF8 *)vtarget + bytesToWrite;
  299. switch (bytesToWrite) { /* note: everything falls through. */
  300. case 4: *--target = (ch | byteMark) & byteMask; ch >>= 6;
  301. case 3: *--target = (ch | byteMark) & byteMask; ch >>= 6;
  302. case 2: *--target = (ch | byteMark) & byteMask; ch >>= 6;
  303. case 1: *--target = ch | firstByteMark[bytesToWrite];
  304. }
  305. return bytesToWrite;
  306. }
  307. unsigned writeUtf16le(void * vtarget, unsigned maxLength, UTF32 ch)
  308. {
  309. if (maxLength < 2)
  310. return 0;
  311. UTF16 * target = (UTF16 *)vtarget;
  312. if (ch <= UNI_MAX_BMP)
  313. {
  314. /* Target is a character <= 0xFFFF */
  315. if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
  316. ch = UNI_REPLACEMENT_CHAR;
  317. *target = ch; /* normal case */
  318. return 2;
  319. }
  320. if (ch > UNI_MAX_UTF16)
  321. {
  322. *target = UNI_REPLACEMENT_CHAR;
  323. return 2;
  324. }
  325. /* target is a character in range 0xFFFF - 0x10FFFF. */
  326. if (maxLength < 4)
  327. return 0;
  328. ch -= halfBase;
  329. target[0] = (ch >> halfShift) + UNI_SUR_HIGH_START;
  330. target[1] = (ch & halfMask) + UNI_SUR_LOW_START;
  331. return 4;
  332. }
  333. unsigned writeUtf16be(void * vtarget, unsigned maxLength, UTF32 ch)
  334. {
  335. if (maxLength < 2)
  336. return 0;
  337. UTF16 * target = (UTF16 *)vtarget;
  338. UTF16 temp;
  339. if (ch <= UNI_MAX_BMP)
  340. {
  341. /* Target is a character <= 0xFFFF */
  342. if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
  343. ch = UNI_REPLACEMENT_CHAR;
  344. temp = ch;
  345. _cpyrev2(target, &temp); /* normal case */
  346. return 2;
  347. }
  348. if (ch > UNI_MAX_UTF16)
  349. {
  350. temp = UNI_REPLACEMENT_CHAR;
  351. _cpyrev2(target, &temp);
  352. return 2;
  353. }
  354. /* target is a character in range 0xFFFF - 0x10FFFF. */
  355. if (maxLength < 4)
  356. return 0;
  357. ch -= halfBase;
  358. temp = (ch >> halfShift) + UNI_SUR_HIGH_START;
  359. _cpyrev2(target, &temp);
  360. temp = (ch & halfMask) + UNI_SUR_LOW_START;
  361. _cpyrev2(target+1, &temp);
  362. return 4;
  363. }
  364. unsigned writeUtf32le(void * vtarget, unsigned maxLength, UTF32 ch)
  365. {
  366. if (maxLength < 4) return 0;
  367. *(UTF32 *)vtarget = ch;
  368. return 4;
  369. }
  370. unsigned writeUtf32be(void * vtarget, unsigned maxLength, UTF32 ch)
  371. {
  372. if (maxLength < 4) return 0;
  373. _cpyrev4(vtarget, &ch);
  374. return 4;
  375. }
  376. //---------------------------------------------------------------------------
  377. MemoryBuffer & appendUtf8(MemoryBuffer & out, UTF32 value)
  378. {
  379. char temp[4];
  380. return out.append(writeUtf8(temp, sizeof(temp), value), temp);
  381. }
  382. MemoryBuffer & appendUtf16le(MemoryBuffer & out, UTF32 value)
  383. {
  384. char temp[4];
  385. return out.append(writeUtf16le(temp, sizeof(temp), value), temp);
  386. }
  387. MemoryBuffer & appendUtf16be(MemoryBuffer & out, UTF32 value)
  388. {
  389. char temp[4];
  390. return out.append(writeUtf16be(temp, sizeof(temp), value), temp);
  391. }
  392. MemoryBuffer & appendUtf32le(MemoryBuffer & out, UTF32 value)
  393. {
  394. char temp[4];
  395. return out.append(writeUtf32le(temp, sizeof(temp), value), temp);
  396. }
  397. MemoryBuffer & appendUtf32be(MemoryBuffer & out, UTF32 value)
  398. {
  399. char temp[4];
  400. return out.append(writeUtf32be(temp, sizeof(temp), value), temp);
  401. }
  402. MemoryBuffer & appendUtf(MemoryBuffer & out, UtfReader::UtfFormat targetType, UTF32 value)
  403. {
  404. switch (targetType)
  405. {
  406. case UtfReader::Utf8: appendUtf8(out, value); break;
  407. case UtfReader::Utf16le: appendUtf16le(out, value); break;
  408. case UtfReader::Utf16be: appendUtf16be(out, value); break;
  409. case UtfReader::Utf32le: appendUtf32le(out, value); break;
  410. case UtfReader::Utf32be: appendUtf32be(out, value); break;
  411. }
  412. return out;
  413. }
  414. /* ---------------------------------------------------------------------
  415. Note A.
  416. The fall-through switches in UTF-8 reading code save a
  417. temp variable, some decrements & conditionals. The switches
  418. are equivalent to the following loop:
  419. {
  420. int tmpBytesToRead = extraBytesToRead+1;
  421. do {
  422. ch += *source++;
  423. --tmpBytesToRead;
  424. if (tmpBytesToRead) ch <<= 6;
  425. } while (tmpBytesToRead > 0);
  426. }
  427. In UTF-8 writing code, the switches on "bytesToWrite" are
  428. similarly unrolled loops.
  429. --------------------------------------------------------------------- */
  430. bool convertUtf(MemoryBuffer & target, UtfReader::UtfFormat targetType, unsigned sourceLength, const void * source, UtfReader::UtfFormat sourceType)
  431. {
  432. UtfReader input(sourceType, false);
  433. input.set(sourceLength, source);
  434. unsigned originalLength = target.length();
  435. loop
  436. {
  437. UTF32 next = input.next();
  438. if (next == sourceExhausted)
  439. return true;
  440. if (next == sourceIllegal)
  441. {
  442. target.setLength(originalLength);
  443. return false;
  444. }
  445. appendUtf(target, targetType, next);
  446. }
  447. }
  448. bool convertToUtf8(MemoryBuffer & target, unsigned sourceLength, const void * source)
  449. {
  450. if (sourceLength < 2)
  451. return false;
  452. const byte * text = (const byte *)source;
  453. //check for leading BOM of 0xfeff in the appropriate encoding
  454. if ((text[0] == 0xfe) && (text[1] == 0xff))
  455. return convertUtf(target, UtfReader::Utf8, sourceLength-2, text+2, UtfReader::Utf16be);
  456. if ((text[0] == 0xff) && (text[1] == 0xfe))
  457. {
  458. if (sourceLength >= 4 && (text[2] == 0) && (text[3] == 0))
  459. return convertUtf(target, UtfReader::Utf8, sourceLength-4, text+4, UtfReader::Utf32le);
  460. return convertUtf(target, UtfReader::Utf8, sourceLength-2, text+2, UtfReader::Utf16le);
  461. }
  462. if (sourceLength > 4 && (text[0] == 0) && (text[1] == 0) && (text[2] == 0xfe) && (text[3] == 0xff))
  463. return convertUtf(target, UtfReader::Utf8, sourceLength-4, text+4, UtfReader::Utf32be);
  464. //Try and guess the format
  465. if (text[0] && !text[1])
  466. {
  467. if (text[2])
  468. {
  469. if (convertUtf(target, UtfReader::Utf8, sourceLength, source, UtfReader::Utf16le))
  470. return true;
  471. }
  472. else
  473. {
  474. if (convertUtf(target, UtfReader::Utf8, sourceLength, source, UtfReader::Utf32le))
  475. return true;
  476. }
  477. }
  478. else if (!text[0])
  479. {
  480. if (text[1])
  481. {
  482. if (convertUtf(target, UtfReader::Utf8, sourceLength, source, UtfReader::Utf16be))
  483. return true;
  484. }
  485. else
  486. {
  487. if (convertUtf(target, UtfReader::Utf8, sourceLength, source, UtfReader::Utf32be))
  488. return true;
  489. }
  490. }
  491. //No idea first one that matches wins!
  492. return
  493. convertUtf(target, UtfReader::Utf8, sourceLength, source, UtfReader::Utf16le) ||
  494. convertUtf(target, UtfReader::Utf8, sourceLength, source, UtfReader::Utf16be) ||
  495. convertUtf(target, UtfReader::Utf8, sourceLength, source, UtfReader::Utf32le) ||
  496. convertUtf(target, UtfReader::Utf8, sourceLength, source, UtfReader::Utf32be);
  497. }
  498. //---------------------------------------------------------------------------
  499. void addUtfActionList(StringMatcher & matcher, const char * text, unsigned action, unsigned * maxElementLength, UtfReader::UtfFormat utfFormat)
  500. {
  501. if (!text)
  502. return;
  503. unsigned idx=0;
  504. while (*text)
  505. {
  506. StringBuffer str;
  507. while (*text)
  508. {
  509. char next = *text++;
  510. if (next == ',')
  511. break;
  512. if (next == '\\' && *text)
  513. {
  514. next = *text++;
  515. switch (next)
  516. {
  517. case 'r': next = '\r'; break;
  518. case 'n': next = '\n'; break;
  519. case 't': next = '\t'; break;
  520. case 'x':
  521. //hex constant - at least we can define spaces then...
  522. if (text[0] && text[1])
  523. {
  524. next = (hex2num(*text) << 4) | hex2num(text[1]);
  525. text+=2;
  526. }
  527. break;
  528. default:
  529. break; //otherwise \ just quotes the character e.g. \,
  530. }
  531. }
  532. str.append(next);
  533. }
  534. if (str.length())
  535. {
  536. MemoryBuffer converted;
  537. if (!convertUtf(converted, utfFormat, str.length(), str.str(), UtfReader::Utf8))
  538. throwError(JLIBERR_BadUtf8InArguments);
  539. matcher.queryAddEntry(converted.length(), converted.toByteArray(), action+(idx++<<8));
  540. if (maxElementLength && (converted.length() > *maxElementLength))
  541. *maxElementLength = converted.length();
  542. }
  543. }
  544. }
  545. extern jlib_decl bool replaceUtf(utfReplacementFunc func, MemoryBuffer & target, UtfReader::UtfFormat type, unsigned sourceLength, const void * source)
  546. {
  547. UtfReader input(type, false);
  548. input.set(sourceLength, source);
  549. unsigned originalLength = target.length();
  550. loop
  551. {
  552. const byte * cur = input.cur;
  553. UTF32 next = input.next();
  554. if (next == sourceExhausted)
  555. return true;
  556. if (next == sourceIllegal)
  557. {
  558. target.setLength(originalLength);
  559. return false;
  560. }
  561. func(target, next, type, cur, input.cur-cur, cur==source);
  562. }
  563. }
  564. struct utf32ValidXmlCharRange
  565. {
  566. UTF32 min;
  567. UTF32 max;
  568. bool start;
  569. };
  570. utf32ValidXmlCharRange utf32ValidXmlCharRanges[] = {
  571. {'0', '9', false},
  572. {'A', 'Z', true},
  573. {'a', 'z', true},
  574. {0xC0, 0xD6, true},
  575. {0xD8, 0xF6, true},
  576. {0xF8, 0x2FF, true},
  577. {0x300, 0x36F, false},
  578. {0x370, 0x37D, true},
  579. {0x37F, 0x1FFF, true},
  580. {0x200C, 0x200D, true},
  581. {0x203F, 0x2040, false},
  582. {0x2070, 0x218F, true},
  583. {0x2C00, 0x2FEF, true},
  584. {0x3001, 0xD7FF, true},
  585. {0xF900, 0xFDCF, true},
  586. {0xFDF0, 0xFFFD, true},
  587. {0x10000, 0xEFFFF, true},
  588. {0, 0, false}
  589. };
  590. inline bool replaceBelowRange(UTF32 match, UTF32 replace, int id, MemoryBuffer & target, UtfReader::UtfFormat type, const void * source, int len, bool start)
  591. {
  592. utf32ValidXmlCharRange &r = utf32ValidXmlCharRanges[id];
  593. if (r.min==0)
  594. return true;
  595. if (match>r.max)
  596. return false;
  597. if (match<r.min)
  598. {
  599. appendUtf(target, type, replace);
  600. return true;
  601. }
  602. if (!r.start && start)
  603. appendUtf(target, type, replace);
  604. else
  605. target.append(len, source); //src and target are same, no need to reconvert
  606. return true;
  607. }
  608. MemoryBuffer & utfXmlNameReplacementFunc(MemoryBuffer & target, UTF32 match, UtfReader::UtfFormat type, const void * source, int len, bool start)
  609. {
  610. if (match==':' || match=='_' || (!start && (match=='-' || match=='.' || match==0xB7)))
  611. return target.append(len, source);
  612. for (int i=0; !replaceBelowRange(match, '_', i, target, type, source, len, start); i++);
  613. return target;
  614. }
  615. extern jlib_decl bool appendUtfXmlName(MemoryBuffer & target, UtfReader::UtfFormat type, unsigned sourceLength, const void * source)
  616. {
  617. return replaceUtf(utfXmlNameReplacementFunc, target, type, sourceLength, source);
  618. }