junicode.cpp 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716
  1. /*##############################################################################
  2. HPCC SYSTEMS software Copyright (C) 2012 HPCC Systems®.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. ############################################################################## */
  13. #include "platform.h"
  14. #include "jliball.hpp"
  15. #include "jerror.hpp"
  16. #include "junicode.hpp"
  17. /* Based on code extracted from the following source... Changed quite signficantly */
  18. /*
  19. * Copyright 2001 Unicode, Inc.
  20. *
  21. * Disclaimer
  22. *
  23. * This source code is provided as is by Unicode, Inc. No claims are
  24. * made as to fitness for any particular purpose. No warranties of any
  25. * kind are expressed or implied. The recipient agrees to determine
  26. * applicability of information provided. If this file has been
  27. * purchased on magnetic or optical media from Unicode, Inc., the
  28. * sole remedy for any claim will be exchange of defective media
  29. * within 90 days of receipt.
  30. *
  31. * Limitations on Rights to Redistribute This Code
  32. *
  33. * Unicode, Inc. hereby grants the right to freely use the information
  34. * supplied in this file in the creation of products supporting the
  35. * Unicode Standard, and to make copies of this file in any form
  36. * for internal or external distribution as long as this notice
  37. * remains attached.
  38. */
  39. //----------------------------------------------------------------------------
  40. static const int halfShift = 10; /* used for shifting by 10 bits */
  41. static const UTF32 halfBase = 0x0010000UL;
  42. static const UTF32 halfMask = 0x3FFUL;
  43. #define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD
  44. #define UNI_MAX_BMP (UTF32)0x0000FFFF
  45. #define UNI_MAX_UTF16 (UTF32)0x0010FFFF
  46. #define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
  47. #define UNI_SUR_HIGH_START (UTF32)0xD800
  48. #define UNI_SUR_HIGH_END (UTF32)0xDBFF
  49. #define UNI_SUR_LOW_START (UTF32)0xDC00
  50. #define UNI_SUR_LOW_END (UTF32)0xDFFF
  51. UTF32 UtfReader::next()
  52. {
  53. switch (type)
  54. {
  55. case Utf8: return next8();
  56. case Utf16le: return next16le();
  57. case Utf16be: return next16be();
  58. case Utf32le: return next32le();
  59. case Utf32be: return next32be();
  60. }
  61. UNIMPLEMENTED;
  62. }
  63. size32_t UtfReader::getLegalLength()
  64. {
  65. const byte * saved = cur;
  66. while (next() < errorLowerLimit)
  67. {
  68. }
  69. size32_t ret = (size32_t)(cur-saved);
  70. cur = saved;
  71. return ret;
  72. }
  73. //---------------------------------------------------------------------------
  74. UTF32 UtfReader::next32le()
  75. {
  76. if (end - cur < 4) return sourceExhausted;
  77. UTF32 ch = *(UTF32 *)cur;
  78. if (strictConversion && ((ch >= UNI_SUR_HIGH_START) && (ch <= UNI_SUR_LOW_END)))
  79. return sourceIllegal;
  80. cur += sizeof(UTF32);
  81. return ch;
  82. }
  83. //---------------------------------------------------------------------------
  84. UTF32 UtfReader::next32be()
  85. {
  86. if (end - cur < 4) return sourceExhausted;
  87. UTF32 ch;
  88. _cpyrev4(&ch, cur);
  89. if (strictConversion && ((ch >= UNI_SUR_HIGH_START) && (ch <= UNI_SUR_LOW_END)))
  90. return sourceIllegal;
  91. cur += sizeof(UTF32);
  92. return ch;
  93. }
  94. //---------------------------------------------------------------------------
  95. UTF32 UtfReader::next16le()
  96. {
  97. if (end - cur < 2) return sourceExhausted;
  98. const byte * source = cur;
  99. UTF32 ch = source[0] | (source[1] << 8);
  100. source += 2;
  101. if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
  102. {
  103. if (end - cur < 2)
  104. return sourceExhausted;
  105. UTF32 ch2 = source[0] | (source[1] << 8);
  106. source += 2;
  107. if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
  108. {
  109. ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
  110. + (ch2 - UNI_SUR_LOW_START) + halfBase;
  111. }
  112. else if (strictConversion) /* it's an unpaired high surrogate */
  113. return sourceIllegal;
  114. }
  115. else if ((strictConversion) && (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END))
  116. return sourceIllegal;
  117. cur = (const byte *)source;
  118. return ch;
  119. }
  120. //---------------------------------------------------------------------------
  121. UTF32 UtfReader::next16be()
  122. {
  123. if (end - cur < 2) return sourceExhausted;
  124. const byte * source = cur;
  125. UTF32 ch = (source[0] << 8) | source[1];
  126. source += 2;
  127. if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
  128. {
  129. if (end - cur < 2)
  130. return sourceExhausted;
  131. UTF32 ch2 = (source[0] << 8) | source[1];
  132. source += 2;
  133. if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
  134. {
  135. ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
  136. + (ch2 - UNI_SUR_LOW_START) + halfBase;
  137. }
  138. else if (strictConversion) /* it's an unpaired high surrogate */
  139. return sourceIllegal;
  140. }
  141. else if ((strictConversion) && (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END))
  142. return sourceIllegal;
  143. cur = source;
  144. return ch;
  145. }
  146. //---------------------------------------------------------------------------
  147. //This is probably faster than a table lookup on modern processors since it would avoid a cache hit.
  148. //Especially because first branch is the most common.
  149. inline unsigned getTrailingBytesForUTF8(byte value)
  150. {
  151. if (value < 0xc0)
  152. return 0;
  153. if (value < 0xe0)
  154. return 1;
  155. if (value < 0xf0)
  156. return 2;
  157. if (value < 0xf8)
  158. return 3;
  159. if (value < 0xfc)
  160. return 4;
  161. return 5;
  162. }
  163. /*
  164. * Magic values subtracted from a buffer value during UTF8 conversion.
  165. * This table contains as many values as there might be trailing bytes
  166. * in a UTF-8 sequence.
  167. */
  168. static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
  169. 0x03C82080UL, 0xFA082080UL, 0x82082080UL };
  170. /*
  171. * Utility routine to tell whether a sequence of bytes is legal UTF-8.
  172. * This must be called with the length pre-determined by the first byte.
  173. * If not calling this from ConvertUTF8to*, then the length can be set by:
  174. * length = trailingBytesForUTF8[*source]+1;
  175. * and the sequence is illegal right away if there aren't that many bytes
  176. * available.
  177. * If presented with a length > 4, this returns false. The Unicode
  178. * definition of UTF-8 goes up to 4-byte sequences.
  179. */
  180. unsigned readUtf8Size(const void * _data)
  181. {
  182. const byte * ptr = (const byte *)_data;
  183. return getTrailingBytesForUTF8(*ptr)+1;
  184. }
  185. UTF32 readUtf8Char(const void * _data)
  186. {
  187. const byte * ptr = (const byte *)_data;
  188. unsigned short extraBytesToRead = getTrailingBytesForUTF8(*ptr);
  189. UTF32 ch = 0;
  190. switch (extraBytesToRead) {
  191. case 3: ch += *ptr++; ch <<= 6; // fallthrough
  192. case 2: ch += *ptr++; ch <<= 6; // fallthrough
  193. case 1: ch += *ptr++; ch <<= 6; // fallthrough
  194. case 0: ch += *ptr++;
  195. }
  196. return ch - offsetsFromUTF8[extraBytesToRead];
  197. }
  198. inline bool isLegalUTF8(const UTF8 *source, unsigned length)
  199. {
  200. UTF8 a;
  201. const UTF8 *srcptr = source+length;
  202. switch (length)
  203. {
  204. default: return false;
  205. /* Everything else falls through when "true"... */
  206. case 4:
  207. if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
  208. // fallthrough
  209. case 3:
  210. if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
  211. // fallthrough
  212. case 2: if ((a = (*--srcptr)) > 0xBF) return false;
  213. switch (*source)
  214. {
  215. /* no fall-through in this inner switch */
  216. case 0xE0: if (a < 0xA0) return false; break;
  217. case 0xF0: if (a < 0x90) return false; break;
  218. case 0xF4: if (a > 0x8F) return false; break;
  219. default: if (a < 0x80) return false;
  220. }
  221. // fallthrough
  222. case 1:
  223. if (*source >= 0x80 && *source < 0xC2) return false;
  224. if (*source > 0xF4) return false;
  225. }
  226. return true;
  227. }
  228. /* --------------------------------------------------------------------- */
  229. UTF32 UtfReader::next8()
  230. {
  231. const UTF8* source = (const UTF8*)cur;
  232. if (source >= end) return sourceExhausted;
  233. unsigned short extraBytesToRead = getTrailingBytesForUTF8(*source);
  234. if (source + extraBytesToRead >= end)
  235. return sourceExhausted;
  236. /* Do this check whether lenient or strict */
  237. if (! isLegalUTF8(source, extraBytesToRead+1))
  238. return sourceIllegal;
  239. /*
  240. * The cases all fall through. See "Note A" below.
  241. */
  242. UTF32 ch = 0;
  243. switch (extraBytesToRead) {
  244. case 3: ch += *source++; ch <<= 6; // fallthrough
  245. case 2: ch += *source++; ch <<= 6; // fallthrough
  246. case 1: ch += *source++; ch <<= 6; // fallthrough
  247. case 0: ch += *source++;
  248. }
  249. cur = (const byte *)source;
  250. return ch - offsetsFromUTF8[extraBytesToRead];
  251. }
  252. //---------------------------------------------------------------------------
  253. UTF32 readUtf8Character(unsigned len, const byte * & cur)
  254. {
  255. const UTF8* source = (const UTF8*)cur;
  256. if (len == 0) return sourceExhausted;
  257. unsigned short extraBytesToRead = getTrailingBytesForUTF8(*source);
  258. if (extraBytesToRead >= len)
  259. return sourceExhausted;
  260. /* Do this check whether lenient or strict */
  261. if (! isLegalUTF8(source, extraBytesToRead+1))
  262. return sourceIllegal;
  263. /*
  264. * The cases all fall through. See "Note A" below.
  265. */
  266. UTF32 ch = 0;
  267. switch (extraBytesToRead) {
  268. case 3: ch += *source++; ch <<= 6; // fallthrough
  269. case 2: ch += *source++; ch <<= 6; // fallthrough
  270. case 1: ch += *source++; ch <<= 6; // fallthrough
  271. case 0: ch += *source++;
  272. }
  273. cur = (const byte *)source;
  274. return ch - offsetsFromUTF8[extraBytesToRead];
  275. }
  276. /*
  277. * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
  278. * into the first byte, depending on how many bytes follow. There are
  279. * as many entries in this table as there are UTF-8 sequence types.
  280. * (I.e., one byte sequence, two byte... six byte sequence.)
  281. */
  282. static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
  283. static const UTF32 byteMask = 0xBF;
  284. static const UTF32 byteMark = 0x80;
  285. unsigned writeUtf8(void * vtarget, unsigned maxLength, UTF32 ch)
  286. {
  287. unsigned short bytesToWrite;
  288. /* Figure out how many bytes the result will require */
  289. if (ch < (UTF32)0x80)
  290. bytesToWrite = 1;
  291. else if (ch < (UTF32)0x800)
  292. bytesToWrite = 2;
  293. else if (ch < (UTF32)0x10000)
  294. bytesToWrite = 3;
  295. else if (ch < (UTF32)0x200000)
  296. bytesToWrite = 4;
  297. else {
  298. bytesToWrite = 2;
  299. ch = UNI_REPLACEMENT_CHAR;
  300. }
  301. if (bytesToWrite > maxLength)
  302. return 0;
  303. UTF8 * target = (UTF8 *)vtarget + bytesToWrite;
  304. switch (bytesToWrite) { /* note: everything falls through. */
  305. case 4: *--target = (ch | byteMark) & byteMask; ch >>= 6; // fallthrough
  306. case 3: *--target = (ch | byteMark) & byteMask; ch >>= 6; // fallthrough
  307. case 2: *--target = (ch | byteMark) & byteMask; ch >>= 6; // fallthrough
  308. case 1: *--target = ch | firstByteMark[bytesToWrite];
  309. }
  310. return bytesToWrite;
  311. }
  312. unsigned writeUtf16le(void * vtarget, unsigned maxLength, UTF32 ch)
  313. {
  314. if (maxLength < 2)
  315. return 0;
  316. UTF16 * target = (UTF16 *)vtarget;
  317. if (ch <= UNI_MAX_BMP)
  318. {
  319. /* Target is a character <= 0xFFFF */
  320. if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
  321. ch = UNI_REPLACEMENT_CHAR;
  322. *target = ch; /* normal case */
  323. return 2;
  324. }
  325. if (ch > UNI_MAX_UTF16)
  326. {
  327. *target = UNI_REPLACEMENT_CHAR;
  328. return 2;
  329. }
  330. /* target is a character in range 0xFFFF - 0x10FFFF. */
  331. if (maxLength < 4)
  332. return 0;
  333. ch -= halfBase;
  334. target[0] = (ch >> halfShift) + UNI_SUR_HIGH_START;
  335. target[1] = (ch & halfMask) + UNI_SUR_LOW_START;
  336. return 4;
  337. }
  338. unsigned writeUtf16be(void * vtarget, unsigned maxLength, UTF32 ch)
  339. {
  340. if (maxLength < 2)
  341. return 0;
  342. UTF16 * target = (UTF16 *)vtarget;
  343. UTF16 temp;
  344. if (ch <= UNI_MAX_BMP)
  345. {
  346. /* Target is a character <= 0xFFFF */
  347. if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
  348. ch = UNI_REPLACEMENT_CHAR;
  349. temp = ch;
  350. _cpyrev2(target, &temp); /* normal case */
  351. return 2;
  352. }
  353. if (ch > UNI_MAX_UTF16)
  354. {
  355. temp = UNI_REPLACEMENT_CHAR;
  356. _cpyrev2(target, &temp);
  357. return 2;
  358. }
  359. /* target is a character in range 0xFFFF - 0x10FFFF. */
  360. if (maxLength < 4)
  361. return 0;
  362. ch -= halfBase;
  363. temp = (ch >> halfShift) + UNI_SUR_HIGH_START;
  364. _cpyrev2(target, &temp);
  365. temp = (ch & halfMask) + UNI_SUR_LOW_START;
  366. _cpyrev2(target+1, &temp);
  367. return 4;
  368. }
  369. unsigned writeUtf32le(void * vtarget, unsigned maxLength, UTF32 ch)
  370. {
  371. if (maxLength < 4) return 0;
  372. *(UTF32 *)vtarget = ch;
  373. return 4;
  374. }
  375. unsigned writeUtf32be(void * vtarget, unsigned maxLength, UTF32 ch)
  376. {
  377. if (maxLength < 4) return 0;
  378. _cpyrev4(vtarget, &ch);
  379. return 4;
  380. }
  381. //---------------------------------------------------------------------------
  382. MemoryBuffer & appendUtf8(MemoryBuffer & out, UTF32 value)
  383. {
  384. char temp[4];
  385. return out.append(writeUtf8(temp, sizeof(temp), value), temp);
  386. }
  387. MemoryBuffer & appendUtf16le(MemoryBuffer & out, UTF32 value)
  388. {
  389. char temp[4];
  390. return out.append(writeUtf16le(temp, sizeof(temp), value), temp);
  391. }
  392. MemoryBuffer & appendUtf16be(MemoryBuffer & out, UTF32 value)
  393. {
  394. char temp[4];
  395. return out.append(writeUtf16be(temp, sizeof(temp), value), temp);
  396. }
  397. MemoryBuffer & appendUtf32le(MemoryBuffer & out, UTF32 value)
  398. {
  399. char temp[4];
  400. return out.append(writeUtf32le(temp, sizeof(temp), value), temp);
  401. }
  402. MemoryBuffer & appendUtf32be(MemoryBuffer & out, UTF32 value)
  403. {
  404. char temp[4];
  405. return out.append(writeUtf32be(temp, sizeof(temp), value), temp);
  406. }
  407. MemoryBuffer & appendUtf(MemoryBuffer & out, UtfReader::UtfFormat targetType, UTF32 value)
  408. {
  409. switch (targetType)
  410. {
  411. case UtfReader::Utf8: appendUtf8(out, value); break;
  412. case UtfReader::Utf16le: appendUtf16le(out, value); break;
  413. case UtfReader::Utf16be: appendUtf16be(out, value); break;
  414. case UtfReader::Utf32le: appendUtf32le(out, value); break;
  415. case UtfReader::Utf32be: appendUtf32be(out, value); break;
  416. }
  417. return out;
  418. }
  419. /* ---------------------------------------------------------------------
  420. Note A.
  421. The fall-through switches in UTF-8 reading code save a
  422. temp variable, some decrements & conditionals. The switches
  423. are equivalent to the following loop:
  424. {
  425. int tmpBytesToRead = extraBytesToRead+1;
  426. do {
  427. ch += *source++;
  428. --tmpBytesToRead;
  429. if (tmpBytesToRead) ch <<= 6;
  430. } while (tmpBytesToRead > 0);
  431. }
  432. In UTF-8 writing code, the switches on "bytesToWrite" are
  433. similarly unrolled loops.
  434. --------------------------------------------------------------------- */
  435. bool convertUtf(MemoryBuffer & target, UtfReader::UtfFormat targetType, unsigned sourceLength, const void * source, UtfReader::UtfFormat sourceType)
  436. {
  437. UtfReader input(sourceType, false);
  438. input.set(sourceLength, source);
  439. unsigned originalLength = target.length();
  440. for (;;)
  441. {
  442. UTF32 next = input.next();
  443. if (next == sourceExhausted)
  444. return true;
  445. if (next == sourceIllegal)
  446. {
  447. target.setLength(originalLength);
  448. return false;
  449. }
  450. appendUtf(target, targetType, next);
  451. }
  452. }
  453. bool convertToUtf8(MemoryBuffer & target, unsigned sourceLength, const void * source)
  454. {
  455. if (sourceLength < 2)
  456. return false;
  457. const byte * text = (const byte *)source;
  458. //check for leading BOM of 0xfeff in the appropriate encoding
  459. if ((text[0] == 0xfe) && (text[1] == 0xff))
  460. return convertUtf(target, UtfReader::Utf8, sourceLength-2, text+2, UtfReader::Utf16be);
  461. if ((text[0] == 0xff) && (text[1] == 0xfe))
  462. {
  463. if (sourceLength >= 4 && (text[2] == 0) && (text[3] == 0))
  464. return convertUtf(target, UtfReader::Utf8, sourceLength-4, text+4, UtfReader::Utf32le);
  465. return convertUtf(target, UtfReader::Utf8, sourceLength-2, text+2, UtfReader::Utf16le);
  466. }
  467. if (sourceLength > 4 && (text[0] == 0) && (text[1] == 0) && (text[2] == 0xfe) && (text[3] == 0xff))
  468. return convertUtf(target, UtfReader::Utf8, sourceLength-4, text+4, UtfReader::Utf32be);
  469. //Try and guess the format
  470. if (text[0] && !text[1])
  471. {
  472. if (text[2])
  473. {
  474. if (convertUtf(target, UtfReader::Utf8, sourceLength, source, UtfReader::Utf16le))
  475. return true;
  476. }
  477. else
  478. {
  479. if (convertUtf(target, UtfReader::Utf8, sourceLength, source, UtfReader::Utf32le))
  480. return true;
  481. }
  482. }
  483. else if (!text[0])
  484. {
  485. if (text[1])
  486. {
  487. if (convertUtf(target, UtfReader::Utf8, sourceLength, source, UtfReader::Utf16be))
  488. return true;
  489. }
  490. else
  491. {
  492. if (convertUtf(target, UtfReader::Utf8, sourceLength, source, UtfReader::Utf32be))
  493. return true;
  494. }
  495. }
  496. //No idea first one that matches wins!
  497. return
  498. convertUtf(target, UtfReader::Utf8, sourceLength, source, UtfReader::Utf16le) ||
  499. convertUtf(target, UtfReader::Utf8, sourceLength, source, UtfReader::Utf16be) ||
  500. convertUtf(target, UtfReader::Utf8, sourceLength, source, UtfReader::Utf32le) ||
  501. convertUtf(target, UtfReader::Utf8, sourceLength, source, UtfReader::Utf32be);
  502. }
  503. //---------------------------------------------------------------------------
  504. void addUtfActionList(StringMatcher & matcher, const char * text, unsigned action, unsigned * maxElementLength, UtfReader::UtfFormat utfFormat)
  505. {
  506. if (!text)
  507. return;
  508. unsigned idx=0;
  509. while (*text)
  510. {
  511. StringBuffer str;
  512. while (*text)
  513. {
  514. char next = *text++;
  515. if (next == ',')
  516. break;
  517. if (next == '\\' && *text)
  518. {
  519. next = *text++;
  520. switch (next)
  521. {
  522. case 'r': next = '\r'; break;
  523. case 'n': next = '\n'; break;
  524. case 't': next = '\t'; break;
  525. case 'x':
  526. //hex constant - at least we can define spaces then...
  527. if (text[0] && text[1])
  528. {
  529. next = (hex2num(*text) << 4) | hex2num(text[1]);
  530. text+=2;
  531. }
  532. break;
  533. default:
  534. break; //otherwise \ just quotes the character e.g. \,
  535. }
  536. }
  537. str.append(next);
  538. }
  539. if (str.length())
  540. {
  541. MemoryBuffer converted;
  542. if (!convertUtf(converted, utfFormat, str.length(), str.str(), UtfReader::Utf8))
  543. throwError(JLIBERR_BadUtf8InArguments);
  544. matcher.queryAddEntry(converted.length(), converted.toByteArray(), action+(idx++<<8));
  545. if (maxElementLength && (converted.length() > *maxElementLength))
  546. *maxElementLength = converted.length();
  547. }
  548. }
  549. }
  550. extern jlib_decl bool replaceUtf(utfReplacementFunc func, MemoryBuffer & target, UtfReader::UtfFormat type, unsigned sourceLength, const void * source)
  551. {
  552. UtfReader input(type, false);
  553. input.set(sourceLength, source);
  554. unsigned originalLength = target.length();
  555. for (;;)
  556. {
  557. const byte * cur = input.cur;
  558. UTF32 next = input.next();
  559. if (next == sourceExhausted)
  560. return true;
  561. if (next == sourceIllegal)
  562. {
  563. target.setLength(originalLength);
  564. return false;
  565. }
  566. func(target, next, type, cur, input.cur-cur, cur==source);
  567. }
  568. }
  569. struct utf32ValidXmlCharRange
  570. {
  571. UTF32 min;
  572. UTF32 max;
  573. bool start;
  574. };
  575. utf32ValidXmlCharRange utf32ValidXmlCharRanges[] = {
  576. {'0', '9', false},
  577. {'A', 'Z', true},
  578. {'a', 'z', true},
  579. {0xC0, 0xD6, true},
  580. {0xD8, 0xF6, true},
  581. {0xF8, 0x2FF, true},
  582. {0x300, 0x36F, false},
  583. {0x370, 0x37D, true},
  584. {0x37F, 0x1FFF, true},
  585. {0x200C, 0x200D, true},
  586. {0x203F, 0x2040, false},
  587. {0x2070, 0x218F, true},
  588. {0x2C00, 0x2FEF, true},
  589. {0x3001, 0xD7FF, true},
  590. {0xF900, 0xFDCF, true},
  591. {0xFDF0, 0xFFFD, true},
  592. {0x10000, 0xEFFFF, true},
  593. {0, 0, false}
  594. };
  595. inline bool replaceBelowRange(UTF32 match, UTF32 replace, int id, MemoryBuffer & target, UtfReader::UtfFormat type, const void * source, int len, bool start)
  596. {
  597. utf32ValidXmlCharRange &r = utf32ValidXmlCharRanges[id];
  598. if (r.min==0)
  599. return true;
  600. if (match>r.max)
  601. return false;
  602. if (match<r.min)
  603. {
  604. appendUtf(target, type, replace);
  605. return true;
  606. }
  607. if (!r.start && start)
  608. appendUtf(target, type, replace);
  609. else
  610. target.append(len, source); //src and target are same, no need to reconvert
  611. return true;
  612. }
  613. MemoryBuffer & utfXmlNameReplacementFunc(MemoryBuffer & target, UTF32 match, UtfReader::UtfFormat type, const void * source, int len, bool start)
  614. {
  615. if (match==':' || match=='_' || (!start && (match=='-' || match=='.' || match==0xB7)))
  616. return target.append(len, source);
  617. for (int i=0; !replaceBelowRange(match, '_', i, target, type, source, len, start); i++);
  618. return target;
  619. }
  620. extern jlib_decl bool appendUtfXmlName(MemoryBuffer & target, UtfReader::UtfFormat type, unsigned sourceLength, const void * source)
  621. {
  622. return replaceUtf(utfXmlNameReplacementFunc, target, type, sourceLength, source);
  623. }