unicodelib.cpp 43 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146
  1. /*##############################################################################
  2. Copyright (C) 2011 HPCC Systems.
  3. All rights reserved. This program is free software: you can redistribute it and/or modify
  4. it under the terms of the GNU Affero General Public License as
  5. published by the Free Software Foundation, either version 3 of the
  6. License, or (at your option) any later version.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. GNU Affero General Public License for more details.
  11. You should have received a copy of the GNU Affero General Public License
  12. along with this program. If not, see <http://www.gnu.org/licenses/>.
  13. ############################################################################## */
  14. #include "jlib.hpp"
  15. #include "jsem.hpp"
  16. #include <string.h>
  17. #include "unicodelib.hpp"
  18. #include "unicode/usearch.h"
  19. #include "unicode/schriter.h"
  20. #include "unicode/locid.h"
  21. #include "unicode/coll.h"
  22. #include "unicode/stsearch.h"
  23. #include "unicode/translit.h"
  24. #include "unicode/rbbi.h"
  25. #include "../stringlib/wildmatch.tpp"
  26. #define UNICODELIB_VERSION "UNICODELIB 1.1.06"
  27. UChar32 const u32comma = ',';
  28. UChar32 const u32space = ' ';
  29. UChar const u16asterisk = '*';
  30. UChar const u16query = '?';
  31. UChar const u16space = ' ';
  32. const char * EclDefinition =
  33. "export UnicodeLib := SERVICE\n"
  34. " unicode UnicodeFilterOut(const unicode src, const unicode _within) : c, pure,entrypoint='ulUnicodeFilterOut'; \n"
  35. " unicode UnicodeFilter(const unicode src, const unicode _within) : c, pure,entrypoint='ulUnicodeFilter'; \n"
  36. " unicode UnicodeSubstituteOut(const unicode src, const unicode _within, const unicode _newchar) : c, pure,entrypoint='ulUnicodeSubsOut'; \n"
  37. " unicode UnicodeSubstitute(const unicode src, const unicode _within, const unicode _newchar) : c, pure,entrypoint='ulUnicodeSubs'; \n"
  38. " unicode UnicodeRepad(const unicode src, unsigned4 size) : c, pure,entrypoint='ulUnicodeRepad'; \n"
  39. " unsigned integer4 UnicodeFind(const unicode src, const unicode tofind, unsigned4 instance) : c, pure,entrypoint='ulUnicodeFind', hole; \n"
  40. " unsigned integer4 UnicodeLocaleFind(const unicode src, const unicode tofind, unsigned4 instance, const varstring localename) : c, pure,entrypoint='ulUnicodeLocaleFind', hole; \n"
  41. " unsigned integer4 UnicodeLocaleFindAtStrength(const unicode src, const unicode tofind, unsigned4 instance, const varstring localename, integer1 strength) : c, pure,entrypoint='ulUnicodeLocaleFindAtStrength', hole; \n"
  42. " unicode UnicodeExtract(const unicode src, unsigned4 instance) : c,pure,entrypoint='ulUnicodeExtract'; \n"
  43. " unicode50 UnicodeExtract50(const unicode src, unsigned4 instance) : c,pure,entrypoint='ulUnicodeExtract50', hole; \n"
  44. " unicode UnicodeToLowerCase(const unicode src) : c,pure,entrypoint='ulUnicodeToLowerCase';\n"
  45. " unicode UnicodeToUpperCase(const unicode src) : c,pure,entrypoint='ulUnicodeToUpperCase';\n"
  46. " unicode UnicodeToProperCase(const unicode src) : c,pure,entrypoint='ulUnicodeToProperCase';\n"
  47. " unicode80 UnicodeToLowerCase80(const unicode src) : c,pure,entrypoint='ulUnicodeToLowerCase80', hole;\n"
  48. " unicode80 UnicodeToUpperCase80(const unicode src) : c,pure,entrypoint='ulUnicodeToUpperCase80', hole;\n"
  49. " unicode80 UnicodeToProperCase80(const unicode src) : c,pure,entrypoint='ulUnicodeToProperCase80', hole;\n"
  50. " unicode UnicodeLocaleToLowerCase(const unicode src, const varstring localename) : c,pure,entrypoint='ulUnicodeLocaleToLowerCase';\n"
  51. " unicode UnicodeLocaleToUpperCase(const unicode src, const varstring localename) : c,pure,entrypoint='ulUnicodeLocaleToUpperCase';\n"
  52. " unicode UnicodeLocaleToProperCase(const unicode src, const varstring localename) : c,pure,entrypoint='ulUnicodeLocaleToProperCase';\n"
  53. " unicode80 UnicodeLocaleToLowerCase80(const unicode src, const varstring localename) : c,pure,entrypoint='ulUnicodeLocaleToLowerCase80', hole;\n"
  54. " unicode80 UnicodeLocaleToUpperCase80(const unicode src, const varstring localename) : c,pure,entrypoint='ulUnicodeLocaleToUpperCase80', hole;\n"
  55. " unicode80 UnicodeLocaleToProperCase80(const unicode src, const varstring localename) : c,pure,entrypoint='ulUnicodeLocaleToProperCase80', hole;\n"
  56. " integer4 UnicodeCompareIgnoreCase(const unicode src1, const unicode src2) : c,pure,entrypoint='ulUnicodeCompareIgnoreCase', hole;\n"
  57. " integer4 UnicodeCompareAtStrength(const unicode src1, const unicode src2, integer1 strength) : c,pure,entrypoint='ulUnicodeCompareAtStrength', hole;\n"
  58. " integer4 UnicodeLocaleCompareIgnoreCase(const unicode src1, const unicode src2, const varstring localename) : c,pure,entrypoint='ulUnicodeLocaleCompareIgnoreCase', hole;\n"
  59. " integer4 UnicodeLocaleCompareAtStrength(const unicode src1, const unicode src2, const varstring localename, integer1 strength) : c,pure,entrypoint='ulUnicodeLocaleCompareAtStrength', hole;\n"
  60. " unicode UnicodeReverse(const unicode src) : c,pure,entrypoint='ulUnicodeReverse';\n"
  61. " unicode UnicodeFindReplace(const unicode src, const unicode stok, const unicode rtok) : c,pure,entrypoint='ulUnicodeFindReplace';\n"
  62. " unicode UnicodeLocaleFindReplace(const unicode src, const unicode stok, const unicode rtok, const varstring localename) : c,pure,entrypoint='ulUnicodeLocaleFindReplace';\n"
  63. " unicode UnicodeLocaleFindAtStrengthReplace(const unicode src, const unicode stok, const unicode rtok, const varstring localename, integer1 strength) : c,pure,entrypoint='ulUnicodeLocaleFindAtStrengthReplace';\n"
  64. " unicode80 UnicodeFindReplace80(const unicode src, const unicode stok, const unicode rtok) : c,pure,entrypoint='ulUnicodeFindReplace80', hole;\n"
  65. " unicode80 UnicodeLocaleFindReplace80(const unicode src, const unicode stok, const unicode rtok, const varstring localename) : c,pure,entrypoint='ulUnicodeLocaleFindReplace80',hole;\n"
  66. " unicode80 UnicodeLocaleFindAtStrengthReplace80(const unicode src, const unicode stok, const unicode rtok, const varstring localename, integer1 strength) : c,pure,entrypoint='ulUnicodeLocaleFindAtStrengthReplace80',hole;\n"
  67. " unicode UnicodeCleanAccents(const unicode src) : c,pure,entrypoint='ulUnicodeCleanAccents'; \n"
  68. " unicode UnicodeCleanSpaces(const unicode src) : c,pure,entrypoint='ulUnicodeCleanSpaces'; \n"
  69. " unicode25 UnicodeCleanSpaces25(const unicode src) : c,pure,entrypoint='ulUnicodeCleanSpaces25', hole; \n"
  70. " unicode80 UnicodeCleanSpaces80(const unicode src) : c,pure,entrypoint='ulUnicodeCleanSpaces80', hole; \n"
  71. " boolean UnicodeWildMatch(const unicode src, const unicode _pattern, boolean _noCase) : c, pure,entrypoint='ulUnicodeWildMatch', hole; \n"
  72. " boolean UnicodeContains(const unicode src, const unicode _pattern, boolean _noCase) : c, pure,entrypoint='ulUnicodeContains', hole; \n"
  73. " unsigned4 UnicodeLocaleEditDistance(const unicode left, const unicode right, const varstring localename) : c, pure,entrypoint='ulUnicodeLocaleEditDistance', hole; \n"
  74. " boolean UnicodeLocaleEditDistanceWithinRadius(const unicode left, const unicode right, unsigned4 radius, const varstring localename) : c, pure,entrypoint='ulUnicodeLocaleEditDistanceWithinRadius', hole; \n"
  75. " unsigned4 UnicodeLocaleWordCount(const unicode text, const varstring localename) : c, pure,entrypoint='ulUnicodeLocaleWordCount', hole; \n"
  76. " unicode UnicodeLocaleGetNthWord(const unicode text, unsigned4 n, const varstring localename) : c,pure,entrypoint='ulUnicodeLocaleGetNthWord';\n"
  77. "END;\n";
  78. static const char * compatibleVersions[] = {
  79. "UNICODELIB 1.1.01 [64d78857c1cecae15bd238cd7767b3c1]",
  80. "UNICODELIB 1.1.01 [e8790fe30d9627997749c3c4839b5957]",
  81. "UNICODELIB 1.1.02",
  82. "UNICODELIB 1.1.03",
  83. "UNICODELIB 1.1.04",
  84. "UNICODELIB 1.1.05",
  85. NULL };
  86. UNICODELIB_API bool getECLPluginDefinition(ECLPluginDefinitionBlock *pb)
  87. {
  88. if (pb->size == sizeof(ECLPluginDefinitionBlockEx))
  89. {
  90. ECLPluginDefinitionBlockEx * pbx = (ECLPluginDefinitionBlockEx *) pb;
  91. pbx->compatibleVersions = compatibleVersions;
  92. }
  93. else if (pb->size != sizeof(ECLPluginDefinitionBlock))
  94. return false;
  95. pb->magicVersion = PLUGIN_VERSION;
  96. pb->version = UNICODELIB_VERSION;
  97. pb->moduleName = "lib_unicodelib";
  98. pb->ECL = EclDefinition;
  99. pb->flags = PLUGIN_IMPLICIT_MODULE | PLUGIN_MULTIPLE_VERSIONS;
  100. pb->description = "UnicodeLib unicode string manipulation library";
  101. return true;
  102. }
  103. namespace nsUnicodelib {
  104. IPluginContext * parentCtx = NULL;
  105. void doTrimRight(UnicodeString & source)
  106. {
  107. int32_t oldLength = source.length();
  108. if (!oldLength)
  109. return;
  110. int32_t currentLength = oldLength;
  111. bool uSpace = true;
  112. do {
  113. UChar32 c = source[--currentLength];
  114. if(!(c == 0x20 || u_isWhitespace(c))) {
  115. currentLength++;
  116. uSpace = false;
  117. }
  118. } while (uSpace && currentLength>0);
  119. if (currentLength < oldLength) {
  120. source.truncate(currentLength);
  121. }
  122. }
  123. void forceLength(UnicodeString & str, int32_t len)
  124. {
  125. if(str.length()>len)
  126. str.truncate(len);
  127. else if(str.length()<len)
  128. str.padTrailing(len);
  129. }
  130. void doModifySearchStrength(StringSearch & search, char strength, UErrorCode & error)
  131. {
  132. RuleBasedCollator * coll = search.getCollator();
  133. switch(strength)
  134. {
  135. case 1:
  136. coll->setStrength(Collator::PRIMARY);
  137. break;
  138. case 2:
  139. coll->setStrength(Collator::SECONDARY);
  140. break;
  141. case 3:
  142. coll->setStrength(Collator::TERTIARY);
  143. break;
  144. case 4:
  145. coll->setStrength(Collator::QUATERNARY);
  146. break;
  147. case 5:
  148. default:
  149. coll->setStrength(Collator::IDENTICAL);
  150. }
  151. search.setCollator(coll, error);
  152. }
  153. bool extract(UnicodeString & out, UnicodeString const & in, unsigned instance)
  154. {
  155. if(!instance) return false;
  156. int32_t start = 0;
  157. while(--instance)
  158. {
  159. start = in.indexOf(u32comma, start);
  160. if(start == -1) return false;
  161. start++;
  162. }
  163. int32_t end = in.indexOf(u32comma, start);
  164. if(end == -1)
  165. end = in.length();
  166. out.append(in, start, end-start);
  167. return true;
  168. }
  169. int doUnicodeCompareAtStrength(unsigned src1Len, UChar const * src1, unsigned src2Len, UChar const * src2, Collator::ECollationStrength strength)
  170. {
  171. UErrorCode error = U_ZERO_ERROR;
  172. Collator * coll = Collator::createInstance(error);
  173. coll->setStrength(strength);
  174. Collator::EComparisonResult ret = coll->compare(src1, src1Len, src2, src2Len);
  175. delete coll;
  176. return ret;
  177. }
  178. int doUnicodeLocaleCompareAtStrength(unsigned src1Len, UChar const * src1, unsigned src2Len, UChar const * src2, char const * localename, Collator::ECollationStrength strength)
  179. {
  180. UErrorCode error = U_ZERO_ERROR;
  181. Locale locale(localename);
  182. Collator * coll = Collator::createInstance(locale, error);
  183. coll->setStrength(strength);
  184. Collator::EComparisonResult ret = coll->compare(src1, src1Len, src2, src2Len);
  185. delete coll;
  186. return ret;
  187. }
  188. void doUnicodeLocaleFindReplace(UnicodeString & source, UnicodeString const & pattern, UnicodeString const & replace, char const * localename)
  189. {
  190. UErrorCode error = U_ZERO_ERROR;
  191. Locale locale(localename);
  192. StringSearch search(pattern, source, locale, 0, error);
  193. int32_t pos = search.first(error);
  194. while(pos != USEARCH_DONE)
  195. {
  196. source.replace(pos, search.getMatchedLength(), replace);
  197. search.setText(source, error);
  198. search.setOffset(pos+replace.length(), error);
  199. pos = search.next(error);
  200. }
  201. }
  202. void doUnicodeLocaleFindAtStrengthReplace(UnicodeString & source, UnicodeString const & pattern, UnicodeString const & replace, char const * localename, char strength)
  203. {
  204. UErrorCode error = U_ZERO_ERROR;
  205. Locale locale(localename);
  206. StringSearch search(pattern, source, locale, 0, error);
  207. doModifySearchStrength(search, strength, error);
  208. int32_t pos = search.first(error);
  209. while(pos != USEARCH_DONE)
  210. {
  211. source.replace(pos, search.getMatchedLength(), replace);
  212. search.setText(source, error);
  213. search.setOffset(pos+replace.length(), error);
  214. pos = search.next(error);
  215. }
  216. }
  217. void doUnicodeCleanSpaces(UnicodeString & source)
  218. {
  219. int32_t srclen;
  220. int32_t pos = source.indexOf(u32space);
  221. int32_t endpos;
  222. int32_t spacelen;
  223. while(pos != -1)
  224. {
  225. srclen = source.length();
  226. for(endpos=pos; endpos<srclen; endpos++)
  227. if(source.charAt(endpos)!=u32space) break;
  228. spacelen = endpos-pos;
  229. if((pos>0) && (endpos<srclen)) spacelen--;
  230. if(spacelen>0) source.remove(pos, spacelen);
  231. pos = source.indexOf(u32space, pos+1);
  232. }
  233. }
  234. /*
  235. N.B. To do 'real' case-insensitive matching we should use full stringwise casefolding on the source and pattern. The simple char-by-char toupper approach has problems with unicode. For example, some chars uppercase to multiple chars (e.g. the German 'sharp s' uppercases to 'SS'). See http://icu-project.org/userguide/posix.html#case_mappings for more examples. Furthermore, converting as 16-bit code units does not work when code points from U+10000 upwards are involved. Nevertheless, we use the simple char-by-char toupper approach for the UnicodeWildMatch function, because it is intended as a high-speed function. For accurate case-folding, you should either use the UnicodeToUpperCase function explicitly on the arguments or use REGEXFIND.
  236. */
  237. inline UChar u16toupper(UChar c)
  238. {
  239. UChar32 o = u_toupper(c);
  240. return U_IS_SUPPLEMENTARY(o) ? c : (UChar)o;
  241. }
  242. static icu::Transliterator* deAccenter = NULL;
  243. static CriticalSection accenterCrit;
  244. inline unsigned char min3(unsigned char a, unsigned char b, unsigned char c)
  245. {
  246. unsigned char min = (a<b)? a:b;
  247. return (min<c)? min:c;
  248. }
  249. #define DISTANCE_ON_ERROR 999
  250. class CEList
  251. {
  252. private:
  253. UnicodeString ustring_;
  254. uint32_t* ces_;
  255. uint32_t length_;
  256. uint32_t capacity_;
  257. bool invalid;
  258. void doCreateCEList(RuleBasedCollator& rbc) {
  259. UErrorCode status = U_ZERO_ERROR;
  260. CollationElementIterator* ceIterator = rbc.createCollationElementIterator( ustring_ );
  261. if (!capacity_) {
  262. capacity_ = ustring_.length();
  263. }
  264. ces_ = new uint32_t[capacity_];
  265. uint32_t ce = 0;
  266. do {
  267. ce = ceIterator->next(status);
  268. if ((length_ == capacity_) || (ce == CollationElementIterator::NULLORDER))
  269. break;
  270. ces_[length_++] = ce;
  271. } while (ce != CollationElementIterator::NULLORDER);
  272. delete ceIterator;
  273. if (U_FAILURE(status)) invalid = true;
  274. }
  275. public:
  276. CEList(RuleBasedCollator& rbc, const UnicodeString & source, uint32_t capacity=0)
  277. : length_(0), capacity_(capacity), ustring_(source), invalid(false)
  278. {
  279. doCreateCEList(rbc);
  280. }
  281. ~CEList()
  282. {
  283. delete[] ces_;
  284. }
  285. uint32_t operator[](uint32_t offset)
  286. {
  287. return (offset < length_ )? ces_[offset]:0xffff;
  288. }
  289. uint32_t length() { return length_;}
  290. uint32_t capacity() {return capacity_;}
  291. inline bool isInvalid() const { return invalid; }
  292. };
  293. inline unsigned mask(unsigned x) { return x & 1; }
  294. unsigned unicodeEditDistanceV2(UnicodeString & left, UnicodeString & right, RuleBasedCollator& rbc)
  295. {
  296. unsigned char i, j;
  297. doTrimRight(left);
  298. doTrimRight(right);
  299. unsigned leftLen = left.length();
  300. unsigned rightLen = right.length();
  301. if (leftLen > 255)
  302. leftLen = 255;
  303. if (rightLen > 255)
  304. rightLen = 255;
  305. if (leftLen == 0)
  306. return rightLen;
  307. if (rightLen == 0)
  308. return leftLen;
  309. CEList leftCEs(rbc, left, leftLen);
  310. CEList rightCEs(rbc, right, rightLen);
  311. if (leftCEs.isInvalid() || rightCEs.isInvalid())
  312. return DISTANCE_ON_ERROR;
  313. leftLen = leftCEs.length();
  314. rightLen = rightCEs.length();
  315. //Optimize the storage requirements by
  316. //i) Only storing two stripes
  317. //ii) Calculate, but don't store the row comparing against the null string
  318. unsigned char da[2][256];
  319. uint32_t r_0 = rightCEs[0];
  320. uint32_t l_0 = leftCEs[0];
  321. bool matched_l0 = false;
  322. for (j = 0; j < rightLen; j++)
  323. {
  324. if (rightCEs[j] == l_0) matched_l0 = true;
  325. da[0][j] = (matched_l0) ? j : j+1;
  326. }
  327. bool matched_r0 = (l_0 == r_0);
  328. for (i = 1; i < leftLen; i++)
  329. {
  330. uint32_t l_i = leftCEs[i];
  331. if (l_i == r_0)
  332. matched_r0 = true;
  333. byte da_i_0 = matched_r0 ? i : i+1;
  334. da[mask(i)][0] = da_i_0;
  335. byte da_i_prevj = da_i_0;
  336. for (j = 1; j < rightLen; j++)
  337. {
  338. uint32_t r_j = rightCEs[j];
  339. unsigned char next = (l_i == r_j) ? da[mask(i-1)][j-1] :
  340. min3(da[mask(i-1)][j], da_i_prevj, da[mask(i-1)][j-1]) + 1;
  341. da[mask(i)][j] = next;
  342. da_i_prevj = next;
  343. }
  344. }
  345. return da[mask(leftLen-1)][rightLen-1];
  346. }
  347. //This could be further improved in the following ways:
  348. // * Only use 2*radius bytes of temporary storage - I doubt it is worth it.
  349. // * special case edit1 - you could use variables for the 6 interesting array elements, and get
  350. // rid of the array completely. You could also unwind the first (and last iterations).
  351. // * I suspect the early exit condition could be improved depending the lengths of the strings.
  352. unsigned unicodeEditDistanceV3(UnicodeString & left, UnicodeString & right, unsigned radius, RuleBasedCollator& rbc)
  353. {
  354. if (radius >= 255)
  355. return 255;
  356. doTrimRight(left);
  357. doTrimRight(right);
  358. unsigned leftLen = left.length();
  359. unsigned rightLen = right.length();
  360. unsigned minED = (leftLen < rightLen)? rightLen - leftLen: leftLen - rightLen;
  361. if (minED > radius)
  362. return minED;
  363. if (leftLen > 255)
  364. leftLen = 255;
  365. if (rightLen > 255)
  366. rightLen = 255;
  367. //Checking for leading common substrings actually slows the function down.
  368. if (leftLen == 0)
  369. return rightLen;
  370. if (rightLen == 0)
  371. return leftLen;
  372. CEList leftCEs(rbc, left, leftLen);
  373. CEList rightCEs(rbc, right, rightLen);
  374. leftLen = leftCEs.length();
  375. rightLen = rightCEs.length();
  376. /*
  377. This function applies two optimizations over the function above.
  378. a) Adding a charcter (next row) can at most decrease the edit distance by 1, so short circuit when
  379. we there is no possiblity of getting within the distance.
  380. b) We only need to evaluate the martix da[i-radius..i+radius][j-radius..j+radius]
  381. not taking into account values outside that range [can use max value to prevent access]
  382. */
  383. //Optimize the storage requirements by
  384. //i) Only storing two stripes
  385. //ii) Calculate, but don't store the row comparing against the null string
  386. unsigned char da[2][256];
  387. uint32_t r_0 = rightCEs[0];
  388. uint32_t l_0 = leftCEs[0];
  389. bool matched_l0 = false;
  390. for (unsigned char j = 0; j < rightLen; j++)
  391. {
  392. if (rightCEs[j] == l_0) matched_l0 = true;
  393. da[0][j] = (matched_l0) ? j : j+1;
  394. }
  395. bool matched_r0 = (l_0 == r_0);
  396. for (unsigned char i = 1; i < leftLen; i++)
  397. {
  398. uint32_t l_i = leftCEs[i];
  399. if (l_i == r_0)
  400. matched_r0 = true;
  401. byte da_i_0 = matched_r0 ? i : i+1;
  402. da[mask(i)][0] = da_i_0;
  403. byte da_i_prevj = da_i_0;
  404. unsigned low = i-radius;
  405. unsigned high = i+radius;
  406. unsigned first = (i > radius) ? low : 1;
  407. unsigned last = (high >= rightLen) ? rightLen : high +1;
  408. for (unsigned j = first; j < last; j++)
  409. {
  410. uint32_t r_j = rightCEs[j];
  411. unsigned char next = da[mask(i-1)][j-1];
  412. if (l_i != r_j)
  413. {
  414. if (j != low)
  415. {
  416. if (next > da_i_prevj)
  417. next = da_i_prevj;
  418. }
  419. if (j != high)
  420. {
  421. byte da_previ_j = da[mask(i-1)][j];
  422. if (next > da_previ_j)
  423. next = da_previ_j;
  424. }
  425. next++;
  426. }
  427. da[mask(i)][j] = next;
  428. da_i_prevj = next;
  429. }
  430. // bail out early if ed can't possibly be <= radius
  431. // Only considering a strip down the middle of the matrix, so the maximum the score can ever be adjusted is 2xradius
  432. unsigned max_valid_score = 3*radius;
  433. // But maximum is also 1 for every difference in string length - comes in to play when close to the end.
  434. //In 32bit goes slower for radius=1 I suspect because running out of registers. Retest in 64bit.
  435. if (radius > 1)
  436. {
  437. unsigned max_distance = radius + (leftLen - (i+1)) + (rightLen - last);
  438. if (max_valid_score > max_distance)
  439. max_valid_score = max_distance;
  440. }
  441. if (da_i_prevj > max_valid_score)
  442. return da_i_prevj;
  443. }
  444. return da[mask(leftLen-1)][rightLen-1];
  445. }
  446. UnicodeString getNthWord(RuleBasedBreakIterator& bi, UnicodeString const & source, unsigned n)
  447. {
  448. UnicodeString word;
  449. if (!n) return word;
  450. bi.setText(source);
  451. int32_t start = bi.first();
  452. while (start != BreakIterator::DONE && n) {
  453. int breakType = bi.getRuleStatus();
  454. if (breakType != UBRK_WORD_NONE) {
  455. // Exclude spaces, punctuation, and the like.
  456. // A status value UBRK_WORD_NONE indicates that the boundary does
  457. // not start a word or number.
  458. //
  459. n--;
  460. if (!n) {
  461. unsigned wordBegining = bi.preceding(start);
  462. unsigned wordEnd = bi.next();
  463. source.extractBetween(wordBegining, wordEnd, word);
  464. }
  465. }
  466. start = bi.next();
  467. }
  468. return word;
  469. }
  470. unsigned doCountWords(RuleBasedBreakIterator& bi, UnicodeString const & source)
  471. {
  472. bi.setText(source);
  473. int32_t start = bi.first();
  474. int32_t count = 0;
  475. while (start != BreakIterator::DONE) {
  476. int breakType = bi.getRuleStatus();
  477. if (breakType != UBRK_WORD_NONE) {
  478. // Exclude spaces, punctuation, and the like.
  479. // A status value UBRK_WORD_NONE indicates that the boundary does
  480. // not start a word or number.
  481. //
  482. ++count;
  483. }
  484. start = bi.next();
  485. }
  486. return count;
  487. }
  488. }//namespace
  489. using namespace nsUnicodelib;
  490. UNICODELIB_API void setPluginContext(IPluginContext * _ctx) { parentCtx = _ctx; }
  491. UNICODELIB_API void UNICODELIB_CALL ulUnicodeFilterOut(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src, unsigned hitLen, UChar const * hit)
  492. {
  493. UnicodeString const in(src, srcLen);
  494. UnicodeString const filter(hit, hitLen);
  495. UnicodeString out;
  496. StringCharacterIterator iter(in);
  497. for(iter.first32(); iter.hasNext(); iter.next32())
  498. {
  499. UChar32 c = iter.current32();
  500. if(filter.indexOf(c) == -1)
  501. out.append(c);
  502. }
  503. tgtLen = out.length();
  504. tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
  505. out.extract(0, tgtLen, tgt);
  506. }
  507. UNICODELIB_API void UNICODELIB_CALL ulUnicodeFilter(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src, unsigned hitLen, UChar const * hit)
  508. {
  509. UnicodeString const in(src, srcLen);
  510. UnicodeString const filter(hit, hitLen);
  511. UnicodeString out;
  512. StringCharacterIterator iter(in);
  513. for(iter.first32(); iter.hasNext(); iter.next32())
  514. {
  515. UChar32 c = iter.current32();
  516. if(filter.indexOf(c) != -1)
  517. out.append(c);
  518. }
  519. tgtLen = out.length();
  520. tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
  521. out.extract(0, tgtLen, tgt);
  522. }
  523. UNICODELIB_API void UNICODELIB_CALL ulUnicodeSubsOut(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src, unsigned hitLen, UChar const * hit, unsigned newCharLen, UChar const * newChar)
  524. {
  525. UnicodeString out;
  526. if(newCharLen > 0)
  527. {
  528. UnicodeString const in(src, srcLen);
  529. UnicodeString const filter(hit, hitLen);
  530. UnicodeString const replaceString(newChar, newCharLen);
  531. UChar32 replace = replaceString.char32At(0);
  532. StringCharacterIterator iter(in);
  533. for(iter.first32(); iter.hasNext(); iter.next32())
  534. {
  535. UChar32 c = iter.current32();
  536. if(filter.indexOf(c) == -1)
  537. out.append(c);
  538. else
  539. out.append(replace);
  540. }
  541. }
  542. else
  543. out.append(src, srcLen);
  544. tgtLen = out.length();
  545. tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
  546. out.extract(0, tgtLen, tgt);
  547. }
  548. UNICODELIB_API void UNICODELIB_CALL ulUnicodeSubs(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src, unsigned hitLen, UChar const * hit, unsigned newCharLen, UChar const * newChar)
  549. {
  550. UnicodeString out;
  551. if(newCharLen > 0)
  552. {
  553. UnicodeString const in(src, srcLen);
  554. UnicodeString const filter(hit, hitLen);
  555. UnicodeString const replaceString(newChar, newCharLen);
  556. UChar32 replace = replaceString.char32At(0);
  557. StringCharacterIterator iter(in);
  558. for(iter.first32(); iter.hasNext(); iter.next32())
  559. {
  560. UChar32 c = iter.current32();
  561. if(filter.indexOf(c) != -1)
  562. out.append(c);
  563. else
  564. out.append(replace);
  565. }
  566. }
  567. else
  568. out.append(src, srcLen);
  569. tgtLen = out.length();
  570. tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
  571. out.extract(0, tgtLen, tgt);
  572. }
  573. UNICODELIB_API void UNICODELIB_CALL ulUnicodeRepad(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src, unsigned tLen)
  574. {
  575. UnicodeString out(src, srcLen);
  576. out.trim();
  577. forceLength(out, tLen);
  578. tgtLen = out.length();
  579. tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
  580. out.extract(0, tgtLen, tgt);
  581. }
  582. UNICODELIB_API unsigned UNICODELIB_CALL ulUnicodeFind(unsigned srcLen, UChar const * src, unsigned hitLen, UChar const * hit, unsigned instance)
  583. {
  584. return ulUnicodeLocaleFind(srcLen, src, hitLen, hit, instance, "");
  585. }
  586. UNICODELIB_API unsigned UNICODELIB_CALL ulUnicodeLocaleFind(unsigned srcLen, UChar const * src, unsigned hitLen, UChar const * hit, unsigned instance, char const * localename)
  587. {
  588. UErrorCode error = U_ZERO_ERROR;
  589. UStringSearch * search = usearch_open(hit, hitLen, src, srcLen, localename, 0, &error);
  590. int32_t pos;
  591. for(pos = usearch_first(search, &error); pos != USEARCH_DONE; pos = usearch_next(search, &error))
  592. {
  593. if(!--instance)
  594. {
  595. usearch_close(search);
  596. return pos+1;
  597. }
  598. }
  599. usearch_close(search);
  600. return 0;
  601. }
  602. UNICODELIB_API unsigned UNICODELIB_CALL ulUnicodeLocaleFindAtStrength(unsigned srcLen, UChar const * src, unsigned hitLen, UChar const * hit, unsigned instance, char const * localename, char strength)
  603. {
  604. UnicodeString const source(src, srcLen);
  605. UnicodeString const pattern(hit, hitLen);
  606. UErrorCode error = U_ZERO_ERROR;
  607. Locale locale(localename);
  608. StringSearch search(pattern, source, locale, 0, error);
  609. doModifySearchStrength(search, strength, error);
  610. int32_t pos = search.first(error);
  611. while(pos != USEARCH_DONE)
  612. {
  613. if(!--instance)
  614. return pos+1;
  615. pos = search.next(error);
  616. }
  617. return 0;
  618. }
  619. UNICODELIB_API void UNICODELIB_CALL ulUnicodeExtract(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src, unsigned instance){
  620. UnicodeString const in(src, srcLen);
  621. UnicodeString out;
  622. if(extract(out, in, instance))
  623. {
  624. tgtLen = out.length();
  625. tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
  626. out.extract(0, tgtLen, tgt);
  627. }
  628. else
  629. {
  630. tgtLen = 0;
  631. tgt = 0;
  632. }
  633. }
  634. UNICODELIB_API void UNICODELIB_CALL ulUnicodeExtract50(UChar *tgt, unsigned srcLen, UChar const * src, unsigned instance)
  635. {
  636. UnicodeString const in(src, srcLen);
  637. UnicodeString out;
  638. extract(out, in, instance);
  639. forceLength(out, 50);
  640. out.extract(0, 50, tgt);
  641. }
  642. UNICODELIB_API void UNICODELIB_CALL ulUnicodeToLowerCase(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src)
  643. {
  644. UnicodeString unicode(src, srcLen);
  645. unicode.toLower();
  646. tgtLen = unicode.length();
  647. tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
  648. unicode.extract(0, tgtLen, tgt);
  649. }
  650. UNICODELIB_API void UNICODELIB_CALL ulUnicodeToUpperCase(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src)
  651. {
  652. UnicodeString unicode(src, srcLen);
  653. unicode.toUpper();
  654. tgtLen = unicode.length();
  655. tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
  656. unicode.extract(0, tgtLen, tgt);
  657. }
  658. UNICODELIB_API void UNICODELIB_CALL ulUnicodeToProperCase(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src)
  659. {
  660. UnicodeString unicode(src, srcLen);
  661. unicode.toTitle(0);
  662. tgtLen = unicode.length();
  663. tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
  664. unicode.extract(0, tgtLen, tgt);
  665. }
  666. UNICODELIB_API void UNICODELIB_CALL ulUnicodeToLowerCase80(UChar * tgt, unsigned srcLen, UChar const * src)
  667. {
  668. UnicodeString unicode(src, srcLen);
  669. unicode.toLower();
  670. forceLength(unicode, 80);
  671. unicode.extract(0, 80, tgt);
  672. }
  673. UNICODELIB_API void UNICODELIB_CALL ulUnicodeToUpperCase80(UChar * tgt, unsigned srcLen, UChar const * src)
  674. {
  675. UnicodeString unicode(src, srcLen);
  676. unicode.toUpper();
  677. forceLength(unicode, 80);
  678. unicode.extract(0, 80, tgt);
  679. }
  680. UNICODELIB_API void UNICODELIB_CALL ulUnicodeToProperCase80(UChar * tgt, unsigned srcLen, UChar const * src)
  681. {
  682. UnicodeString unicode(src, srcLen);
  683. unicode.toTitle(0);
  684. forceLength(unicode, 80);
  685. unicode.extract(0, 80, tgt);
  686. }
  687. UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleToLowerCase(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src, char const * localename)
  688. {
  689. UnicodeString unicode(src, srcLen);
  690. Locale locale(localename);
  691. unicode.toLower(locale);
  692. tgtLen = unicode.length();
  693. tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
  694. unicode.extract(0, tgtLen, tgt);
  695. }
  696. UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleToUpperCase(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src, char const * localename)
  697. {
  698. UnicodeString unicode(src, srcLen);
  699. Locale locale(localename);
  700. unicode.toUpper(locale);
  701. tgtLen = unicode.length();
  702. tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
  703. unicode.extract(0, tgtLen, tgt);
  704. }
  705. UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleToProperCase(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src, char const * localename)
  706. {
  707. UnicodeString unicode(src, srcLen);
  708. Locale locale(localename);
  709. unicode.toTitle(0, locale);
  710. tgtLen = unicode.length();
  711. tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
  712. unicode.extract(0, tgtLen, tgt);
  713. }
  714. UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleToLowerCase80(UChar * tgt, unsigned srcLen, UChar const * src, char const * localename)
  715. {
  716. UnicodeString unicode(src, srcLen);
  717. Locale locale(localename);
  718. unicode.toLower(locale);
  719. forceLength(unicode, 80);
  720. unicode.extract(0, 80, tgt);
  721. }
  722. UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleToUpperCase80(UChar * tgt, unsigned srcLen, UChar const * src, char const * localename)
  723. {
  724. UnicodeString unicode(src, srcLen);
  725. Locale locale(localename);
  726. unicode.toUpper(locale);
  727. forceLength(unicode, 80);
  728. unicode.extract(0, 80, tgt);
  729. }
  730. UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleToProperCase80(UChar * tgt, unsigned srcLen, UChar const * src, char const * localename)
  731. {
  732. UnicodeString unicode(src, srcLen);
  733. Locale locale(localename);
  734. unicode.toTitle(0, locale);
  735. forceLength(unicode, 80);
  736. unicode.extract(0, 80, tgt);
  737. }
  738. UNICODELIB_API int UNICODELIB_CALL ulUnicodeCompareIgnoreCase(unsigned src1Len, UChar const * src1, unsigned src2Len, UChar const * src2)
  739. {
  740. return doUnicodeCompareAtStrength(src1Len, src1, src2Len, src2, Collator::SECONDARY);
  741. }
  742. UNICODELIB_API int UNICODELIB_CALL ulUnicodeCompareAtStrength(unsigned src1Len, UChar const * src1, unsigned src2Len, UChar const * src2, char strength)
  743. {
  744. switch(strength)
  745. {
  746. case 1:
  747. return doUnicodeCompareAtStrength(src1Len, src1, src2Len, src2, Collator::PRIMARY);
  748. case 2:
  749. return doUnicodeCompareAtStrength(src1Len, src1, src2Len, src2, Collator::SECONDARY);
  750. case 3:
  751. return doUnicodeCompareAtStrength(src1Len, src1, src2Len, src2, Collator::TERTIARY);
  752. case 4:
  753. return doUnicodeCompareAtStrength(src1Len, src1, src2Len, src2, Collator::QUATERNARY);
  754. case 5:
  755. default:
  756. return doUnicodeCompareAtStrength(src1Len, src1, src2Len, src2, Collator::IDENTICAL);
  757. }
  758. }
  759. UNICODELIB_API int UNICODELIB_CALL ulUnicodeLocaleCompareIgnoreCase(unsigned src1Len, UChar const * src1, unsigned src2Len, UChar const * src2, char const * localename)
  760. {
  761. return doUnicodeLocaleCompareAtStrength(src1Len, src1, src2Len, src2, localename, Collator::SECONDARY);
  762. }
  763. UNICODELIB_API int UNICODELIB_CALL ulUnicodeLocaleCompareAtStrength(unsigned src1Len, UChar const * src1, unsigned src2Len, UChar const * src2, char const * localename, char strength)
  764. {
  765. switch(strength)
  766. {
  767. case 1:
  768. return doUnicodeLocaleCompareAtStrength(src1Len, src1, src2Len, src2, localename, Collator::PRIMARY);
  769. case 2:
  770. return doUnicodeLocaleCompareAtStrength(src1Len, src1, src2Len, src2, localename, Collator::SECONDARY);
  771. case 3:
  772. return doUnicodeLocaleCompareAtStrength(src1Len, src1, src2Len, src2, localename, Collator::TERTIARY);
  773. case 4:
  774. return doUnicodeLocaleCompareAtStrength(src1Len, src1, src2Len, src2, localename, Collator::QUATERNARY);
  775. case 5:
  776. default:
  777. return doUnicodeLocaleCompareAtStrength(src1Len, src1, src2Len, src2, localename, Collator::IDENTICAL);
  778. }
  779. }
  780. UNICODELIB_API void UNICODELIB_CALL ulUnicodeReverse(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src)
  781. {
  782. UnicodeString in(src, srcLen);
  783. UnicodeString out;
  784. StringCharacterIterator iter(in);
  785. for(iter.last32(); iter.hasPrevious(); iter.previous32())
  786. out.append(iter.current32());
  787. if(srcLen) out.append(iter.current32());
  788. tgtLen = out.length();
  789. tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
  790. out.extract(0, tgtLen, tgt);
  791. }
  792. UNICODELIB_API void UNICODELIB_CALL ulUnicodeFindReplace(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src, unsigned stokLen, UChar const * stok, unsigned rtokLen, UChar const * rtok)
  793. {
  794. UnicodeString source(src, srcLen);
  795. UnicodeString const pattern(stok, stokLen);
  796. UnicodeString const replace(rtok, rtokLen);
  797. source.findAndReplace(pattern, replace);
  798. tgtLen = source.length();
  799. tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
  800. source.extract(0, tgtLen, tgt);
  801. }
  802. UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleFindReplace(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src, unsigned stokLen, UChar const * stok, unsigned rtokLen, UChar const * rtok, char const * localename)
  803. {
  804. UnicodeString source(src, srcLen);
  805. UnicodeString const pattern(stok, stokLen);
  806. UnicodeString const replace(rtok, rtokLen);
  807. doUnicodeLocaleFindReplace(source, pattern, replace, localename);
  808. tgtLen = source.length();
  809. tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
  810. source.extract(0, tgtLen, tgt);
  811. }
  812. UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleFindAtStrengthReplace(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src, unsigned stokLen, UChar const * stok, unsigned rtokLen, UChar const * rtok, char const * localename, char strength)
  813. {
  814. UnicodeString source(src, srcLen);
  815. UnicodeString const pattern(stok, stokLen);
  816. UnicodeString const replace(rtok, rtokLen);
  817. doUnicodeLocaleFindAtStrengthReplace(source, pattern, replace, localename, strength);
  818. tgtLen = source.length();
  819. tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
  820. source.extract(0, tgtLen, tgt);
  821. }
  822. UNICODELIB_API void UNICODELIB_CALL ulUnicodeFindReplace80(UChar * tgt, unsigned srcLen, UChar const * src, unsigned stokLen, UChar const * stok, unsigned rtokLen, UChar const * rtok)
  823. {
  824. UnicodeString source(src, srcLen);
  825. UnicodeString const pattern(stok, stokLen);
  826. UnicodeString const replace(rtok, rtokLen);
  827. source.findAndReplace(pattern, replace);
  828. forceLength(source, 80);
  829. source.extract(0, 80, tgt);
  830. }
  831. UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleFindReplace80(UChar * tgt, unsigned srcLen, UChar const * src, unsigned stokLen, UChar const * stok, unsigned rtokLen, UChar const * rtok, char const * localename)
  832. {
  833. UnicodeString source(src, srcLen);
  834. UnicodeString const pattern(stok, stokLen);
  835. UnicodeString const replace(rtok, rtokLen);
  836. doUnicodeLocaleFindReplace(source, pattern, replace, localename);
  837. forceLength(source, 80);
  838. source.extract(0, 80, tgt);
  839. }
  840. UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleFindAtStrengthReplace80(UChar * tgt, unsigned srcLen, UChar const * src, unsigned stokLen, UChar const * stok, unsigned rtokLen, UChar const * rtok, char const * localename, char strength)
  841. {
  842. UnicodeString source(src, srcLen);
  843. UnicodeString const pattern(stok, stokLen);
  844. UnicodeString const replace(rtok, rtokLen);
  845. doUnicodeLocaleFindAtStrengthReplace(source, pattern, replace, localename, strength);
  846. forceLength(source, 80);
  847. source.extract(0, 80, tgt);
  848. }
  849. UNICODELIB_API void UNICODELIB_CALL ulUnicodeCleanSpaces(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src)
  850. {
  851. UnicodeString source(src, srcLen);
  852. doUnicodeCleanSpaces(source);
  853. tgtLen = source.length();
  854. tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
  855. source.extract(0, tgtLen, tgt);
  856. }
  857. UNICODELIB_API void UNICODELIB_CALL ulUnicodeCleanSpaces25(UChar * tgt, unsigned srcLen, UChar const * src)
  858. {
  859. UnicodeString source(src, srcLen);
  860. doUnicodeCleanSpaces(source);
  861. forceLength(source, 25);
  862. source.extract(0, 25, tgt);
  863. }
  864. UNICODELIB_API void UNICODELIB_CALL ulUnicodeCleanSpaces80(UChar * tgt, unsigned srcLen, UChar const * src)
  865. {
  866. UnicodeString source(src, srcLen);
  867. doUnicodeCleanSpaces(source);
  868. forceLength(source, 80);
  869. source.extract(0, 80, tgt);
  870. }
  871. UNICODELIB_API bool UNICODELIB_CALL ulUnicodeWildMatch(unsigned srcLen, UChar const * src, unsigned patLen, UChar const * pat, bool noCase)
  872. {
  873. return wildTrimMatch<UChar, u16toupper, u16query, u16asterisk, u16space>(src, srcLen, pat, patLen, noCase);
  874. }
  875. UNICODELIB_API bool UNICODELIB_CALL ulUnicodeContains(unsigned srcLen, UChar const * src, unsigned patLen, UChar const * pat, bool noCase)
  876. {
  877. UnicodeString source(src, srcLen);
  878. UnicodeString pattern(pat, patLen);
  879. if(noCase)
  880. {
  881. source.foldCase();
  882. pattern.foldCase();
  883. }
  884. StringCharacterIterator iter(pattern);
  885. for(iter.first32(); iter.hasNext(); iter.next32())
  886. if(source.indexOf(iter.current32()) == -1)
  887. return false;
  888. return true;
  889. }
  890. UNICODELIB_API void UNICODELIB_CALL ulUnicodeCleanAccents(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src)
  891. {
  892. if (!deAccenter)
  893. {
  894. CriticalBlock b(accenterCrit);
  895. if (!deAccenter)
  896. {
  897. UErrorCode lStatus = U_ZERO_ERROR;
  898. deAccenter = icu::Transliterator::createInstance("NFD; [:M:] Remove; NFC;", UTRANS_FORWARD, lStatus);
  899. }
  900. }
  901. UnicodeString source(src, srcLen);
  902. deAccenter->transliterate(source);
  903. tgtLen = source.length();
  904. tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
  905. source.extract(0, tgtLen, tgt);
  906. }
  907. static RuleBasedCollator * createRBCollator(const char * localename)
  908. {
  909. UErrorCode status = U_ZERO_ERROR;
  910. Locale locale(localename);
  911. RuleBasedCollator * rbc = (RuleBasedCollator *)RuleBasedCollator::createInstance(locale, status);
  912. rbc->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, status);
  913. if (U_FAILURE(status))
  914. {
  915. delete rbc;
  916. return NULL;
  917. }
  918. return rbc;
  919. }
  920. class RBCLocale
  921. {
  922. public:
  923. RBCLocale(char const * _locale) : locale(_locale)
  924. {
  925. rbc = createRBCollator(locale);
  926. }
  927. ~RBCLocale()
  928. {
  929. delete rbc;
  930. }
  931. RuleBasedCollator * queryCollator() const { return rbc; }
  932. private:
  933. StringAttr locale;
  934. RuleBasedCollator * rbc;
  935. };
  936. typedef MapStringTo<RBCLocale, char const *> MapStrToRBC;
  937. static MapStrToRBC * localeMap;
  938. static CriticalSection localeCrit;
  939. MODULE_INIT(INIT_PRIORITY_STANDARD)
  940. {
  941. return true;
  942. }
  943. MODULE_EXIT()
  944. {
  945. delete localeMap;
  946. localeMap = NULL;
  947. }
  948. static RuleBasedCollator * queryRBCollator(const char * localename)
  949. {
  950. if (!localename) localename = "";
  951. CriticalBlock b(localeCrit);
  952. if (!localeMap)
  953. localeMap = new MapStrToRBC;
  954. RBCLocale * loc = localeMap->getValue(localename);
  955. if(!loc)
  956. {
  957. //MORE: ECLRTL calls rtlGetNormalizedUnicodeLocaleName(). Should this be happening here?
  958. const char * normalizedlocale = localename;
  959. localeMap->setValue(localename, normalizedlocale);
  960. loc = localeMap->getValue(localename);
  961. }
  962. return loc->queryCollator();
  963. }
  964. UNICODELIB_API unsigned UNICODELIB_CALL ulUnicodeLocaleEditDistance(unsigned leftLen, UChar const * left, unsigned rightLen, UChar const * right, char const * localename)
  965. {
  966. RuleBasedCollator* rbc = queryRBCollator(localename);
  967. if (!rbc)
  968. return DISTANCE_ON_ERROR;
  969. UnicodeString uLeft(left, leftLen);
  970. UnicodeString uRight(right, rightLen);
  971. unsigned distance = nsUnicodelib::unicodeEditDistanceV2(uLeft, uRight, *rbc);
  972. return distance;
  973. }
  974. UNICODELIB_API bool UNICODELIB_CALL ulUnicodeLocaleEditDistanceWithinRadius(unsigned leftLen, UChar const * left, unsigned rightLen, UChar const * right, unsigned radius, char const * localename)
  975. {
  976. RuleBasedCollator* rbc = queryRBCollator(localename);
  977. if (!rbc)
  978. return false;
  979. UnicodeString uLeft(left, leftLen);
  980. UnicodeString uRight(right, rightLen);
  981. unsigned distance = nsUnicodelib::unicodeEditDistanceV3(uLeft, uRight, radius, *rbc);
  982. return distance <= radius;
  983. }
  984. UNICODELIB_API unsigned UNICODELIB_CALL ulUnicodeLocaleWordCount(unsigned textLen, UChar const * text, char const * localename)
  985. {
  986. UErrorCode status = U_ZERO_ERROR;
  987. Locale locale(localename);
  988. RuleBasedBreakIterator* bi = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createWordInstance(locale, status);
  989. UnicodeString uText(text, textLen);
  990. uText.trim();
  991. unsigned count = doCountWords(*bi, uText);
  992. delete bi;
  993. return count;
  994. }
  995. UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleGetNthWord(unsigned & tgtLen, UChar * & tgt, unsigned textLen, UChar const * text, unsigned n, char const * localename)
  996. {
  997. UErrorCode status = U_ZERO_ERROR;
  998. Locale locale(localename);
  999. RuleBasedBreakIterator* bi = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createWordInstance(locale, status);
  1000. UnicodeString uText(text, textLen);
  1001. uText.trim();
  1002. UnicodeString word = getNthWord(*bi, uText, n);
  1003. if(word.length()>0)
  1004. {
  1005. tgtLen = word.length();
  1006. tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
  1007. word.extract(0, tgtLen, tgt);
  1008. }
  1009. else
  1010. {
  1011. tgtLen = 0;
  1012. tgt = 0;
  1013. }
  1014. }