unicodelib.cpp 61 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710
  1. /*##############################################################################
  2. HPCC SYSTEMS software Copyright (C) 2012 HPCC Systems®.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. ############################################################################## */
  13. #include "jlib.hpp"
  14. #include "jsem.hpp"
  15. #include <string.h>
  16. #include "unicodelib.hpp"
  17. #include "unicode/usearch.h"
  18. #include "unicode/schriter.h"
  19. #include "unicode/locid.h"
  20. #include "unicode/coll.h"
  21. #include "unicode/stsearch.h"
  22. #include "unicode/translit.h"
  23. #include "unicode/rbbi.h"
  24. #include "../stringlib/wildmatch.tpp"
  25. #define UNICODELIB_VERSION "UNICODELIB 1.1.06"
  26. static UChar32 const u32comma = ',';
  27. static UChar32 const u32space = ' ';
  28. static UChar const u16asterisk = '*';
  29. static UChar const u16query = '?';
  30. static UChar const u16space = ' ';
  31. static const char * EclDefinition =
  32. "export UnicodeLib := SERVICE:fold\n"
  33. " unicode UnicodeFilterOut(const unicode src, const unicode _within) : c, pure,entrypoint='ulUnicodeFilterOut'; \n"
  34. " unicode UnicodeFilter(const unicode src, const unicode _within) : c, pure,entrypoint='ulUnicodeFilter'; \n"
  35. " unicode UnicodeSubstituteOut(const unicode src, const unicode _within, const unicode _newchar) : c, pure,entrypoint='ulUnicodeSubsOut'; \n"
  36. " unicode UnicodeSubstitute(const unicode src, const unicode _within, const unicode _newchar) : c, pure,entrypoint='ulUnicodeSubs'; \n"
  37. " unicode UnicodeRepad(const unicode src, unsigned4 size) : c, pure,entrypoint='ulUnicodeRepad'; \n"
  38. " unsigned integer4 UnicodeFind(const unicode src, const unicode tofind, unsigned4 instance) : c, pure,entrypoint='ulUnicodeFind', hole; \n"
  39. " unsigned integer4 UnicodeLocaleFind(const unicode src, const unicode tofind, unsigned4 instance, const varstring localename) : c, pure,entrypoint='ulUnicodeLocaleFind', hole; \n"
  40. " unsigned integer4 UnicodeLocaleFindAtStrength(const unicode src, const unicode tofind, unsigned4 instance, const varstring localename, integer1 strength) : c, pure,entrypoint='ulUnicodeLocaleFindAtStrength', hole; \n"
  41. " unicode UnicodeExtract(const unicode src, unsigned4 instance) : c,pure,entrypoint='ulUnicodeExtract'; \n"
  42. " unicode50 UnicodeExtract50(const unicode src, unsigned4 instance) : c,pure,entrypoint='ulUnicodeExtract50', hole; \n"
  43. " unicode UnicodeToLowerCase(const unicode src) : c,pure,entrypoint='ulUnicodeToLowerCase';\n"
  44. " unicode UnicodeToUpperCase(const unicode src) : c,pure,entrypoint='ulUnicodeToUpperCase';\n"
  45. " unicode UnicodeToProperCase(const unicode src) : c,pure,entrypoint='ulUnicodeToProperCase';\n"
  46. " unicode80 UnicodeToLowerCase80(const unicode src) : c,pure,entrypoint='ulUnicodeToLowerCase80', hole;\n"
  47. " unicode80 UnicodeToUpperCase80(const unicode src) : c,pure,entrypoint='ulUnicodeToUpperCase80', hole;\n"
  48. " unicode80 UnicodeToProperCase80(const unicode src) : c,pure,entrypoint='ulUnicodeToProperCase80', hole;\n"
  49. " unicode UnicodeLocaleToLowerCase(const unicode src, const varstring localename) : c,pure,entrypoint='ulUnicodeLocaleToLowerCase';\n"
  50. " unicode UnicodeLocaleToUpperCase(const unicode src, const varstring localename) : c,pure,entrypoint='ulUnicodeLocaleToUpperCase';\n"
  51. " unicode UnicodeLocaleToProperCase(const unicode src, const varstring localename) : c,pure,entrypoint='ulUnicodeLocaleToProperCase';\n"
  52. " unicode80 UnicodeLocaleToLowerCase80(const unicode src, const varstring localename) : c,pure,entrypoint='ulUnicodeLocaleToLowerCase80', hole;\n"
  53. " unicode80 UnicodeLocaleToUpperCase80(const unicode src, const varstring localename) : c,pure,entrypoint='ulUnicodeLocaleToUpperCase80', hole;\n"
  54. " unicode80 UnicodeLocaleToProperCase80(const unicode src, const varstring localename) : c,pure,entrypoint='ulUnicodeLocaleToProperCase80', hole;\n"
  55. " integer4 UnicodeCompareIgnoreCase(const unicode src1, const unicode src2) : c,pure,entrypoint='ulUnicodeCompareIgnoreCase', hole;\n"
  56. " integer4 UnicodeCompareAtStrength(const unicode src1, const unicode src2, integer1 strength) : c,pure,entrypoint='ulUnicodeCompareAtStrength', hole;\n"
  57. " integer4 UnicodeLocaleCompareIgnoreCase(const unicode src1, const unicode src2, const varstring localename) : c,pure,entrypoint='ulUnicodeLocaleCompareIgnoreCase', hole;\n"
  58. " integer4 UnicodeLocaleCompareAtStrength(const unicode src1, const unicode src2, const varstring localename, integer1 strength) : c,pure,entrypoint='ulUnicodeLocaleCompareAtStrength', hole;\n"
  59. " unicode UnicodeReverse(const unicode src) : c,pure,entrypoint='ulUnicodeReverse';\n"
  60. " unicode UnicodeFindReplace(const unicode src, const unicode stok, const unicode rtok) : c,pure,entrypoint='ulUnicodeFindReplace';\n"
  61. " unicode UnicodeLocaleFindReplace(const unicode src, const unicode stok, const unicode rtok, const varstring localename) : c,pure,entrypoint='ulUnicodeLocaleFindReplace';\n"
  62. " unicode UnicodeLocaleFindAtStrengthReplace(const unicode src, const unicode stok, const unicode rtok, const varstring localename, integer1 strength) : c,pure,entrypoint='ulUnicodeLocaleFindAtStrengthReplace';\n"
  63. " unicode80 UnicodeFindReplace80(const unicode src, const unicode stok, const unicode rtok) : c,pure,entrypoint='ulUnicodeFindReplace80', hole;\n"
  64. " unicode80 UnicodeLocaleFindReplace80(const unicode src, const unicode stok, const unicode rtok, const varstring localename) : c,pure,entrypoint='ulUnicodeLocaleFindReplace80',hole;\n"
  65. " unicode80 UnicodeLocaleFindAtStrengthReplace80(const unicode src, const unicode stok, const unicode rtok, const varstring localename, integer1 strength) : c,pure,entrypoint='ulUnicodeLocaleFindAtStrengthReplace80',hole;\n"
  66. " unicode UnicodeCleanAccents(const unicode src) : c,pure,entrypoint='ulUnicodeCleanAccents'; \n"
  67. " unicode UnicodeCleanSpaces(const unicode src) : c,pure,entrypoint='ulUnicodeCleanSpaces'; \n"
  68. " unicode25 UnicodeCleanSpaces25(const unicode src) : c,pure,entrypoint='ulUnicodeCleanSpaces25', hole; \n"
  69. " unicode80 UnicodeCleanSpaces80(const unicode src) : c,pure,entrypoint='ulUnicodeCleanSpaces80', hole; \n"
  70. " boolean UnicodeWildMatch(const unicode src, const unicode _pattern, boolean _noCase) : c, pure,entrypoint='ulUnicodeWildMatch', hole; \n"
  71. " boolean UnicodeContains(const unicode src, const unicode _pattern, boolean _noCase) : c, pure,entrypoint='ulUnicodeContains', hole; \n"
  72. " unsigned4 UnicodeLocaleEditDistance(const unicode left, const unicode right, const varstring localename) : c,time,pure,entrypoint='ulUnicodeLocaleEditDistance', hole; \n"
  73. " boolean UnicodeLocaleEditDistanceWithinRadius(const unicode left, const unicode right, unsigned4 radius, const varstring localename) : c,time,pure,entrypoint='ulUnicodeLocaleEditDistanceWithinRadius', hole; \n"
  74. " unsigned4 UnicodeLocaleWordCount(const unicode text, const varstring localename) : c, pure,entrypoint='ulUnicodeLocaleWordCount', hole; \n"
  75. " unicode UnicodeLocaleGetNthWord(const unicode text, unsigned4 n, const varstring localename) : c,pure,entrypoint='ulUnicodeLocaleGetNthWord';\n"
  76. " unicode UnicodeLocaleExcludeNthWord(const unicode text, unsigned4 n, const varstring localename) :c,pure,entrypoint='ulUnicodeLocaleExcludeNthWord';\n"
  77. " unicode UnicodeLocaleExcludeLastWord(const unicode text, const varstring localename) : c,pure,entrypoint='ulUnicodeLocaleExcludeLastWord';\n"
  78. " unicode UnicodeLocaleTranslate(const unicode text, unicode sear, unicode repl) :c,pure,entrypoint='ulUnicodeLocaleTranslate';\n"
  79. " boolean UnicodeLocaleStartsWith(const unicode src, unicode pref, string form) :c,pure,entrypoint='ulUnicodeLocaleStartsWith';\n"
  80. " boolean UnicodeLocaleEndsWith(const unicode src, const unicode suff, const string form) :c,pure,entrypoint='ulUnicodeLocaleEndsWith';\n"
  81. " string UnicodeVersion():c,pure,entrypoint='ulUnicodeVersion';\n"
  82. "END;\n";
  83. static const char * compatibleVersions[] = {
  84. "UNICODELIB 1.1.01 [64d78857c1cecae15bd238cd7767b3c1]",
  85. "UNICODELIB 1.1.01 [e8790fe30d9627997749c3c4839b5957]",
  86. "UNICODELIB 1.1.02",
  87. "UNICODELIB 1.1.03",
  88. "UNICODELIB 1.1.04",
  89. "UNICODELIB 1.1.05",
  90. NULL };
  91. UNICODELIB_API bool getECLPluginDefinition(ECLPluginDefinitionBlock *pb)
  92. {
  93. if (pb->size == sizeof(ECLPluginDefinitionBlockEx))
  94. {
  95. ECLPluginDefinitionBlockEx * pbx = (ECLPluginDefinitionBlockEx *) pb;
  96. pbx->compatibleVersions = compatibleVersions;
  97. }
  98. else if (pb->size != sizeof(ECLPluginDefinitionBlock))
  99. return false;
  100. pb->magicVersion = PLUGIN_VERSION;
  101. pb->version = UNICODELIB_VERSION;
  102. pb->moduleName = "lib_unicodelib";
  103. pb->ECL = EclDefinition;
  104. pb->flags = PLUGIN_IMPLICIT_MODULE | PLUGIN_MULTIPLE_VERSIONS;
  105. pb->description = "UnicodeLib unicode string manipulation library";
  106. return true;
  107. }
  108. namespace nsUnicodelib {
  109. IPluginContext * parentCtx = NULL;
  110. void doTrimRight(UnicodeString & source)
  111. {
  112. int32_t oldLength = source.length();
  113. if (!oldLength)
  114. return;
  115. int32_t currentLength = oldLength;
  116. bool uSpace = true;
  117. do {
  118. UChar32 c = source[--currentLength];
  119. if(!(c == 0x20 || u_isWhitespace(c))) {
  120. currentLength++;
  121. uSpace = false;
  122. }
  123. } while (uSpace && currentLength>0);
  124. if (currentLength < oldLength) {
  125. source.truncate(currentLength);
  126. }
  127. }
  128. void forceLength(UnicodeString & str, int32_t len)
  129. {
  130. if(str.length()>len)
  131. str.truncate(len);
  132. else if(str.length()<len)
  133. str.padTrailing(len);
  134. }
  135. void doModifySearchStrength(StringSearch & search, char strength, UErrorCode & error)
  136. {
  137. RuleBasedCollator * coll = search.getCollator();
  138. switch(strength)
  139. {
  140. case 1:
  141. coll->setStrength(Collator::PRIMARY);
  142. break;
  143. case 2:
  144. coll->setStrength(Collator::SECONDARY);
  145. break;
  146. case 3:
  147. coll->setStrength(Collator::TERTIARY);
  148. break;
  149. case 4:
  150. coll->setStrength(Collator::QUATERNARY);
  151. break;
  152. case 5:
  153. default:
  154. coll->setStrength(Collator::IDENTICAL);
  155. }
  156. search.setCollator(coll, error);
  157. }
  158. bool extract(UnicodeString & out, UnicodeString const & in, unsigned instance)
  159. {
  160. if(!instance) return false;
  161. int32_t start = 0;
  162. while(--instance)
  163. {
  164. start = in.indexOf(u32comma, start);
  165. if(start == -1) return false;
  166. start++;
  167. }
  168. int32_t end = in.indexOf(u32comma, start);
  169. if(end == -1)
  170. end = in.length();
  171. out.append(in, start, end-start);
  172. return true;
  173. }
  174. int doUnicodeCompareAtStrength(unsigned src1Len, UChar const * src1, unsigned src2Len, UChar const * src2, Collator::ECollationStrength strength)
  175. {
  176. UErrorCode error = U_ZERO_ERROR;
  177. Collator * coll = Collator::createInstance(error);
  178. coll->setStrength(strength);
  179. Collator::EComparisonResult ret = coll->compare(src1, src1Len, src2, src2Len);
  180. delete coll;
  181. return ret;
  182. }
  183. int doUnicodeLocaleCompareAtStrength(unsigned src1Len, UChar const * src1, unsigned src2Len, UChar const * src2, char const * localename, Collator::ECollationStrength strength)
  184. {
  185. UErrorCode error = U_ZERO_ERROR;
  186. Locale locale(localename);
  187. Collator * coll = Collator::createInstance(locale, error);
  188. coll->setStrength(strength);
  189. Collator::EComparisonResult ret = coll->compare(src1, src1Len, src2, src2Len);
  190. delete coll;
  191. return ret;
  192. }
  193. void doUnicodeLocaleFindReplace(UnicodeString & source, UnicodeString const & pattern, UnicodeString const & replace, char const * localename)
  194. {
  195. UErrorCode error = U_ZERO_ERROR;
  196. Locale locale(localename);
  197. StringSearch search(pattern, source, locale, 0, error);
  198. int32_t pos = search.first(error);
  199. while(pos != USEARCH_DONE)
  200. {
  201. source.replace(pos, search.getMatchedLength(), replace);
  202. search.setText(source, error);
  203. search.setOffset(pos+replace.length(), error);
  204. pos = search.next(error);
  205. }
  206. }
  207. void doUnicodeLocaleFindAtStrengthReplace(UnicodeString & source, UnicodeString const & pattern, UnicodeString const & replace, char const * localename, char strength)
  208. {
  209. UErrorCode error = U_ZERO_ERROR;
  210. Locale locale(localename);
  211. StringSearch search(pattern, source, locale, 0, error);
  212. doModifySearchStrength(search, strength, error);
  213. int32_t pos = search.first(error);
  214. while(pos != USEARCH_DONE)
  215. {
  216. source.replace(pos, search.getMatchedLength(), replace);
  217. search.setText(source, error);
  218. search.setOffset(pos+replace.length(), error);
  219. pos = search.next(error);
  220. }
  221. }
  222. void doUnicodeCleanSpaces(UnicodeString & source)
  223. {
  224. int32_t srclen;
  225. int32_t pos = source.indexOf(u32space);
  226. int32_t endpos;
  227. int32_t spacelen;
  228. while(pos != -1)
  229. {
  230. srclen = source.length();
  231. for(endpos=pos; endpos<srclen; endpos++)
  232. if(source.charAt(endpos)!=u32space) break;
  233. spacelen = endpos-pos;
  234. if((pos>0) && (endpos<srclen)) spacelen--;
  235. if(spacelen>0) source.remove(pos, spacelen);
  236. pos = source.indexOf(u32space, pos+1);
  237. }
  238. }
  239. /*
  240. N.B. To do 'real' case-insensitive matching we should use full stringwise casefolding on the source and pattern. The simple char-by-char toupper approach has problems with unicode. For example, some chars uppercase to multiple chars (e.g. the German 'sharp s' uppercases to 'SS'). See http://icu-project.org/userguide/posix.html#case_mappings for more examples. Furthermore, converting as 16-bit code units does not work when code points from U+10000 upwards are involved. Nevertheless, we use the simple char-by-char toupper approach for the UnicodeWildMatch function, because it is intended as a high-speed function. For accurate case-folding, you should either use the UnicodeToUpperCase function explicitly on the arguments or use REGEXFIND.
  241. */
  242. inline UChar u16toupper(UChar c)
  243. {
  244. UChar32 o = u_toupper(c);
  245. return U_IS_SUPPLEMENTARY(o) ? c : (UChar)o;
  246. }
  247. static icu::Transliterator* deAccenter = NULL;
  248. static CriticalSection accenterCrit;
  249. inline unsigned char min3(unsigned char a, unsigned char b, unsigned char c)
  250. {
  251. unsigned char min = (a<b)? a:b;
  252. return (min<c)? min:c;
  253. }
  254. // returns the length of Unicode Code Point in 16-bit Code Units
  255. inline int ucpLength(UChar32 c)
  256. {
  257. return U16_IS_SINGLE(c)?1:2;
  258. }
  259. #define DISTANCE_ON_ERROR 999
  260. class UPCList // User perceived character list
  261. {
  262. private:
  263. UnicodeString ustring_;
  264. uint32_t* next_;
  265. uint32_t length_;
  266. uint32_t capacity_;
  267. bool invalid_;
  268. void doCreateUPCList(BreakIterator& cbi) {
  269. UErrorCode status = U_ZERO_ERROR;
  270. if (!capacity_) {
  271. capacity_ = ustring_.length();
  272. }
  273. next_ = new uint32_t[capacity_+1]; // the number of characters is always less or equal to the string length
  274. unsigned index=0;
  275. cbi.setText(ustring_);
  276. next_[index] = cbi.first();
  277. for (int32_t end = cbi.next(); end != BreakIterator::DONE && length_ < capacity_; end = cbi.next())
  278. {
  279. length_++;
  280. next_[++index]=end;
  281. }
  282. if (U_FAILURE(status)) { length_ = 0; capacity_ = 0; invalid_ = true; }
  283. }
  284. void doCreateUPCList() {
  285. if (!capacity_) {
  286. capacity_ = ustring_.length();
  287. }
  288. next_ = new uint32_t[capacity_+1]; // the number of characters is always less or equal to the string length
  289. unsigned index=0;
  290. next_[index] = 0;
  291. int32_t end = 0;
  292. while (end < capacity_)
  293. {
  294. end = end+ucpLength(ustring_[end]);
  295. next_[++index] = end;
  296. }
  297. length_ = index;
  298. }
  299. public:
  300. UPCList(BreakIterator* cbi, const UnicodeString & source, uint32_t capacity=0)
  301. : length_(0), capacity_(capacity),ustring_(source), invalid_(false)
  302. {
  303. !cbi?doCreateUPCList():doCreateUPCList(*cbi);
  304. }
  305. ~UPCList()
  306. {
  307. delete[] next_;
  308. }
  309. uint32_t charOffset(uint32_t index) const
  310. {
  311. return (index < length_ )? next_[index]:0;
  312. }
  313. uint32_t charLength(uint32_t index) const
  314. {
  315. return (index < length_ )? next_[index+1]-next_[index]:0;
  316. }
  317. bool equal(uint32_t index, const UPCList& srcText, uint32_t srcIndex) const
  318. {
  319. uint32_t lLen = charLength(index);
  320. uint32_t rLen = srcText.charLength(srcIndex);
  321. if ( lLen != rLen )
  322. return false;
  323. UChar lChar,rChar;
  324. for (unsigned i=0; i < lLen; i++)
  325. {
  326. lChar = ustring_[charOffset(index)+i];
  327. rChar = srcText.getString()[srcText.charOffset(srcIndex)+i];
  328. if (lChar != rChar)
  329. return false;
  330. }
  331. return true;
  332. }
  333. const UnicodeString& getString() const {return ustring_;}
  334. uint32_t length() const { return length_;}
  335. uint32_t capacity() const {return capacity_;}
  336. inline bool isInvalid() const { return invalid_; }
  337. };
  338. class CEList
  339. {
  340. private:
  341. UnicodeString ustring_;
  342. uint32_t* ces_;
  343. uint32_t length_;
  344. uint32_t capacity_;
  345. bool invalid;
  346. void doCreateCEList(RuleBasedCollator& rbc) {
  347. UErrorCode status = U_ZERO_ERROR;
  348. CollationElementIterator* ceIterator = rbc.createCollationElementIterator( ustring_ );
  349. if (!capacity_) {
  350. capacity_ = ustring_.length();
  351. }
  352. ces_ = new uint32_t[capacity_];
  353. uint32_t ce = 0;
  354. do {
  355. ce = ceIterator->next(status);
  356. if ((length_ == capacity_) || (ce == CollationElementIterator::NULLORDER))
  357. break;
  358. ces_[length_++] = ce;
  359. } while (ce != CollationElementIterator::NULLORDER);
  360. delete ceIterator;
  361. if (U_FAILURE(status)) invalid = true;
  362. }
  363. public:
  364. CEList(RuleBasedCollator& rbc, const UnicodeString & source, uint32_t capacity=0)
  365. : length_(0), capacity_(capacity), ustring_(source), invalid(false)
  366. {
  367. doCreateCEList(rbc);
  368. }
  369. ~CEList()
  370. {
  371. delete[] ces_;
  372. }
  373. uint32_t operator[](uint32_t offset)
  374. {
  375. return (offset < length_ )? ces_[offset]:0xffff;
  376. }
  377. uint32_t length() { return length_;}
  378. uint32_t capacity() {return capacity_;}
  379. inline bool isInvalid() const { return invalid; }
  380. };
  381. inline unsigned mask(unsigned x) { return x & 1; }
  382. unsigned unicodeEditDistanceV2(UnicodeString & left, UnicodeString & right, RuleBasedCollator& rbc)
  383. {
  384. unsigned char i, j;
  385. doTrimRight(left);
  386. doTrimRight(right);
  387. unsigned leftLen = left.length();
  388. unsigned rightLen = right.length();
  389. if (leftLen > 255)
  390. leftLen = 255;
  391. if (rightLen > 255)
  392. rightLen = 255;
  393. if (leftLen == 0)
  394. return rightLen;
  395. if (rightLen == 0)
  396. return leftLen;
  397. CEList leftCEs(rbc, left, leftLen);
  398. CEList rightCEs(rbc, right, rightLen);
  399. if (leftCEs.isInvalid() || rightCEs.isInvalid())
  400. return DISTANCE_ON_ERROR;
  401. leftLen = leftCEs.length();
  402. rightLen = rightCEs.length();
  403. //Optimize the storage requirements by
  404. //i) Only storing two stripes
  405. //ii) Calculate, but don't store the row comparing against the null string
  406. unsigned char da[2][256];
  407. uint32_t r_0 = rightCEs[0];
  408. uint32_t l_0 = leftCEs[0];
  409. bool matched_l0 = false;
  410. for (j = 0; j < rightLen; j++)
  411. {
  412. if (rightCEs[j] == l_0) matched_l0 = true;
  413. da[0][j] = (matched_l0) ? j : j+1;
  414. }
  415. bool matched_r0 = (l_0 == r_0);
  416. for (i = 1; i < leftLen; i++)
  417. {
  418. uint32_t l_i = leftCEs[i];
  419. if (l_i == r_0)
  420. matched_r0 = true;
  421. byte da_i_0 = matched_r0 ? i : i+1;
  422. da[mask(i)][0] = da_i_0;
  423. byte da_i_prevj = da_i_0;
  424. for (j = 1; j < rightLen; j++)
  425. {
  426. uint32_t r_j = rightCEs[j];
  427. unsigned char next = (l_i == r_j) ? da[mask(i-1)][j-1] :
  428. min3(da[mask(i-1)][j], da_i_prevj, da[mask(i-1)][j-1]) + 1;
  429. da[mask(i)][j] = next;
  430. da_i_prevj = next;
  431. }
  432. }
  433. return da[mask(leftLen-1)][rightLen-1];
  434. }
  435. //This could be further improved in the following ways:
  436. // * Only use 2*radius bytes of temporary storage - I doubt it is worth it.
  437. // * special case edit1 - you could use variables for the 6 interesting array elements, and get
  438. // rid of the array completely. You could also unwind the first (and last iterations).
  439. // * I suspect the early exit condition could be improved depending the lengths of the strings.
  440. unsigned unicodeEditDistanceV3(UnicodeString & left, UnicodeString & right, unsigned radius, RuleBasedCollator& rbc)
  441. {
  442. if (radius >= 255)
  443. return 255;
  444. doTrimRight(left);
  445. doTrimRight(right);
  446. unsigned leftLen = left.length();
  447. unsigned rightLen = right.length();
  448. unsigned minED = (leftLen < rightLen)? rightLen - leftLen: leftLen - rightLen;
  449. if (minED > radius)
  450. return minED;
  451. if (leftLen > 255)
  452. leftLen = 255;
  453. if (rightLen > 255)
  454. rightLen = 255;
  455. //Checking for leading common substrings actually slows the function down.
  456. if (leftLen == 0)
  457. return rightLen;
  458. if (rightLen == 0)
  459. return leftLen;
  460. CEList leftCEs(rbc, left, leftLen);
  461. CEList rightCEs(rbc, right, rightLen);
  462. leftLen = leftCEs.length();
  463. rightLen = rightCEs.length();
  464. /*
  465. This function applies two optimizations over the function above.
  466. a) Adding a charcter (next row) can at most decrease the edit distance by 1, so short circuit when
  467. we there is no possiblity of getting within the distance.
  468. b) We only need to evaluate the martix da[i-radius..i+radius][j-radius..j+radius]
  469. not taking into account values outside that range [can use max value to prevent access]
  470. */
  471. //Optimize the storage requirements by
  472. //i) Only storing two stripes
  473. //ii) Calculate, but don't store the row comparing against the null string
  474. unsigned char da[2][256];
  475. uint32_t r_0 = rightCEs[0];
  476. uint32_t l_0 = leftCEs[0];
  477. bool matched_l0 = false;
  478. for (unsigned char j = 0; j < rightLen; j++)
  479. {
  480. if (rightCEs[j] == l_0) matched_l0 = true;
  481. da[0][j] = (matched_l0) ? j : j+1;
  482. }
  483. bool matched_r0 = (l_0 == r_0);
  484. for (unsigned char i = 1; i < leftLen; i++)
  485. {
  486. uint32_t l_i = leftCEs[i];
  487. if (l_i == r_0)
  488. matched_r0 = true;
  489. byte da_i_0 = matched_r0 ? i : i+1;
  490. da[mask(i)][0] = da_i_0;
  491. byte da_i_prevj = da_i_0;
  492. unsigned low = i-radius;
  493. unsigned high = i+radius;
  494. unsigned first = (i > radius) ? low : 1;
  495. unsigned last = (high >= rightLen) ? rightLen : high +1;
  496. for (unsigned j = first; j < last; j++)
  497. {
  498. uint32_t r_j = rightCEs[j];
  499. unsigned char next = da[mask(i-1)][j-1];
  500. if (l_i != r_j)
  501. {
  502. if (j != low)
  503. {
  504. if (next > da_i_prevj)
  505. next = da_i_prevj;
  506. }
  507. if (j != high)
  508. {
  509. byte da_previ_j = da[mask(i-1)][j];
  510. if (next > da_previ_j)
  511. next = da_previ_j;
  512. }
  513. next++;
  514. }
  515. da[mask(i)][j] = next;
  516. da_i_prevj = next;
  517. }
  518. // bail out early if ed can't possibly be <= radius
  519. // Only considering a strip down the middle of the matrix, so the maximum the score can ever be adjusted is 2xradius
  520. unsigned max_valid_score = 3*radius;
  521. // But maximum is also 1 for every difference in string length - comes in to play when close to the end.
  522. //In 32bit goes slower for radius=1 I suspect because running out of registers. Retest in 64bit.
  523. if (radius > 1)
  524. {
  525. unsigned max_distance = radius + (leftLen - (i+1)) + (rightLen - last);
  526. if (max_valid_score > max_distance)
  527. max_valid_score = max_distance;
  528. }
  529. if (da_i_prevj > max_valid_score)
  530. return da_i_prevj;
  531. }
  532. return da[mask(leftLen-1)][rightLen-1];
  533. }
  534. //This function is based on the unicodeEditDistanceV3 to pickup optimizations;
  535. // It replaces RuleBasedCollator with the CharacterIterator
  536. unsigned unicodeEditDistanceV4(UnicodeString & left, UnicodeString & right, unsigned radius, BreakIterator* bi)
  537. {
  538. if (radius >= 255)
  539. return 255;
  540. doTrimRight(left);
  541. doTrimRight(right);
  542. unsigned leftLen = left.length();
  543. unsigned rightLen = right.length();
  544. if (leftLen > 255)
  545. leftLen = 255;
  546. if (rightLen > 255)
  547. rightLen = 255;
  548. //Checking for leading common substrings actually slows the function down.
  549. if (leftLen == 0)
  550. return rightLen;
  551. if (rightLen == 0)
  552. return leftLen;
  553. UPCList leftCs(bi, left, leftLen);
  554. UPCList rightCs(bi, right, rightLen);
  555. if (leftCs.isInvalid() || rightCs.isInvalid())
  556. return DISTANCE_ON_ERROR;
  557. // get Unicode character lengths
  558. leftLen = leftCs.length();
  559. rightLen = rightCs.length();
  560. unsigned minED = (leftLen < rightLen)? rightLen - leftLen: leftLen - rightLen;
  561. if (minED > radius)
  562. return minED;
  563. /*
  564. This function applies two optimizations over the function above.
  565. a) Adding a character (next row) can at most decrease the edit distance by 1, so short circuit when
  566. we there is no possibility of getting within the distance.
  567. b) We only need to evaluate the matrix da[i-radius..i+radius][j-radius..j+radius]
  568. not taking into account values outside that range [can use max value to prevent access]
  569. */
  570. //Optimize the storage requirements by
  571. //i) Only storing two stripes
  572. //ii) Calculate, but don't store the row comparing against the null string
  573. unsigned char da[2][256];
  574. uint32_t rI_0 = 0;
  575. uint32_t lI_0 = 0;
  576. bool matched_l0 = false;
  577. for (unsigned char j = 0; j < rightLen; j++)
  578. {
  579. if (leftCs.equal(lI_0, rightCs, rI_0+j)) matched_l0 = true;
  580. da[0][j] = (matched_l0) ? j : j+1;
  581. }
  582. bool matched_r0 = leftCs.equal(lI_0, rightCs, rI_0);
  583. for (unsigned char i = 1; i < leftLen; i++)
  584. {
  585. uint32_t lI_i = i;
  586. if (leftCs.equal(lI_i, rightCs, rI_0))
  587. matched_r0 = true;
  588. byte da_i_0 = matched_r0 ? i : i+1;
  589. da[mask(i)][0] = da_i_0;
  590. byte da_i_prevj = da_i_0;
  591. unsigned low = i-radius;
  592. unsigned high = i+radius;
  593. unsigned first = (i > radius) ? low : 1;
  594. unsigned last = (high >= rightLen) ? rightLen : high +1;
  595. for (unsigned j = first; j < last; j++)
  596. {
  597. uint32_t rI_j = j;
  598. unsigned char next = da[mask(i-1)][j-1];
  599. if (!leftCs.equal(lI_i, rightCs, rI_j))
  600. {
  601. if (j != low)
  602. {
  603. if (next > da_i_prevj)
  604. next = da_i_prevj;
  605. }
  606. if (j != high)
  607. {
  608. byte da_previ_j = da[mask(i-1)][j];
  609. if (next > da_previ_j)
  610. next = da_previ_j;
  611. }
  612. next++;
  613. }
  614. da[mask(i)][j] = next;
  615. da_i_prevj = next;
  616. }
  617. // bail out early if ed can't possibly be <= radius
  618. // Only considering a strip down the middle of the matrix, so the maximum the score can ever be adjusted is 2xradius
  619. unsigned max_valid_score = 3*radius;
  620. // But maximum is also 1 for every difference in string length - comes in to play when close to the end.
  621. //In 32bit goes slower for radius=1 I suspect because running out of registers. Retest in 64bit.
  622. if (radius > 1)
  623. {
  624. unsigned max_distance = radius + (leftLen - (i+1)) + (rightLen - last);
  625. if (max_valid_score > max_distance)
  626. max_valid_score = max_distance;
  627. }
  628. if (da_i_prevj > max_valid_score)
  629. return da_i_prevj;
  630. }
  631. return da[mask(leftLen-1)][rightLen-1];
  632. }
  633. void normalizationFormCheck(UnicodeString & source, const char * form)
  634. {
  635. #if U_ICU_VERSION_MAJOR_NUM >= 44
  636. UErrorCode errorCode = U_ZERO_ERROR;
  637. if (form[2] == 'C')
  638. {
  639. const Normalizer2 * no = Normalizer2::getInstance(NULL, "nfc", UNormalization2Mode::UNORM2_COMPOSE, errorCode);
  640. source = no->normalize(source, errorCode);
  641. }
  642. else if (form[2] == 'D')
  643. {
  644. const Normalizer2 * no = Normalizer2::getInstance(NULL, "nfc", UNormalization2Mode::UNORM2_DECOMPOSE, errorCode);
  645. source = no->normalize(source, errorCode);
  646. }
  647. else if (form[3] == 'C')
  648. {
  649. const Normalizer2 * no = Normalizer2::getInstance(NULL, "nfkc", UNormalization2Mode::UNORM2_COMPOSE, errorCode);
  650. source = no->normalize(source, errorCode);
  651. }
  652. else if (form[3] == 'D')
  653. {
  654. const Normalizer2 * no = Normalizer2::getInstance(NULL, "nfkc", UNormalization2Mode::UNORM2_DECOMPOSE, errorCode);
  655. source = no->normalize(source, errorCode);
  656. }
  657. #else
  658. UErrorCode errorCode = U_ZERO_ERROR;
  659. UnicodeString result;
  660. if (form[2] == 'C')
  661. {
  662. Normalizer::normalize(source, UNORM_NFC, 0, result, errorCode);
  663. }
  664. else if (form[2] == 'D')
  665. {
  666. Normalizer::normalize(source, UNORM_NFD, 0, result, errorCode);
  667. }
  668. else if (form[3] == 'C')
  669. {
  670. Normalizer::normalize(source, UNORM_NFKC, 0, result, errorCode);
  671. }
  672. else if (form[3] == 'D')
  673. {
  674. Normalizer::normalize(source, UNORM_NFKD, 0, result, errorCode);
  675. }
  676. else
  677. return;
  678. source = result;
  679. #endif
  680. }
  681. static bool endsWith(UnicodeString const & processed, UnicodeString const & suffix)
  682. {
  683. if (processed.isEmpty() || suffix.isEmpty())
  684. {
  685. return false;
  686. }
  687. if (!processed.endsWith(suffix))
  688. {
  689. return false;
  690. }
  691. return true;
  692. }
  693. static bool startsWith(UnicodeString & processed, UnicodeString & prefix)
  694. {
  695. if (processed.isEmpty() || prefix.isEmpty())
  696. {
  697. return false;
  698. }
  699. prefix.trim();
  700. if (processed.compareCodePointOrder(0, prefix.length(), prefix) != 0)
  701. {
  702. return false;
  703. }
  704. return true;
  705. }
  706. void translate(UnicodeString & toProcess, UChar const * sear, unsigned searLen, UChar const * repl, unsigned replLen)
  707. {
  708. UnicodeString search(false, sear, searLen);
  709. UnicodeString replace(repl, replLen);
  710. if (search.countChar32() != replace.countChar32() || toProcess.isEmpty() || search.isEmpty() || replace.isEmpty())
  711. {
  712. return;
  713. }
  714. StringCharacterIterator it(toProcess);
  715. toProcess.remove();
  716. int32_t idx = it.setToStart();
  717. while (idx != it.endIndex())
  718. {
  719. int32_t x = search.lastIndexOf(it.current32());
  720. if (x == -1)
  721. {
  722. toProcess.append(it.current32());
  723. }
  724. else
  725. {
  726. x = replace.moveIndex32(0, x);
  727. toProcess.append(replace.char32At(x));
  728. }
  729. idx = it.move32(1, CharacterIterator::kCurrent);
  730. }
  731. }
  732. void excludeLastWord(RuleBasedBreakIterator& bi, UnicodeString & toProcess)
  733. {
  734. bi.setText(toProcess);
  735. int32_t idx = bi.last();
  736. int32_t wordidx = 0;
  737. int32_t wordBeginning = 0;
  738. int32_t wordEnd = idx;
  739. while (idx != 0)
  740. {
  741. //Backwards iterator operates until the iterator reaches 0 from bi.last()
  742. int breakType = bi.getRuleStatus();
  743. if (breakType != UBRK_WORD_NONE)
  744. {
  745. // Exclude spaces, punctuation, and the like.
  746. // A status value UBRK_WORD_NONE indicates that the boundary does
  747. // not start a word or number.
  748. ++wordidx;
  749. wordBeginning = bi.previous();
  750. //Increments the wordidx count and then moves iterator backwards past the one word that was recorded.
  751. //Iterator located just before the start of the last word.
  752. if (bi.getRuleStatus() != UBRK_WORD_NONE)
  753. {
  754. //Check for languages that do not use space characters to separate words.
  755. //If a word lies before the current location of the iterator,
  756. //incremement the wordidx to prevent removal of this extra word.
  757. ++wordidx;
  758. }
  759. if (bi.previous() == 0 && wordidx == 1)
  760. {
  761. //Check for single word string. In place to remove leading whitespaces if so.
  762. //Moves iterator backwards to the next boundary: either the beginning or end of a word.
  763. //If at the beginning of a word, wordidx should be 2,
  764. //and the condition should fail regardless of the iterator being the first position.
  765. //If at the end of a word, wordidx should be 1,
  766. //and the condition should fail because the iterator is not the first position.
  767. wordBeginning = 0;
  768. }
  769. toProcess.removeBetween(wordBeginning, wordEnd);
  770. return;
  771. }
  772. //Should only be called once before reaching a word or the beginning of the string.
  773. idx = bi.previous();
  774. }
  775. //Called if the string has no words.
  776. toProcess.removeBetween(0, bi.last());
  777. }
  778. void excludeNthWord(RuleBasedBreakIterator& bi, UnicodeString & source, unsigned n)
  779. {
  780. bi.setText(source);
  781. int32_t idx = bi.first();
  782. int32_t wordidx = 0;
  783. unsigned wordBeginning = 0;
  784. while (idx != BreakIterator::DONE)
  785. {
  786. int breakType = bi.getRuleStatus();
  787. if (breakType != UBRK_WORD_NONE)
  788. {
  789. // Exclude spaces, punctuation, and the like.
  790. // A status value UBRK_WORD_NONE indicates that the boundary does
  791. // not start a word or number.
  792. if (++wordidx == n)
  793. {
  794. if (n == 1)
  795. {
  796. wordBeginning = 0;
  797. }
  798. unsigned wordEnd;
  799. do
  800. {
  801. wordEnd = idx;
  802. idx = bi.next();
  803. } while (bi.getRuleStatus() == UBRK_WORD_NONE && idx != BreakIterator::DONE);
  804. source.removeBetween(wordBeginning, wordEnd);
  805. return;
  806. }
  807. }
  808. wordBeginning = idx;
  809. idx = bi.next();
  810. }
  811. if (!wordidx)
  812. {
  813. source.removeBetween(bi.first(), bi.last());
  814. }
  815. }
  816. UnicodeString getNthWord(RuleBasedBreakIterator& bi, UnicodeString const & source, unsigned n)
  817. {
  818. UnicodeString word;
  819. if (!n) return word;
  820. bi.setText(source);
  821. int32_t start = bi.first();
  822. while (start != BreakIterator::DONE && n) {
  823. int breakType = bi.getRuleStatus();
  824. if (breakType != UBRK_WORD_NONE) {
  825. // Exclude spaces, punctuation, and the like.
  826. // A status value UBRK_WORD_NONE indicates that the boundary does
  827. // not start a word or number.
  828. //
  829. n--;
  830. if (!n) {
  831. unsigned wordBegining = bi.preceding(start);
  832. unsigned wordEnd = bi.next();
  833. source.extractBetween(wordBegining, wordEnd, word);
  834. }
  835. }
  836. start = bi.next();
  837. }
  838. return word;
  839. }
  840. unsigned doCountWords(RuleBasedBreakIterator& bi, UnicodeString const & source)
  841. {
  842. bi.setText(source);
  843. int32_t start = bi.first();
  844. int32_t count = 0;
  845. while (start != BreakIterator::DONE) {
  846. int breakType = bi.getRuleStatus();
  847. if (breakType != UBRK_WORD_NONE) {
  848. // Exclude spaces, punctuation, and the like.
  849. // A status value UBRK_WORD_NONE indicates that the boundary does
  850. // not start a word or number.
  851. //
  852. ++count;
  853. }
  854. start = bi.next();
  855. }
  856. return count;
  857. }
  858. static BreakIterator * createCharacterBreakIterator(const char * localename)
  859. {
  860. UErrorCode status = U_ZERO_ERROR;
  861. Locale locale(localename);
  862. BreakIterator * cbi = (BreakIterator *)BreakIterator::createCharacterInstance(locale, status);
  863. if (U_FAILURE(status))
  864. {
  865. delete cbi;
  866. return NULL;
  867. }
  868. return cbi;
  869. }
  870. class CBILocale
  871. {
  872. public:
  873. CBILocale(char const * _locale) : locale(_locale)
  874. {
  875. cbi = createCharacterBreakIterator(locale);
  876. }
  877. ~CBILocale()
  878. {
  879. delete cbi;
  880. }
  881. BreakIterator * queryCharacterBreakIterator() const { return cbi; }
  882. private:
  883. StringAttr locale;
  884. BreakIterator * cbi;
  885. };
  886. typedef MapStringTo<CBILocale, char const *> MapStrToCBI;
  887. static MapStrToCBI * localeCBiMap;
  888. static CriticalSection localeCBiCrit;
  889. static BreakIterator * queryCharacterBreakIterator(const char * localename)
  890. {
  891. if (!localename) localename = "";
  892. CriticalBlock b(localeCBiCrit);
  893. if (!localeCBiMap)
  894. localeCBiMap = new MapStrToCBI;
  895. CBILocale * loc = localeCBiMap->getValue(localename);
  896. if(!loc)
  897. {
  898. const char * normalizedlocale = localename;
  899. localeCBiMap->setValue(localename, normalizedlocale);
  900. loc = localeCBiMap->getValue(localename);
  901. }
  902. return loc->queryCharacterBreakIterator();
  903. }
  904. static RuleBasedCollator * createRBCollator(const char * localename)
  905. {
  906. UErrorCode status = U_ZERO_ERROR;
  907. Locale locale(localename);
  908. RuleBasedCollator * rbc = (RuleBasedCollator *)RuleBasedCollator::createInstance(locale, status);
  909. rbc->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, status);
  910. if (U_FAILURE(status))
  911. {
  912. delete rbc;
  913. return NULL;
  914. }
  915. return rbc;
  916. }
  917. class RBCLocale
  918. {
  919. public:
  920. RBCLocale(char const * _locale) : locale(_locale)
  921. {
  922. rbc = createRBCollator(locale);
  923. }
  924. ~RBCLocale()
  925. {
  926. delete rbc;
  927. }
  928. RuleBasedCollator * queryCollator() const { return rbc; }
  929. private:
  930. StringAttr locale;
  931. RuleBasedCollator * rbc;
  932. };
  933. typedef MapStringTo<RBCLocale, char const *> MapStrToRBC;
  934. static MapStrToRBC * localeMap;
  935. static CriticalSection localeCrit;
  936. static RuleBasedCollator * queryRBCollator(const char * localename)
  937. {
  938. if (!localename) localename = "";
  939. CriticalBlock b(localeCrit);
  940. if (!localeMap)
  941. localeMap = new MapStrToRBC;
  942. RBCLocale * loc = localeMap->getValue(localename);
  943. if(!loc)
  944. {
  945. //MORE: ECLRTL calls rtlGetNormalizedUnicodeLocaleName(). Should this be happening here?
  946. const char * normalizedlocale = localename;
  947. localeMap->setValue(localename, normalizedlocale);
  948. loc = localeMap->getValue(localename);
  949. }
  950. return loc->queryCollator();
  951. }
  952. MODULE_INIT(INIT_PRIORITY_STANDARD)
  953. {
  954. return true;
  955. }
  956. MODULE_EXIT()
  957. {
  958. delete localeMap;
  959. localeMap = NULL;
  960. delete localeCBiMap;
  961. localeCBiMap = NULL;
  962. }
  963. }//namespace
  964. using namespace nsUnicodelib;
  965. UNICODELIB_API void setPluginContext(IPluginContext * _ctx) { parentCtx = _ctx; }
  966. UNICODELIB_API void UNICODELIB_CALL ulUnicodeFilterOut(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src, unsigned hitLen, UChar const * hit)
  967. {
  968. UnicodeString const in(src, srcLen);
  969. UnicodeString const filter(hit, hitLen);
  970. UnicodeString out;
  971. StringCharacterIterator iter(in);
  972. for(iter.first32(); iter.hasNext(); iter.next32())
  973. {
  974. UChar32 c = iter.current32();
  975. if(filter.indexOf(c) == -1)
  976. out.append(c);
  977. }
  978. tgtLen = out.length();
  979. tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
  980. out.extract(0, tgtLen, tgt);
  981. }
  982. UNICODELIB_API void UNICODELIB_CALL ulUnicodeFilter(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src, unsigned hitLen, UChar const * hit)
  983. {
  984. UnicodeString const in(src, srcLen);
  985. UnicodeString const filter(hit, hitLen);
  986. UnicodeString out;
  987. StringCharacterIterator iter(in);
  988. for(iter.first32(); iter.hasNext(); iter.next32())
  989. {
  990. UChar32 c = iter.current32();
  991. if(filter.indexOf(c) != -1)
  992. out.append(c);
  993. }
  994. tgtLen = out.length();
  995. tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
  996. out.extract(0, tgtLen, tgt);
  997. }
  998. UNICODELIB_API void UNICODELIB_CALL ulUnicodeSubsOut(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src, unsigned hitLen, UChar const * hit, unsigned newCharLen, UChar const * newChar)
  999. {
  1000. UnicodeString out;
  1001. if(newCharLen > 0)
  1002. {
  1003. UnicodeString const in(src, srcLen);
  1004. UnicodeString const filter(hit, hitLen);
  1005. UnicodeString const replaceString(newChar, newCharLen);
  1006. UChar32 replace = replaceString.char32At(0);
  1007. StringCharacterIterator iter(in);
  1008. for(iter.first32(); iter.hasNext(); iter.next32())
  1009. {
  1010. UChar32 c = iter.current32();
  1011. if(filter.indexOf(c) == -1)
  1012. out.append(c);
  1013. else
  1014. out.append(replace);
  1015. }
  1016. }
  1017. else
  1018. out.append(src, srcLen);
  1019. tgtLen = out.length();
  1020. tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
  1021. out.extract(0, tgtLen, tgt);
  1022. }
  1023. UNICODELIB_API void UNICODELIB_CALL ulUnicodeSubs(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src, unsigned hitLen, UChar const * hit, unsigned newCharLen, UChar const * newChar)
  1024. {
  1025. UnicodeString out;
  1026. if(newCharLen > 0)
  1027. {
  1028. UnicodeString const in(src, srcLen);
  1029. UnicodeString const filter(hit, hitLen);
  1030. UnicodeString const replaceString(newChar, newCharLen);
  1031. UChar32 replace = replaceString.char32At(0);
  1032. StringCharacterIterator iter(in);
  1033. for(iter.first32(); iter.hasNext(); iter.next32())
  1034. {
  1035. UChar32 c = iter.current32();
  1036. if(filter.indexOf(c) != -1)
  1037. out.append(c);
  1038. else
  1039. out.append(replace);
  1040. }
  1041. }
  1042. else
  1043. out.append(src, srcLen);
  1044. tgtLen = out.length();
  1045. tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
  1046. out.extract(0, tgtLen, tgt);
  1047. }
  1048. UNICODELIB_API void UNICODELIB_CALL ulUnicodeRepad(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src, unsigned tLen)
  1049. {
  1050. UnicodeString out(src, srcLen);
  1051. out.trim();
  1052. forceLength(out, tLen);
  1053. tgtLen = out.length();
  1054. tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
  1055. out.extract(0, tgtLen, tgt);
  1056. }
  1057. UNICODELIB_API unsigned UNICODELIB_CALL ulUnicodeFind(unsigned srcLen, UChar const * src, unsigned hitLen, UChar const * hit, unsigned instance)
  1058. {
  1059. return ulUnicodeLocaleFind(srcLen, src, hitLen, hit, instance, "");
  1060. }
  1061. UNICODELIB_API unsigned UNICODELIB_CALL ulUnicodeLocaleFind(unsigned srcLen, UChar const * src, unsigned hitLen, UChar const * hit, unsigned instance, char const * localename)
  1062. {
  1063. UErrorCode error = U_ZERO_ERROR;
  1064. UStringSearch * search = usearch_open(hit, hitLen, src, srcLen, localename, 0, &error);
  1065. int32_t pos;
  1066. for(pos = usearch_first(search, &error); pos != USEARCH_DONE; pos = usearch_next(search, &error))
  1067. {
  1068. if(!--instance)
  1069. {
  1070. usearch_close(search);
  1071. return pos+1;
  1072. }
  1073. }
  1074. usearch_close(search);
  1075. return 0;
  1076. }
  1077. UNICODELIB_API unsigned UNICODELIB_CALL ulUnicodeLocaleFindAtStrength(unsigned srcLen, UChar const * src, unsigned hitLen, UChar const * hit, unsigned instance, char const * localename, char strength)
  1078. {
  1079. //Very strange behaviour - if source or pattern lengths are 0 search->getCollator() is invalid
  1080. if (srcLen == 0 || hitLen == 0)
  1081. return 0;
  1082. UnicodeString const source(src, srcLen);
  1083. UnicodeString const pattern(hit, hitLen);
  1084. UErrorCode error = U_ZERO_ERROR;
  1085. Locale locale(localename);
  1086. StringSearch search(pattern, source, locale, 0, error);
  1087. doModifySearchStrength(search, strength, error);
  1088. int32_t pos = search.first(error);
  1089. while(pos != USEARCH_DONE)
  1090. {
  1091. if(!--instance)
  1092. return pos+1;
  1093. pos = search.next(error);
  1094. }
  1095. return 0;
  1096. }
  1097. UNICODELIB_API void UNICODELIB_CALL ulUnicodeExtract(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src, unsigned instance){
  1098. UnicodeString const in(src, srcLen);
  1099. UnicodeString out;
  1100. if(extract(out, in, instance))
  1101. {
  1102. tgtLen = out.length();
  1103. tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
  1104. out.extract(0, tgtLen, tgt);
  1105. }
  1106. else
  1107. {
  1108. tgtLen = 0;
  1109. tgt = 0;
  1110. }
  1111. }
  1112. UNICODELIB_API void UNICODELIB_CALL ulUnicodeExtract50(UChar *tgt, unsigned srcLen, UChar const * src, unsigned instance)
  1113. {
  1114. UnicodeString const in(src, srcLen);
  1115. UnicodeString out;
  1116. extract(out, in, instance);
  1117. forceLength(out, 50);
  1118. out.extract(0, 50, tgt);
  1119. }
  1120. UNICODELIB_API void UNICODELIB_CALL ulUnicodeToLowerCase(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src)
  1121. {
  1122. UnicodeString unicode(src, srcLen);
  1123. unicode.toLower();
  1124. tgtLen = unicode.length();
  1125. tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
  1126. unicode.extract(0, tgtLen, tgt);
  1127. }
  1128. UNICODELIB_API void UNICODELIB_CALL ulUnicodeToUpperCase(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src)
  1129. {
  1130. UnicodeString unicode(src, srcLen);
  1131. unicode.toUpper();
  1132. tgtLen = unicode.length();
  1133. tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
  1134. unicode.extract(0, tgtLen, tgt);
  1135. }
  1136. UNICODELIB_API void UNICODELIB_CALL ulUnicodeToProperCase(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src)
  1137. {
  1138. UnicodeString unicode(src, srcLen);
  1139. unicode.toTitle(0);
  1140. tgtLen = unicode.length();
  1141. tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
  1142. unicode.extract(0, tgtLen, tgt);
  1143. }
  1144. UNICODELIB_API void UNICODELIB_CALL ulUnicodeToLowerCase80(UChar * tgt, unsigned srcLen, UChar const * src)
  1145. {
  1146. UnicodeString unicode(src, srcLen);
  1147. unicode.toLower();
  1148. forceLength(unicode, 80);
  1149. unicode.extract(0, 80, tgt);
  1150. }
  1151. UNICODELIB_API void UNICODELIB_CALL ulUnicodeToUpperCase80(UChar * tgt, unsigned srcLen, UChar const * src)
  1152. {
  1153. UnicodeString unicode(src, srcLen);
  1154. unicode.toUpper();
  1155. forceLength(unicode, 80);
  1156. unicode.extract(0, 80, tgt);
  1157. }
  1158. UNICODELIB_API void UNICODELIB_CALL ulUnicodeToProperCase80(UChar * tgt, unsigned srcLen, UChar const * src)
  1159. {
  1160. UnicodeString unicode(src, srcLen);
  1161. unicode.toTitle(0);
  1162. forceLength(unicode, 80);
  1163. unicode.extract(0, 80, tgt);
  1164. }
  1165. UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleToLowerCase(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src, char const * localename)
  1166. {
  1167. UnicodeString unicode(src, srcLen);
  1168. Locale locale(localename);
  1169. unicode.toLower(locale);
  1170. tgtLen = unicode.length();
  1171. tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
  1172. unicode.extract(0, tgtLen, tgt);
  1173. }
  1174. UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleToUpperCase(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src, char const * localename)
  1175. {
  1176. UnicodeString unicode(src, srcLen);
  1177. Locale locale(localename);
  1178. unicode.toUpper(locale);
  1179. tgtLen = unicode.length();
  1180. tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
  1181. unicode.extract(0, tgtLen, tgt);
  1182. }
  1183. UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleToProperCase(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src, char const * localename)
  1184. {
  1185. UnicodeString unicode(src, srcLen);
  1186. Locale locale(localename);
  1187. unicode.toTitle(0, locale);
  1188. tgtLen = unicode.length();
  1189. tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
  1190. unicode.extract(0, tgtLen, tgt);
  1191. }
  1192. UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleToLowerCase80(UChar * tgt, unsigned srcLen, UChar const * src, char const * localename)
  1193. {
  1194. UnicodeString unicode(src, srcLen);
  1195. Locale locale(localename);
  1196. unicode.toLower(locale);
  1197. forceLength(unicode, 80);
  1198. unicode.extract(0, 80, tgt);
  1199. }
  1200. UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleToUpperCase80(UChar * tgt, unsigned srcLen, UChar const * src, char const * localename)
  1201. {
  1202. UnicodeString unicode(src, srcLen);
  1203. Locale locale(localename);
  1204. unicode.toUpper(locale);
  1205. forceLength(unicode, 80);
  1206. unicode.extract(0, 80, tgt);
  1207. }
  1208. UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleToProperCase80(UChar * tgt, unsigned srcLen, UChar const * src, char const * localename)
  1209. {
  1210. UnicodeString unicode(src, srcLen);
  1211. Locale locale(localename);
  1212. unicode.toTitle(0, locale);
  1213. forceLength(unicode, 80);
  1214. unicode.extract(0, 80, tgt);
  1215. }
  1216. UNICODELIB_API int UNICODELIB_CALL ulUnicodeCompareIgnoreCase(unsigned src1Len, UChar const * src1, unsigned src2Len, UChar const * src2)
  1217. {
  1218. return doUnicodeCompareAtStrength(src1Len, src1, src2Len, src2, Collator::SECONDARY);
  1219. }
  1220. UNICODELIB_API int UNICODELIB_CALL ulUnicodeCompareAtStrength(unsigned src1Len, UChar const * src1, unsigned src2Len, UChar const * src2, char strength)
  1221. {
  1222. switch(strength)
  1223. {
  1224. case 1:
  1225. return doUnicodeCompareAtStrength(src1Len, src1, src2Len, src2, Collator::PRIMARY);
  1226. case 2:
  1227. return doUnicodeCompareAtStrength(src1Len, src1, src2Len, src2, Collator::SECONDARY);
  1228. case 3:
  1229. return doUnicodeCompareAtStrength(src1Len, src1, src2Len, src2, Collator::TERTIARY);
  1230. case 4:
  1231. return doUnicodeCompareAtStrength(src1Len, src1, src2Len, src2, Collator::QUATERNARY);
  1232. case 5:
  1233. default:
  1234. return doUnicodeCompareAtStrength(src1Len, src1, src2Len, src2, Collator::IDENTICAL);
  1235. }
  1236. }
  1237. UNICODELIB_API int UNICODELIB_CALL ulUnicodeLocaleCompareIgnoreCase(unsigned src1Len, UChar const * src1, unsigned src2Len, UChar const * src2, char const * localename)
  1238. {
  1239. return doUnicodeLocaleCompareAtStrength(src1Len, src1, src2Len, src2, localename, Collator::SECONDARY);
  1240. }
  1241. UNICODELIB_API int UNICODELIB_CALL ulUnicodeLocaleCompareAtStrength(unsigned src1Len, UChar const * src1, unsigned src2Len, UChar const * src2, char const * localename, char strength)
  1242. {
  1243. switch(strength)
  1244. {
  1245. case 1:
  1246. return doUnicodeLocaleCompareAtStrength(src1Len, src1, src2Len, src2, localename, Collator::PRIMARY);
  1247. case 2:
  1248. return doUnicodeLocaleCompareAtStrength(src1Len, src1, src2Len, src2, localename, Collator::SECONDARY);
  1249. case 3:
  1250. return doUnicodeLocaleCompareAtStrength(src1Len, src1, src2Len, src2, localename, Collator::TERTIARY);
  1251. case 4:
  1252. return doUnicodeLocaleCompareAtStrength(src1Len, src1, src2Len, src2, localename, Collator::QUATERNARY);
  1253. case 5:
  1254. default:
  1255. return doUnicodeLocaleCompareAtStrength(src1Len, src1, src2Len, src2, localename, Collator::IDENTICAL);
  1256. }
  1257. }
  1258. UNICODELIB_API void UNICODELIB_CALL ulUnicodeReverse(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src)
  1259. {
  1260. UnicodeString in(src, srcLen);
  1261. UnicodeString out;
  1262. StringCharacterIterator iter(in);
  1263. for(iter.last32(); iter.hasPrevious(); iter.previous32())
  1264. out.append(iter.current32());
  1265. if(srcLen) out.append(iter.current32());
  1266. tgtLen = out.length();
  1267. tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
  1268. out.extract(0, tgtLen, tgt);
  1269. }
  1270. UNICODELIB_API void UNICODELIB_CALL ulUnicodeFindReplace(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src, unsigned stokLen, UChar const * stok, unsigned rtokLen, UChar const * rtok)
  1271. {
  1272. UnicodeString source(src, srcLen);
  1273. UnicodeString const pattern(stok, stokLen);
  1274. UnicodeString const replace(rtok, rtokLen);
  1275. source.findAndReplace(pattern, replace);
  1276. tgtLen = source.length();
  1277. tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
  1278. source.extract(0, tgtLen, tgt);
  1279. }
  1280. UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleFindReplace(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src, unsigned stokLen, UChar const * stok, unsigned rtokLen, UChar const * rtok, char const * localename)
  1281. {
  1282. UnicodeString source(src, srcLen);
  1283. UnicodeString const pattern(stok, stokLen);
  1284. UnicodeString const replace(rtok, rtokLen);
  1285. doUnicodeLocaleFindReplace(source, pattern, replace, localename);
  1286. tgtLen = source.length();
  1287. tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
  1288. source.extract(0, tgtLen, tgt);
  1289. }
  1290. UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleFindAtStrengthReplace(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src, unsigned stokLen, UChar const * stok, unsigned rtokLen, UChar const * rtok, char const * localename, char strength)
  1291. {
  1292. UnicodeString source(src, srcLen);
  1293. //Very strange behaviour - if source or pattern lengths are 0 search->getCollator() is invalid
  1294. if (srcLen && stokLen)
  1295. {
  1296. UnicodeString const pattern(stok, stokLen);
  1297. UnicodeString const replace(rtok, rtokLen);
  1298. doUnicodeLocaleFindAtStrengthReplace(source, pattern, replace, localename, strength);
  1299. }
  1300. tgtLen = source.length();
  1301. tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
  1302. source.extract(0, tgtLen, tgt);
  1303. }
  1304. UNICODELIB_API void UNICODELIB_CALL ulUnicodeFindReplace80(UChar * tgt, unsigned srcLen, UChar const * src, unsigned stokLen, UChar const * stok, unsigned rtokLen, UChar const * rtok)
  1305. {
  1306. UnicodeString source(src, srcLen);
  1307. UnicodeString const pattern(stok, stokLen);
  1308. UnicodeString const replace(rtok, rtokLen);
  1309. source.findAndReplace(pattern, replace);
  1310. forceLength(source, 80);
  1311. source.extract(0, 80, tgt);
  1312. }
  1313. UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleFindReplace80(UChar * tgt, unsigned srcLen, UChar const * src, unsigned stokLen, UChar const * stok, unsigned rtokLen, UChar const * rtok, char const * localename)
  1314. {
  1315. UnicodeString source(src, srcLen);
  1316. UnicodeString const pattern(stok, stokLen);
  1317. UnicodeString const replace(rtok, rtokLen);
  1318. doUnicodeLocaleFindReplace(source, pattern, replace, localename);
  1319. forceLength(source, 80);
  1320. source.extract(0, 80, tgt);
  1321. }
  1322. UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleFindAtStrengthReplace80(UChar * tgt, unsigned srcLen, UChar const * src, unsigned stokLen, UChar const * stok, unsigned rtokLen, UChar const * rtok, char const * localename, char strength)
  1323. {
  1324. UnicodeString source(src, srcLen);
  1325. UnicodeString const pattern(stok, stokLen);
  1326. UnicodeString const replace(rtok, rtokLen);
  1327. doUnicodeLocaleFindAtStrengthReplace(source, pattern, replace, localename, strength);
  1328. forceLength(source, 80);
  1329. source.extract(0, 80, tgt);
  1330. }
  1331. UNICODELIB_API void UNICODELIB_CALL ulUnicodeCleanSpaces(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src)
  1332. {
  1333. UnicodeString source(src, srcLen);
  1334. doUnicodeCleanSpaces(source);
  1335. tgtLen = source.length();
  1336. tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
  1337. source.extract(0, tgtLen, tgt);
  1338. }
  1339. UNICODELIB_API void UNICODELIB_CALL ulUnicodeCleanSpaces25(UChar * tgt, unsigned srcLen, UChar const * src)
  1340. {
  1341. UnicodeString source(src, srcLen);
  1342. doUnicodeCleanSpaces(source);
  1343. forceLength(source, 25);
  1344. source.extract(0, 25, tgt);
  1345. }
  1346. UNICODELIB_API void UNICODELIB_CALL ulUnicodeCleanSpaces80(UChar * tgt, unsigned srcLen, UChar const * src)
  1347. {
  1348. UnicodeString source(src, srcLen);
  1349. doUnicodeCleanSpaces(source);
  1350. forceLength(source, 80);
  1351. source.extract(0, 80, tgt);
  1352. }
  1353. UNICODELIB_API bool UNICODELIB_CALL ulUnicodeWildMatch(unsigned srcLen, UChar const * src, unsigned patLen, UChar const * pat, bool noCase)
  1354. {
  1355. return wildTrimMatch<UChar, u16toupper, u16query, u16asterisk, u16space>(src, srcLen, pat, patLen, noCase);
  1356. }
  1357. UNICODELIB_API bool UNICODELIB_CALL ulUnicodeContains(unsigned srcLen, UChar const * src, unsigned patLen, UChar const * pat, bool noCase)
  1358. {
  1359. UnicodeString source(src, srcLen);
  1360. UnicodeString pattern(pat, patLen);
  1361. if(noCase)
  1362. {
  1363. source.foldCase();
  1364. pattern.foldCase();
  1365. }
  1366. StringCharacterIterator iter(pattern);
  1367. for(iter.first32(); iter.hasNext(); iter.next32())
  1368. if(source.indexOf(iter.current32()) == -1)
  1369. return false;
  1370. return true;
  1371. }
  1372. UNICODELIB_API void UNICODELIB_CALL ulUnicodeCleanAccents(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src)
  1373. {
  1374. if (!deAccenter)
  1375. {
  1376. CriticalBlock b(accenterCrit);
  1377. if (!deAccenter)
  1378. {
  1379. UErrorCode lStatus = U_ZERO_ERROR;
  1380. deAccenter = icu::Transliterator::createInstance("NFD; [:M:] Remove; NFC;", UTRANS_FORWARD, lStatus);
  1381. }
  1382. }
  1383. UnicodeString source(src, srcLen);
  1384. deAccenter->transliterate(source);
  1385. tgtLen = source.length();
  1386. tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
  1387. source.extract(0, tgtLen, tgt);
  1388. }
  1389. UNICODELIB_API unsigned UNICODELIB_CALL ulUnicodeLocaleEditDistance(unsigned leftLen, UChar const * left, unsigned rightLen, UChar const * right, char const * localename)
  1390. {
  1391. BreakIterator* bi = 0;
  1392. if (localename && *localename)
  1393. {
  1394. bi = queryCharacterBreakIterator(localename);
  1395. if (!bi)
  1396. return DISTANCE_ON_ERROR;
  1397. }
  1398. UnicodeString uLeft(false, left, leftLen); // Readonly-aliasing UChar* constructor.
  1399. UnicodeString uRight(false, right, rightLen);
  1400. unsigned distance = nsUnicodelib::unicodeEditDistanceV4(uLeft, uRight, 254, bi);
  1401. return distance;
  1402. }
  1403. UNICODELIB_API bool UNICODELIB_CALL ulUnicodeLocaleEditDistanceWithinRadius(unsigned leftLen, UChar const * left, unsigned rightLen, UChar const * right, unsigned radius, char const * localename)
  1404. {
  1405. BreakIterator* bi = 0;
  1406. if (localename && *localename)
  1407. {
  1408. bi = queryCharacterBreakIterator(localename);
  1409. if (!bi)
  1410. return false;
  1411. }
  1412. UnicodeString uLeft(false, left, leftLen); // Readonly-aliasing UChar* constructor.
  1413. UnicodeString uRight(false, right, rightLen);
  1414. unsigned distance = nsUnicodelib::unicodeEditDistanceV4(uLeft, uRight, radius, bi);
  1415. return distance <= radius;
  1416. }
  1417. UNICODELIB_API unsigned UNICODELIB_CALL ulUnicodeLocaleWordCount(unsigned textLen, UChar const * text, char const * localename)
  1418. {
  1419. UErrorCode status = U_ZERO_ERROR;
  1420. Locale locale(localename);
  1421. RuleBasedBreakIterator* bi = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createWordInstance(locale, status);
  1422. UnicodeString uText(text, textLen);
  1423. uText.trim();
  1424. unsigned count = doCountWords(*bi, uText);
  1425. delete bi;
  1426. return count;
  1427. }
  1428. UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleGetNthWord(unsigned & tgtLen, UChar * & tgt, unsigned textLen, UChar const * text, unsigned n, char const * localename)
  1429. {
  1430. UErrorCode status = U_ZERO_ERROR;
  1431. Locale locale(localename);
  1432. RuleBasedBreakIterator* bi = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createWordInstance(locale, status);
  1433. UnicodeString uText(text, textLen);
  1434. uText.trim();
  1435. UnicodeString word = getNthWord(*bi, uText, n);
  1436. delete bi;
  1437. if(word.length()>0)
  1438. {
  1439. tgtLen = word.length();
  1440. tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
  1441. word.extract(0, tgtLen, tgt);
  1442. }
  1443. else
  1444. {
  1445. tgtLen = 0;
  1446. tgt = 0;
  1447. }
  1448. }
  1449. UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleExcludeNthWord(unsigned & tgtLen, UChar * & tgt, unsigned textLen, UChar const * text, unsigned n, char const * localename)
  1450. {
  1451. UErrorCode status = U_ZERO_ERROR;
  1452. Locale locale(localename);
  1453. RuleBasedBreakIterator* bi = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createWordInstance(locale, status);
  1454. UnicodeString processed(text, textLen);
  1455. excludeNthWord(*bi, processed, n);
  1456. delete bi;
  1457. if (processed.length()>0)
  1458. {
  1459. tgtLen = processed.length();
  1460. tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
  1461. processed.extract(0, tgtLen, tgt);
  1462. }
  1463. else
  1464. {
  1465. tgtLen = 0;
  1466. tgt = 0;
  1467. }
  1468. }
  1469. UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleExcludeLastWord(unsigned & tgtLen, UChar * & tgt, unsigned textLen, UChar const * text, char const * localename)
  1470. {
  1471. UErrorCode status = U_ZERO_ERROR;
  1472. Locale locale(localename);
  1473. RuleBasedBreakIterator* bi = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createWordInstance(locale, status);
  1474. UnicodeString processed(text, textLen);
  1475. excludeLastWord(*bi, processed);
  1476. delete bi;
  1477. if (processed.length()>0)
  1478. {
  1479. tgtLen = processed.length();
  1480. tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen * 2);
  1481. processed.extract(0, tgtLen, tgt);
  1482. }
  1483. else
  1484. {
  1485. tgtLen = 0;
  1486. tgt = 0;
  1487. }
  1488. }
  1489. UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleTranslate(unsigned & tgtLen, UChar * & tgt, unsigned textLen, UChar const * text, unsigned searLen, UChar const * sear, unsigned replLen, UChar * repl)
  1490. {
  1491. UnicodeString processed(text, textLen);
  1492. translate(processed, sear, searLen, repl, replLen);
  1493. if (processed.length() > 0)
  1494. {
  1495. tgtLen = processed.length();
  1496. tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen * 2);
  1497. processed.extract(0, tgtLen, tgt);
  1498. }
  1499. else
  1500. {
  1501. tgtLen = 0;
  1502. tgt = 0;
  1503. }
  1504. }
  1505. UNICODELIB_API bool UNICODELIB_CALL ulUnicodeLocaleStartsWith(unsigned srcLen, UChar const * src, unsigned prefLen, UChar const * pref, unsigned formLen, char const * form)
  1506. {
  1507. UErrorCode errorCode = U_ZERO_ERROR;
  1508. UnicodeString pro(src, srcLen);
  1509. UnicodeString pre(pref, prefLen);
  1510. if (formLen == 3 || formLen == 4)
  1511. {
  1512. normalizationFormCheck(pro, form);
  1513. normalizationFormCheck(pre, form);
  1514. }
  1515. return startsWith(pro, pre);
  1516. }
  1517. UNICODELIB_API bool UNICODELIB_CALL ulUnicodeLocaleEndsWith(unsigned srcLen, UChar const * src, unsigned suffLen, UChar const * suff, unsigned formLen, char const * form)
  1518. {
  1519. UnicodeString pro(src, srcLen);
  1520. UnicodeString suf(suff, suffLen);
  1521. suf.trim();
  1522. if (formLen == 3 || formLen == 4)
  1523. {
  1524. normalizationFormCheck(pro, form);
  1525. normalizationFormCheck(suf, form);
  1526. }
  1527. return endsWith(pro, suf);
  1528. }
  1529. UNICODELIB_API void UNICODELIB_CALL ulUnicodeVersion(unsigned & tgtLen, char * & tgt)
  1530. {
  1531. char version[U_MAX_VERSION_STRING_LENGTH];
  1532. UVersionInfo versionInfo;
  1533. u_getVersion(versionInfo);
  1534. u_versionToString(versionInfo, version);
  1535. tgtLen = strlen(version);
  1536. tgt = (char *)CTXMALLOC(parentCtx, tgtLen);
  1537. memcpy(tgt, version, tgtLen);
  1538. }