123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146 |
- /*##############################################################################
- Copyright (C) 2011 HPCC Systems.
- All rights reserved. This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as
- published by the Free Software Foundation, either version 3 of the
- License, or (at your option) any later version.
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU Affero General Public License for more details.
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
- ############################################################################## */
- #include "jlib.hpp"
- #include "jsem.hpp"
- #include <string.h>
- #include "unicodelib.hpp"
- #include "unicode/usearch.h"
- #include "unicode/schriter.h"
- #include "unicode/locid.h"
- #include "unicode/coll.h"
- #include "unicode/stsearch.h"
- #include "unicode/translit.h"
- #include "unicode/rbbi.h"
- #include "../stringlib/wildmatch.tpp"
- #define UNICODELIB_VERSION "UNICODELIB 1.1.06"
- UChar32 const u32comma = ',';
- UChar32 const u32space = ' ';
- UChar const u16asterisk = '*';
- UChar const u16query = '?';
- UChar const u16space = ' ';
- const char * EclDefinition =
- "export UnicodeLib := SERVICE\n"
- " unicode UnicodeFilterOut(const unicode src, const unicode _within) : c, pure,entrypoint='ulUnicodeFilterOut'; \n"
- " unicode UnicodeFilter(const unicode src, const unicode _within) : c, pure,entrypoint='ulUnicodeFilter'; \n"
- " unicode UnicodeSubstituteOut(const unicode src, const unicode _within, const unicode _newchar) : c, pure,entrypoint='ulUnicodeSubsOut'; \n"
- " unicode UnicodeSubstitute(const unicode src, const unicode _within, const unicode _newchar) : c, pure,entrypoint='ulUnicodeSubs'; \n"
- " unicode UnicodeRepad(const unicode src, unsigned4 size) : c, pure,entrypoint='ulUnicodeRepad'; \n"
- " unsigned integer4 UnicodeFind(const unicode src, const unicode tofind, unsigned4 instance) : c, pure,entrypoint='ulUnicodeFind', hole; \n"
- " unsigned integer4 UnicodeLocaleFind(const unicode src, const unicode tofind, unsigned4 instance, const varstring localename) : c, pure,entrypoint='ulUnicodeLocaleFind', hole; \n"
- " unsigned integer4 UnicodeLocaleFindAtStrength(const unicode src, const unicode tofind, unsigned4 instance, const varstring localename, integer1 strength) : c, pure,entrypoint='ulUnicodeLocaleFindAtStrength', hole; \n"
- " unicode UnicodeExtract(const unicode src, unsigned4 instance) : c,pure,entrypoint='ulUnicodeExtract'; \n"
- " unicode50 UnicodeExtract50(const unicode src, unsigned4 instance) : c,pure,entrypoint='ulUnicodeExtract50', hole; \n"
- " unicode UnicodeToLowerCase(const unicode src) : c,pure,entrypoint='ulUnicodeToLowerCase';\n"
- " unicode UnicodeToUpperCase(const unicode src) : c,pure,entrypoint='ulUnicodeToUpperCase';\n"
- " unicode UnicodeToProperCase(const unicode src) : c,pure,entrypoint='ulUnicodeToProperCase';\n"
- " unicode80 UnicodeToLowerCase80(const unicode src) : c,pure,entrypoint='ulUnicodeToLowerCase80', hole;\n"
- " unicode80 UnicodeToUpperCase80(const unicode src) : c,pure,entrypoint='ulUnicodeToUpperCase80', hole;\n"
- " unicode80 UnicodeToProperCase80(const unicode src) : c,pure,entrypoint='ulUnicodeToProperCase80', hole;\n"
- " unicode UnicodeLocaleToLowerCase(const unicode src, const varstring localename) : c,pure,entrypoint='ulUnicodeLocaleToLowerCase';\n"
- " unicode UnicodeLocaleToUpperCase(const unicode src, const varstring localename) : c,pure,entrypoint='ulUnicodeLocaleToUpperCase';\n"
- " unicode UnicodeLocaleToProperCase(const unicode src, const varstring localename) : c,pure,entrypoint='ulUnicodeLocaleToProperCase';\n"
- " unicode80 UnicodeLocaleToLowerCase80(const unicode src, const varstring localename) : c,pure,entrypoint='ulUnicodeLocaleToLowerCase80', hole;\n"
- " unicode80 UnicodeLocaleToUpperCase80(const unicode src, const varstring localename) : c,pure,entrypoint='ulUnicodeLocaleToUpperCase80', hole;\n"
- " unicode80 UnicodeLocaleToProperCase80(const unicode src, const varstring localename) : c,pure,entrypoint='ulUnicodeLocaleToProperCase80', hole;\n"
- " integer4 UnicodeCompareIgnoreCase(const unicode src1, const unicode src2) : c,pure,entrypoint='ulUnicodeCompareIgnoreCase', hole;\n"
- " integer4 UnicodeCompareAtStrength(const unicode src1, const unicode src2, integer1 strength) : c,pure,entrypoint='ulUnicodeCompareAtStrength', hole;\n"
- " integer4 UnicodeLocaleCompareIgnoreCase(const unicode src1, const unicode src2, const varstring localename) : c,pure,entrypoint='ulUnicodeLocaleCompareIgnoreCase', hole;\n"
- " integer4 UnicodeLocaleCompareAtStrength(const unicode src1, const unicode src2, const varstring localename, integer1 strength) : c,pure,entrypoint='ulUnicodeLocaleCompareAtStrength', hole;\n"
- " unicode UnicodeReverse(const unicode src) : c,pure,entrypoint='ulUnicodeReverse';\n"
- " unicode UnicodeFindReplace(const unicode src, const unicode stok, const unicode rtok) : c,pure,entrypoint='ulUnicodeFindReplace';\n"
- " unicode UnicodeLocaleFindReplace(const unicode src, const unicode stok, const unicode rtok, const varstring localename) : c,pure,entrypoint='ulUnicodeLocaleFindReplace';\n"
- " unicode UnicodeLocaleFindAtStrengthReplace(const unicode src, const unicode stok, const unicode rtok, const varstring localename, integer1 strength) : c,pure,entrypoint='ulUnicodeLocaleFindAtStrengthReplace';\n"
- " unicode80 UnicodeFindReplace80(const unicode src, const unicode stok, const unicode rtok) : c,pure,entrypoint='ulUnicodeFindReplace80', hole;\n"
- " unicode80 UnicodeLocaleFindReplace80(const unicode src, const unicode stok, const unicode rtok, const varstring localename) : c,pure,entrypoint='ulUnicodeLocaleFindReplace80',hole;\n"
- " unicode80 UnicodeLocaleFindAtStrengthReplace80(const unicode src, const unicode stok, const unicode rtok, const varstring localename, integer1 strength) : c,pure,entrypoint='ulUnicodeLocaleFindAtStrengthReplace80',hole;\n"
- " unicode UnicodeCleanAccents(const unicode src) : c,pure,entrypoint='ulUnicodeCleanAccents'; \n"
- " unicode UnicodeCleanSpaces(const unicode src) : c,pure,entrypoint='ulUnicodeCleanSpaces'; \n"
- " unicode25 UnicodeCleanSpaces25(const unicode src) : c,pure,entrypoint='ulUnicodeCleanSpaces25', hole; \n"
- " unicode80 UnicodeCleanSpaces80(const unicode src) : c,pure,entrypoint='ulUnicodeCleanSpaces80', hole; \n"
- " boolean UnicodeWildMatch(const unicode src, const unicode _pattern, boolean _noCase) : c, pure,entrypoint='ulUnicodeWildMatch', hole; \n"
- " boolean UnicodeContains(const unicode src, const unicode _pattern, boolean _noCase) : c, pure,entrypoint='ulUnicodeContains', hole; \n"
- " unsigned4 UnicodeLocaleEditDistance(const unicode left, const unicode right, const varstring localename) : c, pure,entrypoint='ulUnicodeLocaleEditDistance', hole; \n"
- " boolean UnicodeLocaleEditDistanceWithinRadius(const unicode left, const unicode right, unsigned4 radius, const varstring localename) : c, pure,entrypoint='ulUnicodeLocaleEditDistanceWithinRadius', hole; \n"
- " unsigned4 UnicodeLocaleWordCount(const unicode text, const varstring localename) : c, pure,entrypoint='ulUnicodeLocaleWordCount', hole; \n"
- " unicode UnicodeLocaleGetNthWord(const unicode text, unsigned4 n, const varstring localename) : c,pure,entrypoint='ulUnicodeLocaleGetNthWord';\n"
- "END;\n";
- static const char * compatibleVersions[] = {
- "UNICODELIB 1.1.01 [64d78857c1cecae15bd238cd7767b3c1]",
- "UNICODELIB 1.1.01 [e8790fe30d9627997749c3c4839b5957]",
- "UNICODELIB 1.1.02",
- "UNICODELIB 1.1.03",
- "UNICODELIB 1.1.04",
- "UNICODELIB 1.1.05",
- NULL };
- UNICODELIB_API bool getECLPluginDefinition(ECLPluginDefinitionBlock *pb)
- {
- if (pb->size == sizeof(ECLPluginDefinitionBlockEx))
- {
- ECLPluginDefinitionBlockEx * pbx = (ECLPluginDefinitionBlockEx *) pb;
- pbx->compatibleVersions = compatibleVersions;
- }
- else if (pb->size != sizeof(ECLPluginDefinitionBlock))
- return false;
- pb->magicVersion = PLUGIN_VERSION;
- pb->version = UNICODELIB_VERSION;
- pb->moduleName = "lib_unicodelib";
- pb->ECL = EclDefinition;
- pb->flags = PLUGIN_IMPLICIT_MODULE | PLUGIN_MULTIPLE_VERSIONS;
- pb->description = "UnicodeLib unicode string manipulation library";
- return true;
- }
- namespace nsUnicodelib {
- IPluginContext * parentCtx = NULL;
- void doTrimRight(UnicodeString & source)
- {
- int32_t oldLength = source.length();
- if (!oldLength)
- return;
- int32_t currentLength = oldLength;
- bool uSpace = true;
- do {
- UChar32 c = source[--currentLength];
- if(!(c == 0x20 || u_isWhitespace(c))) {
- currentLength++;
- uSpace = false;
- }
- } while (uSpace && currentLength>0);
- if (currentLength < oldLength) {
- source.truncate(currentLength);
- }
- }
- void forceLength(UnicodeString & str, int32_t len)
- {
- if(str.length()>len)
- str.truncate(len);
- else if(str.length()<len)
- str.padTrailing(len);
- }
- void doModifySearchStrength(StringSearch & search, char strength, UErrorCode & error)
- {
- RuleBasedCollator * coll = search.getCollator();
- switch(strength)
- {
- case 1:
- coll->setStrength(Collator::PRIMARY);
- break;
- case 2:
- coll->setStrength(Collator::SECONDARY);
- break;
- case 3:
- coll->setStrength(Collator::TERTIARY);
- break;
- case 4:
- coll->setStrength(Collator::QUATERNARY);
- break;
- case 5:
- default:
- coll->setStrength(Collator::IDENTICAL);
- }
- search.setCollator(coll, error);
- }
- bool extract(UnicodeString & out, UnicodeString const & in, unsigned instance)
- {
- if(!instance) return false;
- int32_t start = 0;
- while(--instance)
- {
- start = in.indexOf(u32comma, start);
- if(start == -1) return false;
- start++;
- }
- int32_t end = in.indexOf(u32comma, start);
- if(end == -1)
- end = in.length();
- out.append(in, start, end-start);
- return true;
- }
- int doUnicodeCompareAtStrength(unsigned src1Len, UChar const * src1, unsigned src2Len, UChar const * src2, Collator::ECollationStrength strength)
- {
- UErrorCode error = U_ZERO_ERROR;
- Collator * coll = Collator::createInstance(error);
- coll->setStrength(strength);
- Collator::EComparisonResult ret = coll->compare(src1, src1Len, src2, src2Len);
- delete coll;
- return ret;
- }
- int doUnicodeLocaleCompareAtStrength(unsigned src1Len, UChar const * src1, unsigned src2Len, UChar const * src2, char const * localename, Collator::ECollationStrength strength)
- {
- UErrorCode error = U_ZERO_ERROR;
- Locale locale(localename);
- Collator * coll = Collator::createInstance(locale, error);
- coll->setStrength(strength);
- Collator::EComparisonResult ret = coll->compare(src1, src1Len, src2, src2Len);
- delete coll;
- return ret;
- }
- void doUnicodeLocaleFindReplace(UnicodeString & source, UnicodeString const & pattern, UnicodeString const & replace, char const * localename)
- {
- UErrorCode error = U_ZERO_ERROR;
- Locale locale(localename);
- StringSearch search(pattern, source, locale, 0, error);
- int32_t pos = search.first(error);
- while(pos != USEARCH_DONE)
- {
- source.replace(pos, search.getMatchedLength(), replace);
- search.setText(source, error);
- search.setOffset(pos+replace.length(), error);
- pos = search.next(error);
- }
- }
- void doUnicodeLocaleFindAtStrengthReplace(UnicodeString & source, UnicodeString const & pattern, UnicodeString const & replace, char const * localename, char strength)
- {
- UErrorCode error = U_ZERO_ERROR;
- Locale locale(localename);
- StringSearch search(pattern, source, locale, 0, error);
- doModifySearchStrength(search, strength, error);
- int32_t pos = search.first(error);
- while(pos != USEARCH_DONE)
- {
- source.replace(pos, search.getMatchedLength(), replace);
- search.setText(source, error);
- search.setOffset(pos+replace.length(), error);
- pos = search.next(error);
- }
- }
- void doUnicodeCleanSpaces(UnicodeString & source)
- {
- int32_t srclen;
- int32_t pos = source.indexOf(u32space);
- int32_t endpos;
- int32_t spacelen;
- while(pos != -1)
- {
- srclen = source.length();
- for(endpos=pos; endpos<srclen; endpos++)
- if(source.charAt(endpos)!=u32space) break;
- spacelen = endpos-pos;
- if((pos>0) && (endpos<srclen)) spacelen--;
- if(spacelen>0) source.remove(pos, spacelen);
- pos = source.indexOf(u32space, pos+1);
- }
- }
- /*
- N.B. To do 'real' case-insensitive matching we should use full stringwise casefolding on the source and pattern. The simple char-by-char toupper approach has problems with unicode. For example, some chars uppercase to multiple chars (e.g. the German 'sharp s' uppercases to 'SS'). See http://icu-project.org/userguide/posix.html#case_mappings for more examples. Furthermore, converting as 16-bit code units does not work when code points from U+10000 upwards are involved. Nevertheless, we use the simple char-by-char toupper approach for the UnicodeWildMatch function, because it is intended as a high-speed function. For accurate case-folding, you should either use the UnicodeToUpperCase function explicitly on the arguments or use REGEXFIND.
- */
- inline UChar u16toupper(UChar c)
- {
- UChar32 o = u_toupper(c);
- return U_IS_SUPPLEMENTARY(o) ? c : (UChar)o;
- }
- static icu::Transliterator* deAccenter = NULL;
- static CriticalSection accenterCrit;
- inline unsigned char min3(unsigned char a, unsigned char b, unsigned char c)
- {
- unsigned char min = (a<b)? a:b;
- return (min<c)? min:c;
- }
- #define DISTANCE_ON_ERROR 999
- class CEList
- {
- private:
- UnicodeString ustring_;
- uint32_t* ces_;
- uint32_t length_;
- uint32_t capacity_;
- bool invalid;
- void doCreateCEList(RuleBasedCollator& rbc) {
- UErrorCode status = U_ZERO_ERROR;
- CollationElementIterator* ceIterator = rbc.createCollationElementIterator( ustring_ );
- if (!capacity_) {
- capacity_ = ustring_.length();
- }
- ces_ = new uint32_t[capacity_];
- uint32_t ce = 0;
- do {
- ce = ceIterator->next(status);
- if ((length_ == capacity_) || (ce == CollationElementIterator::NULLORDER))
- break;
- ces_[length_++] = ce;
- } while (ce != CollationElementIterator::NULLORDER);
- delete ceIterator;
- if (U_FAILURE(status)) invalid = true;
- }
- public:
- CEList(RuleBasedCollator& rbc, const UnicodeString & source, uint32_t capacity=0)
- : length_(0), capacity_(capacity), ustring_(source), invalid(false)
- {
- doCreateCEList(rbc);
- }
- ~CEList()
- {
- delete[] ces_;
- }
-
- uint32_t operator[](uint32_t offset)
- {
- return (offset < length_ )? ces_[offset]:0xffff;
- }
- uint32_t length() { return length_;}
- uint32_t capacity() {return capacity_;}
- inline bool isInvalid() const { return invalid; }
- };
- inline unsigned mask(unsigned x) { return x & 1; }
- unsigned unicodeEditDistanceV2(UnicodeString & left, UnicodeString & right, RuleBasedCollator& rbc)
- {
- unsigned char i, j;
- doTrimRight(left);
- doTrimRight(right);
- unsigned leftLen = left.length();
- unsigned rightLen = right.length();
- if (leftLen > 255)
- leftLen = 255;
- if (rightLen > 255)
- rightLen = 255;
- if (leftLen == 0)
- return rightLen;
- if (rightLen == 0)
- return leftLen;
- CEList leftCEs(rbc, left, leftLen);
- CEList rightCEs(rbc, right, rightLen);
- if (leftCEs.isInvalid() || rightCEs.isInvalid())
- return DISTANCE_ON_ERROR;
- leftLen = leftCEs.length();
- rightLen = rightCEs.length();
- //Optimize the storage requirements by
- //i) Only storing two stripes
- //ii) Calculate, but don't store the row comparing against the null string
- unsigned char da[2][256];
- uint32_t r_0 = rightCEs[0];
- uint32_t l_0 = leftCEs[0];
- bool matched_l0 = false;
- for (j = 0; j < rightLen; j++)
- {
- if (rightCEs[j] == l_0) matched_l0 = true;
- da[0][j] = (matched_l0) ? j : j+1;
- }
- bool matched_r0 = (l_0 == r_0);
- for (i = 1; i < leftLen; i++)
- {
- uint32_t l_i = leftCEs[i];
- if (l_i == r_0)
- matched_r0 = true;
- byte da_i_0 = matched_r0 ? i : i+1;
- da[mask(i)][0] = da_i_0;
- byte da_i_prevj = da_i_0;
- for (j = 1; j < rightLen; j++)
- {
- uint32_t r_j = rightCEs[j];
- unsigned char next = (l_i == r_j) ? da[mask(i-1)][j-1] :
- min3(da[mask(i-1)][j], da_i_prevj, da[mask(i-1)][j-1]) + 1;
- da[mask(i)][j] = next;
- da_i_prevj = next;
- }
- }
- return da[mask(leftLen-1)][rightLen-1];
- }
- //This could be further improved in the following ways:
- // * Only use 2*radius bytes of temporary storage - I doubt it is worth it.
- // * special case edit1 - you could use variables for the 6 interesting array elements, and get
- // rid of the array completely. You could also unwind the first (and last iterations).
- // * I suspect the early exit condition could be improved depending the lengths of the strings.
- unsigned unicodeEditDistanceV3(UnicodeString & left, UnicodeString & right, unsigned radius, RuleBasedCollator& rbc)
- {
- if (radius >= 255)
- return 255;
- doTrimRight(left);
- doTrimRight(right);
- unsigned leftLen = left.length();
- unsigned rightLen = right.length();
- unsigned minED = (leftLen < rightLen)? rightLen - leftLen: leftLen - rightLen;
- if (minED > radius)
- return minED;
- if (leftLen > 255)
- leftLen = 255;
- if (rightLen > 255)
- rightLen = 255;
- //Checking for leading common substrings actually slows the function down.
- if (leftLen == 0)
- return rightLen;
- if (rightLen == 0)
- return leftLen;
- CEList leftCEs(rbc, left, leftLen);
- CEList rightCEs(rbc, right, rightLen);
- leftLen = leftCEs.length();
- rightLen = rightCEs.length();
- /*
- This function applies two optimizations over the function above.
- a) Adding a charcter (next row) can at most decrease the edit distance by 1, so short circuit when
- we there is no possiblity of getting within the distance.
- b) We only need to evaluate the martix da[i-radius..i+radius][j-radius..j+radius]
- not taking into account values outside that range [can use max value to prevent access]
- */
- //Optimize the storage requirements by
- //i) Only storing two stripes
- //ii) Calculate, but don't store the row comparing against the null string
- unsigned char da[2][256];
- uint32_t r_0 = rightCEs[0];
- uint32_t l_0 = leftCEs[0];
- bool matched_l0 = false;
- for (unsigned char j = 0; j < rightLen; j++)
- {
- if (rightCEs[j] == l_0) matched_l0 = true;
- da[0][j] = (matched_l0) ? j : j+1;
- }
- bool matched_r0 = (l_0 == r_0);
- for (unsigned char i = 1; i < leftLen; i++)
- {
- uint32_t l_i = leftCEs[i];
- if (l_i == r_0)
- matched_r0 = true;
- byte da_i_0 = matched_r0 ? i : i+1;
- da[mask(i)][0] = da_i_0;
- byte da_i_prevj = da_i_0;
- unsigned low = i-radius;
- unsigned high = i+radius;
- unsigned first = (i > radius) ? low : 1;
- unsigned last = (high >= rightLen) ? rightLen : high +1;
- for (unsigned j = first; j < last; j++)
- {
- uint32_t r_j = rightCEs[j];
- unsigned char next = da[mask(i-1)][j-1];
- if (l_i != r_j)
- {
- if (j != low)
- {
- if (next > da_i_prevj)
- next = da_i_prevj;
- }
- if (j != high)
- {
- byte da_previ_j = da[mask(i-1)][j];
- if (next > da_previ_j)
- next = da_previ_j;
- }
- next++;
- }
- da[mask(i)][j] = next;
- da_i_prevj = next;
- }
- // bail out early if ed can't possibly be <= radius
- // Only considering a strip down the middle of the matrix, so the maximum the score can ever be adjusted is 2xradius
- unsigned max_valid_score = 3*radius;
- // But maximum is also 1 for every difference in string length - comes in to play when close to the end.
- //In 32bit goes slower for radius=1 I suspect because running out of registers. Retest in 64bit.
- if (radius > 1)
- {
- unsigned max_distance = radius + (leftLen - (i+1)) + (rightLen - last);
- if (max_valid_score > max_distance)
- max_valid_score = max_distance;
- }
- if (da_i_prevj > max_valid_score)
- return da_i_prevj;
- }
- return da[mask(leftLen-1)][rightLen-1];
- }
- UnicodeString getNthWord(RuleBasedBreakIterator& bi, UnicodeString const & source, unsigned n)
- {
- UnicodeString word;
- if (!n) return word;
- bi.setText(source);
- int32_t start = bi.first();
- while (start != BreakIterator::DONE && n) {
- int breakType = bi.getRuleStatus();
- if (breakType != UBRK_WORD_NONE) {
- // Exclude spaces, punctuation, and the like.
- // A status value UBRK_WORD_NONE indicates that the boundary does
- // not start a word or number.
- //
- n--;
- if (!n) {
- unsigned wordBegining = bi.preceding(start);
- unsigned wordEnd = bi.next();
- source.extractBetween(wordBegining, wordEnd, word);
- }
- }
- start = bi.next();
- }
- return word;
- }
- unsigned doCountWords(RuleBasedBreakIterator& bi, UnicodeString const & source)
- {
- bi.setText(source);
- int32_t start = bi.first();
- int32_t count = 0;
- while (start != BreakIterator::DONE) {
- int breakType = bi.getRuleStatus();
- if (breakType != UBRK_WORD_NONE) {
- // Exclude spaces, punctuation, and the like.
- // A status value UBRK_WORD_NONE indicates that the boundary does
- // not start a word or number.
- //
- ++count;
- }
- start = bi.next();
- }
- return count;
- }
- }//namespace
- using namespace nsUnicodelib;
- UNICODELIB_API void setPluginContext(IPluginContext * _ctx) { parentCtx = _ctx; }
- UNICODELIB_API void UNICODELIB_CALL ulUnicodeFilterOut(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src, unsigned hitLen, UChar const * hit)
- {
- UnicodeString const in(src, srcLen);
- UnicodeString const filter(hit, hitLen);
- UnicodeString out;
- StringCharacterIterator iter(in);
- for(iter.first32(); iter.hasNext(); iter.next32())
- {
- UChar32 c = iter.current32();
- if(filter.indexOf(c) == -1)
- out.append(c);
- }
- tgtLen = out.length();
- tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
- out.extract(0, tgtLen, tgt);
- }
- UNICODELIB_API void UNICODELIB_CALL ulUnicodeFilter(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src, unsigned hitLen, UChar const * hit)
- {
- UnicodeString const in(src, srcLen);
- UnicodeString const filter(hit, hitLen);
- UnicodeString out;
- StringCharacterIterator iter(in);
- for(iter.first32(); iter.hasNext(); iter.next32())
- {
- UChar32 c = iter.current32();
- if(filter.indexOf(c) != -1)
- out.append(c);
- }
- tgtLen = out.length();
- tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
- out.extract(0, tgtLen, tgt);
- }
- UNICODELIB_API void UNICODELIB_CALL ulUnicodeSubsOut(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src, unsigned hitLen, UChar const * hit, unsigned newCharLen, UChar const * newChar)
- {
- UnicodeString out;
- if(newCharLen > 0)
- {
- UnicodeString const in(src, srcLen);
- UnicodeString const filter(hit, hitLen);
- UnicodeString const replaceString(newChar, newCharLen);
- UChar32 replace = replaceString.char32At(0);
- StringCharacterIterator iter(in);
- for(iter.first32(); iter.hasNext(); iter.next32())
- {
- UChar32 c = iter.current32();
- if(filter.indexOf(c) == -1)
- out.append(c);
- else
- out.append(replace);
- }
- }
- else
- out.append(src, srcLen);
- tgtLen = out.length();
- tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
- out.extract(0, tgtLen, tgt);
- }
- UNICODELIB_API void UNICODELIB_CALL ulUnicodeSubs(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src, unsigned hitLen, UChar const * hit, unsigned newCharLen, UChar const * newChar)
- {
- UnicodeString out;
- if(newCharLen > 0)
- {
- UnicodeString const in(src, srcLen);
- UnicodeString const filter(hit, hitLen);
- UnicodeString const replaceString(newChar, newCharLen);
- UChar32 replace = replaceString.char32At(0);
- StringCharacterIterator iter(in);
- for(iter.first32(); iter.hasNext(); iter.next32())
- {
- UChar32 c = iter.current32();
- if(filter.indexOf(c) != -1)
- out.append(c);
- else
- out.append(replace);
- }
- }
- else
- out.append(src, srcLen);
- tgtLen = out.length();
- tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
- out.extract(0, tgtLen, tgt);
- }
- UNICODELIB_API void UNICODELIB_CALL ulUnicodeRepad(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src, unsigned tLen)
- {
- UnicodeString out(src, srcLen);
- out.trim();
- forceLength(out, tLen);
- tgtLen = out.length();
- tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
- out.extract(0, tgtLen, tgt);
- }
- UNICODELIB_API unsigned UNICODELIB_CALL ulUnicodeFind(unsigned srcLen, UChar const * src, unsigned hitLen, UChar const * hit, unsigned instance)
- {
- return ulUnicodeLocaleFind(srcLen, src, hitLen, hit, instance, "");
- }
- UNICODELIB_API unsigned UNICODELIB_CALL ulUnicodeLocaleFind(unsigned srcLen, UChar const * src, unsigned hitLen, UChar const * hit, unsigned instance, char const * localename)
- {
- UErrorCode error = U_ZERO_ERROR;
- UStringSearch * search = usearch_open(hit, hitLen, src, srcLen, localename, 0, &error);
- int32_t pos;
- for(pos = usearch_first(search, &error); pos != USEARCH_DONE; pos = usearch_next(search, &error))
- {
- if(!--instance)
- {
- usearch_close(search);
- return pos+1;
- }
- }
- usearch_close(search);
- return 0;
- }
- UNICODELIB_API unsigned UNICODELIB_CALL ulUnicodeLocaleFindAtStrength(unsigned srcLen, UChar const * src, unsigned hitLen, UChar const * hit, unsigned instance, char const * localename, char strength)
- {
- UnicodeString const source(src, srcLen);
- UnicodeString const pattern(hit, hitLen);
- UErrorCode error = U_ZERO_ERROR;
- Locale locale(localename);
- StringSearch search(pattern, source, locale, 0, error);
- doModifySearchStrength(search, strength, error);
- int32_t pos = search.first(error);
- while(pos != USEARCH_DONE)
- {
- if(!--instance)
- return pos+1;
- pos = search.next(error);
- }
- return 0;
- }
- UNICODELIB_API void UNICODELIB_CALL ulUnicodeExtract(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src, unsigned instance){
- UnicodeString const in(src, srcLen);
- UnicodeString out;
- if(extract(out, in, instance))
- {
- tgtLen = out.length();
- tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
- out.extract(0, tgtLen, tgt);
- }
- else
- {
- tgtLen = 0;
- tgt = 0;
- }
- }
- UNICODELIB_API void UNICODELIB_CALL ulUnicodeExtract50(UChar *tgt, unsigned srcLen, UChar const * src, unsigned instance)
- {
- UnicodeString const in(src, srcLen);
- UnicodeString out;
- extract(out, in, instance);
- forceLength(out, 50);
- out.extract(0, 50, tgt);
- }
- UNICODELIB_API void UNICODELIB_CALL ulUnicodeToLowerCase(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src)
- {
- UnicodeString unicode(src, srcLen);
- unicode.toLower();
- tgtLen = unicode.length();
- tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
- unicode.extract(0, tgtLen, tgt);
- }
- UNICODELIB_API void UNICODELIB_CALL ulUnicodeToUpperCase(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src)
- {
- UnicodeString unicode(src, srcLen);
- unicode.toUpper();
- tgtLen = unicode.length();
- tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
- unicode.extract(0, tgtLen, tgt);
- }
- UNICODELIB_API void UNICODELIB_CALL ulUnicodeToProperCase(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src)
- {
- UnicodeString unicode(src, srcLen);
- unicode.toTitle(0);
- tgtLen = unicode.length();
- tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
- unicode.extract(0, tgtLen, tgt);
- }
- UNICODELIB_API void UNICODELIB_CALL ulUnicodeToLowerCase80(UChar * tgt, unsigned srcLen, UChar const * src)
- {
- UnicodeString unicode(src, srcLen);
- unicode.toLower();
- forceLength(unicode, 80);
- unicode.extract(0, 80, tgt);
- }
- UNICODELIB_API void UNICODELIB_CALL ulUnicodeToUpperCase80(UChar * tgt, unsigned srcLen, UChar const * src)
- {
- UnicodeString unicode(src, srcLen);
- unicode.toUpper();
- forceLength(unicode, 80);
- unicode.extract(0, 80, tgt);
- }
- UNICODELIB_API void UNICODELIB_CALL ulUnicodeToProperCase80(UChar * tgt, unsigned srcLen, UChar const * src)
- {
- UnicodeString unicode(src, srcLen);
- unicode.toTitle(0);
- forceLength(unicode, 80);
- unicode.extract(0, 80, tgt);
- }
- UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleToLowerCase(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src, char const * localename)
- {
- UnicodeString unicode(src, srcLen);
- Locale locale(localename);
- unicode.toLower(locale);
- tgtLen = unicode.length();
- tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
- unicode.extract(0, tgtLen, tgt);
- }
- UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleToUpperCase(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src, char const * localename)
- {
- UnicodeString unicode(src, srcLen);
- Locale locale(localename);
- unicode.toUpper(locale);
- tgtLen = unicode.length();
- tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
- unicode.extract(0, tgtLen, tgt);
- }
- UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleToProperCase(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src, char const * localename)
- {
- UnicodeString unicode(src, srcLen);
- Locale locale(localename);
- unicode.toTitle(0, locale);
- tgtLen = unicode.length();
- tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
- unicode.extract(0, tgtLen, tgt);
- }
- UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleToLowerCase80(UChar * tgt, unsigned srcLen, UChar const * src, char const * localename)
- {
- UnicodeString unicode(src, srcLen);
- Locale locale(localename);
- unicode.toLower(locale);
- forceLength(unicode, 80);
- unicode.extract(0, 80, tgt);
- }
- UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleToUpperCase80(UChar * tgt, unsigned srcLen, UChar const * src, char const * localename)
- {
- UnicodeString unicode(src, srcLen);
- Locale locale(localename);
- unicode.toUpper(locale);
- forceLength(unicode, 80);
- unicode.extract(0, 80, tgt);
- }
- UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleToProperCase80(UChar * tgt, unsigned srcLen, UChar const * src, char const * localename)
- {
- UnicodeString unicode(src, srcLen);
- Locale locale(localename);
- unicode.toTitle(0, locale);
- forceLength(unicode, 80);
- unicode.extract(0, 80, tgt);
- }
- UNICODELIB_API int UNICODELIB_CALL ulUnicodeCompareIgnoreCase(unsigned src1Len, UChar const * src1, unsigned src2Len, UChar const * src2)
- {
- return doUnicodeCompareAtStrength(src1Len, src1, src2Len, src2, Collator::SECONDARY);
- }
- UNICODELIB_API int UNICODELIB_CALL ulUnicodeCompareAtStrength(unsigned src1Len, UChar const * src1, unsigned src2Len, UChar const * src2, char strength)
- {
- switch(strength)
- {
- case 1:
- return doUnicodeCompareAtStrength(src1Len, src1, src2Len, src2, Collator::PRIMARY);
- case 2:
- return doUnicodeCompareAtStrength(src1Len, src1, src2Len, src2, Collator::SECONDARY);
- case 3:
- return doUnicodeCompareAtStrength(src1Len, src1, src2Len, src2, Collator::TERTIARY);
- case 4:
- return doUnicodeCompareAtStrength(src1Len, src1, src2Len, src2, Collator::QUATERNARY);
- case 5:
- default:
- return doUnicodeCompareAtStrength(src1Len, src1, src2Len, src2, Collator::IDENTICAL);
- }
- }
- UNICODELIB_API int UNICODELIB_CALL ulUnicodeLocaleCompareIgnoreCase(unsigned src1Len, UChar const * src1, unsigned src2Len, UChar const * src2, char const * localename)
- {
- return doUnicodeLocaleCompareAtStrength(src1Len, src1, src2Len, src2, localename, Collator::SECONDARY);
- }
- UNICODELIB_API int UNICODELIB_CALL ulUnicodeLocaleCompareAtStrength(unsigned src1Len, UChar const * src1, unsigned src2Len, UChar const * src2, char const * localename, char strength)
- {
- switch(strength)
- {
- case 1:
- return doUnicodeLocaleCompareAtStrength(src1Len, src1, src2Len, src2, localename, Collator::PRIMARY);
- case 2:
- return doUnicodeLocaleCompareAtStrength(src1Len, src1, src2Len, src2, localename, Collator::SECONDARY);
- case 3:
- return doUnicodeLocaleCompareAtStrength(src1Len, src1, src2Len, src2, localename, Collator::TERTIARY);
- case 4:
- return doUnicodeLocaleCompareAtStrength(src1Len, src1, src2Len, src2, localename, Collator::QUATERNARY);
- case 5:
- default:
- return doUnicodeLocaleCompareAtStrength(src1Len, src1, src2Len, src2, localename, Collator::IDENTICAL);
- }
- }
- UNICODELIB_API void UNICODELIB_CALL ulUnicodeReverse(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src)
- {
- UnicodeString in(src, srcLen);
- UnicodeString out;
- StringCharacterIterator iter(in);
- for(iter.last32(); iter.hasPrevious(); iter.previous32())
- out.append(iter.current32());
- if(srcLen) out.append(iter.current32());
- tgtLen = out.length();
- tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
- out.extract(0, tgtLen, tgt);
- }
- UNICODELIB_API void UNICODELIB_CALL ulUnicodeFindReplace(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src, unsigned stokLen, UChar const * stok, unsigned rtokLen, UChar const * rtok)
- {
- UnicodeString source(src, srcLen);
- UnicodeString const pattern(stok, stokLen);
- UnicodeString const replace(rtok, rtokLen);
- source.findAndReplace(pattern, replace);
- tgtLen = source.length();
- tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
- source.extract(0, tgtLen, tgt);
- }
- UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleFindReplace(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src, unsigned stokLen, UChar const * stok, unsigned rtokLen, UChar const * rtok, char const * localename)
- {
- UnicodeString source(src, srcLen);
- UnicodeString const pattern(stok, stokLen);
- UnicodeString const replace(rtok, rtokLen);
- doUnicodeLocaleFindReplace(source, pattern, replace, localename);
- tgtLen = source.length();
- tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
- source.extract(0, tgtLen, tgt);
- }
- UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleFindAtStrengthReplace(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src, unsigned stokLen, UChar const * stok, unsigned rtokLen, UChar const * rtok, char const * localename, char strength)
- {
- UnicodeString source(src, srcLen);
- UnicodeString const pattern(stok, stokLen);
- UnicodeString const replace(rtok, rtokLen);
- doUnicodeLocaleFindAtStrengthReplace(source, pattern, replace, localename, strength);
- tgtLen = source.length();
- tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
- source.extract(0, tgtLen, tgt);
- }
- UNICODELIB_API void UNICODELIB_CALL ulUnicodeFindReplace80(UChar * tgt, unsigned srcLen, UChar const * src, unsigned stokLen, UChar const * stok, unsigned rtokLen, UChar const * rtok)
- {
- UnicodeString source(src, srcLen);
- UnicodeString const pattern(stok, stokLen);
- UnicodeString const replace(rtok, rtokLen);
- source.findAndReplace(pattern, replace);
- forceLength(source, 80);
- source.extract(0, 80, tgt);
- }
- UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleFindReplace80(UChar * tgt, unsigned srcLen, UChar const * src, unsigned stokLen, UChar const * stok, unsigned rtokLen, UChar const * rtok, char const * localename)
- {
- UnicodeString source(src, srcLen);
- UnicodeString const pattern(stok, stokLen);
- UnicodeString const replace(rtok, rtokLen);
- doUnicodeLocaleFindReplace(source, pattern, replace, localename);
- forceLength(source, 80);
- source.extract(0, 80, tgt);
- }
- UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleFindAtStrengthReplace80(UChar * tgt, unsigned srcLen, UChar const * src, unsigned stokLen, UChar const * stok, unsigned rtokLen, UChar const * rtok, char const * localename, char strength)
- {
- UnicodeString source(src, srcLen);
- UnicodeString const pattern(stok, stokLen);
- UnicodeString const replace(rtok, rtokLen);
- doUnicodeLocaleFindAtStrengthReplace(source, pattern, replace, localename, strength);
- forceLength(source, 80);
- source.extract(0, 80, tgt);
- }
- UNICODELIB_API void UNICODELIB_CALL ulUnicodeCleanSpaces(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src)
- {
- UnicodeString source(src, srcLen);
- doUnicodeCleanSpaces(source);
- tgtLen = source.length();
- tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
- source.extract(0, tgtLen, tgt);
- }
- UNICODELIB_API void UNICODELIB_CALL ulUnicodeCleanSpaces25(UChar * tgt, unsigned srcLen, UChar const * src)
- {
- UnicodeString source(src, srcLen);
- doUnicodeCleanSpaces(source);
- forceLength(source, 25);
- source.extract(0, 25, tgt);
- }
- UNICODELIB_API void UNICODELIB_CALL ulUnicodeCleanSpaces80(UChar * tgt, unsigned srcLen, UChar const * src)
- {
- UnicodeString source(src, srcLen);
- doUnicodeCleanSpaces(source);
- forceLength(source, 80);
- source.extract(0, 80, tgt);
- }
- UNICODELIB_API bool UNICODELIB_CALL ulUnicodeWildMatch(unsigned srcLen, UChar const * src, unsigned patLen, UChar const * pat, bool noCase)
- {
- return wildTrimMatch<UChar, u16toupper, u16query, u16asterisk, u16space>(src, srcLen, pat, patLen, noCase);
- }
- UNICODELIB_API bool UNICODELIB_CALL ulUnicodeContains(unsigned srcLen, UChar const * src, unsigned patLen, UChar const * pat, bool noCase)
- {
- UnicodeString source(src, srcLen);
- UnicodeString pattern(pat, patLen);
- if(noCase)
- {
- source.foldCase();
- pattern.foldCase();
- }
- StringCharacterIterator iter(pattern);
- for(iter.first32(); iter.hasNext(); iter.next32())
- if(source.indexOf(iter.current32()) == -1)
- return false;
- return true;
- }
- UNICODELIB_API void UNICODELIB_CALL ulUnicodeCleanAccents(unsigned & tgtLen, UChar * & tgt, unsigned srcLen, UChar const * src)
- {
- if (!deAccenter)
- {
- CriticalBlock b(accenterCrit);
- if (!deAccenter)
- {
- UErrorCode lStatus = U_ZERO_ERROR;
- deAccenter = icu::Transliterator::createInstance("NFD; [:M:] Remove; NFC;", UTRANS_FORWARD, lStatus);
- }
- }
- UnicodeString source(src, srcLen);
- deAccenter->transliterate(source);
- tgtLen = source.length();
- tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
- source.extract(0, tgtLen, tgt);
- }
- static RuleBasedCollator * createRBCollator(const char * localename)
- {
- UErrorCode status = U_ZERO_ERROR;
- Locale locale(localename);
- RuleBasedCollator * rbc = (RuleBasedCollator *)RuleBasedCollator::createInstance(locale, status);
- rbc->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, status);
- if (U_FAILURE(status))
- {
- delete rbc;
- return NULL;
- }
- return rbc;
- }
- class RBCLocale
- {
- public:
- RBCLocale(char const * _locale) : locale(_locale)
- {
- rbc = createRBCollator(locale);
- }
- ~RBCLocale()
- {
- delete rbc;
- }
- RuleBasedCollator * queryCollator() const { return rbc; }
- private:
- StringAttr locale;
- RuleBasedCollator * rbc;
- };
- typedef MapStringTo<RBCLocale, char const *> MapStrToRBC;
- static MapStrToRBC * localeMap;
- static CriticalSection localeCrit;
- MODULE_INIT(INIT_PRIORITY_STANDARD)
- {
- return true;
- }
- MODULE_EXIT()
- {
- delete localeMap;
- localeMap = NULL;
- }
- static RuleBasedCollator * queryRBCollator(const char * localename)
- {
- if (!localename) localename = "";
- CriticalBlock b(localeCrit);
- if (!localeMap)
- localeMap = new MapStrToRBC;
- RBCLocale * loc = localeMap->getValue(localename);
- if(!loc)
- {
- //MORE: ECLRTL calls rtlGetNormalizedUnicodeLocaleName(). Should this be happening here?
- const char * normalizedlocale = localename;
- localeMap->setValue(localename, normalizedlocale);
- loc = localeMap->getValue(localename);
- }
- return loc->queryCollator();
- }
- UNICODELIB_API unsigned UNICODELIB_CALL ulUnicodeLocaleEditDistance(unsigned leftLen, UChar const * left, unsigned rightLen, UChar const * right, char const * localename)
- {
- RuleBasedCollator* rbc = queryRBCollator(localename);
- if (!rbc)
- return DISTANCE_ON_ERROR;
- UnicodeString uLeft(left, leftLen);
- UnicodeString uRight(right, rightLen);
- unsigned distance = nsUnicodelib::unicodeEditDistanceV2(uLeft, uRight, *rbc);
- return distance;
- }
- UNICODELIB_API bool UNICODELIB_CALL ulUnicodeLocaleEditDistanceWithinRadius(unsigned leftLen, UChar const * left, unsigned rightLen, UChar const * right, unsigned radius, char const * localename)
- {
- RuleBasedCollator* rbc = queryRBCollator(localename);
- if (!rbc)
- return false;
- UnicodeString uLeft(left, leftLen);
- UnicodeString uRight(right, rightLen);
- unsigned distance = nsUnicodelib::unicodeEditDistanceV3(uLeft, uRight, radius, *rbc);
- return distance <= radius;
- }
- UNICODELIB_API unsigned UNICODELIB_CALL ulUnicodeLocaleWordCount(unsigned textLen, UChar const * text, char const * localename)
- {
- UErrorCode status = U_ZERO_ERROR;
- Locale locale(localename);
- RuleBasedBreakIterator* bi = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createWordInstance(locale, status);
- UnicodeString uText(text, textLen);
- uText.trim();
- unsigned count = doCountWords(*bi, uText);
- delete bi;
- return count;
- }
- UNICODELIB_API void UNICODELIB_CALL ulUnicodeLocaleGetNthWord(unsigned & tgtLen, UChar * & tgt, unsigned textLen, UChar const * text, unsigned n, char const * localename)
- {
- UErrorCode status = U_ZERO_ERROR;
- Locale locale(localename);
- RuleBasedBreakIterator* bi = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createWordInstance(locale, status);
- UnicodeString uText(text, textLen);
- uText.trim();
- UnicodeString word = getNthWord(*bi, uText, n);
- if(word.length()>0)
- {
- tgtLen = word.length();
- tgt = (UChar *)CTXMALLOC(parentCtx, tgtLen*2);
- word.extract(0, tgtLen, tgt);
- }
- else
- {
- tgtLen = 0;
- tgt = 0;
- }
- }
|