stringlib.cpp 52 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762
  1. /*##############################################################################
  2. Copyright (C) 2011 HPCC Systems.
  3. All rights reserved. This program is free software: you can redistribute it and/or modify
  4. it under the terms of the GNU Affero General Public License as
  5. published by the Free Software Foundation, either version 3 of the
  6. License, or (at your option) any later version.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. GNU Affero General Public License for more details.
  11. You should have received a copy of the GNU Affero General Public License
  12. along with this program. If not, see <http://www.gnu.org/licenses/>.
  13. ############################################################################## */
  14. #include <time.h>
  15. #include <stdlib.h>
  16. #include <string.h>
  17. #include <ctype.h>
  18. #include <assert.h>
  19. #include "stringlib.hpp"
  20. #include "wildmatch.tpp"
  21. static const char * compatibleVersions[] = {
  22. "STRINGLIB 1.1.06 [fd997dc3feb4ca385d59a12b9dc4beab]", // windows version
  23. "STRINGLIB 1.1.06 [f8305e66ca26a1447dee66d4a36d88dc]", // linux version
  24. "STRINGLIB 1.1.07",
  25. "STRINGLIB 1.1.08",
  26. "STRINGLIB 1.1.09",
  27. "STRINGLIB 1.1.10",
  28. "STRINGLIB 1.1.11",
  29. "STRINGLIB 1.1.12",
  30. "STRINGLIB 1.1.13",
  31. NULL };
  32. #define STRINGLIB_VERSION "STRINGLIB 1.1.14"
  33. const char * EclDefinition =
  34. "export StringLib := SERVICE\n"
  35. " string StringFilterOut(const string src, const string _within) : c, pure,entrypoint='slStringFilterOut'; \n"
  36. " string StringFilter(const string src, const string _within) : c, pure,entrypoint='slStringFilter'; \n"
  37. " string StringSubstituteOut(const string src, const string _within, const string _newchar) : c, pure,entrypoint='slStringSubsOut'; \n"
  38. " string StringSubstitute(const string src, const string _within, const string _newchar) : c, pure,entrypoint='slStringSubs'; \n"
  39. " string StringRepad(const string src, unsigned4 size) : c, pure,entrypoint='slStringRepad'; \n"
  40. " unsigned integer4 StringFind(const string src, const string tofind, unsigned4 instance ) : c, pure,entrypoint='slStringFind'; \n"
  41. " unsigned integer4 StringUnboundedUnsafeFind(const string src, const string tofind ) : c, pure,entrypoint='slStringFind2'; \n"
  42. " unsigned integer4 StringFindCount(const string src, const string tofind) : c, pure,entrypoint='slStringFindCount'; \n"
  43. " unsigned integer4 EbcdicStringFind(const ebcdic string src, const ebcdic string tofind , unsigned4 instance ) : c,pure,entrypoint='slStringFind'; \n"
  44. " unsigned integer4 EbcdicStringUnboundedUnsafeFind(const ebcdic string src, const ebcdic string tofind ) : c,pure,entrypoint='slStringFind2'; \n"
  45. " string StringExtract(const string src, unsigned4 instance) : c,pure,entrypoint='slStringExtract'; \n"
  46. " string8 GetDateYYYYMMDD() : c,once,entrypoint='slGetDateYYYYMMDD2';\n"
  47. " varstring GetBuildInfo() : c,once,entrypoint='slGetBuildInfo';\n"
  48. " string Data2String(const data src) : c,pure,entrypoint='slData2String';\n"
  49. " data String2Data(const string src) : c,pure,entrypoint='slString2Data';\n"
  50. " string StringToLowerCase(const string src) : c,pure,entrypoint='slStringToLowerCase';\n"
  51. " string StringToUpperCase(const string src) : c,pure,entrypoint='slStringToUpperCase';\n"
  52. " string StringToProperCase(const string src) : c,pure,entrypoint='slStringToProperCase';\n"
  53. " string StringToCapitalCase(const string src) : c,pure,entrypoint='slStringToCapitalCase';\n"
  54. " string StringToTitleCase(const string src) : c,pure,entrypoint='slStringToTitleCase';\n"
  55. " integer4 StringCompareIgnoreCase(const string src1, string src2) : c,pure,entrypoint='slStringCompareIgnoreCase';\n"
  56. " string StringReverse(const string src) : c,pure,entrypoint='slStringReverse';\n"
  57. " string StringFindReplace(const string src, const string stok, const string rtok) : c,pure,entrypoint='slStringFindReplace';\n"
  58. " string StringCleanSpaces(const string src) : c,pure,entrypoint='slStringCleanSpaces'; \n"
  59. " boolean StringWildMatch(const string src, const string _pattern, boolean _noCase) : c, pure,entrypoint='slStringWildMatch'; \n"
  60. " boolean StringWildExactMatch(const string src, const string _pattern, boolean _noCase) : c, pure,entrypoint='slStringWildExactMatch'; \n"
  61. " boolean StringContains(const string src, const string _pattern, boolean _noCase) : c, pure,entrypoint='slStringContains'; \n"
  62. " string StringExtractMultiple(const string src, unsigned8 mask) : c,pure,entrypoint='slStringExtractMultiple'; \n"
  63. " unsigned integer4 EditDistance(const string l, const string r) : c, pure,entrypoint='slEditDistance'; \n"
  64. " boolean EditDistanceWithinRadius(const string l, const string r, unsigned4 radius) : c,pure,entrypoint='slEditDistanceWithinRadius'; \n"
  65. " unsigned integer4 EditDistanceV2(const string l, const string r) : c, pure,entrypoint='slEditDistanceV2'; \n"
  66. " boolean EditDistanceWithinRadiusV2(const string l, const string r, unsigned4 radius) : c,pure,entrypoint='slEditDistanceWithinRadiusV2'; \n"
  67. " string StringGetNthWord(const string src, unsigned4 n) : c, pure,entrypoint='slStringGetNthWord'; \n"
  68. " unsigned4 StringWordCount(const string src) : c, pure,entrypoint='slStringWordCount'; \n"
  69. " unsigned4 CountWords(const string src, const string _separator, BOOLEAN allow_blanks) : c, pure,entrypoint='slCountWords'; \n"
  70. " SET OF STRING SplitWords(const string src, const string _separator, BOOLEAN allow_blanks) : c, pure,entrypoint='slSplitWords'; \n"
  71. " STRING CombineWords(set of string src, const string _separator) : c, pure,entrypoint='slCombineWords'; \n"
  72. " UNSIGNED4 StringToDate(const string src, const varstring format) : c, pure,entrypoint='slStringToDate'; \n"
  73. " UNSIGNED4 MatchDate(const string src, set of varstring formats) : c, pure,entrypoint='slMatchDate'; \n"
  74. " STRING FormatDate(UNSIGNED4 date, const varstring format) : c, pure,entrypoint='slFormatDate'; \n"
  75. "END;";
  76. STRINGLIB_API bool getECLPluginDefinition(ECLPluginDefinitionBlock *pb)
  77. {
  78. if (pb->size == sizeof(ECLPluginDefinitionBlockEx))
  79. {
  80. ECLPluginDefinitionBlockEx * pbx = (ECLPluginDefinitionBlockEx *) pb;
  81. pbx->compatibleVersions = compatibleVersions;
  82. }
  83. else if (pb->size != sizeof(ECLPluginDefinitionBlock))
  84. return false;
  85. pb->magicVersion = PLUGIN_VERSION;
  86. pb->version = STRINGLIB_VERSION;
  87. pb->moduleName = "lib_stringlib";
  88. pb->ECL = EclDefinition;
  89. pb->flags = PLUGIN_IMPLICIT_MODULE | PLUGIN_MULTIPLE_VERSIONS;
  90. pb->description = "StringLib string manipulation library";
  91. return true;
  92. }
  93. namespace nsStringlib {
  94. IPluginContext * parentCtx = NULL;
  95. enum { bitsInUnsigned = sizeof(unsigned) * 8 };
  96. static const char hexchar[] = "0123456789ABCDEF";
  97. static unsigned hex2digit(char c)
  98. {
  99. switch (c)
  100. {
  101. default: case 0: return 0;
  102. case '1': return 1;
  103. case '2': return 2;
  104. case '3': return 3;
  105. case '4': return 4;
  106. case '5': return 5;
  107. case '6': return 6;
  108. case '7': return 7;
  109. case '8': return 8;
  110. case '9': return 9;
  111. case 'a': case 'A': return 10;
  112. case 'b': case 'B': return 11;
  113. case 'c': case 'C': return 12;
  114. case 'd': case 'D': return 13;
  115. case 'e': case 'E': return 14;
  116. case 'f': case 'F': return 15;
  117. }
  118. }
  119. inline char char_toupper(char c) { return (char)toupper(c); }
  120. inline void clip(unsigned &len, const char * s)
  121. {
  122. while ( len > 0 && s[len-1]==' ' )
  123. len--;
  124. }
  125. inline unsigned min3(unsigned a, unsigned b, unsigned c)
  126. {
  127. unsigned mi;
  128. mi = a;
  129. if (b < mi)
  130. {
  131. mi = b;
  132. }
  133. if (c < mi)
  134. {
  135. mi = c;
  136. }
  137. return mi;
  138. }
  139. class CEditDistance
  140. {
  141. private:
  142. unsigned char da[256][256];
  143. public:
  144. unsigned editDistance(unsigned leftLen, const char * left, unsigned rightLen, const char * right)
  145. {
  146. unsigned i, j, cost;
  147. char l_i, r_j;
  148. clip(leftLen, left);
  149. clip(rightLen, right);
  150. if (leftLen > 255)
  151. {
  152. leftLen = 255;
  153. }
  154. if (rightLen > 255)
  155. {
  156. rightLen = 255;
  157. }
  158. if (leftLen == 0)
  159. {
  160. return rightLen;
  161. }
  162. if (rightLen == 0)
  163. {
  164. return leftLen;
  165. }
  166. for (i = 0; i <= leftLen; i++)
  167. {
  168. da[i][0] = i;
  169. }
  170. for (j = 0; j <= rightLen; j++)
  171. {
  172. da[0][j] = j;
  173. }
  174. for (i = 1; i <= leftLen; i++)
  175. {
  176. l_i = left[i - 1];
  177. for (j = 1; j <= rightLen; j++)
  178. {
  179. r_j = right[j - 1];
  180. cost = (l_i == r_j) ? 0 : 1;
  181. da[i][j] = min3(da[i-1][j]+1, da[i][j-1]+1, da[i-1][j-1] + cost);
  182. }
  183. }
  184. return da[leftLen][rightLen];
  185. }
  186. unsigned editDistance(unsigned leftLen, const char * left, unsigned rightLen, const char * right, unsigned radius)
  187. {
  188. unsigned i, j, cost;
  189. char l_i, r_j;
  190. clip(leftLen, left);
  191. clip(rightLen, right);
  192. if (leftLen > 255)
  193. {
  194. leftLen = 255;
  195. }
  196. if (rightLen > 255)
  197. {
  198. rightLen = 255;
  199. }
  200. if (leftLen == 0)
  201. {
  202. return rightLen;
  203. }
  204. if (rightLen == 0)
  205. {
  206. return leftLen;
  207. }
  208. if (leftLen > rightLen)
  209. {
  210. const char *tstr = left;
  211. left = right;
  212. right = tstr;
  213. unsigned tlen = leftLen;
  214. leftLen = rightLen;
  215. rightLen = tlen;
  216. }
  217. for (i = 0; i <= leftLen; i++)
  218. {
  219. da[i][0] = i;
  220. }
  221. for (j = 0; j <= rightLen; j++)
  222. {
  223. da[0][j] = j;
  224. }
  225. for (i = 1; i <= leftLen; i++)
  226. {
  227. l_i = left[i - 1];
  228. for (j = 1; j <= rightLen; j++)
  229. {
  230. r_j = right[j - 1];
  231. cost = (l_i == r_j) ? 0 : 1;
  232. da[i][j] = min3(da[i-1][j]+1, da[i][j-1]+1, da[i-1][j-1] + cost);
  233. }
  234. // bail out early if ed can't possibly be <= radius
  235. if ((da[i][rightLen] - (leftLen - i)) > radius)
  236. return da[i][rightLen];
  237. }
  238. return da[leftLen][rightLen];
  239. }
  240. };
  241. //--- Optimized versions of the edit distance functions above.
  242. inline unsigned mask(unsigned x) { return x & 1; }
  243. unsigned editDistanceV2(unsigned leftLen, const char * left, unsigned rightLen, const char * right)
  244. {
  245. unsigned i, j;
  246. clip(leftLen, left);
  247. clip(rightLen, right);
  248. if (leftLen > 255)
  249. leftLen = 255;
  250. if (rightLen > 255)
  251. rightLen = 255;
  252. if (leftLen == 0)
  253. return rightLen;
  254. if (rightLen == 0)
  255. return leftLen;
  256. //Optimize the storage requirements by
  257. //i) Only storing two stripes
  258. //ii) Calculate, but don't store the row comparing against the null string
  259. unsigned char da[2][256];
  260. char r_0 = right[0];
  261. char l_0 = left[0];
  262. bool matched_l0 = false;
  263. for (j = 0; j < rightLen; j++)
  264. {
  265. if (right[j] == l_0) matched_l0 = true;
  266. da[0][j] = (matched_l0) ? j : j+1;
  267. }
  268. bool matched_r0 = (l_0 == r_0);
  269. for (i = 1; i < leftLen; i++)
  270. {
  271. char l_i = left[i];
  272. if (l_i == r_0)
  273. matched_r0 = true;
  274. byte da_i_0 = matched_r0 ? i : i+1;
  275. da[mask(i)][0] = da_i_0;
  276. byte da_i_prevj = da_i_0;
  277. for (j = 1; j < rightLen; j++)
  278. {
  279. char r_j = right[j];
  280. unsigned char next = (l_i == r_j) ? da[mask(i-1)][j-1] :
  281. min3(da[mask(i-1)][j], da_i_prevj, da[mask(i-1)][j-1]) + 1;
  282. da[mask(i)][j] = next;
  283. da_i_prevj = next;
  284. }
  285. }
  286. return da[mask(leftLen-1)][rightLen-1];
  287. }
  288. unsigned editDistanceV2(unsigned leftLen, const char * left, unsigned rightLen, const char * right, unsigned radius)
  289. {
  290. unsigned i, j;
  291. clip(leftLen, left);
  292. clip(rightLen, right);
  293. unsigned minED = (leftLen < rightLen)? rightLen - leftLen: leftLen - rightLen;
  294. if (minED > radius)
  295. return minED;
  296. if (leftLen > 255)
  297. leftLen = 255;
  298. if (rightLen > 255)
  299. rightLen = 255;
  300. //Checking for leading common substrings actually slows the function down.
  301. if (leftLen == 0)
  302. return rightLen;
  303. if (rightLen == 0)
  304. return leftLen;
  305. /*
  306. This function applies two optimizations over the function above.
  307. a) Adding a charcter (next row) can at most decrease the edit distance by 1, so short circuit when
  308. we there is no possiblity of getting within the distance.
  309. b) We only need to evaluate the martix da[i-radius..i+radius][j-radius..j+radius]
  310. not taking into account values outside that range [can use max value to prevent access]
  311. */
  312. //Optimize the storage requirements by
  313. //i) Only storing two stripes
  314. //ii) Calculate, but don't store the row comparing against the null string
  315. //NB: A byte array is ok because the +1 is added after the minimum, and that will always include 254 as an option.
  316. unsigned char da[2][255];
  317. char r_0 = right[0];
  318. char l_0 = left[0];
  319. bool matched_l0 = false;
  320. for (j = 0; j < rightLen; j++)
  321. {
  322. if (right[j] == l_0) matched_l0 = true;
  323. da[0][j] = (matched_l0) ? j : j+1;
  324. }
  325. bool matched_r0 = (l_0 == r_0);
  326. for (i = 1; i < leftLen; i++)
  327. {
  328. char l_i = left[i];
  329. if (l_i == r_0)
  330. matched_r0 = true;
  331. byte da_i_0 = matched_r0 ? i : i+1;
  332. da[mask(i)][0] = da_i_0;
  333. byte da_i_prevj = da_i_0;
  334. unsigned first = (i > radius) ? i-radius : 1;
  335. unsigned last = (i + radius > rightLen) ? rightLen : i + radius;
  336. for (j = 1; j < rightLen; j++)
  337. {
  338. char r_j = right[j];
  339. unsigned char next = (l_i == r_j) ? da[mask(i-1)][j-1] :
  340. min3(da[mask(i-1)][j], da_i_prevj, da[mask(i-1)][j-1]) + 1;
  341. da[mask(i)][j] = next;
  342. da_i_prevj = next;
  343. }
  344. // bail out early if ed can't possibly be <= radius
  345. unsigned maxdelta = (leftLen - (i+1));
  346. if (da_i_prevj > radius + maxdelta) // if da_i_prvj - maxdelta > radius can't ever get low enough
  347. return da_i_prevj;
  348. }
  349. return da[mask(leftLen-1)][rightLen-1];
  350. }
  351. //This could be further improved in the following ways:
  352. // * Only use 2*radius bytes of temporary storage - I doubt it is worth it.
  353. // * special case edit1 - you could use variables for the 6 interesting array elements, and get
  354. // rid of the array completely. You could also unwind the first (and last iterations).
  355. // * I suspect the early exit condition could be improved depending the lengths of the strings.
  356. unsigned editDistanceV3(unsigned leftLen, const char * left, unsigned rightLen, const char * right, unsigned radius)
  357. {
  358. if (radius >= 255)
  359. return 255;
  360. clip(leftLen, left);
  361. clip(rightLen, right);
  362. unsigned minED = (leftLen < rightLen)? rightLen - leftLen: leftLen - rightLen;
  363. if (minED > radius)
  364. return minED;
  365. if (leftLen > 255)
  366. leftLen = 255;
  367. if (rightLen > 255)
  368. rightLen = 255;
  369. //Checking for leading common substrings actually slows the function down.
  370. if (leftLen == 0)
  371. return rightLen;
  372. if (rightLen == 0)
  373. return leftLen;
  374. /*
  375. This function applies two optimizations over the function above.
  376. a) Adding a charcter (next row) can at most decrease the edit distance by 1, so short circuit when
  377. we there is no possiblity of getting within the distance.
  378. b) We only need to evaluate the martix da[i-radius..i+radius][j-radius..j+radius]
  379. not taking into account values outside that range [can use max value to prevent access]
  380. */
  381. //Optimize the storage requirements by
  382. //i) Only storing two stripes
  383. //ii) Calculate, but don't store the row comparing against the null string
  384. unsigned char da[2][256];
  385. char r_0 = right[0];
  386. char l_0 = left[0];
  387. bool matched_l0 = false;
  388. for (unsigned j = 0; j < rightLen; j++)
  389. {
  390. if (right[j] == l_0) matched_l0 = true;
  391. da[0][j] = (matched_l0) ? j : j+1;
  392. }
  393. bool matched_r0 = (l_0 == r_0);
  394. for (unsigned i = 1; i < leftLen; i++)
  395. {
  396. char l_i = left[i];
  397. if (l_i == r_0)
  398. matched_r0 = true;
  399. byte da_i_0 = matched_r0 ? i : i+1;
  400. da[mask(i)][0] = da_i_0;
  401. byte da_i_prevj = da_i_0;
  402. unsigned low = i-radius;
  403. unsigned high = i+radius;
  404. unsigned first = (i > radius) ? low : 1;
  405. unsigned last = (high >= rightLen) ? rightLen : high +1;
  406. for (unsigned j = first; j < last; j++)
  407. {
  408. char r_j = right[j];
  409. unsigned next = da[mask(i-1)][j-1];
  410. if (l_i != r_j)
  411. {
  412. if (j != low)
  413. {
  414. if (next > da_i_prevj)
  415. next = da_i_prevj;
  416. }
  417. if (j != high)
  418. {
  419. byte da_previ_j = da[mask(i-1)][j];
  420. if (next > da_previ_j)
  421. next = da_previ_j;
  422. }
  423. next++;
  424. }
  425. da[mask(i)][j] = next;
  426. da_i_prevj = next;
  427. }
  428. // bail out early if ed can't possibly be <= radius
  429. // Only considering a strip down the middle of the matrix, so the maximum the score can ever be adjusted is 2xradius
  430. unsigned max_valid_score = 3*radius;
  431. // But maximum is also 1 for every difference in string length - comes in to play when close to the end.
  432. //In 32bit goes slower for radius=1 I suspect because running out of registers. Retest in 64bit.
  433. if (radius > 1)
  434. {
  435. unsigned max_distance = radius + (leftLen - (i+1)) + (rightLen - last);
  436. if (max_valid_score > max_distance)
  437. max_valid_score = max_distance;
  438. }
  439. if (da_i_prevj > max_valid_score)
  440. return da_i_prevj;
  441. }
  442. return da[mask(leftLen-1)][rightLen-1];
  443. }
  444. } // namespace
  445. //-------------------------------------------------------------------------------------------------------------------------------------------
  446. // Exported functions are NOT in the namespace
  447. using namespace nsStringlib;
  448. STRINGLIB_API void setPluginContext(IPluginContext * _ctx) { parentCtx = _ctx; }
  449. STRINGLIB_API void STRINGLIB_CALL slStringFilterOut(unsigned & tgtLen, char * & tgt, unsigned srcLen, const char * src, unsigned hitLen, const char * hit)
  450. {
  451. char *temp = (char *)CTXMALLOC(parentCtx, srcLen);
  452. unsigned tlen = 0;
  453. if (hitLen==1)
  454. {
  455. char test = *hit;
  456. for ( unsigned i = 0; i < srcLen; i++ )
  457. {
  458. char c = src[i];
  459. if (c!=test)
  460. temp[tlen++] = c;
  461. }
  462. }
  463. else {
  464. unsigned filter[256/bitsInUnsigned];
  465. memset(filter,0,sizeof(filter));
  466. for (unsigned j = 0; j < hitLen; j++ )
  467. {
  468. unsigned c = (unsigned char)hit[j];
  469. filter[c/bitsInUnsigned] |= (1<<(c%bitsInUnsigned));
  470. }
  471. for ( unsigned i = 0; i < srcLen; i++ )
  472. {
  473. unsigned c = (unsigned char)src[i];
  474. if ((filter[c/bitsInUnsigned] & (1<<(c%bitsInUnsigned))) == 0)
  475. temp[tlen++] = (char)c;
  476. }
  477. }
  478. tgt = (char *)CTXREALLOC(parentCtx, temp, tlen);
  479. tgtLen = tlen;
  480. }
  481. STRINGLIB_API void STRINGLIB_CALL slStringFilter(unsigned & tgtLen, char * & tgt, unsigned srcLen, const char * src, unsigned hitLen, const char * hit)
  482. {
  483. char *temp = (char *)CTXMALLOC(parentCtx, srcLen);
  484. unsigned tlen = 0;
  485. unsigned filter[256/bitsInUnsigned];
  486. memset(filter,0,sizeof(filter));
  487. for (unsigned j = 0; j < hitLen; j++ )
  488. {
  489. unsigned c = (unsigned char)hit[j];
  490. filter[c/bitsInUnsigned] |= (1<<(c%bitsInUnsigned));
  491. }
  492. for ( unsigned i = 0; i < srcLen; i++ )
  493. {
  494. unsigned c = (unsigned char)src[i];
  495. if ((filter[c/bitsInUnsigned] & (1<<(c%bitsInUnsigned))) != 0)
  496. temp[tlen++] = (char)c;
  497. }
  498. tgt = (char *)CTXREALLOC(parentCtx, temp, tlen);
  499. tgtLen = tlen;
  500. }
  501. STRINGLIB_API void STRINGLIB_CALL slStringSubsOut(unsigned & tgtLen, char * & tgt, unsigned srcLen, const char * src, unsigned hitLen, const char * hit, unsigned newCharLen, const char * newChar)
  502. {
  503. bool filter[256];
  504. memset(filter,0,sizeof(filter));
  505. for (unsigned j = 0; j < hitLen; j++ )
  506. {
  507. unsigned char c = ((unsigned char *)hit)[j];
  508. filter[c] = true;
  509. }
  510. tgt = (char *)CTXMALLOC(parentCtx, srcLen);
  511. if (newCharLen > 0)
  512. {
  513. for ( unsigned i = 0; i < srcLen; i++ )
  514. {
  515. unsigned char c = ((unsigned char *)src)[i];
  516. if (!filter[c])
  517. tgt[i] = c;
  518. else
  519. tgt[i] = ((char *)newChar)[0];
  520. }
  521. }
  522. else
  523. {
  524. memcpy(tgt, src, srcLen);
  525. }
  526. tgtLen = srcLen;
  527. }
  528. STRINGLIB_API void STRINGLIB_CALL slStringSubs(unsigned & tgtLen, char * & tgt, unsigned srcLen, const char * src, unsigned hitLen, const char * hit, unsigned newCharLen, const char * newChar)
  529. {
  530. bool filter[256];
  531. memset(filter,0,sizeof(filter));
  532. for (unsigned j = 0; j < hitLen; j++ )
  533. {
  534. unsigned char c = ((unsigned char *)hit)[j];
  535. filter[c] = true;
  536. }
  537. tgt = (char *)CTXMALLOC(parentCtx, srcLen);
  538. if (newCharLen > 0)
  539. {
  540. for ( unsigned i = 0; i < srcLen; i++ )
  541. {
  542. unsigned char c = ((unsigned char *)src)[i];
  543. if (filter[c])
  544. tgt[i] = c;
  545. else
  546. tgt[i] = ((char *)newChar)[0];
  547. }
  548. }
  549. else
  550. {
  551. memcpy(tgt, src, srcLen);
  552. }
  553. tgtLen = srcLen;
  554. }
  555. STRINGLIB_API void STRINGLIB_CALL slStringRepad(unsigned & tgtLen, char * & tgt, unsigned srcLen, const char * src, unsigned tLen)
  556. {
  557. char *base = (char *)src;
  558. while ( srcLen && *base == ' ' )
  559. {
  560. srcLen--;
  561. base++;
  562. }
  563. while ( srcLen && base[srcLen-1] == ' ' )
  564. srcLen--;
  565. if ( srcLen > tLen )
  566. srcLen = tLen;
  567. tgt = (char *)CTXMALLOC(parentCtx, tLen);
  568. tgtLen = tLen;
  569. memcpy(tgt,base,srcLen);
  570. memset(tgt+srcLen,' ',tLen-srcLen);
  571. }
  572. STRINGLIB_API unsigned STRINGLIB_CALL slStringFind(unsigned srcLen, const char * src, unsigned hitLen, const char * hit, unsigned instance)
  573. {
  574. if ( srcLen < hitLen )
  575. return 0;
  576. if (hitLen==1) { // common case optimization
  577. const char *p=src;
  578. const char *e = p+srcLen;
  579. char c = *hit;
  580. while (p!=e)
  581. if ((*(p++)==c))
  582. if (!--instance)
  583. return (unsigned)(p-src);
  584. }
  585. else
  586. {
  587. unsigned steps = srcLen-hitLen+1;
  588. for ( unsigned i = 0; i < steps; i++ )
  589. {
  590. if ( !memcmp((char *)src+i,hit,hitLen) )
  591. {
  592. if ( !--instance )
  593. return i+1;
  594. if (hitLen > 1)
  595. i += (hitLen-1);
  596. }
  597. }
  598. }
  599. return 0;
  600. }
  601. STRINGLIB_API unsigned STRINGLIB_CALL slStringFindCount(unsigned srcLen, const char * src, unsigned hitLen, const char * hit)
  602. {
  603. if ( srcLen < hitLen )
  604. return 0;
  605. unsigned matches = 0;
  606. if (hitLen==1) { // common case optimization
  607. const char *p=src;
  608. const char *e = p+srcLen;
  609. char c = *hit;
  610. while (p!=e)
  611. if ((*(p++)==c))
  612. matches++;
  613. }
  614. else
  615. {
  616. unsigned steps = srcLen-hitLen+1;
  617. for ( unsigned i = 0; i < steps; i++ )
  618. {
  619. if ( !memcmp((char *)src+i,hit,hitLen) )
  620. {
  621. matches++;
  622. if (hitLen > 1)
  623. i += (hitLen-1);
  624. }
  625. }
  626. }
  627. return matches;
  628. }
  629. STRINGLIB_API unsigned STRINGLIB_CALL slStringFind2(unsigned /*srcLen*/, const char * src, unsigned hitLen, const char * hit)
  630. {
  631. if (hitLen==1) { // common case optimization
  632. const char *p=src;
  633. char c = *hit;
  634. while (*(p++)!=c);
  635. return (unsigned)(p-src);
  636. }
  637. for ( unsigned i = 0; ; i++ )
  638. if ( !memcmp((char *)src+i,hit,hitLen) )
  639. return i+1;
  640. return 0;
  641. }
  642. STRINGLIB_API void STRINGLIB_CALL slStringExtract(unsigned & tgtLen, char * & tgt, unsigned srcLen, const char * src, unsigned instance)
  643. {
  644. tgtLen = 0;
  645. tgt = NULL;
  646. char * finger = (char *)src;
  647. if ( !instance )
  648. return;
  649. while ( --instance )
  650. {
  651. while ( srcLen && *finger != ',' )
  652. {
  653. srcLen--;
  654. finger++;
  655. }
  656. if ( !srcLen )
  657. return;
  658. srcLen--; // Skip ,
  659. finger++;
  660. }
  661. unsigned len = 0;
  662. for ( ; len < srcLen; len++ )
  663. if ( finger[len] == ',' )
  664. break;
  665. tgt = (char *)CTXMALLOC(parentCtx, len);
  666. memcpy(tgt,finger,len);
  667. tgtLen = len;
  668. }
  669. STRINGLIB_API void STRINGLIB_CALL slStringExtractMultiple(unsigned & tgtLen, char * & tgt, unsigned srcLen, const char * src, unsigned __int64 mask)
  670. {
  671. tgtLen = 0;
  672. tgt = NULL;
  673. char * finger = (char *)src;
  674. unsigned __int64 thisInstance = 1;
  675. while (mask)
  676. {
  677. while ( srcLen && *finger != ',' )
  678. {
  679. srcLen--;
  680. finger++;
  681. }
  682. if (mask & thisInstance)
  683. {
  684. mask &= ~thisInstance;
  685. unsigned matchLen = (unsigned)(finger - src);
  686. if (!tgt)
  687. tgt = (char *) CTXMALLOC(parentCtx, matchLen + srcLen);
  688. else
  689. tgt[tgtLen++] = ',';
  690. memcpy(tgt+tgtLen, src, finger - src);
  691. tgtLen += matchLen;
  692. }
  693. thisInstance <<= 1;
  694. if ( !srcLen )
  695. break;
  696. srcLen--; // Skip the ','
  697. finger++;
  698. src = finger;
  699. }
  700. }
  701. STRINGLIB_API char * STRINGLIB_CALL slGetDateYYYYMMDD(void)
  702. {
  703. char * result = (char *)CTXMALLOC(parentCtx, 9);
  704. time_t ltime;
  705. time( &ltime );
  706. tm *today = localtime( &ltime );
  707. strftime(result, 9, "%Y%m%d", today);
  708. return result;
  709. }
  710. STRINGLIB_API void STRINGLIB_CALL slGetDateYYYYMMDD2(char * ret)
  711. {
  712. char temp[9];
  713. time_t ltime;
  714. time( &ltime );
  715. tm *today = localtime( &ltime );
  716. strftime(temp, 9, "%Y%m%d", today);
  717. memcpy(ret, temp, 8);
  718. }
  719. STRINGLIB_API char * STRINGLIB_CALL slGetBuildInfo(void)
  720. {
  721. return CTXSTRDUP(parentCtx, STRINGLIB_VERSION);
  722. }
  723. STRINGLIB_API void STRINGLIB_CALL slData2String(size32_t & __ret_len,char * & __ret_str,unsigned _len_y, const void * y)
  724. {
  725. char *out = (char *)CTXMALLOC(parentCtx, _len_y * 2);
  726. char *res = out;
  727. unsigned char *yy = (unsigned char *) y;
  728. for (unsigned int i = 0; i < _len_y; i++)
  729. {
  730. *out++ = hexchar[yy[i] >> 4];
  731. *out++ = hexchar[yy[i] & 0x0f];
  732. }
  733. __ret_len = _len_y * 2;
  734. __ret_str = res;
  735. }
  736. STRINGLIB_API void STRINGLIB_CALL slString2Data(size32_t & __ret_len,void * & __ret_str,unsigned _len_src,const char * src)
  737. {
  738. // trailing nibbles are ignored
  739. // embedded spaces are ignored
  740. // illegal hex values are treated as zero
  741. // we could do a stricter one if it was considered desirable.
  742. char *out = (char *)CTXMALLOC(parentCtx, _len_src / 2);
  743. char *target = out;
  744. for (;;)
  745. {
  746. while (_len_src > 1 && isspace(*src))
  747. {
  748. src++;
  749. _len_src--;
  750. }
  751. if (_len_src < 2)
  752. break;
  753. *target++ = (hex2digit(src[0]) << 4) | hex2digit(src[1]);
  754. _len_src -= 2;
  755. src += 2;
  756. }
  757. __ret_len = (size32_t)(target - out);
  758. __ret_str = out;
  759. }
  760. // -----------------------------------------------------------------
  761. STRINGLIB_API void STRINGLIB_CALL slStringToLowerCase(unsigned & tgtLen, char * & tgt, unsigned srcLen, const char * src)
  762. {
  763. char * res = (char *)CTXMALLOC(parentCtx, srcLen);
  764. for (unsigned int i=0;i<srcLen;i++)
  765. res[i] = tolower(src[i]);
  766. tgt = res;
  767. tgtLen = srcLen;
  768. }
  769. // -----------------------------------------------------------------
  770. STRINGLIB_API void STRINGLIB_CALL slStringToUpperCase(unsigned & tgtLen, char * & tgt, unsigned srcLen, const char * src)
  771. {
  772. char * res = (char *)CTXMALLOC(parentCtx, srcLen);
  773. for (unsigned int i=0;i<srcLen;i++)
  774. res[i] = toupper(src[i]);
  775. tgt = res;
  776. tgtLen = srcLen;
  777. }
  778. // -----------------------------------------------------------------
  779. STRINGLIB_API void STRINGLIB_CALL slStringToProperCase(unsigned & tgtLen, char * & tgt, unsigned srcLen, const char * src)
  780. {
  781. tgt = (char *)CTXMALLOC(parentCtx, srcLen);
  782. char * res = tgt;
  783. bool seenSpace = true;
  784. for (unsigned int i=0;i<srcLen;i++)
  785. {
  786. char c = src[i];
  787. *tgt++ = seenSpace ? toupper(c) : c;
  788. seenSpace = (c==' ');
  789. }
  790. tgt = res;
  791. tgtLen = srcLen;
  792. }
  793. // -----------------------------------------------------------------
  794. STRINGLIB_API void STRINGLIB_CALL slStringToCapitalCase(unsigned & tgtLen, char * & tgt, unsigned srcLen, const char * src)
  795. {
  796. char * const result = (char *)CTXMALLOC(parentCtx, srcLen);
  797. bool upperPending = true;
  798. for (unsigned int i=0;i<srcLen;i++)
  799. {
  800. byte c = src[i];
  801. result[i] = upperPending ? toupper(c) : c;
  802. upperPending = !isalnum(c);
  803. }
  804. tgt = result;
  805. tgtLen = srcLen;
  806. }
  807. // -----------------------------------------------------------------
  808. STRINGLIB_API void STRINGLIB_CALL slStringToTitleCase(unsigned & tgtLen, char * & tgt, unsigned srcLen, const char * src)
  809. {
  810. char * const result = (char *)CTXMALLOC(parentCtx, srcLen);
  811. bool upperPending = true;
  812. for (unsigned int i=0;i<srcLen;i++)
  813. {
  814. byte c = src[i];
  815. result[i] = upperPending ? toupper(c) : tolower(c);
  816. upperPending = !isalnum(c);
  817. }
  818. tgt = result;
  819. tgtLen = srcLen;
  820. }
  821. // -----------------------------------------------------------------
  822. STRINGLIB_API int STRINGLIB_CALL slStringCompareIgnoreCase (unsigned src1Len, const char * src1, unsigned src2Len, const char * src2)
  823. {
  824. unsigned int i;
  825. for (i=0;i < src1Len && i < src2Len;i++)
  826. {
  827. byte lc = src1[i];
  828. byte rc = src2[i];
  829. if (lc != rc)
  830. {
  831. lc = tolower(lc);
  832. rc = tolower(rc);
  833. if (lc != rc)
  834. return lc > rc ? 1 : -1;
  835. }
  836. }
  837. while (i < src1Len)
  838. {
  839. if (src1[i++] != ' ')
  840. return 1;
  841. }
  842. while (i < src2Len)
  843. {
  844. if (src2[i++] != ' ')
  845. return -1;
  846. }
  847. return 0;
  848. }
  849. // -----------------------------------------------------------------
  850. STRINGLIB_API void STRINGLIB_CALL slStringReverse (unsigned & tgtLen, char * & tgt, unsigned srcLen, const char * src)
  851. {
  852. char * res = (char *)CTXMALLOC(parentCtx, srcLen);
  853. unsigned int n = srcLen - 1;
  854. for (unsigned int i=0;i<srcLen;i++)
  855. res[i] = src[n-i];
  856. tgt = res;
  857. tgtLen = srcLen;
  858. }
  859. // -----------------------------------------------------------------
  860. STRINGLIB_API void STRINGLIB_CALL slStringFindReplace (unsigned & tgtLen, char * & tgt, unsigned srcLen, const char * src, unsigned stokLen, const char * stok, unsigned rtokLen, const char * rtok)
  861. {
  862. if ( srcLen < stokLen || stokLen == 0)
  863. {
  864. tgt = (char *) CTXMALLOC(parentCtx, srcLen);
  865. memcpy(tgt, src, srcLen);
  866. tgtLen = srcLen;
  867. }
  868. else
  869. {
  870. unsigned steps = srcLen-stokLen+1;
  871. unsigned tgtmax = rtokLen > stokLen ? srcLen + steps * (rtokLen - stokLen) : srcLen;
  872. // This is the upper limit on target size - not a problem if we allocate a bit too much
  873. char * res = (char *)CTXMALLOC(parentCtx, tgtmax);
  874. tgt = res;
  875. unsigned i;
  876. for ( i = 0; i < steps; )
  877. {
  878. if ( !memcmp(src+i,stok,stokLen) )
  879. {
  880. memcpy(res, rtok, rtokLen);
  881. res += rtokLen;
  882. i += stokLen;
  883. }
  884. else
  885. *res++ = src[i++];
  886. }
  887. while (i <srcLen)
  888. *res++ = src[i++];
  889. tgtLen = (size32_t)(res - tgt);
  890. }
  891. }
  892. // -----------------------------------------------------------------
  893. STRINGLIB_API void STRINGLIB_CALL slStringCleanSpaces(size32_t & __ret_len,char * & __ret_str,unsigned _len_instr,const char * instr)
  894. {
  895. // remove double spaces
  896. char *out = (char *) CTXMALLOC(parentCtx, _len_instr);
  897. char *origout = out;
  898. bool spacePending = false;
  899. bool atStart = true;
  900. for(unsigned idx = 0; idx < _len_instr; idx++)
  901. {
  902. char c = *instr++;
  903. switch (c)
  904. {
  905. case ' ':
  906. case '\t':
  907. spacePending = true;
  908. break;
  909. default:
  910. if (spacePending && !atStart)
  911. *out++ = ' ';
  912. spacePending = false;
  913. atStart = false;
  914. *out++ = c;
  915. break;
  916. }
  917. }
  918. __ret_str = origout;
  919. __ret_len = (size32_t)(out - origout);
  920. }
  921. STRINGLIB_API bool STRINGLIB_CALL slStringWildMatch(unsigned srcLen, const char * src, unsigned patLen, const char * pat, bool noCase)
  922. {
  923. return wildTrimMatch<char, char_toupper, '?', '*', ' '>(src, srcLen, pat, patLen, noCase);
  924. }
  925. STRINGLIB_API bool STRINGLIB_CALL slStringWildExactMatch(unsigned srcLen, const char * src, unsigned patLen, const char * pat, bool noCase)
  926. {
  927. return wildMatch<char, char_toupper, '?', '*'>(src, srcLen, pat, patLen, noCase);
  928. }
  929. STRINGLIB_API bool STRINGLIB_CALL slStringContains(unsigned srcLen, const char * src, unsigned patLen, const char * pat, bool noCase)
  930. {
  931. unsigned char srcCount[256];
  932. memset(srcCount, 0, 256);
  933. while (srcLen && src[srcLen-1]==' ')
  934. srcLen--;
  935. while(srcLen-- > 0)
  936. {
  937. byte c = *src++;
  938. if (noCase)
  939. c = toupper(c);
  940. srcCount[c]++;
  941. }
  942. while (patLen && pat[patLen-1]==' ')
  943. patLen--;
  944. while(patLen-- > 0)
  945. {
  946. byte c = *pat++;
  947. if (noCase)
  948. c = toupper(c);
  949. if (srcCount[c] == 0)
  950. return false;
  951. else
  952. srcCount[c]--;
  953. }
  954. return true;
  955. }
  956. STRINGLIB_API unsigned STRINGLIB_CALL slEditDistance(unsigned leftLen, const char * left, unsigned rightLen, const char * right)
  957. {
  958. CEditDistance * ed = new CEditDistance();
  959. unsigned rval = ed->editDistance(leftLen, left, rightLen, right);
  960. delete ed;
  961. return rval;
  962. }
  963. STRINGLIB_API bool STRINGLIB_CALL slEditDistanceWithinRadius(unsigned leftLen, const char * left, unsigned rightLen, const char * right, unsigned radius)
  964. {
  965. unsigned minED = (leftLen < rightLen)? rightLen - leftLen: leftLen - rightLen;
  966. if (minED > radius)
  967. {
  968. return false;
  969. }
  970. else
  971. {
  972. CEditDistance *ed = new CEditDistance();
  973. unsigned rval = ed->editDistance(leftLen, left, rightLen, right, radius);
  974. delete ed;
  975. return (rval <= radius);
  976. }
  977. }
  978. STRINGLIB_API unsigned STRINGLIB_CALL slEditDistanceV2(unsigned leftLen, const char * left, unsigned rightLen, const char * right)
  979. {
  980. return nsStringlib::editDistanceV2(leftLen, left, rightLen, right);
  981. }
  982. STRINGLIB_API bool STRINGLIB_CALL slEditDistanceWithinRadiusV2(unsigned leftLen, const char * left, unsigned rightLen, const char * right, unsigned radius)
  983. {
  984. return nsStringlib::editDistanceV3(leftLen, left, rightLen, right, radius) <= radius;
  985. }
  986. STRINGLIB_API void STRINGLIB_CALL slStringGetNthWord(unsigned & tgtLen, char * & tgt, unsigned srcLen, const char * src, unsigned n)
  987. {
  988. const char* start = 0;
  989. const char* end = 0;
  990. // skip any leading white space
  991. while (srcLen>0 && (unsigned char)*src<=0x20) {
  992. src++;
  993. srcLen--;
  994. }
  995. while (srcLen>0 && n>0) {
  996. start = src;
  997. n--;
  998. // go to the next white space
  999. while (srcLen>0 && (unsigned char)*src>0x20) {
  1000. src++;
  1001. srcLen--;
  1002. }
  1003. end = src;
  1004. // skip white space again
  1005. while (srcLen>0 && (unsigned char)*src<=0x20) {
  1006. src++;
  1007. srcLen--;
  1008. }
  1009. }
  1010. if (!n && (end-start)) {
  1011. tgt = (char *)CTXMALLOC(parentCtx, end-start);
  1012. memcpy(tgt,start,end-start);
  1013. tgtLen = end-start;
  1014. } else {
  1015. tgt = 0;
  1016. tgtLen = 0;
  1017. }
  1018. }
  1019. STRINGLIB_API unsigned STRINGLIB_CALL slStringWordCount(unsigned srcLen,const char * src)
  1020. {
  1021. // skip any leading white space
  1022. unsigned word_count = 0;
  1023. while (srcLen>0 && (unsigned char)*src<=0x20) {
  1024. src++;
  1025. srcLen--;
  1026. }
  1027. while (srcLen>0) {
  1028. word_count++;
  1029. // go to the next white space
  1030. while (srcLen>0 && (unsigned char)*src>0x20) {
  1031. src++;
  1032. srcLen--;
  1033. }
  1034. // skip white space again
  1035. while (srcLen>0 && (unsigned char)*src<=0x20) {
  1036. src++;
  1037. srcLen--;
  1038. }
  1039. }
  1040. return word_count;
  1041. }
  1042. //--------------------------------------------------------------------------------------------------------------------
  1043. STRINGLIB_API unsigned STRINGLIB_CALL slCountWords(size32_t lenSrc, const char * src, size32_t lenSeparator, const char * separator, bool allowBlankItems)
  1044. {
  1045. if (lenSrc == 0)
  1046. return 0;
  1047. if ((lenSeparator == 0) || (lenSrc < lenSeparator))
  1048. return 1;
  1049. unsigned numWords=0;
  1050. const char * end = src + lenSrc;
  1051. const char * max = end - (lenSeparator - 1);
  1052. const char * cur = src;
  1053. const char * startWord = NULL;
  1054. //MORE: optimize lenSeparator == 1!
  1055. while (cur < max)
  1056. {
  1057. if (memcmp(cur, separator, lenSeparator) == 0)
  1058. {
  1059. if (startWord || allowBlankItems)
  1060. {
  1061. numWords++;
  1062. startWord = NULL;
  1063. }
  1064. cur += lenSeparator;
  1065. }
  1066. else
  1067. {
  1068. if (!startWord)
  1069. startWord = cur;
  1070. cur++;
  1071. }
  1072. }
  1073. if (startWord || (cur != end) || allowBlankItems)
  1074. numWords++;
  1075. return numWords;
  1076. }
  1077. static unsigned calcWordSetSize(size32_t lenSrc, const char * src, size32_t lenSeparator, const char * separator, bool allowBlankItems)
  1078. {
  1079. if (lenSrc == 0)
  1080. return 0;
  1081. if ((lenSeparator == 0) || (lenSrc < lenSeparator))
  1082. return sizeof(size32_t) + lenSrc;
  1083. unsigned sizeWords=0;
  1084. const char * end = src + lenSrc;
  1085. const char * max = end - (lenSeparator - 1);
  1086. const char * cur = src;
  1087. const char * startWord = NULL;
  1088. //MORE: optimize lenSeparator == 1!
  1089. while (cur < max)
  1090. {
  1091. if (memcmp(cur, separator, lenSeparator) == 0)
  1092. {
  1093. if (startWord)
  1094. {
  1095. sizeWords += sizeof(size32_t) + (cur - startWord);
  1096. startWord = NULL;
  1097. }
  1098. else if (allowBlankItems)
  1099. sizeWords += sizeof(size32_t);
  1100. cur += lenSeparator;
  1101. }
  1102. else
  1103. {
  1104. if (!startWord)
  1105. startWord = cur;
  1106. cur++;
  1107. }
  1108. }
  1109. if (startWord || (cur != end) || allowBlankItems)
  1110. {
  1111. if (!startWord)
  1112. startWord = cur;
  1113. sizeWords += sizeof(size32_t) + (end - startWord);
  1114. }
  1115. return sizeWords;
  1116. }
  1117. STRINGLIB_API void STRINGLIB_CALL slSplitWords(bool & __isAllResult, size32_t & __lenResult, void * & __result, size32_t lenSrc, const char * src, size32_t lenSeparator, const char * separator, bool allowBlankItems)
  1118. {
  1119. unsigned sizeRequired = calcWordSetSize(lenSrc, src, lenSeparator, separator, allowBlankItems);
  1120. char * const result = static_cast<char *>(CTXMALLOC(parentCtx, sizeRequired));
  1121. __isAllResult = false;
  1122. __lenResult = sizeRequired;
  1123. __result = result;
  1124. if (lenSrc == 0)
  1125. return;
  1126. if ((lenSeparator == 0) || (lenSrc < lenSeparator))
  1127. {
  1128. *((size32_t *)result) = lenSrc;
  1129. memcpy(result+sizeof(size32_t), src, lenSrc);
  1130. return;
  1131. }
  1132. unsigned sizeWords=0;
  1133. char * target = result;
  1134. const char * end = src + lenSrc;
  1135. const char * max = end - (lenSeparator - 1);
  1136. const char * cur = src;
  1137. const char * startWord = NULL;
  1138. //MORE: optimize lenSeparator == 1!
  1139. while (cur < max)
  1140. {
  1141. if (memcmp(cur, separator, lenSeparator) == 0)
  1142. {
  1143. if (startWord || allowBlankItems)
  1144. {
  1145. size32_t len = startWord ? (cur - startWord) : 0;
  1146. memcpy(target, &len, sizeof(len));
  1147. memcpy(target+sizeof(size32_t), startWord, len);
  1148. target += sizeof(size32_t) + len;
  1149. startWord = NULL;
  1150. }
  1151. cur += lenSeparator;
  1152. }
  1153. else
  1154. {
  1155. if (!startWord)
  1156. startWord = cur;
  1157. cur++;
  1158. }
  1159. }
  1160. if (startWord || (cur != end) || allowBlankItems)
  1161. {
  1162. if (!startWord)
  1163. startWord = cur;
  1164. size32_t len = (end - startWord);
  1165. memcpy(target, &len, sizeof(len));
  1166. memcpy(target+sizeof(size32_t), startWord, len);
  1167. target += sizeof(size32_t) + len;
  1168. }
  1169. assert(target == result + sizeRequired);
  1170. // ctx->fail(1, "Size mismatch in StringLib.SplitWords");
  1171. }
  1172. static unsigned countWords(size32_t lenSrc, const char * src)
  1173. {
  1174. unsigned count = 0;
  1175. unsigned offset = 0;
  1176. while (offset < lenSrc)
  1177. {
  1178. size32_t len;
  1179. memcpy(&len, src+offset, sizeof(len));
  1180. offset += sizeof(len) + len;
  1181. count++;
  1182. }
  1183. return count;
  1184. }
  1185. STRINGLIB_API void STRINGLIB_CALL slCombineWords(size32_t & __lenResult, void * & __result, bool isAllSrc, size32_t lenSrc, const char * src, size32_t lenSeparator, const char * separator, bool allowBlankItems)
  1186. {
  1187. if (lenSrc == 0)
  1188. {
  1189. __lenResult = 0;
  1190. __result = NULL;
  1191. return;
  1192. }
  1193. unsigned numWords = countWords(lenSrc, src);
  1194. size32_t sizeRequired = lenSrc - numWords * sizeof(size32_t) + (numWords-1) * lenSeparator;
  1195. char * const result = static_cast<char *>(CTXMALLOC(parentCtx, sizeRequired));
  1196. __lenResult = sizeRequired;
  1197. __result = result;
  1198. char * target = result;
  1199. unsigned offset = 0;
  1200. while (offset < lenSrc)
  1201. {
  1202. if ((offset != 0) && lenSeparator)
  1203. {
  1204. memcpy(target, separator, lenSeparator);
  1205. target += lenSeparator;
  1206. }
  1207. size32_t len;
  1208. memcpy(&len, src+offset, sizeof(len));
  1209. offset += sizeof(len);
  1210. memcpy(target, src+offset, len);
  1211. target += len;
  1212. offset += len;
  1213. }
  1214. assert(target == result + sizeRequired);
  1215. }
  1216. //--------------------------------------------------------------------------------------------------------------------
  1217. inline bool readValue(unsigned & value, size32_t & _offset, size32_t lenStr, const char * str, unsigned max)
  1218. {
  1219. unsigned total = 0;
  1220. unsigned offset = _offset;
  1221. if (lenStr - offset < max)
  1222. max = lenStr - offset;
  1223. unsigned i=0;
  1224. for (; i < max; i++)
  1225. {
  1226. char next = str[offset+i];
  1227. if (next >= '0' && next <= '9')
  1228. total = total * 10 + (next - '0');
  1229. else
  1230. break;
  1231. }
  1232. if (i == 0)
  1233. return false;
  1234. value = total;
  1235. _offset = offset+i;
  1236. return true;
  1237. }
  1238. const char * const monthNames[12] = { "January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December" };
  1239. inline bool matchString(unsigned & value, size32_t & strOffset, size32_t lenStr, const byte * str, unsigned num, const char * const * strings, unsigned minMatch)
  1240. {
  1241. unsigned startOffset = strOffset;
  1242. for (unsigned i =0; i < num; i++)
  1243. {
  1244. const char * cur = strings[i];
  1245. unsigned offset = startOffset;
  1246. while (offset < lenStr)
  1247. {
  1248. byte next = *cur++;
  1249. if (!next || toupper(next) != toupper(str[offset]))
  1250. break;
  1251. offset++;
  1252. }
  1253. if (offset - startOffset >= minMatch)
  1254. {
  1255. value = i;
  1256. strOffset = offset;
  1257. return true;
  1258. }
  1259. }
  1260. return false;
  1261. }
  1262. //This implements a subset of the specifiers allowed for strptime
  1263. //Another difference is it works on a string with a separate length
  1264. static const char * simple_strptime(size32_t lenStr, const char * str, const char * format, struct tm * tm)
  1265. {
  1266. const char * curFormat = format;
  1267. size32_t offset = 0;
  1268. const byte * src = (const byte *)str;
  1269. unsigned value;
  1270. byte next;
  1271. while ((next = *curFormat++) != '\0')
  1272. {
  1273. if (next == '%')
  1274. {
  1275. switch (*curFormat++)
  1276. {
  1277. case 't':
  1278. while ((offset < lenStr) && isspace(src[offset]))
  1279. offset++;
  1280. break;
  1281. case 'Y':
  1282. if (!readValue(value, offset, lenStr, str, 4))
  1283. return NULL;
  1284. tm->tm_year = value-1900;
  1285. break;
  1286. case 'y':
  1287. if (!readValue(value, offset, lenStr, str, 2))
  1288. return NULL;
  1289. tm->tm_year = value > 68 ? value : value + 100;
  1290. break;
  1291. case 'm':
  1292. if (!readValue(value, offset, lenStr, str, 2) || (value < 1) || (value > 12))
  1293. return NULL;
  1294. tm->tm_mon = value-1;
  1295. break;
  1296. case 'd':
  1297. if (!readValue(value, offset, lenStr, str, 2) || (value < 1) || (value > 31))
  1298. return NULL;
  1299. tm->tm_mday = value;
  1300. break;
  1301. case 'b':
  1302. case 'B':
  1303. case 'h':
  1304. if (!matchString(value, offset, lenStr, src, sizeof(monthNames)/sizeof(*monthNames), monthNames, 3))
  1305. return NULL;
  1306. tm->tm_mon = value;
  1307. break;
  1308. case 'H':
  1309. if (!readValue(value, offset, lenStr, str, 2)|| (value > 24))
  1310. return NULL;
  1311. tm->tm_hour = value;
  1312. break;
  1313. case 'M':
  1314. if (!readValue(value, offset, lenStr, str, 2)|| (value > 59))
  1315. return NULL;
  1316. tm->tm_min = value;
  1317. break;
  1318. case 'S':
  1319. if (!readValue(value, offset, lenStr, str, 2)|| (value > 59))
  1320. return NULL;
  1321. tm->tm_sec = value;
  1322. break;
  1323. default:
  1324. return NULL;
  1325. }
  1326. }
  1327. else
  1328. {
  1329. if (isspace(next))
  1330. {
  1331. while ((offset < lenStr) && isspace(src[offset]))
  1332. offset++;
  1333. }
  1334. else
  1335. {
  1336. if ((offset >= lenStr) || (src[offset++] != next))
  1337. return NULL;
  1338. }
  1339. }
  1340. }
  1341. return str+offset;
  1342. }
  1343. inline unsigned makeDate(const tm & tm)
  1344. {
  1345. return (tm.tm_year + 1900) * 10000 + (tm.tm_mon + 1) * 100 + tm.tm_mday;
  1346. }
  1347. inline void extractDate(tm & tm, unsigned date)
  1348. {
  1349. tm.tm_year = (date / 10000) - 1900;
  1350. tm.tm_mon = ((date / 100) % 100) - 1;
  1351. tm.tm_mday = (date % 100);
  1352. }
  1353. STRINGLIB_API unsigned STRINGLIB_CALL slStringToDate(size32_t lenS, const char * s, const char * fmtin)
  1354. {
  1355. struct tm tm;
  1356. memset(&tm, 0, sizeof(tm));
  1357. if (simple_strptime(lenS, s, fmtin, &tm))
  1358. return makeDate(tm);
  1359. return 0;
  1360. }
  1361. STRINGLIB_API unsigned STRINGLIB_CALL slMatchDate(size32_t lenS, const char * s, bool isAllFormats, unsigned lenFormats, const void * _formats)
  1362. {
  1363. struct tm tm;
  1364. memset(&tm, 0, sizeof(tm));
  1365. const char * formats = (const char *)_formats;
  1366. for (unsigned off=0; off < lenFormats; )
  1367. {
  1368. const char * curFormat = formats+off;
  1369. if (simple_strptime(lenS, s, curFormat, &tm))
  1370. return makeDate(tm);
  1371. off += strlen(curFormat) + 1;
  1372. }
  1373. return 0;
  1374. }
  1375. STRINGLIB_API void STRINGLIB_CALL slFormatDate(size32_t & __lenResult, char * & __result, unsigned date, const char * format)
  1376. {
  1377. size32_t len = 0;
  1378. char * out = NULL;
  1379. if (date)
  1380. {
  1381. struct tm tm;
  1382. memset(&tm, 0, sizeof(tm));
  1383. extractDate(tm, date);
  1384. char buf[255];
  1385. strftime(buf, sizeof(buf), format, &tm);
  1386. len = strlen(buf);
  1387. out = static_cast<char *>(CTXMALLOC(parentCtx, len));
  1388. memcpy(out, buf, len);
  1389. }
  1390. __lenResult = len;
  1391. __result = out;
  1392. }
  1393. //--------------------------------------------------------------------------------------------------------------------
  1394. //--------------------------------------------------------------------------------------------------------------------
  1395. //--------------------------------------------------------------------------------------------------------------------
  1396. // Legacy functions that only work on fixed length strings
  1397. //--------------------------------------------------------------------------------------------------------------------
  1398. //--------------------------------------------------------------------------------------------------------------------
  1399. //--------------------------------------------------------------------------------------------------------------------
  1400. STRINGLIB_API void STRINGLIB_CALL slStringExtract50(char *tgt, unsigned srcLen, const char * src, unsigned instance)
  1401. {
  1402. unsigned lenret;
  1403. char * resret;
  1404. slStringExtract(lenret,resret,srcLen,src,instance);
  1405. if (lenret >= 50)
  1406. memcpy(tgt,resret,50);
  1407. else
  1408. {
  1409. memcpy(tgt,resret,lenret);
  1410. memset(tgt+lenret,' ',50-lenret);
  1411. }
  1412. CTXFREE(parentCtx, resret);
  1413. }
  1414. STRINGLIB_API void STRINGLIB_CALL slGetBuildInfo100(char *tgt)
  1415. {
  1416. size32_t len = (size32_t) strlen(STRINGLIB_VERSION);
  1417. if (len >= 100)
  1418. len = 100;
  1419. memcpy(tgt, STRINGLIB_VERSION, len);
  1420. memset(tgt+len, ' ', 100-len);
  1421. }
  1422. // -----------------------------------------------------------------
  1423. STRINGLIB_API void STRINGLIB_CALL slStringToLowerCase80(char *tgt, unsigned srcLen, const char * src)
  1424. {
  1425. unsigned int i;
  1426. for (i=0;i<srcLen && i < 80;i++)
  1427. *tgt++ = tolower(src[i]);
  1428. while (i < 80)
  1429. {
  1430. *tgt++=' ';
  1431. i++;
  1432. }
  1433. }
  1434. // -----------------------------------------------------------------
  1435. STRINGLIB_API void STRINGLIB_CALL slStringToUpperCase80(char *tgt, unsigned srcLen, const char * src)
  1436. {
  1437. unsigned int i;
  1438. for (i=0;i<srcLen && i < 80;i++)
  1439. *tgt++ = toupper(src[i]);
  1440. while (i < 80)
  1441. {
  1442. *tgt++=' ';
  1443. i++;
  1444. }
  1445. }
  1446. // -----------------------------------------------------------------
  1447. STRINGLIB_API void STRINGLIB_CALL slStringFindReplace80(char * tgt, unsigned srcLen, const char * src, unsigned stokLen, const char * stok, unsigned rtokLen, const char * rtok)
  1448. {
  1449. if ( srcLen < stokLen )
  1450. {
  1451. if (srcLen > 80)
  1452. srcLen = 80;
  1453. memcpy(tgt, src, srcLen);
  1454. if (srcLen < 80)
  1455. memset(tgt+srcLen, ' ', 80 - srcLen);
  1456. }
  1457. else
  1458. {
  1459. unsigned steps = srcLen-stokLen+1;
  1460. unsigned i;
  1461. unsigned lim = 80;
  1462. for ( i = 0; i < steps && lim > 0; )
  1463. {
  1464. if ( !memcmp(src+i,stok,stokLen) )
  1465. {
  1466. if (rtokLen > lim)
  1467. rtokLen = lim;
  1468. memcpy(tgt, rtok, rtokLen);
  1469. tgt += rtokLen;
  1470. i += stokLen;
  1471. lim -= rtokLen;
  1472. }
  1473. else
  1474. {
  1475. *tgt++ = src[i++];
  1476. lim--;
  1477. }
  1478. }
  1479. while (i < srcLen && lim > 0)
  1480. {
  1481. *tgt++ = src[i++];
  1482. lim--;
  1483. }
  1484. if (lim)
  1485. memset(tgt, ' ', lim);
  1486. }
  1487. }
  1488. STRINGLIB_API void STRINGLIB_CALL slStringCleanSpaces25(char *__ret_str,unsigned _len_instr,const char * instr)
  1489. {
  1490. // remove double spaces
  1491. // Fixed width version for Hole
  1492. unsigned outlen = _len_instr;
  1493. if (outlen < 25)
  1494. outlen = 25;
  1495. char *out = (char *) alloca(outlen);
  1496. char *origout = out;
  1497. bool spacePending = false;
  1498. bool atStart = true;
  1499. for(unsigned idx = 0; idx < _len_instr; idx++)
  1500. {
  1501. char c = *instr++;
  1502. switch (c)
  1503. {
  1504. case ' ':
  1505. case '\t':
  1506. spacePending = true;
  1507. break;
  1508. default:
  1509. if (spacePending && !atStart)
  1510. *out++ = ' ';
  1511. spacePending = false;
  1512. atStart = false;
  1513. *out++ = c;
  1514. break;
  1515. }
  1516. }
  1517. unsigned len = (size32_t)(out-origout);
  1518. if (len < 25)
  1519. memset(out, ' ', 25 - len);
  1520. memcpy(__ret_str, origout, 25);
  1521. }
  1522. STRINGLIB_API void STRINGLIB_CALL slStringCleanSpaces80(char *__ret_str,unsigned _len_instr,const char * instr)
  1523. {
  1524. // remove double spaces
  1525. // Another fixed width version for Hole
  1526. unsigned outlen = _len_instr;
  1527. if (outlen < 80)
  1528. outlen = 80;
  1529. char *out = (char *) alloca(outlen);
  1530. char *origout = out;
  1531. bool spacePending = false;
  1532. bool atStart = true;
  1533. for(unsigned idx = 0; idx < _len_instr; idx++)
  1534. {
  1535. char c = *instr++;
  1536. switch (c)
  1537. {
  1538. case ' ':
  1539. case '\t':
  1540. spacePending = true;
  1541. break;
  1542. default:
  1543. if (spacePending && !atStart)
  1544. *out++ = ' ';
  1545. spacePending = false;
  1546. atStart = false;
  1547. *out++ = c;
  1548. break;
  1549. }
  1550. }
  1551. unsigned len = (unsigned)(out-origout);
  1552. if (len < 80)
  1553. memset(out, ' ', 80 - len);
  1554. memcpy(__ret_str, origout, 80);
  1555. }