Uni.ecl 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536
  1. /*##############################################################################
  2. ## HPCC SYSTEMS software Copyright (C) 2012 HPCC Systems®. All rights reserved.
  3. ############################################################################## */
  4. IMPORT lib_unicodelib;
  5. EXPORT Uni := MODULE
  6. /**
  7. * Returns the first string with all characters within the second string removed.
  8. *
  9. * @param src The string that is being tested.
  10. * @param filter The string containing the set of characters to be excluded.
  11. * @see Std.Uni.Filter
  12. */
  13. EXPORT unicode FilterOut(unicode src, unicode filter) :=
  14. lib_unicodelib.UnicodeLib.UnicodeFilterOut(src, filter);
  15. /**
  16. * Returns the first string with all characters not within the second string removed.
  17. *
  18. * @param src The string that is being tested.
  19. * @param filter The string containing the set of characters to be included.
  20. * @see Std.Uni.FilterOut
  21. */
  22. EXPORT unicode Filter(unicode src, unicode filter) :=
  23. lib_unicodelib.UnicodeLib.UnicodeFilter(src, filter);
  24. /**
  25. * Returns the source string with the replacement character substituted for all characters included in the
  26. * filter string.
  27. * MORE: Should this be a general string substitution?
  28. *
  29. * @param src The string that is being tested.
  30. * @param filter The string containing the set of characters to be included.
  31. * @param replace_char The character to be substituted into the result.
  32. * @see Std.Uni.SubstituteOut
  33. */
  34. EXPORT unicode SubstituteIncluded(unicode src, unicode filter, unicode replace_char) :=
  35. lib_unicodelib.UnicodeLib.UnicodeSubstituteOut(src, filter, replace_char);
  36. /**
  37. * Returns the source string with the replacement character substituted for all characters not included in the
  38. * filter string.
  39. * MORE: Should this be a general string substitution?
  40. *
  41. * @param src The string that is being tested.
  42. * @param filter The string containing the set of characters to be included.
  43. * @param replace_char The character to be substituted into the result.
  44. * @see Std.Uni.SubstituteIncluded
  45. */
  46. EXPORT unicode SubstituteExcluded(unicode src, unicode filter, unicode replace_char) :=
  47. lib_unicodelib.UnicodeLib.UnicodeSubstitute(src, filter, replace_char);
  48. /**
  49. * Returns the character position of the nth match of the search string with the first string.
  50. * If no match is found the attribute returns 0.
  51. * If an instance is omitted the position of the first instance is returned.
  52. *
  53. * @param src The string that is searched
  54. * @param sought The string being sought.
  55. * @param instance Which match instance are we interested in?
  56. */
  57. EXPORT UNSIGNED4 Find(unicode src, unicode sought, unsigned4 instance) :=
  58. lib_unicodelib.UnicodeLib.UnicodeFind(src, sought, instance);
  59. /**
  60. * Tests if the search string contains the supplied word as a whole word.
  61. *
  62. * @param src The string that is being tested.
  63. * @param word The word to be searched for.
  64. * @param ignore_case Whether to ignore differences in case between characters.
  65. */
  66. EXPORT BOOLEAN FindWord(UNICODE src, UNICODE word, BOOLEAN ignore_case=FALSE) := FUNCTION
  67. return IF (ignore_case,
  68. REGEXFIND(u'\\b'+word+u'\\b', src, NOCASE),
  69. REGEXFIND(u'\\b'+word+u'\\b', src));
  70. END;
  71. /**
  72. * Returns the character position of the nth match of the search string with the first string.
  73. * If no match is found the attribute returns 0.
  74. * If an instance is omitted the position of the first instance is returned.
  75. *
  76. * @param src The string that is searched
  77. * @param sought The string being sought.
  78. * @param instance Which match instance are we interested in?
  79. * @param locale_name The locale to use for the comparison
  80. */
  81. EXPORT UNSIGNED4 LocaleFind(unicode src, unicode sought, unsigned4 instance, varstring locale_name) :=
  82. lib_unicodelib.UnicodeLib.UnicodeLocaleFind(src, sought, instance, locale_name);
  83. /**
  84. * Returns the character position of the nth match of the search string with the first string.
  85. * If no match is found the attribute returns 0.
  86. * If an instance is omitted the position of the first instance is returned.
  87. *
  88. * @param src The string that is searched
  89. * @param sought The string being sought.
  90. * @param instance Which match instance are we interested in?
  91. * @param locale_name The locale to use for the comparison
  92. * @param strength The strength of the comparison
  93. 1 ignores accents and case, differentiating only between letters
  94. 2 ignores case but differentiates between accents.
  95. 3 differentiates between accents and case but ignores e.g. differences between Hiragana and Katakana
  96. 4 differentiates between accents and case and e.g. Hiragana/Katakana, but ignores e.g. Hebrew cantellation marks
  97. 5 differentiates between all strings whose canonically decomposed forms (NFD�Normalization Form D) are non-identical
  98. */
  99. EXPORT UNSIGNED4 LocaleFindAtStrength(unicode src, unicode tofind, unsigned4 instance, varstring locale_name, integer1 strength) :=
  100. lib_unicodelib.UnicodeLib.UnicodeLocaleFindAtStrength(src, tofind, instance, locale_name, strength);
  101. /**
  102. * Returns the nth element from a comma separated string.
  103. *
  104. * @param src The string containing the comma separated list.
  105. * @param instance Which item to select from the list.
  106. */
  107. EXPORT unicode Extract(unicode src, unsigned4 instance) :=
  108. lib_unicodelib.UnicodeLib.UnicodeExtract(src, instance);
  109. /**
  110. * Returns the argument string with all upper case characters converted to lower case.
  111. *
  112. * @param src The string that is being converted.
  113. */
  114. EXPORT unicode ToLowerCase(unicode src) :=
  115. lib_unicodelib.UnicodeLib.UnicodeToLowerCase(src);
  116. /**
  117. * Return the argument string with all lower case characters converted to upper case.
  118. *
  119. * @param src The string that is being converted.
  120. */
  121. EXPORT unicode ToUpperCase(unicode src) :=
  122. lib_unicodelib.UnicodeLib.UnicodeToUpperCase(src);
  123. /**
  124. * Returns the upper case variant of the string using the rules for a particular locale.
  125. *
  126. * @param src The string that is being converted.
  127. * @param locale_name The locale to use for the comparison
  128. */
  129. EXPORT unicode ToTitleCase(unicode src) :=
  130. lib_unicodelib.UnicodeLib.UnicodeToProperCase(src);
  131. /**
  132. * Returns the lower case variant of the string using the rules for a particular locale.
  133. *
  134. * @param src The string that is being converted.
  135. * @param locale_name The locale to use for the comparison
  136. */
  137. EXPORT unicode LocaleToLowerCase(unicode src, varstring locale_name) :=
  138. lib_unicodelib.UnicodeLib.UnicodeLocaleToLowerCase(src, locale_name);
  139. /**
  140. * Returns the upper case variant of the string using the rules for a particular locale.
  141. *
  142. * @param src The string that is being converted.
  143. * @param locale_name The locale to use for the comparison
  144. */
  145. EXPORT unicode LocaleToUpperCase(unicode src, varstring locale_name) :=
  146. lib_unicodelib.UnicodeLib.UnicodeLocaleToUpperCase(src, locale_name);
  147. /**
  148. * Returns the upper case variant of the string using the rules for a particular locale.
  149. *
  150. * @param src The string that is being converted.
  151. * @param locale_name The locale to use for the comparison
  152. */
  153. EXPORT unicode LocaleToTitleCase(unicode src, varstring locale_name) :=
  154. lib_unicodelib.UnicodeLib.UnicodeLocaleToProperCase(src, locale_name);
  155. /**
  156. * Compares the two strings case insensitively. Equivalent to comparing at strength 2.
  157. *
  158. * @param src1 The first string to be compared.
  159. * @param src2 The second string to be compared.
  160. * @see Std.Uni.CompareAtStrength
  161. */
  162. EXPORT integer4 CompareIgnoreCase(unicode src1, unicode src2) :=
  163. lib_unicodelib.UnicodeLib.UnicodeCompareIgnoreCase(src1, src2);
  164. /**
  165. * Compares the two strings case insensitively. Equivalent to comparing at strength 2.
  166. *
  167. * @param src1 The first string to be compared.
  168. * @param src2 The second string to be compared.
  169. * @param strength The strength of the comparison
  170. 1 ignores accents and case, differentiating only between letters
  171. 2 ignores case but differentiates between accents.
  172. 3 differentiates between accents and case but ignores e.g. differences between Hiragana and Katakana
  173. 4 differentiates between accents and case and e.g. Hiragana/Katakana, but ignores e.g. Hebrew cantellation marks
  174. 5 differentiates between all strings whose canonically decomposed forms (NFD�Normalization Form D) are non-identical
  175. * @see Std.Uni.CompareAtStrength
  176. */
  177. EXPORT integer4 CompareAtStrength(unicode src1, unicode src2, integer1 strength) :=
  178. lib_unicodelib.UnicodeLib.UnicodeCompareAtStrength(src1, src2, strength);
  179. /**
  180. * Compares the two strings case insensitively. Equivalent to comparing at strength 2.
  181. *
  182. * @param src1 The first string to be compared.
  183. * @param src2 The second string to be compared.
  184. * @param locale_name The locale to use for the comparison
  185. * @see Std.Uni.CompareAtStrength
  186. */
  187. EXPORT integer4 LocaleCompareIgnoreCase(unicode src1, unicode src2, varstring locale_name) :=
  188. lib_unicodelib.UnicodeLib.UnicodeLocaleCompareIgnoreCase(src1, src2, locale_name);
  189. /**
  190. * Compares the two strings case insensitively. Equivalent to comparing at strength 2.
  191. *
  192. * @param src1 The first string to be compared.
  193. * @param src2 The second string to be compared.
  194. * @param locale_name The locale to use for the comparison
  195. * @param strength The strength of the comparison
  196. 1 ignores accents and case, differentiating only between letters
  197. 2 ignores case but differentiates between accents.
  198. 3 differentiates between accents and case but ignores e.g. differences between Hiragana and Katakana
  199. 4 differentiates between accents and case and e.g. Hiragana/Katakana, but ignores e.g. Hebrew cantellation marks
  200. 5 differentiates between all strings whose canonically decomposed forms (NFD�Normalization Form D) are non-identical
  201. */
  202. EXPORT integer4 LocaleCompareAtStrength(unicode src1, unicode src2, varstring locale_name, integer1 strength) :=
  203. lib_unicodelib.UnicodeLib.UnicodeLocaleCompareAtStrength(src1, src2, locale_name, strength);
  204. /**
  205. * Returns the argument string with all characters in reverse order.
  206. * Note the argument is not TRIMMED before it is reversed.
  207. *
  208. * @param src The string that is being reversed.
  209. */
  210. EXPORT unicode Reverse(unicode src) :=
  211. lib_unicodelib.UnicodeLib.UnicodeReverse(src);
  212. /**
  213. * Returns the source string with the replacement string substituted for all instances of the search string.
  214. *
  215. * @param src The string that is being transformed.
  216. * @param sought The string to be replaced.
  217. * @param replacement The string to be substituted into the result.
  218. */
  219. EXPORT unicode FindReplace(unicode src, unicode sought, unicode replacement) :=
  220. lib_unicodelib.UnicodeLib.UnicodeFindReplace(src, sought, replacement);
  221. /**
  222. * Returns the source string with the replacement string substituted for all instances of the search string.
  223. *
  224. * @param src The string that is being transformed.
  225. * @param sought The string to be replaced.
  226. * @param replacement The string to be substituted into the result.
  227. * @param locale_name The locale to use for the comparison
  228. */
  229. EXPORT unicode LocaleFindReplace(unicode src, unicode sought, unicode replacement, varstring locale_name) :=
  230. lib_unicodelib.UnicodeLib.UnicodeLocaleFindReplace(src, sought, replacement, locale_name);
  231. /**
  232. * Returns the source string with the replacement string substituted for all instances of the search string.
  233. *
  234. * @param src The string that is being transformed.
  235. * @param sought The string to be replaced.
  236. * @param replacement The string to be substituted into the result.
  237. * @param locale_name The locale to use for the comparison
  238. * @param strength The strength of the comparison
  239. */
  240. EXPORT unicode LocaleFindAtStrengthReplace(unicode src, unicode sought, unicode replacement, varstring locale_name, integer1 strength) :=
  241. lib_unicodelib.UnicodeLib.UnicodeLocaleFindAtStrengthReplace(src, sought, replacement, locale_name, strength);
  242. /**
  243. * Returns the source string with all accented characters replaced with unaccented.
  244. *
  245. * @param src The string that is being transformed.
  246. */
  247. EXPORT unicode CleanAccents(unicode src) :=
  248. lib_unicodelib.UnicodeLib.UnicodeCleanAccents(src);
  249. /**
  250. * Returns the source string with all instances of multiple adjacent space characters (2 or more spaces together)
  251. * reduced to a single space character. Leading and trailing spaces are removed, and tab characters are converted
  252. * to spaces.
  253. *
  254. * @param src The string to be cleaned.
  255. */
  256. EXPORT unicode CleanSpaces(unicode src) :=
  257. lib_unicodelib.UnicodeLib.UnicodeCleanSpaces(src);
  258. /**
  259. * Tests if the search string matches the pattern.
  260. * The pattern can contain wildcards '?' (single character) and '*' (multiple character).
  261. *
  262. * @param src The string that is being tested.
  263. * @param pattern The pattern to match against.
  264. * @param ignore_case Whether to ignore differences in case between characters
  265. */
  266. EXPORT boolean WildMatch(unicode src, unicode _pattern, boolean _noCase) :=
  267. lib_unicodelib.UnicodeLib.UnicodeWildMatch(src, _pattern, _noCase);
  268. /**
  269. * Tests if the search string contains each of the characters in the pattern.
  270. * If the pattern contains duplicate characters those characters will match once for each occurence in the pattern.
  271. *
  272. * @param src The string that is being tested.
  273. * @param pattern The pattern to match against.
  274. * @param ignore_case Whether to ignore differences in case between characters
  275. */
  276. EXPORT BOOLEAN Contains(unicode src, unicode _pattern, boolean _noCase) :=
  277. lib_unicodelib.UnicodeLib.UnicodeContains(src, _pattern, _noCase);
  278. /**
  279. * Returns the minimum edit distance between the two strings. An insert change or delete counts as a single edit.
  280. * The two strings are trimmed before comparing.
  281. *
  282. * @param _left The first string to be compared.
  283. * @param _right The second string to be compared.
  284. * @param localname The locale to use for the comparison. Defaults to ''.
  285. * @param radius The maximum edit distance that is acceptable, or 0 for no limit. Defaults to 0.
  286. * @return The minimum edit distance between the two strings. Edit distances above radius will
  287. return an arbitrary value larger than radius.
  288. */
  289. EXPORT UNSIGNED4 EditDistance(unicode _left, unicode _right, varstring localename = '', UNSIGNED4 radius = 0) :=
  290. lib_unicodelib.UnicodeLib.UnicodeLocaleEditDistanceV2(_left, _right, localename, radius);
  291. /**
  292. * Returns true if the minimum edit distance between the two strings is with a specific range.
  293. * The two strings are trimmed before comparing.
  294. *
  295. * @param _left The first string to be compared.
  296. * @param _right The second string to be compared.
  297. * @param radius The maximum edit distance that is acceptable.
  298. * @param localname The locale to use for the comparison. Defaults to ''.
  299. * @return Whether or not the two strings are within the given specified edit distance.
  300. */
  301. EXPORT BOOLEAN EditDistanceWithinRadius(unicode _left, unicode _right, unsigned4 radius, varstring localename = '') :=
  302. lib_unicodelib.UnicodeLib.UnicodeLocaleEditDistanceWithinRadius(_left, _right, radius, localename);
  303. /**
  304. * Returns the number of words in the string. Word boundaries are marked by the unicode break semantics.
  305. *
  306. * @param text The string to be broken into words.
  307. * @param localname The locale to use for the break semantics. Defaults to ''.
  308. * @return The number of words in the string.
  309. */
  310. EXPORT unsigned4 WordCount(unicode text, varstring localename = '') :=
  311. lib_unicodelib.UnicodeLib.UnicodeLocaleWordCount(text, localename);
  312. /**
  313. * Returns the n-th word from the string. Word boundaries are marked by the unicode break semantics.
  314. *
  315. * @param text The string to be broken into words.
  316. * @param n Which word should be returned from the function.
  317. * @param localname The locale to use for the break semantics. Defaults to ''.
  318. * @return The number of words in the string.
  319. */
  320. EXPORT unicode GetNthWord(unicode text, unsigned4 n, varstring localename = '') :=
  321. lib_unicodelib.UnicodeLib.UnicodeLocaleGetNthWord(text, n, localename);
  322. /**
  323. * Returns everything but the string's nth word and some whitespaces. Words are marked by the unicode break semantics.
  324. * Trailing whitespaes are always removed with the word.
  325. * Leading whitespaces are only removed with the word if the nth word is the first word.
  326. * Returns a blank string if there are no words in the source string.
  327. * Returns the source string if the number of words in the string is less than the n parameter's assigned value.
  328. *
  329. * @param text The string to be broken into words.
  330. * @param n Which word should be removed from the string.
  331. * @param localname The locale to use for the break semantics. Defaults to ''.
  332. * @return The string excluding the nth word.
  333. */
  334. EXPORT ExcludeNthWord(unicode text, unsigned4 n, varstring localename = '') :=
  335. lib_unicodelib.UnicodeLib.UnicodeLocaleExcludeNthWord(text, n, localename);
  336. /**
  337. * Returns everything except the first word from the string. Words are marked by the unicode break semantics.
  338. * Whitespace before and after the first word is also removed.
  339. *
  340. * @param text The string to be broken into words.
  341. * @return The string excluding the first word.
  342. */
  343. EXPORT ExcludeFirstWord(unicode text, varstring localename = '') :=
  344. lib_unicodelib.UnicodeLib.UnicodeLocaleExcludeNthWord(text, 1, localename);
  345. /**
  346. * Returns everything except the last word from the string. Word boundaries are marked by the unicode break semantics.
  347. * Whitespace after a word is removed with the word and leading whitespace is removed with the first word.
  348. *
  349. * @param text The string to be broken into words.
  350. * @param localname The locale to use for the break semantics. Defaults to ''.
  351. * @return The string excluding the last word.
  352. */
  353. EXPORT unicode ExcludeLastWord(unicode text, varstring localename = '') :=
  354. lib_unicodelib.UnicodeLib.UnicodeLocaleExcludeLastWord(text, localename);
  355. /**
  356. * Returns the source string with the all characters that match characters in the search string replaced
  357. * with the character at the corresponding position in the replacement string.
  358. * The isEmpty() tests in the beginning of the function check for invalid sequences in addition to blank strings.
  359. * If any of the isEmpty() tests are true, the function will return the source string.
  360. *
  361. * @param src The string that is being tested.
  362. * @param search The string containing the set of characters to be included.
  363. * @param replacement The string containing the characters to act as replacements.
  364. * @return The string containing the source string but with the translated characters.
  365. */
  366. EXPORT Translate(unicode text, unicode search, unicode replacement) :=
  367. lib_unicodelib.UnicodeLib.UnicodeLocaleTranslate(text, search, replacement);
  368. /**
  369. * Returns true if the prefix string matches the leading characters in the source string. Trailing and Leading spaces
  370. * are stripped from the prefix before matching. Unless specified, normalization will not occur. Unless initiated as hex and
  371. * then converted to Unicode using TRANSFER, ecl will perform its own normalization on your declared Unicode string.
  372. *
  373. * @param src The string being searched in.
  374. * @param prefix The prefix to search for.
  375. * @param form The type of Normalization to be employed.
  376. */
  377. EXPORT BOOLEAN StartsWith(unicode src, unicode prefix, string form) :=
  378. lib_unicodelib.UnicodeLib.UnicodeLocaleStartsWith(src, prefix, form);
  379. /**
  380. * Returns true if the suffix string matches the trailing characters in the source string. Trailing and Leading spaces
  381. * are stripped from the suffix before matching. Unless specified, normalization will not occur. Unless initiated as hex and
  382. * then converted to Unicode using TRANSFER, ecl will perform its own normalization on your declared Unicode string.
  383. *
  384. * @param src The string being searched in.
  385. * @param suffix The suffix to search for.
  386. * @param form The type of Normalization to be employed.
  387. */
  388. EXPORT BOOLEAN EndsWith(unicode src, unicode suffix, string form) :=
  389. lib_unicodelib.UnicodeLib.UnicodeLocaleEndsWith(src, suffix, form);
  390. /**
  391. * Returns a string containing the version of icu being used to implement the unicode library.
  392. */
  393. EXPORT STRING Version() := lib_unicodelib.UnicodeLib.UnicodeVersion();
  394. /**
  395. * Removes the suffix from the search string, if present, and returns the result. Trailing spaces are
  396. * stripped from both strings before matching.
  397. *
  398. * @param src The string being searched in.
  399. * @param suffix The suffix to search for.
  400. * @param form The type of Normalization to be employed.
  401. * @return The string excluding the suffix, if endsWith is true
  402. */
  403. EXPORT RemoveSuffix(unicode src, unicode suffix, string form) :=
  404. lib_unicodelib.UnicodeLib.UnicodeLocaleRemoveSuffix(src, suffix, form);
  405. /*
  406. * Returns a string containing text repeated n times.
  407. *
  408. * @param text The string to be repeated.
  409. * @param n Number of repetitions.
  410. * @return A string containing n concatenations of the string text.
  411. */
  412. EXPORT Repeat(unicode text, unsigned4 n) :=
  413. lib_unicodelib.UnicodeLib.UnicodeLocaleRepeat(text, n);
  414. /**
  415. * Returns the number of occurences of the second string within the first string.
  416. *
  417. * @param src The string that is searched.
  418. * @param sought The string being sought.
  419. * @param form The optional, specified normalization form.
  420. * @return The number of occurences, matches.
  421. */
  422. EXPORT unsigned4 FindCount(unicode src, unicode sought, string form) :=
  423. lib_unicodelib.UnicodeLib.UnicodeLocaleFindCount(src, sought, form);
  424. /**
  425. * Returns the number of words that the string contains. Words are separated by one or more separator strings. No
  426. * spaces are stripped from either string before matching. allow_blank set to false by default.
  427. *
  428. * @param src The string being searched in.
  429. * @param separator The string used to separate words
  430. * @param allow_blank Indicates if empty/blank string items are included in the results.
  431. * @return The number of delimited tokens in the source string
  432. */
  433. EXPORT unsigned4 CountWords(unicode src, unicode separator, boolean allow_blank = FALSE) :=
  434. lib_unicodelib.UnicodeLib.UnicodeLocaleCountWords(src, separator, allow_blank);
  435. /**
  436. * Returns the delimited words that the string contains in a UnicodeSet. Words are separated by one or more separator strings. No
  437. * spaces are stripped from either string before matching. allow_blank is set to false by default.
  438. *
  439. * @param src The string being searched in.
  440. * @param separator The string used to separate words
  441. * @param allow_blank Indicates if empty/blank string items are included in the results.
  442. * @return A UnicodeSet whose members are the delimited words
  443. */
  444. EXPORT SplitWords(unicode src, unicode separator, boolean allow_blank = FALSE) :=
  445. lib_unicodelib.UnicodeLib.UnicodeLocaleSplitWords(src, separator, allow_blank);
  446. END;