rune.c 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358
  1. /*
  2. * The authors of this software are Rob Pike and Ken Thompson.
  3. * Copyright (c) 2002 by Lucent Technologies.
  4. * Permission to use, copy, modify, and distribute this software for any
  5. * purpose without fee is hereby granted, provided that this entire notice
  6. * is included in all copies of any software which is or includes a copy
  7. * or modification of this software and in all copies of the supporting
  8. * documentation for such software.
  9. * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
  10. * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
  11. * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
  12. * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
  13. */
  14. #include <stdarg.h>
  15. #include <string.h>
  16. #include "third_party/utf/utf.h"
  17. #include "third_party/utf/utfdef.h"
  18. enum
  19. {
  20. Bit1 = 7,
  21. Bitx = 6,
  22. Bit2 = 5,
  23. Bit3 = 4,
  24. Bit4 = 3,
  25. Bit5 = 2,
  26. T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
  27. Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
  28. T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
  29. T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
  30. T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
  31. T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
  32. Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */
  33. Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */
  34. Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */
  35. Rune4 = (1<<(Bit4+3*Bitx))-1,
  36. /* 0001 1111 1111 1111 1111 1111 */
  37. Maskx = (1<<Bitx)-1, /* 0011 1111 */
  38. Testx = Maskx ^ 0xFF, /* 1100 0000 */
  39. Bad = Runeerror,
  40. };
  41. /*
  42. * Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24
  43. * This is a slower but "safe" version of the old chartorune
  44. * that works on strings that are not necessarily null-terminated.
  45. *
  46. * If you know for sure that your string is null-terminated,
  47. * chartorune will be a bit faster.
  48. *
  49. * It is guaranteed not to attempt to access "length"
  50. * past the incoming pointer. This is to avoid
  51. * possible access violations. If the string appears to be
  52. * well-formed but incomplete (i.e., to get the whole Rune
  53. * we'd need to read past str+length) then we'll set the Rune
  54. * to Bad and return 0.
  55. *
  56. * Note that if we have decoding problems for other
  57. * reasons, we return 1 instead of 0.
  58. */
  59. int
  60. charntorune(Rune *rune, const char *str, int length)
  61. {
  62. int c, c1, c2, c3;
  63. long l;
  64. /* When we're not allowed to read anything */
  65. if(length <= 0) {
  66. goto badlen;
  67. }
  68. /*
  69. * one character sequence (7-bit value)
  70. * 00000-0007F => T1
  71. */
  72. c = *(uchar*)str;
  73. if(c < Tx) {
  74. *rune = c;
  75. return 1;
  76. }
  77. // If we can't read more than one character we must stop
  78. if(length <= 1) {
  79. goto badlen;
  80. }
  81. /*
  82. * two character sequence (11-bit value)
  83. * 0080-07FF => T2 Tx
  84. */
  85. c1 = *(uchar*)(str+1) ^ Tx;
  86. if(c1 & Testx)
  87. goto bad;
  88. if(c < T3) {
  89. if(c < T2)
  90. goto bad;
  91. l = ((c << Bitx) | c1) & Rune2;
  92. if(l <= Rune1)
  93. goto bad;
  94. *rune = l;
  95. return 2;
  96. }
  97. // If we can't read more than two characters we must stop
  98. if(length <= 2) {
  99. goto badlen;
  100. }
  101. /*
  102. * three character sequence (16-bit value)
  103. * 0800-FFFF => T3 Tx Tx
  104. */
  105. c2 = *(uchar*)(str+2) ^ Tx;
  106. if(c2 & Testx)
  107. goto bad;
  108. if(c < T4) {
  109. l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
  110. if(l <= Rune2)
  111. goto bad;
  112. *rune = l;
  113. return 3;
  114. }
  115. if (length <= 3)
  116. goto badlen;
  117. /*
  118. * four character sequence (21-bit value)
  119. * 10000-1FFFFF => T4 Tx Tx Tx
  120. */
  121. c3 = *(uchar*)(str+3) ^ Tx;
  122. if (c3 & Testx)
  123. goto bad;
  124. if (c < T5) {
  125. l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
  126. if (l <= Rune3)
  127. goto bad;
  128. if (l > Runemax)
  129. goto bad;
  130. *rune = l;
  131. return 4;
  132. }
  133. // Support for 5-byte or longer UTF-8 would go here, but
  134. // since we don't have that, we'll just fall through to bad.
  135. /*
  136. * bad decoding
  137. */
  138. bad:
  139. *rune = Bad;
  140. return 1;
  141. badlen:
  142. *rune = Bad;
  143. return 0;
  144. }
  145. /*
  146. * This is the older "unsafe" version, which works fine on
  147. * null-terminated strings.
  148. */
  149. int
  150. chartorune(Rune *rune, const char *str)
  151. {
  152. int c, c1, c2, c3;
  153. long l;
  154. /*
  155. * one character sequence
  156. * 00000-0007F => T1
  157. */
  158. c = *(uchar*)str;
  159. if(c < Tx) {
  160. *rune = c;
  161. return 1;
  162. }
  163. /*
  164. * two character sequence
  165. * 0080-07FF => T2 Tx
  166. */
  167. c1 = *(uchar*)(str+1) ^ Tx;
  168. if(c1 & Testx)
  169. goto bad;
  170. if(c < T3) {
  171. if(c < T2)
  172. goto bad;
  173. l = ((c << Bitx) | c1) & Rune2;
  174. if(l <= Rune1)
  175. goto bad;
  176. *rune = l;
  177. return 2;
  178. }
  179. /*
  180. * three character sequence
  181. * 0800-FFFF => T3 Tx Tx
  182. */
  183. c2 = *(uchar*)(str+2) ^ Tx;
  184. if(c2 & Testx)
  185. goto bad;
  186. if(c < T4) {
  187. l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
  188. if(l <= Rune2)
  189. goto bad;
  190. *rune = l;
  191. return 3;
  192. }
  193. /*
  194. * four character sequence (21-bit value)
  195. * 10000-1FFFFF => T4 Tx Tx Tx
  196. */
  197. c3 = *(uchar*)(str+3) ^ Tx;
  198. if (c3 & Testx)
  199. goto bad;
  200. if (c < T5) {
  201. l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
  202. if (l <= Rune3)
  203. goto bad;
  204. if (l > Runemax)
  205. goto bad;
  206. *rune = l;
  207. return 4;
  208. }
  209. /*
  210. * Support for 5-byte or longer UTF-8 would go here, but
  211. * since we don't have that, we'll just fall through to bad.
  212. */
  213. /*
  214. * bad decoding
  215. */
  216. bad:
  217. *rune = Bad;
  218. return 1;
  219. }
  220. int
  221. isvalidcharntorune(const char* str, int length, Rune* rune, int* consumed) {
  222. *consumed = charntorune(rune, str, length);
  223. return *rune != Runeerror || *consumed == 3;
  224. }
  225. int
  226. runetochar(char *str, const Rune *rune)
  227. {
  228. /* Runes are signed, so convert to unsigned for range check. */
  229. unsigned long c;
  230. /*
  231. * one character sequence
  232. * 00000-0007F => 00-7F
  233. */
  234. c = *rune;
  235. if(c <= Rune1) {
  236. str[0] = c;
  237. return 1;
  238. }
  239. /*
  240. * two character sequence
  241. * 0080-07FF => T2 Tx
  242. */
  243. if(c <= Rune2) {
  244. str[0] = T2 | (c >> 1*Bitx);
  245. str[1] = Tx | (c & Maskx);
  246. return 2;
  247. }
  248. /*
  249. * If the Rune is out of range, convert it to the error rune.
  250. * Do this test here because the error rune encodes to three bytes.
  251. * Doing it earlier would duplicate work, since an out of range
  252. * Rune wouldn't have fit in one or two bytes.
  253. */
  254. if (c > Runemax)
  255. c = Runeerror;
  256. /*
  257. * three character sequence
  258. * 0800-FFFF => T3 Tx Tx
  259. */
  260. if (c <= Rune3) {
  261. str[0] = T3 | (c >> 2*Bitx);
  262. str[1] = Tx | ((c >> 1*Bitx) & Maskx);
  263. str[2] = Tx | (c & Maskx);
  264. return 3;
  265. }
  266. /*
  267. * four character sequence (21-bit value)
  268. * 10000-1FFFFF => T4 Tx Tx Tx
  269. */
  270. str[0] = T4 | (c >> 3*Bitx);
  271. str[1] = Tx | ((c >> 2*Bitx) & Maskx);
  272. str[2] = Tx | ((c >> 1*Bitx) & Maskx);
  273. str[3] = Tx | (c & Maskx);
  274. return 4;
  275. }
  276. int
  277. runelen(Rune rune)
  278. {
  279. char str[10];
  280. return runetochar(str, &rune);
  281. }
  282. int
  283. runenlen(const Rune *r, int nrune)
  284. {
  285. int nb;
  286. ulong c; /* Rune is signed, so use unsigned for range check. */
  287. nb = 0;
  288. while(nrune--) {
  289. c = *r++;
  290. if (c <= Rune1)
  291. nb++;
  292. else if (c <= Rune2)
  293. nb += 2;
  294. else if (c <= Rune3)
  295. nb += 3;
  296. else if (c <= Runemax)
  297. nb += 4;
  298. else
  299. nb += 3; /* Runeerror = 0xFFFD, see runetochar */
  300. }
  301. return nb;
  302. }
  303. int
  304. fullrune(const char *str, int n)
  305. {
  306. if (n > 0) {
  307. int c = *(uchar*)str;
  308. if (c < Tx)
  309. return 1;
  310. if (n > 1) {
  311. if (c < T3)
  312. return 1;
  313. if (n > 2) {
  314. if (c < T4 || n > 3)
  315. return 1;
  316. }
  317. }
  318. }
  319. return 0;
  320. }