unicode.c 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750
  1. #define STB_DEFINE
  2. #include "../stb.h"
  3. // create unicode mappings
  4. //
  5. // Two kinds of mappings:
  6. // map to a number
  7. // map to a bit
  8. //
  9. // For mapping to a number, we use the following strategy:
  10. //
  11. // User supplies:
  12. // 1. a table of numbers (for now we use uint16, so full Unicode table is 4MB)
  13. // 2. a "don't care" value
  14. // 3. define a 'fallback' value (typically 0)
  15. // 4. define a fast-path range (typically 0..255 or 0..1023) [@TODO: automate detecting this]
  16. //
  17. // Code:
  18. // 1. Determine range of *end* of unicode codepoints (U+10FFFF and down) which
  19. // all have the same value (or don't care). If large enough, emit this as a
  20. // special case in the code.
  21. // 2. Repeat above, limited to at most U+FFFF.
  22. // 3. Cluster the data into intervals of 8,16,32,64,128,256 numeric values.
  23. // 3a. If all the values in an interval are fallback/dont-care, no further processing
  24. // 3b. Find the "trimmed range" outside which all the values are the fallback or don't care
  25. // 3c. Find the "special trimmed range" outside which all the values are some constant or don't care
  26. // 4. Pack the clusters into continuous memory, and find previous instances of
  27. // the cluster. Repeat for trimmed & special-trimmed. In the first case, find
  28. // previous instances of the cluster (allow don't-care to match in either
  29. // direction), both aligned and mis-aligned; in the latter, starting where
  30. // things start or mis-aligned. Build an index table specifying the
  31. // location of each cluster (and its length). Allow an extra indirection here;
  32. // the full-sized index can index a smaller table which has the actual offset
  33. // (and lengths).
  34. // 5. Associate with each packed continuous memory above the amount of memory
  35. // required to store the data w/ smallest datatype (of uint8, uint16, uint32).
  36. // Discard the continuous memory. Recurse on each index table, but avoid the
  37. // smaller packing.
  38. //
  39. // For mapping to a bit, we pack the results for 8 characters into a byte, and then apply
  40. // the above strategy. Note that there may be more optimal approaches with e.g. packing
  41. // 8 different bits into a single structure, though, which we should explore eventually.
  42. // currently we limit *indices* to being 2^16, and we pack them as
  43. // index + end_trim*2^16 + start_trim*2^24; specials have to go in a separate table
  44. typedef uint32 uval;
  45. #define UVAL_DONT_CARE_DEFAULT 0xffffffff
  46. typedef struct
  47. {
  48. uval *input;
  49. uint32 dont_care;
  50. uint32 fallback;
  51. int fastpath;
  52. int length;
  53. int depth;
  54. int has_sign;
  55. int splittable;
  56. int replace_fallback_with_codepoint;
  57. size_t input_size;
  58. size_t inherited_storage;
  59. } table;
  60. typedef struct
  61. {
  62. int split_log2;
  63. table result; // index into not-returned table
  64. int storage;
  65. } output;
  66. typedef struct
  67. {
  68. table t;
  69. char **output_name;
  70. } info;
  71. typedef struct
  72. {
  73. size_t path;
  74. size_t size;
  75. } result;
  76. typedef struct
  77. {
  78. uint8 trim_end;
  79. uint8 trim_start;
  80. uint8 special;
  81. uint8 aligned;
  82. uint8 indirect;
  83. uint16 overhead; // add some forced overhead for each mode to avoid getting complex encoding when it doesn't save much
  84. } mode_info;
  85. mode_info modes[] =
  86. {
  87. { 0,0,0,0,0, 32, },
  88. { 0,0,0,0,1, 100, },
  89. { 0,0,0,1,0, 32, },
  90. { 0,0,0,1,1, 100, },
  91. { 0,0,1,0,1, 100, },
  92. { 0,0,1,1,0, 32, },
  93. { 0,0,1,1,1, 200, },
  94. { 1,0,0,0,0, 100, },
  95. { 1,0,0,0,1, 120, },
  96. { 1,1,0,0,0, 100, },
  97. { 1,1,0,0,1, 130, },
  98. { 1,0,1,0,0, 130, },
  99. { 1,0,1,0,1, 180, },
  100. { 1,1,1,0,0, 180, },
  101. { 1,1,1,0,1, 200, },
  102. };
  103. #define MODECOUNT (sizeof(modes)/sizeof(modes[0]))
  104. #define CLUSTERSIZECOUNT 6 // 8,16, 32,64, 128,256
  105. size_t size_for_max_number(uint32 number)
  106. {
  107. if (number == 0) return 0;
  108. if (number < 256) return 1;
  109. if (number < 256*256) return 2;
  110. if (number < 256*256*256) return 3;
  111. return 4;
  112. }
  113. size_t size_for_max_number_aligned(uint32 number)
  114. {
  115. size_t n = size_for_max_number(number);
  116. return n == 3 ? 4 : n;
  117. }
  118. uval get_data(uval *data, int offset, uval *end)
  119. {
  120. if (data + offset >= end)
  121. return 0;
  122. else
  123. return data[offset];
  124. }
  125. int safe_len(uval *data, int len, uval *end)
  126. {
  127. if (len > end - data)
  128. return end - data;
  129. return len;
  130. }
  131. uval tempdata[256];
  132. int dirty=0;
  133. size_t find_packed(uval **packed, uval *data, int len, int aligned, int fastpath, uval *end, int offset, int replace)
  134. {
  135. int packlen = stb_arr_len(*packed);
  136. int i,p;
  137. if (data+len > end || replace) {
  138. int safelen = safe_len(data, len, end);
  139. memset(tempdata, 0, dirty*sizeof(tempdata[0]));
  140. memcpy(tempdata, data, safelen * sizeof(data[0]));
  141. data = tempdata;
  142. dirty = len;
  143. }
  144. if (replace) {
  145. int i;
  146. int safelen = safe_len(data, len, end);
  147. for (i=0; i < safelen; ++i)
  148. if (data[i] == 0)
  149. data[i] = offset+i;
  150. }
  151. if (len <= 0)
  152. return 0;
  153. if (!fastpath) {
  154. if (aligned) {
  155. for (i=0; i < packlen; i += len)
  156. if ((*packed)[i] == data[0] && 0==memcmp(&(*packed)[i], data, len * sizeof(uval)))
  157. return i / len;
  158. } else {
  159. for (i=0; i < packlen-len+1; i += 1 )
  160. if ((*packed)[i] == data[0] && 0==memcmp(&(*packed)[i], data, len * sizeof(uval)))
  161. return i;
  162. }
  163. }
  164. p = stb_arr_len(*packed);
  165. for (i=0; i < len; ++i)
  166. stb_arr_push(*packed, data[i]);
  167. return p;
  168. }
  169. void output_table(char *name1, char *name2, uval *data, int length, int sign, char **names)
  170. {
  171. char temp[20];
  172. uval maxv = 0;
  173. int bytes, numlen, at_newline;
  174. int linelen = 79; // @TODO: make table more readable by choosing a length that's a multiple?
  175. int i,pos, do_split=0;
  176. for (i=0; i < length; ++i)
  177. if (sign)
  178. maxv = stb_max(maxv, (uval)abs((int)data[i]));
  179. else
  180. maxv = stb_max(maxv, data[i]);
  181. bytes = size_for_max_number_aligned(maxv);
  182. sprintf(temp, "%d", maxv);
  183. numlen=strlen(temp);
  184. if (sign)
  185. ++numlen;
  186. if (bytes == 0)
  187. return;
  188. printf("uint%d %s%s[%d] = {\n", bytes*8, name1, name2, length);
  189. at_newline = 1;
  190. for (i=0; i < length; ++i) {
  191. if (pos + numlen + 2 > linelen) {
  192. printf("\n");
  193. at_newline = 1;
  194. pos = 0;
  195. }
  196. if (at_newline) {
  197. printf(" ");
  198. pos = 2;
  199. at_newline = 0;
  200. } else {
  201. printf(" ");
  202. ++pos;
  203. }
  204. printf("%*d,", numlen, data[i]);
  205. pos += numlen+1;
  206. }
  207. if (!at_newline) printf("\n");
  208. printf("};\n");
  209. }
  210. void output_table_with_trims(char *name1, char *name2, uval *data, int length)
  211. {
  212. uval maxt=0, maxp=0;
  213. int i,d,s,e, count;
  214. // split the table into two pieces
  215. uval *trims = NULL;
  216. if (length == 0)
  217. return;
  218. for (i=0; i < stb_arr_len(data); ++i) {
  219. stb_arr_push(trims, data[i] >> 16);
  220. data[i] &= 0xffff;
  221. maxt = stb_max(maxt, trims[i]);
  222. maxp = stb_max(maxp, data[i]);
  223. }
  224. d=s=e=1;
  225. if (maxt >= 256) {
  226. // need to output start & end values
  227. if (maxp >= 256) {
  228. // can pack into a single table
  229. printf("struct { uint16 val; uint8 start, end; } %s%s[%d] = {\n", name1, name2, length);
  230. } else {
  231. output_table(name1, name2, data, length, 0, 0);
  232. d=0;
  233. printf("struct { uint8 start, end; } %s%s_trim[%d] = {\n", name1, name2, length);
  234. }
  235. } else if (maxt > 0) {
  236. if (maxp >= 256) {
  237. output_table(name1, name2, data, length, 0, 0);
  238. output_table(name1, stb_sprintf("%s_end", name2), trims, length, 0, 0);
  239. return;
  240. } else {
  241. printf("struct { uint8 val, end; } %s%s[%d] = {\n", name1, name2, length);
  242. s=0;
  243. }
  244. } else {
  245. output_table(name1, name2, data, length, 0, 0);
  246. return;
  247. }
  248. // d or s can be zero (but not both), e is always present and last
  249. count = d + s + e;
  250. assert(count >= 2 && count <= 3);
  251. {
  252. char temp[60];
  253. uval maxv = 0;
  254. int numlen, at_newline, len;
  255. int linelen = 79; // @TODO: make table more readable by choosing a length that's a multiple?
  256. int i,pos, do_split=0;
  257. numlen = 0;
  258. for (i=0; i < length; ++i) {
  259. if (count == 2)
  260. sprintf(temp, "{%d,%d}", d ? data[i] : (trims[i]>>8), trims[i]&255);
  261. else
  262. sprintf(temp, "{%d,%d,%d}", data[i], trims[i]>>8, trims[i]&255);
  263. len = strlen(temp);
  264. numlen = stb_max(len, numlen);
  265. }
  266. at_newline = 1;
  267. for (i=0; i < length; ++i) {
  268. if (pos + numlen + 2 > linelen) {
  269. printf("\n");
  270. at_newline = 1;
  271. pos = 0;
  272. }
  273. if (at_newline) {
  274. printf(" ");
  275. pos = 2;
  276. at_newline = 0;
  277. } else {
  278. printf(" ");
  279. ++pos;
  280. }
  281. if (count == 2)
  282. sprintf(temp, "{%d,%d}", d ? data[i] : (trims[i]>>8), trims[i]&255);
  283. else
  284. sprintf(temp, "{%d,%d,%d}", data[i], trims[i]>>8, trims[i]&255);
  285. printf("%*s,", numlen, temp);
  286. pos += numlen+1;
  287. }
  288. if (!at_newline) printf("\n");
  289. printf("};\n");
  290. }
  291. }
  292. int weight=1;
  293. table pack_for_mode(table *t, int mode, char *table_name)
  294. {
  295. size_t extra_size;
  296. int i;
  297. uval maxv;
  298. mode_info mi = modes[mode % MODECOUNT];
  299. int size = 8 << (mode / MODECOUNT);
  300. table newtab;
  301. uval *packed = NULL;
  302. uval *index = NULL;
  303. uval *indirect = NULL;
  304. uval *specials = NULL;
  305. newtab.dont_care = UVAL_DONT_CARE_DEFAULT;
  306. if (table_name)
  307. printf("// clusters of %d\n", size);
  308. for (i=0; i < t->length; i += size) {
  309. uval newval;
  310. int fastpath = (i < t->fastpath);
  311. if (mi.special) {
  312. int end_trim = size-1;
  313. int start_trim = 0;
  314. uval special;
  315. // @TODO: pick special from start or end instead of only end depending on which is longer
  316. for(;;) {
  317. special = t->input[i + end_trim];
  318. if (special != t->dont_care || end_trim == 0)
  319. break;
  320. --end_trim;
  321. }
  322. // at this point, special==inp[end_trim], and end_trim >= 0
  323. if (special == t->dont_care && !fastpath) {
  324. // entire block is don't care, so OUTPUT don't care
  325. stb_arr_push(index, newtab.dont_care);
  326. continue;
  327. } else {
  328. uval pos, trim;
  329. if (mi.trim_end && !fastpath) {
  330. while (end_trim >= 0) {
  331. if (t->input[i + end_trim] == special || t->input[i + end_trim] == t->dont_care)
  332. --end_trim;
  333. else
  334. break;
  335. }
  336. }
  337. if (mi.trim_start && !fastpath) {
  338. while (start_trim < end_trim) {
  339. if (t->input[i + start_trim] == special || t->input[i + start_trim] == t->dont_care)
  340. ++start_trim;
  341. else
  342. break;
  343. }
  344. }
  345. // end_trim points to the last character we have to output
  346. // find the first match, or add it
  347. pos = find_packed(&packed, &t->input[i+start_trim], end_trim-start_trim+1, mi.aligned, fastpath, &t->input[t->length], i+start_trim, t->replace_fallback_with_codepoint);
  348. // encode as a uval
  349. if (!mi.trim_end) {
  350. if (end_trim == 0)
  351. pos = special;
  352. else
  353. pos = pos | 0x80000000;
  354. } else {
  355. assert(end_trim < size && end_trim >= -1);
  356. if (!fastpath) assert(end_trim < size-1); // special always matches last one
  357. assert(end_trim < size && end_trim+1 >= 0);
  358. if (!fastpath) assert(end_trim+1 < size);
  359. if (mi.trim_start)
  360. trim = start_trim*256 + (end_trim+1);
  361. else
  362. trim = end_trim+1;
  363. assert(pos < 65536); // @TODO: if this triggers, just bail on this search path
  364. pos = pos + (trim << 16);
  365. }
  366. newval = pos;
  367. stb_arr_push(specials, special);
  368. }
  369. } else if (mi.trim_end) {
  370. int end_trim = size-1;
  371. int start_trim = 0;
  372. uval pos, trim;
  373. while (end_trim >= 0 && !fastpath)
  374. if (t->input[i + end_trim] == t->fallback || t->input[i + end_trim] == t->dont_care)
  375. --end_trim;
  376. else
  377. break;
  378. if (mi.trim_start && !fastpath) {
  379. while (start_trim < end_trim) {
  380. if (t->input[i + start_trim] == t->fallback || t->input[i + start_trim] == t->dont_care)
  381. ++start_trim;
  382. else
  383. break;
  384. }
  385. }
  386. // end_trim points to the last character we have to output, and can be -1
  387. ++end_trim; // make exclusive at end
  388. if (end_trim == 0 && size == 256)
  389. start_trim = end_trim = 1; // we can't make encode a length from 0..256 in 8 bits, so restrict end_trim to 1..256
  390. // find the first match, or add it
  391. pos = find_packed(&packed, &t->input[i+start_trim], end_trim - start_trim, mi.aligned, fastpath, &t->input[t->length], i+start_trim, t->replace_fallback_with_codepoint);
  392. assert(end_trim <= size && end_trim >= 0);
  393. if (size == 256)
  394. assert(end_trim-1 < 256 && end_trim-1 >= 0);
  395. else
  396. assert(end_trim < 256 && end_trim >= 0);
  397. if (size == 256)
  398. --end_trim;
  399. if (mi.trim_start)
  400. trim = start_trim*256 + end_trim;
  401. else
  402. trim = end_trim;
  403. assert(pos < 65536); // @TODO: if this triggers, just bail on this search path
  404. pos = pos + (trim << 16);
  405. newval = pos;
  406. } else {
  407. newval = find_packed(&packed, &t->input[i], size, mi.aligned, fastpath, &t->input[t->length], i, t->replace_fallback_with_codepoint);
  408. }
  409. if (mi.indirect) {
  410. int j;
  411. for (j=0; j < stb_arr_len(indirect); ++j)
  412. if (indirect[j] == newval)
  413. break;
  414. if (j == stb_arr_len(indirect))
  415. stb_arr_push(indirect, newval);
  416. stb_arr_push(index, j);
  417. } else {
  418. stb_arr_push(index, newval);
  419. }
  420. }
  421. // total up the new size for everything but the index table
  422. extra_size = mi.overhead * weight; // not the actual overhead cost; a penalty to avoid excessive complexity
  423. extra_size += 150; // per indirection
  424. if (table_name)
  425. extra_size = 0;
  426. if (t->has_sign) {
  427. // 'packed' contains two values, which should be packed positive & negative for size
  428. uval maxv2;
  429. for (i=0; i < stb_arr_len(packed); ++i)
  430. if (packed[i] & 0x80000000)
  431. maxv2 = stb_max(maxv2, packed[i]);
  432. else
  433. maxv = stb_max(maxv, packed[i]);
  434. maxv = stb_max(maxv, maxv2) << 1;
  435. } else {
  436. maxv = 0;
  437. for (i=0; i < stb_arr_len(packed); ++i)
  438. if (packed[i] > maxv && packed[i] != t->dont_care)
  439. maxv = packed[i];
  440. }
  441. extra_size += stb_arr_len(packed) * (t->splittable ? size_for_max_number(maxv) : size_for_max_number_aligned(maxv));
  442. if (table_name) {
  443. if (t->splittable)
  444. output_table_with_trims(table_name, "", packed, stb_arr_len(packed));
  445. else
  446. output_table(table_name, "", packed, stb_arr_len(packed), t->has_sign, NULL);
  447. }
  448. maxv = 0;
  449. for (i=0; i < stb_arr_len(specials); ++i)
  450. if (specials[i] > maxv)
  451. maxv = specials[i];
  452. extra_size += stb_arr_len(specials) * size_for_max_number_aligned(maxv);
  453. if (table_name)
  454. output_table(table_name, "_default", specials, stb_arr_len(specials), 0, NULL);
  455. maxv = 0;
  456. for (i=0; i < stb_arr_len(indirect); ++i)
  457. if (indirect[i] > maxv)
  458. maxv = indirect[i];
  459. extra_size += stb_arr_len(indirect) * size_for_max_number(maxv);
  460. if (table_name && stb_arr_len(indirect)) {
  461. if (mi.trim_end)
  462. output_table_with_trims(table_name, "_index", indirect, stb_arr_len(indirect));
  463. else {
  464. assert(0); // this case should only trigger in very extreme circumstances
  465. output_table(table_name, "_index", indirect, stb_arr_len(indirect), 0, NULL);
  466. }
  467. mi.trim_end = mi.special = 0;
  468. }
  469. if (table_name)
  470. printf("// above tables should be %d bytes\n", extra_size);
  471. maxv = 0;
  472. for (i=0; i < stb_arr_len(index); ++i)
  473. if (index[i] > maxv && index[i] != t->dont_care)
  474. maxv = index[i];
  475. newtab.splittable = mi.trim_end;
  476. newtab.input_size = newtab.splittable ? size_for_max_number(maxv) : size_for_max_number_aligned(maxv);
  477. newtab.input = index;
  478. newtab.length = stb_arr_len(index);
  479. newtab.inherited_storage = t->inherited_storage + extra_size;
  480. newtab.fastpath = 0;
  481. newtab.depth = t->depth+1;
  482. stb_arr_free(indirect);
  483. stb_arr_free(packed);
  484. stb_arr_free(specials);
  485. return newtab;
  486. }
  487. result pack_table(table *t, size_t path, int min_storage)
  488. {
  489. int i;
  490. result best;
  491. best.size = t->inherited_storage + t->input_size * t->length;
  492. best.path = path;
  493. if ((int) t->inherited_storage > min_storage) {
  494. best.size = stb_max(best.size, t->inherited_storage);
  495. return best;
  496. }
  497. if (t->length <= 256 || t->depth >= 4) {
  498. //printf("%08x: %7d\n", best.path, best.size);
  499. return best;
  500. }
  501. path <<= 7;
  502. for (i=0; i < MODECOUNT * CLUSTERSIZECOUNT; ++i) {
  503. table newtab;
  504. result r;
  505. newtab = pack_for_mode(t, i, 0);
  506. r = pack_table(&newtab, path+i+1, min_storage);
  507. if (r.size < best.size)
  508. best = r;
  509. stb_arr_free(newtab.input);
  510. //printf("Size: %6d + %6d\n", newtab.inherited_storage, newtab.input_size * newtab.length);
  511. }
  512. return best;
  513. }
  514. int pack_table_by_modes(table *t, int *modes)
  515. {
  516. table s = *t;
  517. while (*modes > -1) {
  518. table newtab;
  519. newtab = pack_for_mode(&s, *modes, 0);
  520. if (s.input != t->input)
  521. stb_arr_free(s.input);
  522. s = newtab;
  523. ++modes;
  524. }
  525. return s.inherited_storage + s.input_size * s.length;
  526. }
  527. int strip_table(table *t, int exceptions)
  528. {
  529. uval terminal_value;
  530. int p = t->length-1;
  531. while (t->input[p] == t->dont_care)
  532. --p;
  533. terminal_value = t->input[p];
  534. while (p >= 0x10000) {
  535. if (t->input[p] != terminal_value && t->input[p] != t->dont_care) {
  536. if (exceptions)
  537. --exceptions;
  538. else
  539. break;
  540. }
  541. --p;
  542. }
  543. return p+1; // p is a character we must output
  544. }
  545. void optimize_table(table *t, char *table_name)
  546. {
  547. int modelist[3] = { 85, -1 };
  548. int modes[8];
  549. int num_modes = 0;
  550. int decent_size;
  551. result r;
  552. size_t path;
  553. table s;
  554. // strip tail end of table
  555. int orig_length = t->length;
  556. int threshhold = 0xffff;
  557. int p = strip_table(t, 2);
  558. int len_saved = t->length - p;
  559. if (len_saved >= threshhold) {
  560. t->length = p;
  561. while (p > 0x10000) {
  562. p = strip_table(t, 0);
  563. len_saved = t->length - p;
  564. if (len_saved < 0x10000)
  565. break;
  566. len_saved = orig_length - p;
  567. if (len_saved < threshhold)
  568. break;
  569. threshhold *= 2;
  570. }
  571. }
  572. t->depth = 1;
  573. // find size of table if we use path 86
  574. decent_size = pack_table_by_modes(t, modelist);
  575. #if 1
  576. // find best packing of remainder of table by exploring tree of packings
  577. r = pack_table(t, 0, decent_size);
  578. // use the computed 'path' to evaluate and output tree
  579. path = r.path;
  580. #else
  581. path = 86;//90;//132097;
  582. #endif
  583. while (path) {
  584. modes[num_modes++] = (path & 127) - 1;
  585. path >>= 7;
  586. }
  587. printf("// modes: %d\n", r.path);
  588. s = *t;
  589. while (num_modes > 0) {
  590. char name[256];
  591. sprintf(name, "%s_%d", table_name, num_modes+1);
  592. --num_modes;
  593. s = pack_for_mode(&s, modes[num_modes], name);
  594. }
  595. // output the final table as-is
  596. if (s.splittable)
  597. output_table_with_trims(table_name, "_1", s.input, s.length);
  598. else
  599. output_table(table_name, "_1", s.input, s.length, 0, NULL);
  600. }
  601. uval unicode_table[0x110000];
  602. typedef struct
  603. {
  604. uval lo,hi;
  605. } char_range;
  606. char_range get_range(char *str)
  607. {
  608. char_range cr;
  609. char *p;
  610. cr.lo = strtol(str, &p, 16);
  611. p = stb_skipwhite(p);
  612. if (*p == '.')
  613. cr.hi = strtol(p+2, NULL, 16);
  614. else
  615. cr.hi = cr.lo;
  616. return cr;
  617. }
  618. char *skip_semi(char *s, int count)
  619. {
  620. while (count) {
  621. s = strchr(s, ';');
  622. assert(s != NULL);
  623. ++s;
  624. --count;
  625. }
  626. return s;
  627. }
  628. int main(int argc, char **argv)
  629. {
  630. table t;
  631. uval maxv=0;
  632. int i,n=0;
  633. char **s = stb_stringfile("../../data/UnicodeData.txt", &n);
  634. assert(s);
  635. for (i=0; i < n; ++i) {
  636. if (s[i][0] == '#' || s[i][0] == '\n' || s[i][0] == 0)
  637. ;
  638. else {
  639. char_range cr = get_range(s[i]);
  640. char *t = skip_semi(s[i], 13);
  641. uval j, v;
  642. if (*t == ';' || *t == '\n' || *t == 0)
  643. v = 0;
  644. else {
  645. v = strtol(t, NULL, 16);
  646. if (v < 65536) {
  647. maxv = stb_max(v, maxv);
  648. for (j=cr.lo; j <= cr.hi; ++j) {
  649. unicode_table[j] = v;
  650. //printf("%06x => %06x\n", j, v);
  651. }
  652. }
  653. }
  654. }
  655. }
  656. t.depth = 0;
  657. t.dont_care = UVAL_DONT_CARE_DEFAULT;
  658. t.fallback = 0;
  659. t.fastpath = 256;
  660. t.inherited_storage = 0;
  661. t.has_sign = 0;
  662. t.splittable = 0;
  663. t.input = unicode_table;
  664. t.input_size = size_for_max_number(maxv);
  665. t.length = 0x110000;
  666. t.replace_fallback_with_codepoint = 1;
  667. optimize_table(&t, "stbu_upppercase");
  668. return 0;
  669. }