123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750 |
- #define STB_DEFINE
- #include "../stb.h"
- // create unicode mappings
- //
- // Two kinds of mappings:
- // map to a number
- // map to a bit
- //
- // For mapping to a number, we use the following strategy:
- //
- // User supplies:
- // 1. a table of numbers (for now we use uint16, so full Unicode table is 4MB)
- // 2. a "don't care" value
- // 3. define a 'fallback' value (typically 0)
- // 4. define a fast-path range (typically 0..255 or 0..1023) [@TODO: automate detecting this]
- //
- // Code:
- // 1. Determine range of *end* of unicode codepoints (U+10FFFF and down) which
- // all have the same value (or don't care). If large enough, emit this as a
- // special case in the code.
- // 2. Repeat above, limited to at most U+FFFF.
- // 3. Cluster the data into intervals of 8,16,32,64,128,256 numeric values.
- // 3a. If all the values in an interval are fallback/dont-care, no further processing
- // 3b. Find the "trimmed range" outside which all the values are the fallback or don't care
- // 3c. Find the "special trimmed range" outside which all the values are some constant or don't care
- // 4. Pack the clusters into continuous memory, and find previous instances of
- // the cluster. Repeat for trimmed & special-trimmed. In the first case, find
- // previous instances of the cluster (allow don't-care to match in either
- // direction), both aligned and mis-aligned; in the latter, starting where
- // things start or mis-aligned. Build an index table specifying the
- // location of each cluster (and its length). Allow an extra indirection here;
- // the full-sized index can index a smaller table which has the actual offset
- // (and lengths).
- // 5. Associate with each packed continuous memory above the amount of memory
- // required to store the data w/ smallest datatype (of uint8, uint16, uint32).
- // Discard the continuous memory. Recurse on each index table, but avoid the
- // smaller packing.
- //
- // For mapping to a bit, we pack the results for 8 characters into a byte, and then apply
- // the above strategy. Note that there may be more optimal approaches with e.g. packing
- // 8 different bits into a single structure, though, which we should explore eventually.
- // currently we limit *indices* to being 2^16, and we pack them as
- // index + end_trim*2^16 + start_trim*2^24; specials have to go in a separate table
- typedef uint32 uval;
- #define UVAL_DONT_CARE_DEFAULT 0xffffffff
- typedef struct
- {
- uval *input;
- uint32 dont_care;
- uint32 fallback;
- int fastpath;
- int length;
- int depth;
- int has_sign;
- int splittable;
- int replace_fallback_with_codepoint;
- size_t input_size;
- size_t inherited_storage;
- } table;
- typedef struct
- {
- int split_log2;
- table result; // index into not-returned table
- int storage;
- } output;
- typedef struct
- {
- table t;
- char **output_name;
- } info;
- typedef struct
- {
- size_t path;
- size_t size;
- } result;
- typedef struct
- {
- uint8 trim_end;
- uint8 trim_start;
- uint8 special;
- uint8 aligned;
- uint8 indirect;
- uint16 overhead; // add some forced overhead for each mode to avoid getting complex encoding when it doesn't save much
- } mode_info;
- mode_info modes[] =
- {
- { 0,0,0,0,0, 32, },
- { 0,0,0,0,1, 100, },
- { 0,0,0,1,0, 32, },
- { 0,0,0,1,1, 100, },
- { 0,0,1,0,1, 100, },
- { 0,0,1,1,0, 32, },
- { 0,0,1,1,1, 200, },
- { 1,0,0,0,0, 100, },
- { 1,0,0,0,1, 120, },
- { 1,1,0,0,0, 100, },
- { 1,1,0,0,1, 130, },
- { 1,0,1,0,0, 130, },
- { 1,0,1,0,1, 180, },
- { 1,1,1,0,0, 180, },
- { 1,1,1,0,1, 200, },
- };
- #define MODECOUNT (sizeof(modes)/sizeof(modes[0]))
- #define CLUSTERSIZECOUNT 6 // 8,16, 32,64, 128,256
- size_t size_for_max_number(uint32 number)
- {
- if (number == 0) return 0;
- if (number < 256) return 1;
- if (number < 256*256) return 2;
- if (number < 256*256*256) return 3;
- return 4;
- }
- size_t size_for_max_number_aligned(uint32 number)
- {
- size_t n = size_for_max_number(number);
- return n == 3 ? 4 : n;
- }
- uval get_data(uval *data, int offset, uval *end)
- {
- if (data + offset >= end)
- return 0;
- else
- return data[offset];
- }
- int safe_len(uval *data, int len, uval *end)
- {
- if (len > end - data)
- return end - data;
- return len;
- }
- uval tempdata[256];
- int dirty=0;
- size_t find_packed(uval **packed, uval *data, int len, int aligned, int fastpath, uval *end, int offset, int replace)
- {
- int packlen = stb_arr_len(*packed);
- int i,p;
- if (data+len > end || replace) {
- int safelen = safe_len(data, len, end);
- memset(tempdata, 0, dirty*sizeof(tempdata[0]));
- memcpy(tempdata, data, safelen * sizeof(data[0]));
- data = tempdata;
- dirty = len;
- }
- if (replace) {
- int i;
- int safelen = safe_len(data, len, end);
- for (i=0; i < safelen; ++i)
- if (data[i] == 0)
- data[i] = offset+i;
- }
- if (len <= 0)
- return 0;
- if (!fastpath) {
- if (aligned) {
- for (i=0; i < packlen; i += len)
- if ((*packed)[i] == data[0] && 0==memcmp(&(*packed)[i], data, len * sizeof(uval)))
- return i / len;
- } else {
- for (i=0; i < packlen-len+1; i += 1 )
- if ((*packed)[i] == data[0] && 0==memcmp(&(*packed)[i], data, len * sizeof(uval)))
- return i;
- }
- }
- p = stb_arr_len(*packed);
- for (i=0; i < len; ++i)
- stb_arr_push(*packed, data[i]);
- return p;
- }
- void output_table(char *name1, char *name2, uval *data, int length, int sign, char **names)
- {
- char temp[20];
- uval maxv = 0;
- int bytes, numlen, at_newline;
- int linelen = 79; // @TODO: make table more readable by choosing a length that's a multiple?
- int i,pos, do_split=0;
- for (i=0; i < length; ++i)
- if (sign)
- maxv = stb_max(maxv, (uval)abs((int)data[i]));
- else
- maxv = stb_max(maxv, data[i]);
- bytes = size_for_max_number_aligned(maxv);
- sprintf(temp, "%d", maxv);
- numlen=strlen(temp);
- if (sign)
- ++numlen;
-
- if (bytes == 0)
- return;
- printf("uint%d %s%s[%d] = {\n", bytes*8, name1, name2, length);
- at_newline = 1;
- for (i=0; i < length; ++i) {
- if (pos + numlen + 2 > linelen) {
- printf("\n");
- at_newline = 1;
- pos = 0;
- }
- if (at_newline) {
- printf(" ");
- pos = 2;
- at_newline = 0;
- } else {
- printf(" ");
- ++pos;
- }
- printf("%*d,", numlen, data[i]);
- pos += numlen+1;
- }
- if (!at_newline) printf("\n");
- printf("};\n");
- }
- void output_table_with_trims(char *name1, char *name2, uval *data, int length)
- {
- uval maxt=0, maxp=0;
- int i,d,s,e, count;
- // split the table into two pieces
- uval *trims = NULL;
- if (length == 0)
- return;
- for (i=0; i < stb_arr_len(data); ++i) {
- stb_arr_push(trims, data[i] >> 16);
- data[i] &= 0xffff;
- maxt = stb_max(maxt, trims[i]);
- maxp = stb_max(maxp, data[i]);
- }
- d=s=e=1;
- if (maxt >= 256) {
- // need to output start & end values
- if (maxp >= 256) {
- // can pack into a single table
- printf("struct { uint16 val; uint8 start, end; } %s%s[%d] = {\n", name1, name2, length);
- } else {
- output_table(name1, name2, data, length, 0, 0);
- d=0;
- printf("struct { uint8 start, end; } %s%s_trim[%d] = {\n", name1, name2, length);
- }
- } else if (maxt > 0) {
- if (maxp >= 256) {
- output_table(name1, name2, data, length, 0, 0);
- output_table(name1, stb_sprintf("%s_end", name2), trims, length, 0, 0);
- return;
- } else {
- printf("struct { uint8 val, end; } %s%s[%d] = {\n", name1, name2, length);
- s=0;
- }
- } else {
- output_table(name1, name2, data, length, 0, 0);
- return;
- }
- // d or s can be zero (but not both), e is always present and last
- count = d + s + e;
- assert(count >= 2 && count <= 3);
- {
- char temp[60];
- uval maxv = 0;
- int numlen, at_newline, len;
- int linelen = 79; // @TODO: make table more readable by choosing a length that's a multiple?
- int i,pos, do_split=0;
- numlen = 0;
- for (i=0; i < length; ++i) {
- if (count == 2)
- sprintf(temp, "{%d,%d}", d ? data[i] : (trims[i]>>8), trims[i]&255);
- else
- sprintf(temp, "{%d,%d,%d}", data[i], trims[i]>>8, trims[i]&255);
- len = strlen(temp);
- numlen = stb_max(len, numlen);
- }
-
- at_newline = 1;
- for (i=0; i < length; ++i) {
- if (pos + numlen + 2 > linelen) {
- printf("\n");
- at_newline = 1;
- pos = 0;
- }
- if (at_newline) {
- printf(" ");
- pos = 2;
- at_newline = 0;
- } else {
- printf(" ");
- ++pos;
- }
- if (count == 2)
- sprintf(temp, "{%d,%d}", d ? data[i] : (trims[i]>>8), trims[i]&255);
- else
- sprintf(temp, "{%d,%d,%d}", data[i], trims[i]>>8, trims[i]&255);
- printf("%*s,", numlen, temp);
- pos += numlen+1;
- }
- if (!at_newline) printf("\n");
- printf("};\n");
- }
- }
- int weight=1;
- table pack_for_mode(table *t, int mode, char *table_name)
- {
- size_t extra_size;
- int i;
- uval maxv;
- mode_info mi = modes[mode % MODECOUNT];
- int size = 8 << (mode / MODECOUNT);
- table newtab;
- uval *packed = NULL;
- uval *index = NULL;
- uval *indirect = NULL;
- uval *specials = NULL;
- newtab.dont_care = UVAL_DONT_CARE_DEFAULT;
- if (table_name)
- printf("// clusters of %d\n", size);
- for (i=0; i < t->length; i += size) {
- uval newval;
- int fastpath = (i < t->fastpath);
- if (mi.special) {
- int end_trim = size-1;
- int start_trim = 0;
- uval special;
- // @TODO: pick special from start or end instead of only end depending on which is longer
- for(;;) {
- special = t->input[i + end_trim];
- if (special != t->dont_care || end_trim == 0)
- break;
- --end_trim;
- }
- // at this point, special==inp[end_trim], and end_trim >= 0
- if (special == t->dont_care && !fastpath) {
- // entire block is don't care, so OUTPUT don't care
- stb_arr_push(index, newtab.dont_care);
- continue;
- } else {
- uval pos, trim;
- if (mi.trim_end && !fastpath) {
- while (end_trim >= 0) {
- if (t->input[i + end_trim] == special || t->input[i + end_trim] == t->dont_care)
- --end_trim;
- else
- break;
- }
- }
- if (mi.trim_start && !fastpath) {
- while (start_trim < end_trim) {
- if (t->input[i + start_trim] == special || t->input[i + start_trim] == t->dont_care)
- ++start_trim;
- else
- break;
- }
- }
- // end_trim points to the last character we have to output
- // find the first match, or add it
- pos = find_packed(&packed, &t->input[i+start_trim], end_trim-start_trim+1, mi.aligned, fastpath, &t->input[t->length], i+start_trim, t->replace_fallback_with_codepoint);
- // encode as a uval
- if (!mi.trim_end) {
- if (end_trim == 0)
- pos = special;
- else
- pos = pos | 0x80000000;
- } else {
- assert(end_trim < size && end_trim >= -1);
- if (!fastpath) assert(end_trim < size-1); // special always matches last one
- assert(end_trim < size && end_trim+1 >= 0);
- if (!fastpath) assert(end_trim+1 < size);
- if (mi.trim_start)
- trim = start_trim*256 + (end_trim+1);
- else
- trim = end_trim+1;
- assert(pos < 65536); // @TODO: if this triggers, just bail on this search path
- pos = pos + (trim << 16);
- }
- newval = pos;
- stb_arr_push(specials, special);
- }
- } else if (mi.trim_end) {
- int end_trim = size-1;
- int start_trim = 0;
- uval pos, trim;
- while (end_trim >= 0 && !fastpath)
- if (t->input[i + end_trim] == t->fallback || t->input[i + end_trim] == t->dont_care)
- --end_trim;
- else
- break;
- if (mi.trim_start && !fastpath) {
- while (start_trim < end_trim) {
- if (t->input[i + start_trim] == t->fallback || t->input[i + start_trim] == t->dont_care)
- ++start_trim;
- else
- break;
- }
- }
- // end_trim points to the last character we have to output, and can be -1
- ++end_trim; // make exclusive at end
- if (end_trim == 0 && size == 256)
- start_trim = end_trim = 1; // we can't make encode a length from 0..256 in 8 bits, so restrict end_trim to 1..256
- // find the first match, or add it
- pos = find_packed(&packed, &t->input[i+start_trim], end_trim - start_trim, mi.aligned, fastpath, &t->input[t->length], i+start_trim, t->replace_fallback_with_codepoint);
- assert(end_trim <= size && end_trim >= 0);
- if (size == 256)
- assert(end_trim-1 < 256 && end_trim-1 >= 0);
- else
- assert(end_trim < 256 && end_trim >= 0);
- if (size == 256)
- --end_trim;
- if (mi.trim_start)
- trim = start_trim*256 + end_trim;
- else
- trim = end_trim;
- assert(pos < 65536); // @TODO: if this triggers, just bail on this search path
- pos = pos + (trim << 16);
- newval = pos;
- } else {
- newval = find_packed(&packed, &t->input[i], size, mi.aligned, fastpath, &t->input[t->length], i, t->replace_fallback_with_codepoint);
- }
- if (mi.indirect) {
- int j;
- for (j=0; j < stb_arr_len(indirect); ++j)
- if (indirect[j] == newval)
- break;
- if (j == stb_arr_len(indirect))
- stb_arr_push(indirect, newval);
- stb_arr_push(index, j);
- } else {
- stb_arr_push(index, newval);
- }
- }
- // total up the new size for everything but the index table
- extra_size = mi.overhead * weight; // not the actual overhead cost; a penalty to avoid excessive complexity
- extra_size += 150; // per indirection
- if (table_name)
- extra_size = 0;
-
- if (t->has_sign) {
- // 'packed' contains two values, which should be packed positive & negative for size
- uval maxv2;
- for (i=0; i < stb_arr_len(packed); ++i)
- if (packed[i] & 0x80000000)
- maxv2 = stb_max(maxv2, packed[i]);
- else
- maxv = stb_max(maxv, packed[i]);
- maxv = stb_max(maxv, maxv2) << 1;
- } else {
- maxv = 0;
- for (i=0; i < stb_arr_len(packed); ++i)
- if (packed[i] > maxv && packed[i] != t->dont_care)
- maxv = packed[i];
- }
- extra_size += stb_arr_len(packed) * (t->splittable ? size_for_max_number(maxv) : size_for_max_number_aligned(maxv));
- if (table_name) {
- if (t->splittable)
- output_table_with_trims(table_name, "", packed, stb_arr_len(packed));
- else
- output_table(table_name, "", packed, stb_arr_len(packed), t->has_sign, NULL);
- }
- maxv = 0;
- for (i=0; i < stb_arr_len(specials); ++i)
- if (specials[i] > maxv)
- maxv = specials[i];
- extra_size += stb_arr_len(specials) * size_for_max_number_aligned(maxv);
- if (table_name)
- output_table(table_name, "_default", specials, stb_arr_len(specials), 0, NULL);
- maxv = 0;
- for (i=0; i < stb_arr_len(indirect); ++i)
- if (indirect[i] > maxv)
- maxv = indirect[i];
- extra_size += stb_arr_len(indirect) * size_for_max_number(maxv);
- if (table_name && stb_arr_len(indirect)) {
- if (mi.trim_end)
- output_table_with_trims(table_name, "_index", indirect, stb_arr_len(indirect));
- else {
- assert(0); // this case should only trigger in very extreme circumstances
- output_table(table_name, "_index", indirect, stb_arr_len(indirect), 0, NULL);
- }
- mi.trim_end = mi.special = 0;
- }
- if (table_name)
- printf("// above tables should be %d bytes\n", extra_size);
- maxv = 0;
- for (i=0; i < stb_arr_len(index); ++i)
- if (index[i] > maxv && index[i] != t->dont_care)
- maxv = index[i];
- newtab.splittable = mi.trim_end;
- newtab.input_size = newtab.splittable ? size_for_max_number(maxv) : size_for_max_number_aligned(maxv);
- newtab.input = index;
- newtab.length = stb_arr_len(index);
- newtab.inherited_storage = t->inherited_storage + extra_size;
- newtab.fastpath = 0;
- newtab.depth = t->depth+1;
- stb_arr_free(indirect);
- stb_arr_free(packed);
- stb_arr_free(specials);
- return newtab;
- }
- result pack_table(table *t, size_t path, int min_storage)
- {
- int i;
- result best;
- best.size = t->inherited_storage + t->input_size * t->length;
- best.path = path;
- if ((int) t->inherited_storage > min_storage) {
- best.size = stb_max(best.size, t->inherited_storage);
- return best;
- }
- if (t->length <= 256 || t->depth >= 4) {
- //printf("%08x: %7d\n", best.path, best.size);
- return best;
- }
- path <<= 7;
- for (i=0; i < MODECOUNT * CLUSTERSIZECOUNT; ++i) {
- table newtab;
- result r;
- newtab = pack_for_mode(t, i, 0);
- r = pack_table(&newtab, path+i+1, min_storage);
- if (r.size < best.size)
- best = r;
- stb_arr_free(newtab.input);
- //printf("Size: %6d + %6d\n", newtab.inherited_storage, newtab.input_size * newtab.length);
- }
- return best;
- }
- int pack_table_by_modes(table *t, int *modes)
- {
- table s = *t;
- while (*modes > -1) {
- table newtab;
- newtab = pack_for_mode(&s, *modes, 0);
- if (s.input != t->input)
- stb_arr_free(s.input);
- s = newtab;
- ++modes;
- }
- return s.inherited_storage + s.input_size * s.length;
- }
- int strip_table(table *t, int exceptions)
- {
- uval terminal_value;
- int p = t->length-1;
- while (t->input[p] == t->dont_care)
- --p;
- terminal_value = t->input[p];
- while (p >= 0x10000) {
- if (t->input[p] != terminal_value && t->input[p] != t->dont_care) {
- if (exceptions)
- --exceptions;
- else
- break;
- }
- --p;
- }
- return p+1; // p is a character we must output
- }
- void optimize_table(table *t, char *table_name)
- {
- int modelist[3] = { 85, -1 };
- int modes[8];
- int num_modes = 0;
- int decent_size;
- result r;
- size_t path;
- table s;
- // strip tail end of table
- int orig_length = t->length;
- int threshhold = 0xffff;
- int p = strip_table(t, 2);
- int len_saved = t->length - p;
- if (len_saved >= threshhold) {
- t->length = p;
- while (p > 0x10000) {
- p = strip_table(t, 0);
- len_saved = t->length - p;
- if (len_saved < 0x10000)
- break;
- len_saved = orig_length - p;
- if (len_saved < threshhold)
- break;
- threshhold *= 2;
- }
- }
- t->depth = 1;
- // find size of table if we use path 86
- decent_size = pack_table_by_modes(t, modelist);
- #if 1
- // find best packing of remainder of table by exploring tree of packings
- r = pack_table(t, 0, decent_size);
- // use the computed 'path' to evaluate and output tree
- path = r.path;
- #else
- path = 86;//90;//132097;
- #endif
- while (path) {
- modes[num_modes++] = (path & 127) - 1;
- path >>= 7;
- }
- printf("// modes: %d\n", r.path);
- s = *t;
- while (num_modes > 0) {
- char name[256];
- sprintf(name, "%s_%d", table_name, num_modes+1);
- --num_modes;
- s = pack_for_mode(&s, modes[num_modes], name);
- }
- // output the final table as-is
- if (s.splittable)
- output_table_with_trims(table_name, "_1", s.input, s.length);
- else
- output_table(table_name, "_1", s.input, s.length, 0, NULL);
- }
- uval unicode_table[0x110000];
- typedef struct
- {
- uval lo,hi;
- } char_range;
- char_range get_range(char *str)
- {
- char_range cr;
- char *p;
- cr.lo = strtol(str, &p, 16);
- p = stb_skipwhite(p);
- if (*p == '.')
- cr.hi = strtol(p+2, NULL, 16);
- else
- cr.hi = cr.lo;
- return cr;
- }
- char *skip_semi(char *s, int count)
- {
- while (count) {
- s = strchr(s, ';');
- assert(s != NULL);
- ++s;
- --count;
- }
- return s;
- }
- int main(int argc, char **argv)
- {
- table t;
- uval maxv=0;
- int i,n=0;
- char **s = stb_stringfile("../../data/UnicodeData.txt", &n);
- assert(s);
- for (i=0; i < n; ++i) {
- if (s[i][0] == '#' || s[i][0] == '\n' || s[i][0] == 0)
- ;
- else {
- char_range cr = get_range(s[i]);
- char *t = skip_semi(s[i], 13);
- uval j, v;
- if (*t == ';' || *t == '\n' || *t == 0)
- v = 0;
- else {
- v = strtol(t, NULL, 16);
- if (v < 65536) {
- maxv = stb_max(v, maxv);
- for (j=cr.lo; j <= cr.hi; ++j) {
- unicode_table[j] = v;
- //printf("%06x => %06x\n", j, v);
- }
- }
- }
- }
- }
- t.depth = 0;
- t.dont_care = UVAL_DONT_CARE_DEFAULT;
- t.fallback = 0;
- t.fastpath = 256;
- t.inherited_storage = 0;
- t.has_sign = 0;
- t.splittable = 0;
- t.input = unicode_table;
- t.input_size = size_for_max_number(maxv);
- t.length = 0x110000;
- t.replace_fallback_with_codepoint = 1;
- optimize_table(&t, "stbu_upppercase");
- return 0;
- }
|