diff options
Diffstat (limited to 'gen/string/gbrk')
-rwxr-xr-x | gen/string/gbrk | 243 |
1 files changed, 160 insertions, 83 deletions
diff --git a/gen/string/gbrk b/gen/string/gbrk index e0acca7..8397e90 100755 --- a/gen/string/gbrk +++ b/gen/string/gbrk @@ -1,97 +1,174 @@ -#!/bin/sh +#!/usr/bin/python3 + +import math + +from lib import * + + +MAP = { + 'Control': 1, + 'Extended_Pictographic': 2, + 'Extend': 3, + 'L': 4, + 'LV': 5, + 'LVT': 6, + 'Prepend': 7, + 'Regional_Indicator': 8, + 'SpacingMark': 9, + 'T': 10, + 'V': 11, + 'ZWJ': 12, + + 'InCB; Consonant': 0b0100_0000, + 'InCB; Extend': 0b1000_0000, + 'InCB; Linker': 0b1100_0000, +} + +longest = 3 + +def parse(*files: str) -> list[bool]: + global longest + + xs = [0] * 0x110000 + + lines = [] + for file in files: + with open(file, 'r') as f: + lines.extend(f.readlines()) + + for line in lines: + if len(line.strip()) == 0 or line[0] == '#': + continue + + parts = line.split(';') + ranges = [int(x, 16) for x in parts[0].strip().split('..')] + + if parts[1].strip() == 'InCB': + p = 'InCB; ' + parts[2].split('#')[0].strip() + else: + p = parts[1].split('#')[0].strip() + if p not in MAP: + continue + + for i in range(ranges[0], ranges[len(ranges) - 1] + 1): + xs[i] |= MAP[p] + return list(map(str, xs)) -set -e -cd "${0%/*}/../.." -exec >include/unicode/_gbrk.h +def genfile(cs: list[tuple[bool, ...]], blksize: int) -> None: + Cs = cs + cs = list(dict.fromkeys(Cs)) -cat <<C + print('''\ /* This file is autogenerated by gen/string/gbrk; DO NOT EDIT. */ #ifndef MLIB_UNICODE__GBRK_H #define MLIB_UNICODE__GBRK_H -/* clang-format off */ +#include <stdint.h> +#include "_attrs.h" #include "_rune.h" -typedef enum { - GBP_OTHER = 0, - - GBP_CTRL = 1 << 0, /* Control */ - GBP_EXT = 1 << 1, /* Extend */ - GBP_PIC = 1 << 2, /* Extended_Pictographic */ - GBP_PREP = 1 << 3, /* Prepend */ - GBP_RI = 1 << 4, /* Regional_Indicator */ - GBP_SM = 1 << 5, /* SpacingMark */ - GBP_ZWJ = 1 << 6, /* ZWJ */ - - GBP_HNGL_L = 1 << 7, /* Hangul L */ - GBP_HNGL_LV = 1 << 8, /* Hangul LV */ - GBP_HNGL_LVT = 1 << 9, /* Hangul LVT */ - GBP_HNGL_T = 1 << 10, /* Hangul T */ - GBP_HNGL_V = 1 << 11, /* Hangul V */ - - GBP_INDC_CNSNT = 1 << 12, /* Indic Consonant */ - GBP_INDC_EXT = 1 << 13, /* Indic Extend */ - GBP_INDC_LNK = 1 << 14, /* Indic Linker */ -} gbrk_prop; - -static const struct { - rune lo, hi; - gbrk_prop val; -} gbrk_prop_tbl[] = { -C - -gawk ' -BEGIN { - FS = "( *#.*| +; +)" - map["Control"] = "CTRL" - map["Extend"] = "EXT" - map["Extended_Pictographic"] = "PIC" - map["Prepend"] = "PREP" - map["Regional_Indicator"] = "RI" - map["SpacingMark"] = "SM" - map["ZWJ"] = "ZWJ" - - map["L"] = "HNGL_L" - map["LV"] = "HNGL_LV" - map["LVT"] = "HNGL_LVT" - map["T"] = "HNGL_T" - map["V"] = "HNGL_V" - - map["InCB; Consonant"] = "INDC_CNSNT" - map["InCB; Extend"] = "INDC_EXT" - map["InCB; Linker"] = "INDC_LNK" -} +#define GBRK_PROP_HI(x) ((x) >> 6) +#define GBRK_PROP_LO(x) ((x) & 63) -map[$2] { - n = split($1, a, /\.\./) - lo = strtonum("0X" a[1]) - hi = strtonum("0X" a[n]) - - for (i = lo; i <= hi; i++) { - s = "GBP_" map[$2] - props[i] = props[i] ? props[i] " | " s : s - } -} - -END { - for (i = 0; i <= 0x10FFFF; i++) { - if (!props[i]) - continue - lo = i - while (props[lo] == props[i + 1]) - i++ - printf "\t{0x%06X, 0x%06X, %s},\n", lo, i, props[lo] - } -} -' data/GraphemeBreakProperty \ - data/DerivedCoreProperties \ - data/emoji-data \ -| sort +enum uprop_gbrk_indc { + GBRK_INDC_CNSNT = 1, /* Consonant */ + GBRK_INDC_EXT, /* Extend */ + GBRK_INDC_LNK, /* Linker */ +}; -cat <<C +enum uprop_gbrk { + GBRK_XX = 0, /* Other */ + GBRK_CN, /* Control */ + GBRK_EXT_PICT, /* Extended Pictographic */ + GBRK_EX, /* Extend */ + GBRK_HST_L, /* L */ + GBRK_HST_LV, /* LV */ + GBRK_HST_LVT, /* LVT */ + GBRK_PP, /* Prepend */ + GBRK_RI, /* Regional Indicator */ + GBRK_SM, /* SpacingMark */ + GBRK_HST_T, /* T */ + GBRK_HST_V, /* V */ + GBRK_ZWJ, /* ZWJ */ + _GBRK_LO_CNT, }; -#endif /* !MLIB_UNICODE__GBRK_H */ -C +static_assert(_GBRK_LO_CNT - 1 <= 0b0011'1111, + "2 bits are required to pack Indic syllables"); +''') + + print(f'static constexpr {typename(len(cs) - 1)} stage1[] = {{') + for i, c in enumerate(Cs): + print(f'%c%{len(str(len(cs) - 1))}d,' % ('\t' if i % 16 == 0 else ' ', cs.index(c)), end='') + if i % 16 == 15: + print() + print('};') + + print() + + ppc = columns(blksize, longest + 1) + print(f'static constexpr uint8_t stage2[][{blksize}] = {{') + for c in cs: + for i in range(blksize // ppc): + print('\t{' if i == 0 else '\t ', end='') + for j in range(ppc): + print(c[i*ppc + j], end='') + if i < blksize // ppc - 1 or j < ppc - 1: + print(',', end='') + if j < ppc - 1: + print(' ' * (longest + 1 - len(c[i*ppc + j])), end='') + if i < blksize // ppc - 1: + print() + print('},') + print('};') + + print() + + print(f'''\ +[[_mlib_pure, _mlib_inline]] +static void +uprop_get_gbrk(enum uprop_gbrk *x, enum uprop_gbrk_indc *y, rune ch) +{{ + uint8_t z = stage2[stage1[ch / {blksize}]][ch % {blksize}]; + *x = GBRK_PROP_LO(z); + *y = GBRK_PROP_HI(z); +}} + +#endif /* !MLIB_UNICODE__GBRK_H */''') + +def main() -> None: + cwd_init() + xs = parse( + 'data/GraphemeBreakProperty', + 'data/DerivedCoreProperties', + 'data/emoji-data', + ) + + blksize = -1 + smallest = math.inf + + for bs in powers_of_2(): + if bs > len(xs): + break + Cs = [tuple(x) for x in chunks(xs, bs)] + cs = set(Cs) + + sz_s1 = len(Cs) * isize(len(cs) - 1) + sz_s2 = len(cs) * bs + sz = sz_s1 + sz_s2 + + if sz < smallest: + smallest = sz + blksize = bs + + Cs = [tuple(x) for x in chunks(xs, blksize)] + with open('include/unicode/_gbrk.h', 'w') as f: + sys.stdout = f + genfile(Cs, blksize) + + report_size(len(xs), smallest) + +if __name__ == '__main__': + main() |