#!/usr/bin/python3 import math from lib import * MAP = { 'Control': 1, 'Extended_Pictographic': 2, 'Extend': 3, 'L': 4, 'LV': 5, 'LVT': 6, 'Prepend': 7, 'Regional_Indicator': 8, 'SpacingMark': 9, 'T': 10, 'V': 11, 'ZWJ': 12, 'InCB; Consonant': 0b0100_0000, 'InCB; Extend': 0b1000_0000, 'InCB; Linker': 0b1100_0000, } longest = 3 def parse(*files: str) -> list[bool]: global longest xs = [0] * 0x110000 lines = [] for file in files: with open(file, 'r') as f: lines.extend(f.readlines()) for line in lines: if len(line.strip()) == 0 or line[0] == '#': continue parts = line.split(';') ranges = [int(x, 16) for x in parts[0].strip().split('..')] if parts[1].strip() == 'InCB': p = 'InCB; ' + parts[2].split('#')[0].strip() else: p = parts[1].split('#')[0].strip() if p not in MAP: continue for i in range(ranges[0], ranges[len(ranges) - 1] + 1): xs[i] |= MAP[p] return list(map(str, xs)) def genfile(cs: list[tuple[bool, ...]], blksize: int) -> None: Cs = cs cs = list(dict.fromkeys(Cs)) print('''\ /* This file is autogenerated by gen/string/gbrk; DO NOT EDIT. */ #ifndef MLIB_UNICODE__GBRK_H #define MLIB_UNICODE__GBRK_H #include #include "_attrs.h" #include "_rune.h" #define GBRK_PROP_HI(x) ((x) >> 6) #define GBRK_PROP_LO(x) ((x) & 63) enum uprop_gbrk_indc { GBRK_INDC_CNSNT = 1, /* Consonant */ GBRK_INDC_EXT, /* Extend */ GBRK_INDC_LNK, /* Linker */ }; enum uprop_gbrk { GBRK_XX = 0, /* Other */ GBRK_CN, /* Control */ GBRK_EXT_PICT, /* Extended Pictographic */ GBRK_EX, /* Extend */ GBRK_HST_L, /* L */ GBRK_HST_LV, /* LV */ GBRK_HST_LVT, /* LVT */ GBRK_PP, /* Prepend */ GBRK_RI, /* Regional Indicator */ GBRK_SM, /* SpacingMark */ GBRK_HST_T, /* T */ GBRK_HST_V, /* V */ GBRK_ZWJ, /* ZWJ */ _GBRK_LO_CNT, }; static_assert(_GBRK_LO_CNT - 1 <= 0b0011'1111, "2 bits are required to pack Indic syllables"); ''') print(f'static constexpr {typename(len(cs) - 1)} stage1[] = {{') for i, c in enumerate(Cs): print(f'%c%{len(str(len(cs) - 1))}d,' % ('\t' if i % 16 == 0 else ' ', cs.index(c)), end='') if i % 16 == 15: print() print('};') print() ppc = columns(blksize, longest + 1) print(f'static constexpr uint8_t stage2[][{blksize}] = {{') for c in cs: for i in range(blksize // ppc): print('\t{' if i == 0 else '\t ', end='') for j in range(ppc): print(c[i*ppc + j], end='') if i < blksize // ppc - 1 or j < ppc - 1: print(',', end='') if j < ppc - 1: print(' ' * (longest + 1 - len(c[i*ppc + j])), end='') if i < blksize // ppc - 1: print() print('},') print('};') print() print(f'''\ [[_mlib_pure, _mlib_inline]] static void uprop_get_gbrk(enum uprop_gbrk *x, enum uprop_gbrk_indc *y, rune ch) {{ uint8_t z = stage2[stage1[ch / {blksize}]][ch % {blksize}]; *x = GBRK_PROP_LO(z); *y = GBRK_PROP_HI(z); }} #endif /* !MLIB_UNICODE__GBRK_H */''') def main() -> None: cwd_init() xs = parse( 'data/GraphemeBreakProperty', 'data/DerivedCoreProperties', 'data/emoji-data', ) blksize = -1 smallest = math.inf for bs in powers_of_2(): if bs > len(xs): break Cs = [tuple(x) for x in chunks(xs, bs)] cs = set(Cs) sz_s1 = len(Cs) * isize(len(cs) - 1) sz_s2 = len(cs) * bs sz = sz_s1 + sz_s2 if sz < smallest: smallest = sz blksize = bs Cs = [tuple(x) for x in chunks(xs, blksize)] with open('include/unicode/_gbrk.h', 'w') as f: sys.stdout = f genfile(Cs, blksize) report_size(len(xs), smallest) if __name__ == '__main__': main()