#!/usr/bin/python3 import math from lib import * MAP = { 'ALetter': 'LE', 'CR': 'CR', 'Double_Quote': 'DQ', 'E_Base': 'EB', 'E_Base_GAZ': 'EBG', 'E_Modifier': 'EM', 'Extended_Pictographic': 'EXTPICT', 'Extend': 'EXTEND', 'ExtendNumLet': 'EX', 'Format': 'FO', 'Glue_After_Zwj': 'GAZ', 'Hebrew_Letter': 'HL', 'Katakana': 'KA', 'LF': 'LF', 'MidLetter': 'ML', 'MidNumLet': 'MB', 'MidNum': 'MN', 'Newline': 'NL', 'Numeric': 'NU', 'Other': 'XX', 'Regional_Indicator': 'RI', 'Single_Quote': 'SQ', 'WSegSpace': 'WSEGSPACE', 'ZWJ': 'ZWJ', } longest = -1 def parse(*files: str) -> list[bool]: global longest xs = ['WBRK_XX'] * 0x110000 lines = [] for file in files: with open(file, 'r') as f: lines.extend(f.readlines()) for line in lines: if len(line.strip()) == 0 or line[0] == '#': continue parts = line.split(';') ranges = [int(x, 16) for x in parts[0].strip().split('..')] try: prop = 'WBRK_' + MAP[parts[1].split('#')[0].strip()] except KeyError: continue longest = max(longest, len(prop)) for i in range(ranges[0], ranges[len(ranges) - 1] + 1): xs[i] = prop return list(map(str, xs)) def genfile(cs: list[tuple[bool, ...]], blksize: int) -> None: Cs = cs cs = list(dict.fromkeys(Cs)) print('''\ /* This file is autogenerated by gen/string/wbrk; DO NOT EDIT. */ #ifndef MLIB_UNICODE__WBRK_H #define MLIB_UNICODE__WBRK_H #include #include "_attrs.h" #include "_rune.h" enum uprop_wbrk : uint_least8_t { WBRK_XX = 0, /* Other */ WBRK_CR, /* CR */ WBRK_DQ, /* Double Quote */ WBRK_EB, /* E Base */ WBRK_EBG, /* E Base GAZ */ WBRK_EM, /* E Modifier */ WBRK_EOT, /* End of Text */ WBRK_EX, /* ExtendNumLet */ WBRK_EXTEND, /* Extend */ WBRK_EXTPICT, /* Extended Pictographic */ WBRK_EXTPICT_LE, /* Extended Pictographic and ALetter */ WBRK_FO, /* Format */ WBRK_GAZ, /* Glue After Zwj */ WBRK_HL, /* Hebrew Letter */ WBRK_KA, /* Katakana */ WBRK_LE, /* ALetter */ WBRK_LF, /* LF */ WBRK_MB, /* MidNumLet */ WBRK_ML, /* MidLetter */ WBRK_MN, /* MidNum */ WBRK_NL, /* Newline */ WBRK_NU, /* Numeric */ WBRK_RI, /* Regional Indicator */ WBRK_SQ, /* Single Quote */ WBRK_WSEGSPACE, /* WSegSpace */ WBRK_ZWJ, /* ZWJ */ }; ''') print(f'static constexpr {typename(len(cs) - 1)} stage1[] = {{') for i, c in enumerate(Cs): print(f'%c%{len(str(len(cs) - 1))}d,' % ('\t' if i % 16 == 0 else ' ', cs.index(c)), end='') if i % 16 == 15: print() print('};') print() ppc = columns(blksize, longest + 1) print(f'static constexpr enum uprop_wbrk stage2[][{blksize}] = {{') for c in cs: for i in range(blksize // ppc): print('\t{' if i == 0 else '\t ', end='') for j in range(ppc): print(c[i*ppc + j], end='') if i < blksize // ppc - 1 or j < ppc - 1: print(',', end='') if j < ppc - 1: print(' ' * (longest + 1 - len(c[i*ppc + j])), end='') if i < blksize // ppc - 1: print() print('},') print('};') print() print(f'''\ [[_mlib_pure, _mlib_inline]] static enum uprop_wbrk uprop_get_wbrk(rune ch) {{ return stage2[stage1[ch / {blksize}]][ch % {blksize}]; }} #endif /* !MLIB_UNICODE__WBRK_H */''') def main() -> None: cwd_init() xs = parse('data/WordBreakProperty', 'data/emoji-data') blksize = -1 smallest = math.inf for bs in powers_of_2(): if bs > len(xs): break Cs = [tuple(x) for x in chunks(xs, bs)] cs = set(Cs) sz_s1 = len(Cs) * isize(len(cs) - 1) sz_s2 = len(cs) * bs sz = sz_s1 + sz_s2 if sz < smallest: smallest = sz blksize = bs Cs = [tuple(x) for x in chunks(xs, blksize)] with open('include/unicode/_wbrk.h', 'w') as f: sys.stdout = f genfile(Cs, blksize) report_size(len(xs), smallest) if __name__ == '__main__': main()