#!/usr/bin/python3 import math from lib import * MAP = { '0' : 'NR', '1' : 'OV', '6' : 'HANR', '7' : 'NK', '8' : 'KV', '9' : 'VR', '10' : '10', '11' : '11', '12' : '12', '13' : '13', '14' : '14', '15' : '15', '16' : '16', '17' : '17', '18' : '18', '19' : '19', '20' : '20', '21' : '21', '22' : '22', '23' : '23', '24' : '24', '25' : '25', '26' : '26', '27' : '27', '28' : '28', '29' : '29', '30' : '30', '31' : '31', '32' : '32', '33' : '33', '34' : '34', '35' : '35', '36' : '36', '84' : '84', '91' : '91', '103': '103', '107': '107', '118': '118', '122': '122', '129': '129', '130': '130', '132': '132', '133': '133', '200': 'ATBL', '202': 'ATB', '214': 'ATA', '216': 'ATAR', '218': 'BL', '220': 'B', '222': 'BR', '224': 'L', '226': 'R', '228': 'AL', '230': 'A', '232': 'AR', '233': 'DB', '234': 'DA', '240': 'IS', } longest = 0 def parse(file: str) -> list[bool]: global longest xs = ['CCC_NR'] * 0x110000 with open(file, 'r') as f: for line in f.readlines(): parts = line.split(';') parts[0] = int(parts[0], 16) if 'First' in parts[1]: lo = parts[0] elif 'Last' in parts[1]: hi = parts[0] for i in range(lo, hi + 1): xs[i] = f'CCC_{MAP[parts[3]]}' longest = max(longest, len(xs[i])) else: xs[parts[0]] = f'CCC_{MAP[parts[3]]}' longest = max(longest, len(xs[parts[0]])) return xs def genfile(cs: list[tuple[bool, ...]], blksize: int) -> None: Cs = cs cs = list(dict.fromkeys(Cs)) print('''\ /* This file is autogenerated by gen/prop/ccc; DO NOT EDIT. */ #include "unicode/prop.h" ''') print(f'static constexpr {typename(len(cs) - 1)} stage1[] = {{') for i, c in enumerate(Cs): print(f'%c%{len(str(len(cs) - 1))}d,' % ('\t' if i % 16 == 0 else ' ', cs.index(c)), end='') if i % 16 == 15: print() print('};') print() ppc = columns(blksize, longest + 1) print(f'static constexpr enum uprop_ccc stage2[][{blksize}] = {{') for c in cs: for i in range(blksize // ppc): print('\t{' if i == 0 else '\t ', end='') for j in range(ppc): print(c[i*ppc + j], end='') if i < blksize // ppc - 1 or j < ppc - 1: print(',', end='') if j < ppc - 1: print(' ' * (longest + 1 - len(c[i*ppc + j])), end='') if i < blksize // ppc - 1: print() print('},') print('};') print() print(f'''\ enum uprop_ccc uprop_get_ccc(rune ch) {{ return stage2[stage1[ch / {blksize}]][ch % {blksize}]; }}''') def main() -> None: cwd_init() sys.stdout = open('lib/unicode/prop/uprop_get_ccc.c', 'w') xs = parse('data/UnicodeData') blksize = -1 smallest = math.inf for bs in powers_of_2(): if bs > len(xs): break Cs = [tuple(x) for x in chunks(xs, bs)] cs = set(Cs) sz_s1 = len(Cs) * isize(len(cs) - 1) sz_s2 = len(cs) * bs * 2 sz = sz_s1 + sz_s2 if sz < smallest: smallest = sz blksize = bs Cs = [tuple(x) for x in chunks(xs, blksize)] genfile(Cs, blksize) report_size(len(xs), smallest) if __name__ == '__main__': main()