From 34c55c4d07af131c9da06c367ac2958a6090f2a3 Mon Sep 17 00:00:00 2001 From: Thomas Voss Date: Tue, 30 Apr 2024 20:08:37 +0200 Subject: Add more 2-stage lookup tables --- gen/prop/ccc | 269 +++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 158 insertions(+), 111 deletions(-) (limited to 'gen/prop/ccc') diff --git a/gen/prop/ccc b/gen/prop/ccc index 4f370e7..5339748 100755 --- a/gen/prop/ccc +++ b/gen/prop/ccc @@ -1,116 +1,163 @@ -#!/bin/sh - -set -e -cd "${0%/*}/../.." -exec >lib/unicode/prop/uprop_get_ccc.c - -gawk ' -BEGIN { - FS = ";" - - map[1] = "OV" - map[6] = "HANR" - map[7] = "NK" - map[8] = "KV" - map[9] = "VR" - map[10] = "CCC10" - map[11] = "CCC11" - map[12] = "CCC12" - map[13] = "CCC13" - map[14] = "CCC14" - map[15] = "CCC15" - map[16] = "CCC16" - map[17] = "CCC17" - map[18] = "CCC18" - map[19] = "CCC19" - map[20] = "CCC20" - map[21] = "CCC21" - map[22] = "CCC22" - map[23] = "CCC23" - map[24] = "CCC24" - map[25] = "CCC25" - map[26] = "CCC26" - map[27] = "CCC27" - map[28] = "CCC28" - map[29] = "CCC29" - map[30] = "CCC30" - map[31] = "CCC31" - map[32] = "CCC32" - map[33] = "CCC33" - map[34] = "CCC34" - map[35] = "CCC35" - map[36] = "CCC36" - map[84] = "CCC84" - map[91] = "CCC91" - map[103] = "CCC103" - map[107] = "CCC107" - map[118] = "CCC118" - map[122] = "CCC122" - map[129] = "CCC129" - map[130] = "CCC130" - map[132] = "CCC132" - map[133] = "CCC133" - map[200] = "ATBL" - map[202] = "ATB" - map[214] = "ATA" - map[216] = "ATAR" - map[218] = "BL" - map[220] = "B" - map[222] = "BR" - map[224] = "L" - map[226] = "R" - map[228] = "AL" - map[230] = "A" - map[232] = "AR" - map[233] = "DB" - map[234] = "DA" - map[240] = "IS" - - print "/* This file is autogenerated by gen/prop/ccc; DO NOT EDIT. */" - print "" - print "#include \"_bsearch.h\"" - print "#include \"macros.h\"" - print "#include \"rune.h\"" - print "#include \"unicode/prop.h\"" - print "" -} +#!/usr/bin/python3 -{ - s = "CCC_" (map[$4] ? map[$4] : "NR") - lo = strtonum("0X" $1) +import math - if ($2 ~ /First/) { - getline - hi = strtonum("0X" $1) - } else - hi = lo +from lib import * - for (i = lo; i <= hi; i++) - props[i] = s -} -END { - print "static const struct {" - print "\trune lo, hi;" - print "\tenum uprop_ccc val;" - print "} lookup[] = {" - - for (i = 0; i <= 0x10FFFF; i++) { - if (!props[i] || props[i] == "CCC_NR") - continue - for (lo = i; props[lo] == props[i + 1]; i++) - ; - printf "\t{RUNE_C(0x%06X), RUNE_C(0x%06X), %s},\n", lo, i, props[lo] - } - - print "};" - print "" - print "_MLIB_DEFINE_BSEARCH(enum uprop_ccc, lookup, CCC_NR)" - print "" - print "enum uprop_ccc" - print "uprop_get_ccc(rune ch)" - print "{" - print "\treturn ch < lookup[0].lo ? CCC_NR : mlib_lookup(ch);" - print "}" +MAP = { + '0' : 'NR', + '1' : 'OV', + '6' : 'HANR', + '7' : 'NK', + '8' : 'KV', + '9' : 'VR', + '10' : '10', + '11' : '11', + '12' : '12', + '13' : '13', + '14' : '14', + '15' : '15', + '16' : '16', + '17' : '17', + '18' : '18', + '19' : '19', + '20' : '20', + '21' : '21', + '22' : '22', + '23' : '23', + '24' : '24', + '25' : '25', + '26' : '26', + '27' : '27', + '28' : '28', + '29' : '29', + '30' : '30', + '31' : '31', + '32' : '32', + '33' : '33', + '34' : '34', + '35' : '35', + '36' : '36', + '84' : '84', + '91' : '91', + '103': '103', + '107': '107', + '118': '118', + '122': '122', + '129': '129', + '130': '130', + '132': '132', + '133': '133', + '200': 'ATBL', + '202': 'ATB', + '214': 'ATA', + '216': 'ATAR', + '218': 'BL', + '220': 'B', + '222': 'BR', + '224': 'L', + '226': 'R', + '228': 'AL', + '230': 'A', + '232': 'AR', + '233': 'DB', + '234': 'DA', + '240': 'IS', } -' data/UnicodeData + +longest = 0 + +def parse(file: str) -> list[bool]: + global longest + + xs = ['CCC_NR'] * 0x110000 + with open(file, 'r') as f: + for line in f.readlines(): + parts = line.split(';') + parts[0] = int(parts[0], 16) + if 'First' in parts[1]: + lo = parts[0] + elif 'Last' in parts[1]: + hi = parts[0] + for i in range(lo, hi + 1): + xs[i] = f'CCC_{MAP[parts[3]]}' + longest = max(longest, len(xs[i])) + else: + xs[parts[0]] = f'CCC_{MAP[parts[3]]}' + longest = max(longest, len(xs[parts[0]])) + return xs + +def genfile(cs: list[tuple[bool, ...]], blksize: int) -> None: + Cs = cs + cs = list(dict.fromkeys(Cs)) + + print('''\ +/* This file is autogenerated by gen/prop/ccc; DO NOT EDIT. */ + +#include "unicode/prop.h" +''') + + print(f'static constexpr {typename(len(cs) - 1)} stage1[] = {{') + for i, c in enumerate(Cs): + print(f'%c%{len(str(len(cs) - 1))}d,' % ('\t' if i % 16 == 0 else ' ', cs.index(c)), end='') + if i % 16 == 15: + print() + print('};') + + print() + + ppc = columns(blksize, longest + 1) + print(f'static constexpr enum uprop_ccc stage2[][{blksize}] = {{') + for c in cs: + for i in range(blksize // ppc): + print('\t{' if i == 0 else '\t ', end='') + for j in range(ppc): + print(c[i*ppc + j], end='') + if i < blksize // ppc - 1 or j < ppc - 1: + print(',', end='') + if j < ppc - 1: + print(' ' * (longest + 1 - len(c[i*ppc + j])), end='') + if i < blksize // ppc - 1: + print() + print('},') + print('};') + + print() + + print(f'''\ +enum uprop_ccc +uprop_get_ccc(rune ch) +{{ + return stage2[stage1[ch / {blksize}]][ch % {blksize}]; +}}''') + +def main() -> None: + cwd_init() + sys.stdout = open('lib/unicode/prop/uprop_get_ccc.c', 'w') + xs = parse('data/UnicodeData') + + blksize = -1 + smallest = math.inf + + for bs in powers_of_2(): + if bs > len(xs): + break + Cs = [tuple(x) for x in chunks(xs, bs)] + cs = set(Cs) + + sz_s1 = len(Cs) * isize(len(cs) - 1) + sz_s2 = len(cs) * bs * 4 + sz = sz_s1 + sz_s2 + + if sz < smallest: + smallest = sz + blksize = bs + + Cs = [tuple(x) for x in chunks(xs, blksize)] + genfile(Cs, blksize) + + report_size(len(xs), smallest) + +if __name__ == '__main__': + main() -- cgit v1.2.3