From f5268368fbfd88cb3259a8f4313abd06a1c57d70 Mon Sep 17 00:00:00 2001 From: Thomas Voss Date: Sat, 4 May 2024 21:47:33 +0200 Subject: More 2-stage lookup tables --- gen/prop/scx | 240 +++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 143 insertions(+), 97 deletions(-) (limited to 'gen/prop/scx') diff --git a/gen/prop/scx b/gen/prop/scx index ec5b03f..0d6664e 100755 --- a/gen/prop/scx +++ b/gen/prop/scx @@ -1,97 +1,143 @@ -#!/bin/sh - -set -e -cd "${0%/*}/../.." -exec >lib/unicode/prop/uprop_get_scx.c - -gawk ' -BEGIN { - FS = " *(; *|#.*)" - - print "/* This file is autogenerated by gen/prop/scx; DO NOT EDIT. */" - print "" - print "#include \"_bsearch.h\"" - print "#include \"macros.h\"" - print "#include \"rune.h\"" - print "#include \"unicode/prop.h\"" - print "" - print "#define CAST(...) (const enum uprop_sc []){__VA_ARGS__}" - print "#define _(...) {CAST(__VA_ARGS__), lengthof(CAST(__VA_ARGS__))}" - print "" - print "struct uprop_sc_view {" - print "\tconst enum uprop_sc *p;" - print "\tsize_t n;" - print "};" - print "" - print "static constexpr enum uprop_sc fallback[] = {" - print "\tSC_ZZZZ, SC_ADLM, SC_AGHB, SC_AHOM, SC_ARAB, SC_ARMI, SC_ARMN, SC_AVST," - print "\tSC_BALI, SC_BAMU, SC_BASS, SC_BATK, SC_BENG, SC_BHKS, SC_BOPO, SC_BRAH," - print "\tSC_BRAI, SC_BUGI, SC_BUHD, SC_CAKM, SC_CANS, SC_CARI, SC_CHAM, SC_CHER," - print "\tSC_CHRS, SC_COPT, SC_CPMN, SC_CPRT, SC_CYRL, SC_DEVA, SC_DIAK, SC_DOGR," - print "\tSC_DSRT, SC_DUPL, SC_EGYP, SC_ELBA, SC_ELYM, SC_ETHI, SC_GEOR, SC_GLAG," - print "\tSC_GONG, SC_GONM, SC_GOTH, SC_GRAN, SC_GREK, SC_GUJR, SC_GURU, SC_HANG," - print "\tSC_HANI, SC_HANO, SC_HATR, SC_HEBR, SC_HIRA, SC_HLUW, SC_HMNG, SC_HMNP," - print "\tSC_HRKT, SC_HUNG, SC_ITAL, SC_JAVA, SC_KALI, SC_KANA, SC_KAWI, SC_KHAR," - print "\tSC_KHMR, SC_KHOJ, SC_KITS, SC_KNDA, SC_KTHI, SC_LANA, SC_LAOO, SC_LATN," - print "\tSC_LEPC, SC_LIMB, SC_LINA, SC_LINB, SC_LISU, SC_LYCI, SC_LYDI, SC_MAHJ," - print "\tSC_MAKA, SC_MAND, SC_MANI, SC_MARC, SC_MEDF, SC_MEND, SC_MERC, SC_MERO," - print "\tSC_MLYM, SC_MODI, SC_MONG, SC_MROO, SC_MTEI, SC_MULT, SC_MYMR, SC_NAGM," - print "\tSC_NAND, SC_NARB, SC_NBAT, SC_NEWA, SC_NKOO, SC_NSHU, SC_OGAM, SC_OLCK," - print "\tSC_ORKH, SC_ORYA, SC_OSGE, SC_OSMA, SC_OUGR, SC_PALM, SC_PAUC, SC_PERM," - print "\tSC_PHAG, SC_PHLI, SC_PHLP, SC_PHNX, SC_PLRD, SC_PRTI, SC_RJNG, SC_ROHG," - print "\tSC_RUNR, SC_SAMR, SC_SARB, SC_SAUR, SC_SGNW, SC_SHAW, SC_SHRD, SC_SIDD," - print "\tSC_SIND, SC_SINH, SC_SOGD, SC_SOGO, SC_SORA, SC_SOYO, SC_SUND, SC_SYLO," - print "\tSC_SYRC, SC_TAGB, SC_TAKR, SC_TALE, SC_TALU, SC_TAML, SC_TANG, SC_TAVT," - print "\tSC_TELU, SC_TFNG, SC_TGLG, SC_THAA, SC_THAI, SC_TIBT, SC_TIRH, SC_TNSA," - print "\tSC_TOTO, SC_UGAR, SC_VAII, SC_VITH, SC_WARA, SC_WCHO, SC_XPEO, SC_XSUX," - print "\tSC_YEZI, SC_YIII, SC_ZANB, SC_ZINH, SC_ZYYY," - print "};" - print "" -} - -/^[A-F0-9]/ { - n = split($1, a, /\.\./) - lo = strtonum("0X" a[1]) - hi = strtonum("0X" a[n]) - - for (i = lo; i <= hi; i++) - props[i] = $2 -} - -END { - print "static const struct {" - print "\trune lo, hi;" - print "\tstruct uprop_sc_view val;" - print "} lookup[] = {" - - for (i = 0; i <= 0x10FFFF; i++) { - if (!props[i]) - continue - for (lo = i; props[lo] == props[i + 1]; i++) - ; - printf "\t{RUNE_C(0x%06X), RUNE_C(0x%06X), _(", lo, i - split(props[i], xs, / /) - for (j in xs) { - printf "SC_%s", toupper(xs[j]) - if (j < length(xs)) - printf ", " - } - printf ")},\n" - } - - print "};" - print "" - print "_MLIB_DEFINE_BSEARCH(struct uprop_sc_view, lookup, ((struct uprop_sc_view){" - print "\t.p = fallback + uprop_get_sc(ch)," - print "\t.n = 1," - print "}))" - print "" - print "const enum uprop_sc *" - print "uprop_get_scx(rune ch, size_t *n)" - print "{" - print "\tstruct uprop_sc_view v = mlib_lookup(ch);" - print "\t*n = v.n;" - print "\treturn v.p;" - print "}" -} -' data/ScriptExtensions +#!/usr/bin/python3 + +import math + +from lib import * + + +longest = 0 + +def parse(file: str) -> list[bool]: + global longest + + xs = ['{}'] * 0x110000 + with open(file, 'r') as f: + for line in f.readlines(): + if len(line.strip()) == 0 or line[0] == '#': + continue + + parts = line.split(';') + ranges = [int(x, 16) for x in parts[0].strip().split('..')] + scs = [ + f'SC_{x}' for x in ( + parts[1] + .split('#')[0] + .strip() + .upper() + .split() + ) + ] + prop = f'_({', '.join(scs)})' + longest = max(longest, len(prop)) + + for i in range(ranges[0], ranges[len(ranges) - 1] + 1): + xs[i] = prop + return xs + +def genfile(cs: list[tuple[bool, ...]], blksize: int) -> None: + Cs = cs + cs = list(dict.fromkeys(Cs)) + + print('''\ +/* This file is autogenerated by gen/prop/scx; DO NOT EDIT. */ + +#include + +#include "macros.h" +#include "unicode/prop.h" + +#define CAST(...) (const enum uprop_sc []){__VA_ARGS__} +#define _(...) {CAST(__VA_ARGS__), lengthof(CAST(__VA_ARGS__))} + +static constexpr enum uprop_sc fallback[] = { + SC_ZZZZ, SC_ADLM, SC_AGHB, SC_AHOM, SC_ARAB, SC_ARMI, SC_ARMN, SC_AVST, + SC_BALI, SC_BAMU, SC_BASS, SC_BATK, SC_BENG, SC_BHKS, SC_BOPO, SC_BRAH, + SC_BRAI, SC_BUGI, SC_BUHD, SC_CAKM, SC_CANS, SC_CARI, SC_CHAM, SC_CHER, + SC_CHRS, SC_COPT, SC_CPMN, SC_CPRT, SC_CYRL, SC_DEVA, SC_DIAK, SC_DOGR, + SC_DSRT, SC_DUPL, SC_EGYP, SC_ELBA, SC_ELYM, SC_ETHI, SC_GEOR, SC_GLAG, + SC_GONG, SC_GONM, SC_GOTH, SC_GRAN, SC_GREK, SC_GUJR, SC_GURU, SC_HANG, + SC_HANI, SC_HANO, SC_HATR, SC_HEBR, SC_HIRA, SC_HLUW, SC_HMNG, SC_HMNP, + SC_HRKT, SC_HUNG, SC_ITAL, SC_JAVA, SC_KALI, SC_KANA, SC_KAWI, SC_KHAR, + SC_KHMR, SC_KHOJ, SC_KITS, SC_KNDA, SC_KTHI, SC_LANA, SC_LAOO, SC_LATN, + SC_LEPC, SC_LIMB, SC_LINA, SC_LINB, SC_LISU, SC_LYCI, SC_LYDI, SC_MAHJ, + SC_MAKA, SC_MAND, SC_MANI, SC_MARC, SC_MEDF, SC_MEND, SC_MERC, SC_MERO, + SC_MLYM, SC_MODI, SC_MONG, SC_MROO, SC_MTEI, SC_MULT, SC_MYMR, SC_NAGM, + SC_NAND, SC_NARB, SC_NBAT, SC_NEWA, SC_NKOO, SC_NSHU, SC_OGAM, SC_OLCK, + SC_ORKH, SC_ORYA, SC_OSGE, SC_OSMA, SC_OUGR, SC_PALM, SC_PAUC, SC_PERM, + SC_PHAG, SC_PHLI, SC_PHLP, SC_PHNX, SC_PLRD, SC_PRTI, SC_RJNG, SC_ROHG, + SC_RUNR, SC_SAMR, SC_SARB, SC_SAUR, SC_SGNW, SC_SHAW, SC_SHRD, SC_SIDD, + SC_SIND, SC_SINH, SC_SOGD, SC_SOGO, SC_SORA, SC_SOYO, SC_SUND, SC_SYLO, + SC_SYRC, SC_TAGB, SC_TAKR, SC_TALE, SC_TALU, SC_TAML, SC_TANG, SC_TAVT, + SC_TELU, SC_TFNG, SC_TGLG, SC_THAA, SC_THAI, SC_TIBT, SC_TIRH, SC_TNSA, + SC_TOTO, SC_UGAR, SC_VAII, SC_VITH, SC_WARA, SC_WCHO, SC_XPEO, SC_XSUX, + SC_YEZI, SC_YIII, SC_ZANB, SC_ZINH, SC_ZYYY, +}; +''') + + print(f'static constexpr {typename(len(cs) - 1)} stage1[] = {{') + for i, c in enumerate(Cs): + print(f'%c%{len(str(len(cs) - 1))}d,' % ('\t' if i % 16 == 0 else ' ', cs.index(c)), end='') + if i % 16 == 15: + print() + print('};') + + print() + + ppc = columns(blksize, longest + 1) + print(f'static const struct uprop_sc_view stage2[][{blksize}] = {{') + for c in cs: + for i in range(blksize // ppc): + print('\t{' if i == 0 else '\t ', end='') + for j in range(ppc): + print(c[i*ppc + j], end='') + if i < blksize // ppc - 1 or j < ppc - 1: + print(',', end='') + if j < ppc - 1: + print(' ' * (longest + 1 - len(c[i*ppc + j])), end='') + if i < blksize // ppc - 1: + print() + print('},') + print('};') + + print() + + print(f'''\ +struct uprop_sc_view +uprop_get_scx(rune ch) +{{ + struct uprop_sc_view scv = stage2[stage1[ch / {blksize}]][ch % {blksize}]; + return scv.p == nullptr + ? (struct uprop_sc_view){{fallback + uprop_get_sc(ch), 1}} + : scv; +}}''') + +def main() -> None: + cwd_init() + xs = parse('data/ScriptExtensions') + + blksize = -1 + smallest = math.inf + + for bs in powers_of_2(): + if bs > len(xs): + break + Cs = [tuple(x) for x in chunks(xs, bs)] + cs = set(Cs) + + sz_s1 = len(Cs) * isize(len(cs) - 1) + sz_s2 = len(cs) * bs + sz = sz_s1 + sz_s2 + + if sz < smallest: + smallest = sz + blksize = bs + + Cs = [tuple(x) for x in chunks(xs, blksize)] + with open('lib/unicode/prop/uprop_get_scx.c', 'w') as f: + sys.stdout = f + genfile(Cs, blksize) + + report_size(len(xs), smallest) + +if __name__ == '__main__': + main() -- cgit v1.2.3