diff options
author | Thomas Voss <mail@thomasvoss.com> | 2024-04-28 06:15:55 +0200 |
---|---|---|
committer | Thomas Voss <mail@thomasvoss.com> | 2024-04-28 06:15:55 +0200 |
commit | 4ea2dd117e656f950c41f9954bd593c313e34ee2 (patch) | |
tree | d239c3426240bb886f129bccf901b0fc970c6073 /gen | |
parent | 50787ecf06854f99eaf4b966abd11d23554bd221 (diff) |
Implement boolean props using 2-stage lookup
Diffstat (limited to 'gen')
-rwxr-xr-x | gen/prop/bool-props | 5 | ||||
-rw-r--r-- | gen/prop/bool-props.awk | 79 | ||||
-rwxr-xr-x | gen/prop/bool-props.py | 148 |
3 files changed, 150 insertions, 82 deletions
diff --git a/gen/prop/bool-props b/gen/prop/bool-props index b98b9d8..0a8330f 100755 --- a/gen/prop/bool-props +++ b/gen/prop/bool-props @@ -79,9 +79,8 @@ ri=Regional_Indicator gen() { local p=${1%%=*} - gawk -M -v prop=${1#*=} -v short=$p \ - -f gen/prop/bool-props.awk data/$2 \ - >lib/unicode/prop/uprop_is_${p}.c + gen/prop/bool-props.py ${1#*=} $p data/$2 \ + >lib/unicode/prop/uprop_is_$p.c printf 'DONE uprop_is_%s()\n' $p >&2 } diff --git a/gen/prop/bool-props.awk b/gen/prop/bool-props.awk deleted file mode 100644 index 6362dea..0000000 --- a/gen/prop/bool-props.awk +++ /dev/null @@ -1,79 +0,0 @@ -BEGIN { - FS = "( *#.*| +; +)" - - print "/* This file is autogenerated by gen/prop/bool-props; DO NOT EDIT. */" - print "" - print "#include \"_bsearch.h\"" - print "#include \"bitset.h\"" - print "#include \"rune.h\"" - print "#include \"unicode/prop.h\"" - print "" - print "/* clang-format off */" - print "" -} - -$2 == prop || (prop == "Indic_Conjunct_Break" && $2 ~ /InCB;/) { - n = split($1, a, /\.\./) - lo = strtonum("0x" a[1]) - hi = strtonum("0x" a[n]) - - for (i = lo; i <= hi; i++) - xs[i] = 1 -} - -END { - for (i = 0; i <= 0xFF; i++) { - if (xs[i]) - mask = or(mask, lshift(1, i)) - } - - if (mask > 0) { - print "static constexpr bitset(bs, LATIN1_MAX) = {" - for (i = 0; i < 32; i++) { - if (i % 8 == 0) - printf "\t" - printf "0x%02X,", and(rshift(mask, 8 * i), 0xFF) - printf((i % 8 == 7) ? "\n" : " ") - } - print "};" - print "" - } - - for (i = 0x100; i <= 0x10FFFF; i++) { - if (xs[i]) { - need_big_lookup = 1 - break - } - } - - if (need_big_lookup) { - print "static const struct {" - print "\trune lo, hi;" - print "} lookup[] = {" - - for (i = 0x100; i <= 0x10FFFF; i++) { - if (!xs[i]) - continue - lo = i - while (xs[i + 1]) - i++ - printf "\t{RUNE_C(0x%06X), RUNE_C(0x%06X)},\n", lo, i - } - - print "};" - print "" - print "_MLIB_DEFINE_BSEARCH_CONTAINS(lookup)" - print "" - } - - print "bool" - printf "uprop_is_%s(rune ch)\n", short - print "{" - if (mask > 0 && need_big_lookup) - print "\treturn ch <= LATIN1_MAX ? TESTBIT(bs, ch) : mlib_lookup_contains(ch);" - else if (need_big_lookup) - print "\treturn mlib_lookup_contains(ch);" - else - print "\treturn ch <= LATIN1_MAX && TESTBIT(bs, ch);" - print "}" -} diff --git a/gen/prop/bool-props.py b/gen/prop/bool-props.py new file mode 100755 index 0000000..a913904 --- /dev/null +++ b/gen/prop/bool-props.py @@ -0,0 +1,148 @@ +#!/usr/bin/python3 + +import functools +import math +import sys +from typing import Generator + + +def chunks[T](xs: list[T], n: int) -> Generator[list[T], None, None]: + for i in range(0, len(xs), n): + yield xs[i:i + n] + +def powers_of_2() -> Generator[int, None, None]: + i = 0 + while True: + yield 2 ** i + i += 1 + +def bytes_per_col(n: int) -> int: + xs = list(set(functools.reduce(list.__add__, ( + [i, n // i] for i in range(1, int(n ** 0.5) + 1) if n % i == 0) + ))) + for x in sorted(xs, reverse=True): + y = 5 + y += x * 5 + y += x - 1 + if y <= 80: + return x + + raise ValueError + +def isize(x: int) -> int: + if x < 256: + return 1 + if x < 65535: + return 2 + if x < 4294967295: + return 3 + if x < 18446744073709551615: + return 4 + raise ValueError + +def typename(x: int) -> str: + if x < 256: + return "uint8_t" + if x < 65535: + return "uint16_t" + if x < 4294967295: + return "uint32_t" + if x < 18446744073709551615: + return "uint64_t" + raise ValueError + +def parse(file: str) -> list[bool]: + xs = [False] * 0x110000 + if sys.argv[1] == 'Indic_Conjunct_Break': + sys.argv[1] = 'InCB;' + with open(file, 'r') as f: + for line in f.readlines(): + if ( + len(line) == 0 + or line[0] == '#' + or sys.argv[1] not in line + ): + continue + parts = [int(x, 16) for x in line.split(';')[0].strip().split('..')] + for i in range(parts[0], parts[len(parts) - 1] + 1): + xs[i] = True + return xs + +def genfile(cs: list[tuple[bool, ...]], blksize: int) -> None: + Cs = cs + cs = list(dict.fromkeys(Cs)) + + print('''\ +/* This file is autogenerated by gen/prop/bool-props; DO NOT EDIT. */ + +#include "bitset.h" +#include "unicode/prop.h" +''') + + print(f'static constexpr {typename(len(cs) - 1)} stage1[] = {{') + for i, c in enumerate(Cs): + print(f'%c%{len(str(len(cs) - 1))}d,' % ('\t' if i % 16 == 0 else ' ', cs.index(c)), end='') + if i % 16 == 15: + print() + print('};') + + print() + + bcnt = blksize // 8 + bpc = bytes_per_col(bcnt) + print(f'static constexpr unsigned char stage2[][{bcnt}] = {{') + for c in cs: + x = sum(map(lambda x: x[1] << x[0], enumerate(c))) + + for i in range(bcnt // bpc): + print('\t{' if i == 0 else '\t ', end='') + for j in range(bpc): + print('0x%02X' % (x & 0xFF), end='') + x >>= 8 + if i < bcnt // bpc - 1 or j < bpc - 1: + print(',', end='') + if j < bpc - 1: + print(' ', end='') + if i < bcnt // bpc - 1: + print() + print('},') + print('};') + + print() + + print(f'''\ +bool +uprop_is_{sys.argv[2]}(rune ch) +{{ + return TESTBIT(stage2[stage1[ch / {blksize}]], ch % {blksize}); +}}''') + +def main() -> None: + if len(sys.argv) != 4: + print('Usage: bool-props.py name shortname file', file=sys.stderr) + exit(1) + + xs = parse(sys.argv[3]) + + blksize = -1 + smallest = math.inf + + for bs in powers_of_2(): + if bs > len(xs): + break + Cs = [tuple(x) for x in chunks(xs, bs)] + cs = list(dict.fromkeys(Cs)) + + sz_s1 = len(Cs) * isize(len(cs) - 1) + sz_s2 = len(cs) * bs + sz = sz_s1 + sz_s2 + + if sz < smallest: + smallest = sz + blksize = bs + + Cs = [tuple(x) for x in chunks(xs, blksize)] + genfile(Cs, blksize) + +if __name__ == '__main__': + main() |