aboutsummaryrefslogtreecommitdiff
path: root/gen
diff options
context:
space:
mode:
authorThomas Voss <mail@thomasvoss.com> 2024-04-28 06:15:55 +0200
committerThomas Voss <mail@thomasvoss.com> 2024-04-28 06:15:55 +0200
commit4ea2dd117e656f950c41f9954bd593c313e34ee2 (patch)
treed239c3426240bb886f129bccf901b0fc970c6073 /gen
parent50787ecf06854f99eaf4b966abd11d23554bd221 (diff)
Implement boolean props using 2-stage lookup
Diffstat (limited to 'gen')
-rwxr-xr-xgen/prop/bool-props5
-rw-r--r--gen/prop/bool-props.awk79
-rwxr-xr-xgen/prop/bool-props.py148
3 files changed, 150 insertions, 82 deletions
diff --git a/gen/prop/bool-props b/gen/prop/bool-props
index b98b9d8..0a8330f 100755
--- a/gen/prop/bool-props
+++ b/gen/prop/bool-props
@@ -79,9 +79,8 @@ ri=Regional_Indicator
gen()
{
local p=${1%%=*}
- gawk -M -v prop=${1#*=} -v short=$p \
- -f gen/prop/bool-props.awk data/$2 \
- >lib/unicode/prop/uprop_is_${p}.c
+ gen/prop/bool-props.py ${1#*=} $p data/$2 \
+ >lib/unicode/prop/uprop_is_$p.c
printf 'DONE uprop_is_%s()\n' $p >&2
}
diff --git a/gen/prop/bool-props.awk b/gen/prop/bool-props.awk
deleted file mode 100644
index 6362dea..0000000
--- a/gen/prop/bool-props.awk
+++ /dev/null
@@ -1,79 +0,0 @@
-BEGIN {
- FS = "( *#.*| +; +)"
-
- print "/* This file is autogenerated by gen/prop/bool-props; DO NOT EDIT. */"
- print ""
- print "#include \"_bsearch.h\""
- print "#include \"bitset.h\""
- print "#include \"rune.h\""
- print "#include \"unicode/prop.h\""
- print ""
- print "/* clang-format off */"
- print ""
-}
-
-$2 == prop || (prop == "Indic_Conjunct_Break" && $2 ~ /InCB;/) {
- n = split($1, a, /\.\./)
- lo = strtonum("0x" a[1])
- hi = strtonum("0x" a[n])
-
- for (i = lo; i <= hi; i++)
- xs[i] = 1
-}
-
-END {
- for (i = 0; i <= 0xFF; i++) {
- if (xs[i])
- mask = or(mask, lshift(1, i))
- }
-
- if (mask > 0) {
- print "static constexpr bitset(bs, LATIN1_MAX) = {"
- for (i = 0; i < 32; i++) {
- if (i % 8 == 0)
- printf "\t"
- printf "0x%02X,", and(rshift(mask, 8 * i), 0xFF)
- printf((i % 8 == 7) ? "\n" : " ")
- }
- print "};"
- print ""
- }
-
- for (i = 0x100; i <= 0x10FFFF; i++) {
- if (xs[i]) {
- need_big_lookup = 1
- break
- }
- }
-
- if (need_big_lookup) {
- print "static const struct {"
- print "\trune lo, hi;"
- print "} lookup[] = {"
-
- for (i = 0x100; i <= 0x10FFFF; i++) {
- if (!xs[i])
- continue
- lo = i
- while (xs[i + 1])
- i++
- printf "\t{RUNE_C(0x%06X), RUNE_C(0x%06X)},\n", lo, i
- }
-
- print "};"
- print ""
- print "_MLIB_DEFINE_BSEARCH_CONTAINS(lookup)"
- print ""
- }
-
- print "bool"
- printf "uprop_is_%s(rune ch)\n", short
- print "{"
- if (mask > 0 && need_big_lookup)
- print "\treturn ch <= LATIN1_MAX ? TESTBIT(bs, ch) : mlib_lookup_contains(ch);"
- else if (need_big_lookup)
- print "\treturn mlib_lookup_contains(ch);"
- else
- print "\treturn ch <= LATIN1_MAX && TESTBIT(bs, ch);"
- print "}"
-}
diff --git a/gen/prop/bool-props.py b/gen/prop/bool-props.py
new file mode 100755
index 0000000..a913904
--- /dev/null
+++ b/gen/prop/bool-props.py
@@ -0,0 +1,148 @@
+#!/usr/bin/python3
+
+import functools
+import math
+import sys
+from typing import Generator
+
+
+def chunks[T](xs: list[T], n: int) -> Generator[list[T], None, None]:
+ for i in range(0, len(xs), n):
+ yield xs[i:i + n]
+
+def powers_of_2() -> Generator[int, None, None]:
+ i = 0
+ while True:
+ yield 2 ** i
+ i += 1
+
+def bytes_per_col(n: int) -> int:
+ xs = list(set(functools.reduce(list.__add__, (
+ [i, n // i] for i in range(1, int(n ** 0.5) + 1) if n % i == 0)
+ )))
+ for x in sorted(xs, reverse=True):
+ y = 5
+ y += x * 5
+ y += x - 1
+ if y <= 80:
+ return x
+
+ raise ValueError
+
+def isize(x: int) -> int:
+ if x < 256:
+ return 1
+ if x < 65535:
+ return 2
+ if x < 4294967295:
+ return 3
+ if x < 18446744073709551615:
+ return 4
+ raise ValueError
+
+def typename(x: int) -> str:
+ if x < 256:
+ return "uint8_t"
+ if x < 65535:
+ return "uint16_t"
+ if x < 4294967295:
+ return "uint32_t"
+ if x < 18446744073709551615:
+ return "uint64_t"
+ raise ValueError
+
+def parse(file: str) -> list[bool]:
+ xs = [False] * 0x110000
+ if sys.argv[1] == 'Indic_Conjunct_Break':
+ sys.argv[1] = 'InCB;'
+ with open(file, 'r') as f:
+ for line in f.readlines():
+ if (
+ len(line) == 0
+ or line[0] == '#'
+ or sys.argv[1] not in line
+ ):
+ continue
+ parts = [int(x, 16) for x in line.split(';')[0].strip().split('..')]
+ for i in range(parts[0], parts[len(parts) - 1] + 1):
+ xs[i] = True
+ return xs
+
+def genfile(cs: list[tuple[bool, ...]], blksize: int) -> None:
+ Cs = cs
+ cs = list(dict.fromkeys(Cs))
+
+ print('''\
+/* This file is autogenerated by gen/prop/bool-props; DO NOT EDIT. */
+
+#include "bitset.h"
+#include "unicode/prop.h"
+''')
+
+ print(f'static constexpr {typename(len(cs) - 1)} stage1[] = {{')
+ for i, c in enumerate(Cs):
+ print(f'%c%{len(str(len(cs) - 1))}d,' % ('\t' if i % 16 == 0 else ' ', cs.index(c)), end='')
+ if i % 16 == 15:
+ print()
+ print('};')
+
+ print()
+
+ bcnt = blksize // 8
+ bpc = bytes_per_col(bcnt)
+ print(f'static constexpr unsigned char stage2[][{bcnt}] = {{')
+ for c in cs:
+ x = sum(map(lambda x: x[1] << x[0], enumerate(c)))
+
+ for i in range(bcnt // bpc):
+ print('\t{' if i == 0 else '\t ', end='')
+ for j in range(bpc):
+ print('0x%02X' % (x & 0xFF), end='')
+ x >>= 8
+ if i < bcnt // bpc - 1 or j < bpc - 1:
+ print(',', end='')
+ if j < bpc - 1:
+ print(' ', end='')
+ if i < bcnt // bpc - 1:
+ print()
+ print('},')
+ print('};')
+
+ print()
+
+ print(f'''\
+bool
+uprop_is_{sys.argv[2]}(rune ch)
+{{
+ return TESTBIT(stage2[stage1[ch / {blksize}]], ch % {blksize});
+}}''')
+
+def main() -> None:
+ if len(sys.argv) != 4:
+ print('Usage: bool-props.py name shortname file', file=sys.stderr)
+ exit(1)
+
+ xs = parse(sys.argv[3])
+
+ blksize = -1
+ smallest = math.inf
+
+ for bs in powers_of_2():
+ if bs > len(xs):
+ break
+ Cs = [tuple(x) for x in chunks(xs, bs)]
+ cs = list(dict.fromkeys(Cs))
+
+ sz_s1 = len(Cs) * isize(len(cs) - 1)
+ sz_s2 = len(cs) * bs
+ sz = sz_s1 + sz_s2
+
+ if sz < smallest:
+ smallest = sz
+ blksize = bs
+
+ Cs = [tuple(x) for x in chunks(xs, blksize)]
+ genfile(Cs, blksize)
+
+if __name__ == '__main__':
+ main()