aboutsummaryrefslogtreecommitdiff
path: root/gen/prop/ccc
diff options
context:
space:
mode:
authorThomas Voss <mail@thomasvoss.com> 2024-04-30 20:08:37 +0200
committerThomas Voss <mail@thomasvoss.com> 2024-04-30 20:08:37 +0200
commit34c55c4d07af131c9da06c367ac2958a6090f2a3 (patch)
tree2d0fe61b618928feb3a0fffa031e9285a816f0cd /gen/prop/ccc
parent04e8ee70d94a579f1d24aaa80e9341c9000d2dec (diff)
Add more 2-stage lookup tables
Diffstat (limited to 'gen/prop/ccc')
-rwxr-xr-xgen/prop/ccc269
1 files changed, 158 insertions, 111 deletions
diff --git a/gen/prop/ccc b/gen/prop/ccc
index 4f370e7..5339748 100755
--- a/gen/prop/ccc
+++ b/gen/prop/ccc
@@ -1,116 +1,163 @@
-#!/bin/sh
-
-set -e
-cd "${0%/*}/../.."
-exec >lib/unicode/prop/uprop_get_ccc.c
-
-gawk '
-BEGIN {
- FS = ";"
-
- map[1] = "OV"
- map[6] = "HANR"
- map[7] = "NK"
- map[8] = "KV"
- map[9] = "VR"
- map[10] = "CCC10"
- map[11] = "CCC11"
- map[12] = "CCC12"
- map[13] = "CCC13"
- map[14] = "CCC14"
- map[15] = "CCC15"
- map[16] = "CCC16"
- map[17] = "CCC17"
- map[18] = "CCC18"
- map[19] = "CCC19"
- map[20] = "CCC20"
- map[21] = "CCC21"
- map[22] = "CCC22"
- map[23] = "CCC23"
- map[24] = "CCC24"
- map[25] = "CCC25"
- map[26] = "CCC26"
- map[27] = "CCC27"
- map[28] = "CCC28"
- map[29] = "CCC29"
- map[30] = "CCC30"
- map[31] = "CCC31"
- map[32] = "CCC32"
- map[33] = "CCC33"
- map[34] = "CCC34"
- map[35] = "CCC35"
- map[36] = "CCC36"
- map[84] = "CCC84"
- map[91] = "CCC91"
- map[103] = "CCC103"
- map[107] = "CCC107"
- map[118] = "CCC118"
- map[122] = "CCC122"
- map[129] = "CCC129"
- map[130] = "CCC130"
- map[132] = "CCC132"
- map[133] = "CCC133"
- map[200] = "ATBL"
- map[202] = "ATB"
- map[214] = "ATA"
- map[216] = "ATAR"
- map[218] = "BL"
- map[220] = "B"
- map[222] = "BR"
- map[224] = "L"
- map[226] = "R"
- map[228] = "AL"
- map[230] = "A"
- map[232] = "AR"
- map[233] = "DB"
- map[234] = "DA"
- map[240] = "IS"
-
- print "/* This file is autogenerated by gen/prop/ccc; DO NOT EDIT. */"
- print ""
- print "#include \"_bsearch.h\""
- print "#include \"macros.h\""
- print "#include \"rune.h\""
- print "#include \"unicode/prop.h\""
- print ""
-}
+#!/usr/bin/python3
-{
- s = "CCC_" (map[$4] ? map[$4] : "NR")
- lo = strtonum("0X" $1)
+import math
- if ($2 ~ /First/) {
- getline
- hi = strtonum("0X" $1)
- } else
- hi = lo
+from lib import *
- for (i = lo; i <= hi; i++)
- props[i] = s
-}
-END {
- print "static const struct {"
- print "\trune lo, hi;"
- print "\tenum uprop_ccc val;"
- print "} lookup[] = {"
-
- for (i = 0; i <= 0x10FFFF; i++) {
- if (!props[i] || props[i] == "CCC_NR")
- continue
- for (lo = i; props[lo] == props[i + 1]; i++)
- ;
- printf "\t{RUNE_C(0x%06X), RUNE_C(0x%06X), %s},\n", lo, i, props[lo]
- }
-
- print "};"
- print ""
- print "_MLIB_DEFINE_BSEARCH(enum uprop_ccc, lookup, CCC_NR)"
- print ""
- print "enum uprop_ccc"
- print "uprop_get_ccc(rune ch)"
- print "{"
- print "\treturn ch < lookup[0].lo ? CCC_NR : mlib_lookup(ch);"
- print "}"
+MAP = {
+ '0' : 'NR',
+ '1' : 'OV',
+ '6' : 'HANR',
+ '7' : 'NK',
+ '8' : 'KV',
+ '9' : 'VR',
+ '10' : '10',
+ '11' : '11',
+ '12' : '12',
+ '13' : '13',
+ '14' : '14',
+ '15' : '15',
+ '16' : '16',
+ '17' : '17',
+ '18' : '18',
+ '19' : '19',
+ '20' : '20',
+ '21' : '21',
+ '22' : '22',
+ '23' : '23',
+ '24' : '24',
+ '25' : '25',
+ '26' : '26',
+ '27' : '27',
+ '28' : '28',
+ '29' : '29',
+ '30' : '30',
+ '31' : '31',
+ '32' : '32',
+ '33' : '33',
+ '34' : '34',
+ '35' : '35',
+ '36' : '36',
+ '84' : '84',
+ '91' : '91',
+ '103': '103',
+ '107': '107',
+ '118': '118',
+ '122': '122',
+ '129': '129',
+ '130': '130',
+ '132': '132',
+ '133': '133',
+ '200': 'ATBL',
+ '202': 'ATB',
+ '214': 'ATA',
+ '216': 'ATAR',
+ '218': 'BL',
+ '220': 'B',
+ '222': 'BR',
+ '224': 'L',
+ '226': 'R',
+ '228': 'AL',
+ '230': 'A',
+ '232': 'AR',
+ '233': 'DB',
+ '234': 'DA',
+ '240': 'IS',
}
-' data/UnicodeData
+
+longest = 0
+
+def parse(file: str) -> list[bool]:
+ global longest
+
+ xs = ['CCC_NR'] * 0x110000
+ with open(file, 'r') as f:
+ for line in f.readlines():
+ parts = line.split(';')
+ parts[0] = int(parts[0], 16)
+ if 'First' in parts[1]:
+ lo = parts[0]
+ elif 'Last' in parts[1]:
+ hi = parts[0]
+ for i in range(lo, hi + 1):
+ xs[i] = f'CCC_{MAP[parts[3]]}'
+ longest = max(longest, len(xs[i]))
+ else:
+ xs[parts[0]] = f'CCC_{MAP[parts[3]]}'
+ longest = max(longest, len(xs[parts[0]]))
+ return xs
+
+def genfile(cs: list[tuple[bool, ...]], blksize: int) -> None:
+ Cs = cs
+ cs = list(dict.fromkeys(Cs))
+
+ print('''\
+/* This file is autogenerated by gen/prop/ccc; DO NOT EDIT. */
+
+#include "unicode/prop.h"
+''')
+
+ print(f'static constexpr {typename(len(cs) - 1)} stage1[] = {{')
+ for i, c in enumerate(Cs):
+ print(f'%c%{len(str(len(cs) - 1))}d,' % ('\t' if i % 16 == 0 else ' ', cs.index(c)), end='')
+ if i % 16 == 15:
+ print()
+ print('};')
+
+ print()
+
+ ppc = columns(blksize, longest + 1)
+ print(f'static constexpr enum uprop_ccc stage2[][{blksize}] = {{')
+ for c in cs:
+ for i in range(blksize // ppc):
+ print('\t{' if i == 0 else '\t ', end='')
+ for j in range(ppc):
+ print(c[i*ppc + j], end='')
+ if i < blksize // ppc - 1 or j < ppc - 1:
+ print(',', end='')
+ if j < ppc - 1:
+ print(' ' * (longest + 1 - len(c[i*ppc + j])), end='')
+ if i < blksize // ppc - 1:
+ print()
+ print('},')
+ print('};')
+
+ print()
+
+ print(f'''\
+enum uprop_ccc
+uprop_get_ccc(rune ch)
+{{
+ return stage2[stage1[ch / {blksize}]][ch % {blksize}];
+}}''')
+
+def main() -> None:
+ cwd_init()
+ sys.stdout = open('lib/unicode/prop/uprop_get_ccc.c', 'w')
+ xs = parse('data/UnicodeData')
+
+ blksize = -1
+ smallest = math.inf
+
+ for bs in powers_of_2():
+ if bs > len(xs):
+ break
+ Cs = [tuple(x) for x in chunks(xs, bs)]
+ cs = set(Cs)
+
+ sz_s1 = len(Cs) * isize(len(cs) - 1)
+ sz_s2 = len(cs) * bs * 4
+ sz = sz_s1 + sz_s2
+
+ if sz < smallest:
+ smallest = sz
+ blksize = bs
+
+ Cs = [tuple(x) for x in chunks(xs, blksize)]
+ genfile(Cs, blksize)
+
+ report_size(len(xs), smallest)
+
+if __name__ == '__main__':
+ main()