diff options
| author | Thomas Voss <mail@thomasvoss.com> | 2024-05-04 21:47:33 +0200 | 
|---|---|---|
| committer | Thomas Voss <mail@thomasvoss.com> | 2024-05-04 21:47:33 +0200 | 
| commit | f5268368fbfd88cb3259a8f4313abd06a1c57d70 (patch) | |
| tree | 115c5c9678a7f4325343db5dcea34c25bdfacba9 /gen | |
| parent | 3c6ca49b23fd6a2df735e0eaf93432bfef3cba97 (diff) | |
More 2-stage lookup tables
Diffstat (limited to 'gen')
| -rwxr-xr-x | gen/prop/scx | 240 | 
1 files changed, 143 insertions, 97 deletions
| diff --git a/gen/prop/scx b/gen/prop/scx index ec5b03f..0d6664e 100755 --- a/gen/prop/scx +++ b/gen/prop/scx @@ -1,97 +1,143 @@ -#!/bin/sh - -set -e -cd "${0%/*}/../.." -exec >lib/unicode/prop/uprop_get_scx.c - -gawk ' -BEGIN { -	FS = " *(; *|#.*)" - -	print "/* This file is autogenerated by gen/prop/scx; DO NOT EDIT. */" -	print "" -	print "#include \"_bsearch.h\"" -	print "#include \"macros.h\"" -	print "#include \"rune.h\"" -	print "#include \"unicode/prop.h\"" -	print "" -	print "#define CAST(...) (const enum uprop_sc []){__VA_ARGS__}" -	print "#define _(...)    {CAST(__VA_ARGS__), lengthof(CAST(__VA_ARGS__))}" -	print "" -	print "struct uprop_sc_view {" -	print "\tconst enum uprop_sc *p;" -	print "\tsize_t n;" -	print "};" -	print "" -	print "static constexpr enum uprop_sc fallback[] = {" -	print "\tSC_ZZZZ, SC_ADLM, SC_AGHB, SC_AHOM, SC_ARAB, SC_ARMI, SC_ARMN, SC_AVST," -	print "\tSC_BALI, SC_BAMU, SC_BASS, SC_BATK, SC_BENG, SC_BHKS, SC_BOPO, SC_BRAH," -	print "\tSC_BRAI, SC_BUGI, SC_BUHD, SC_CAKM, SC_CANS, SC_CARI, SC_CHAM, SC_CHER," -	print "\tSC_CHRS, SC_COPT, SC_CPMN, SC_CPRT, SC_CYRL, SC_DEVA, SC_DIAK, SC_DOGR," -	print "\tSC_DSRT, SC_DUPL, SC_EGYP, SC_ELBA, SC_ELYM, SC_ETHI, SC_GEOR, SC_GLAG," -	print "\tSC_GONG, SC_GONM, SC_GOTH, SC_GRAN, SC_GREK, SC_GUJR, SC_GURU, SC_HANG," -	print "\tSC_HANI, SC_HANO, SC_HATR, SC_HEBR, SC_HIRA, SC_HLUW, SC_HMNG, SC_HMNP," -	print "\tSC_HRKT, SC_HUNG, SC_ITAL, SC_JAVA, SC_KALI, SC_KANA, SC_KAWI, SC_KHAR," -	print "\tSC_KHMR, SC_KHOJ, SC_KITS, SC_KNDA, SC_KTHI, SC_LANA, SC_LAOO, SC_LATN," -	print "\tSC_LEPC, SC_LIMB, SC_LINA, SC_LINB, SC_LISU, SC_LYCI, SC_LYDI, SC_MAHJ," -	print "\tSC_MAKA, SC_MAND, SC_MANI, SC_MARC, SC_MEDF, SC_MEND, SC_MERC, SC_MERO," -	print "\tSC_MLYM, SC_MODI, SC_MONG, SC_MROO, SC_MTEI, SC_MULT, SC_MYMR, SC_NAGM," -	print "\tSC_NAND, SC_NARB, SC_NBAT, SC_NEWA, SC_NKOO, SC_NSHU, SC_OGAM, SC_OLCK," -	print "\tSC_ORKH, SC_ORYA, SC_OSGE, SC_OSMA, SC_OUGR, SC_PALM, SC_PAUC, SC_PERM," -	print "\tSC_PHAG, SC_PHLI, SC_PHLP, SC_PHNX, SC_PLRD, SC_PRTI, SC_RJNG, SC_ROHG," -	print "\tSC_RUNR, SC_SAMR, SC_SARB, SC_SAUR, SC_SGNW, SC_SHAW, SC_SHRD, SC_SIDD," -	print "\tSC_SIND, SC_SINH, SC_SOGD, SC_SOGO, SC_SORA, SC_SOYO, SC_SUND, SC_SYLO," -	print "\tSC_SYRC, SC_TAGB, SC_TAKR, SC_TALE, SC_TALU, SC_TAML, SC_TANG, SC_TAVT," -	print "\tSC_TELU, SC_TFNG, SC_TGLG, SC_THAA, SC_THAI, SC_TIBT, SC_TIRH, SC_TNSA," -	print "\tSC_TOTO, SC_UGAR, SC_VAII, SC_VITH, SC_WARA, SC_WCHO, SC_XPEO, SC_XSUX," -	print "\tSC_YEZI, SC_YIII, SC_ZANB, SC_ZINH, SC_ZYYY," -	print "};" -	print "" -} - -/^[A-F0-9]/ { -	n = split($1, a, /\.\./) -	lo = strtonum("0X" a[1]) -	hi = strtonum("0X" a[n]) - -	for (i = lo; i <= hi; i++) -		props[i] = $2 -} - -END { -	print "static const struct {" -	print "\trune lo, hi;" -	print "\tstruct uprop_sc_view val;" -	print "} lookup[] = {" - -	for (i = 0; i <= 0x10FFFF; i++) { -		if (!props[i]) -			continue -		for (lo = i; props[lo] == props[i + 1]; i++) -			; -		printf "\t{RUNE_C(0x%06X), RUNE_C(0x%06X), _(", lo, i -		split(props[i], xs, / /) -		for (j in xs) { -			printf "SC_%s", toupper(xs[j]) -			if (j < length(xs)) -				printf ", " -		} -		printf ")},\n" -	} - -	print "};" -	print "" -	print "_MLIB_DEFINE_BSEARCH(struct uprop_sc_view, lookup, ((struct uprop_sc_view){" -	print "\t.p = fallback + uprop_get_sc(ch)," -	print "\t.n = 1," -	print "}))" -	print "" -	print "const enum uprop_sc *" -	print "uprop_get_scx(rune ch, size_t *n)" -	print "{" -	print "\tstruct uprop_sc_view v = mlib_lookup(ch);" -	print "\t*n = v.n;" -	print "\treturn v.p;" -	print "}" -} -' data/ScriptExtensions +#!/usr/bin/python3 + +import math + +from lib import * + + +longest = 0 + +def parse(file: str) -> list[bool]: +	global longest + +	xs = ['{}'] * 0x110000 +	with open(file, 'r') as f: +		for line in f.readlines(): +			if len(line.strip()) == 0 or line[0] == '#': +				continue + +			parts = line.split(';') +			ranges = [int(x, 16) for x in parts[0].strip().split('..')] +			scs = [ +				f'SC_{x}' for x in ( +					parts[1] +						.split('#')[0] +						.strip() +						.upper() +						.split() +				) +			] +			prop = f'_({', '.join(scs)})' +			longest = max(longest, len(prop)) + +			for i in range(ranges[0], ranges[len(ranges) - 1] + 1): +				xs[i] = prop +	return xs + +def genfile(cs: list[tuple[bool, ...]], blksize: int) -> None: +	Cs = cs +	cs = list(dict.fromkeys(Cs)) + +	print('''\ +/* This file is autogenerated by gen/prop/scx; DO NOT EDIT. */ + +#include <stdint.h> + +#include "macros.h" +#include "unicode/prop.h" + +#define CAST(...) (const enum uprop_sc []){__VA_ARGS__} +#define _(...)    {CAST(__VA_ARGS__), lengthof(CAST(__VA_ARGS__))} + +static constexpr enum uprop_sc fallback[] = { +	SC_ZZZZ, SC_ADLM, SC_AGHB, SC_AHOM, SC_ARAB, SC_ARMI, SC_ARMN, SC_AVST, +	SC_BALI, SC_BAMU, SC_BASS, SC_BATK, SC_BENG, SC_BHKS, SC_BOPO, SC_BRAH, +	SC_BRAI, SC_BUGI, SC_BUHD, SC_CAKM, SC_CANS, SC_CARI, SC_CHAM, SC_CHER, +	SC_CHRS, SC_COPT, SC_CPMN, SC_CPRT, SC_CYRL, SC_DEVA, SC_DIAK, SC_DOGR, +	SC_DSRT, SC_DUPL, SC_EGYP, SC_ELBA, SC_ELYM, SC_ETHI, SC_GEOR, SC_GLAG, +	SC_GONG, SC_GONM, SC_GOTH, SC_GRAN, SC_GREK, SC_GUJR, SC_GURU, SC_HANG, +	SC_HANI, SC_HANO, SC_HATR, SC_HEBR, SC_HIRA, SC_HLUW, SC_HMNG, SC_HMNP, +	SC_HRKT, SC_HUNG, SC_ITAL, SC_JAVA, SC_KALI, SC_KANA, SC_KAWI, SC_KHAR, +	SC_KHMR, SC_KHOJ, SC_KITS, SC_KNDA, SC_KTHI, SC_LANA, SC_LAOO, SC_LATN, +	SC_LEPC, SC_LIMB, SC_LINA, SC_LINB, SC_LISU, SC_LYCI, SC_LYDI, SC_MAHJ, +	SC_MAKA, SC_MAND, SC_MANI, SC_MARC, SC_MEDF, SC_MEND, SC_MERC, SC_MERO, +	SC_MLYM, SC_MODI, SC_MONG, SC_MROO, SC_MTEI, SC_MULT, SC_MYMR, SC_NAGM, +	SC_NAND, SC_NARB, SC_NBAT, SC_NEWA, SC_NKOO, SC_NSHU, SC_OGAM, SC_OLCK, +	SC_ORKH, SC_ORYA, SC_OSGE, SC_OSMA, SC_OUGR, SC_PALM, SC_PAUC, SC_PERM, +	SC_PHAG, SC_PHLI, SC_PHLP, SC_PHNX, SC_PLRD, SC_PRTI, SC_RJNG, SC_ROHG, +	SC_RUNR, SC_SAMR, SC_SARB, SC_SAUR, SC_SGNW, SC_SHAW, SC_SHRD, SC_SIDD, +	SC_SIND, SC_SINH, SC_SOGD, SC_SOGO, SC_SORA, SC_SOYO, SC_SUND, SC_SYLO, +	SC_SYRC, SC_TAGB, SC_TAKR, SC_TALE, SC_TALU, SC_TAML, SC_TANG, SC_TAVT, +	SC_TELU, SC_TFNG, SC_TGLG, SC_THAA, SC_THAI, SC_TIBT, SC_TIRH, SC_TNSA, +	SC_TOTO, SC_UGAR, SC_VAII, SC_VITH, SC_WARA, SC_WCHO, SC_XPEO, SC_XSUX, +	SC_YEZI, SC_YIII, SC_ZANB, SC_ZINH, SC_ZYYY, +}; +''') + +	print(f'static constexpr {typename(len(cs) - 1)} stage1[] = {{') +	for i, c in enumerate(Cs): +		print(f'%c%{len(str(len(cs) - 1))}d,' % ('\t' if i % 16 == 0 else ' ', cs.index(c)), end='') +		if i % 16 == 15: +			print() +	print('};') + +	print() + +	ppc = columns(blksize, longest + 1) +	print(f'static const struct uprop_sc_view stage2[][{blksize}] = {{') +	for c in cs: +		for i in range(blksize // ppc): +			print('\t{' if i == 0 else '\t ', end='') +			for j in range(ppc): +				print(c[i*ppc + j], end='') +				if i < blksize // ppc - 1 or j < ppc - 1: +					print(',', end='') +				if j < ppc - 1: +					print(' ' * (longest + 1 - len(c[i*ppc + j])), end='') +			if i < blksize // ppc - 1: +				print() +		print('},') +	print('};') + +	print() + +	print(f'''\ +struct uprop_sc_view +uprop_get_scx(rune ch) +{{ +	struct uprop_sc_view scv = stage2[stage1[ch / {blksize}]][ch % {blksize}]; +	return scv.p == nullptr +		? (struct uprop_sc_view){{fallback + uprop_get_sc(ch), 1}} +		: scv; +}}''') + +def main() -> None: +	cwd_init() +	xs = parse('data/ScriptExtensions') + +	blksize = -1 +	smallest = math.inf + +	for bs in powers_of_2(): +		if bs > len(xs): +			break +		Cs = [tuple(x) for x in chunks(xs, bs)] +		cs = set(Cs) + +		sz_s1 = len(Cs) * isize(len(cs) - 1) +		sz_s2 = len(cs) * bs +		sz = sz_s1 + sz_s2 + +		if sz < smallest: +			smallest = sz +			blksize = bs + +	Cs = [tuple(x) for x in chunks(xs, blksize)] +	with open('lib/unicode/prop/uprop_get_scx.c', 'w') as f: +		sys.stdout = f +		genfile(Cs, blksize) + +	report_size(len(xs), smallest) + +if __name__ == '__main__': +	main() |