diff options
| author | Thomas Voss <mail@thomasvoss.com> | 2024-05-04 20:48:57 +0200 | 
|---|---|---|
| committer | Thomas Voss <mail@thomasvoss.com> | 2024-05-04 20:48:57 +0200 | 
| commit | 3c6ca49b23fd6a2df735e0eaf93432bfef3cba97 (patch) | |
| tree | 12a0f4ebb8d774af1b4f6f2a41b2367e99567943 /gen/prop/wb | |
| parent | 10fe179c3d4b8ca2fe3a09c40aff73d3dfe585ee (diff) | |
More 2-stage lookup tables
Diffstat (limited to 'gen/prop/wb')
| -rwxr-xr-x | gen/prop/wb | 207 | 
1 files changed, 125 insertions, 82 deletions
diff --git a/gen/prop/wb b/gen/prop/wb index a6b47f2..f6621f5 100755 --- a/gen/prop/wb +++ b/gen/prop/wb @@ -1,86 +1,129 @@ -#!/bin/sh - -set -e -cd "${0%/*}/../.." -exec >lib/unicode/prop/uprop_get_wb.c - -gawk ' -BEGIN { -	FS = " *(; *|#.*)" - -	map["ALetter"]            = "LE" -	map["CR"]                 = "CR" -	map["Double_Quote"]       = "DQ" -	map["E_Base"]             = "EB" -	map["E_Base_GAZ"]         = "EBG" -	map["E_Modifier"]         = "EM" -	map["Extend"]             = "EXTEND" -	map["ExtendNumLet"]       = "EX" -	map["Format"]             = "FO" -	map["Glue_After_Zwj"]     = "GAZ" -	map["Hebrew_Letter"]      = "HL" -	map["Katakana"]           = "KA" -	map["LF"]                 = "LF" -	map["MidLetter"]          = "ML" -	map["MidNumLet"]          = "MB" -	map["MidNum"]             = "MN" -	map["Newline"]            = "NL" -	map["Numeric"]            = "NU" -	map["Other"]              = "XX" -	map["Regional_Indicator"] = "RI" -	map["Single_Quote"]       = "SQ" -	map["WSegSpace"]          = "WSEGSPACE" -	map["ZWJ"]                = "ZWJ" - -	print "/* This file is autogenerated by gen/prop/wb; DO NOT EDIT. */" -	print "" -	print "#include \"_bsearch.h\"" -	print "#include \"macros.h\"" -	print "#include \"rune.h\"" -	print "#include \"unicode/prop.h\"" -	print "" -} +#!/usr/bin/python3 -/^[A-F0-9]/ { -	n = split($1, a, /\.\./) -	lo = strtonum("0X" a[1]) -	hi = strtonum("0X" a[n]) +import math -	for (i = lo; i <= hi; i++) -		props[i] = "WB_" map[$2] -} +from lib import * -END { -	print "static constexpr enum uprop_wb lookup_lat1[] = {" -	for (i = 0; i < 0x100; i++) { -		if (i % 4 == 0) -			printf "\t" -		printf "%-13s%s", (props[i] ? props[i] : "WB_XX") ",", \ -			i % 4 == 3 ? "\n" : " " -	} -	print "};" -	print "" -	print "static const struct {" -	print "\trune lo, hi;" -	print "\tenum uprop_wb val;" -	print "} lookup[] = {" - -	for (i = 0x100; i <= 0x10FFFF; i++) { -		if (!props[i]) -			continue -		for (lo = i; props[lo] == props[i + 1]; i++) -			; -		printf "\t{RUNE_C(0x%06X), RUNE_C(0x%06X), %s},\n", lo, i, props[i] -	} - -	print "};" -	print "" -	print "_MLIB_DEFINE_BSEARCH(enum uprop_wb, lookup, WB_XX)" -	print "" -	print "enum uprop_wb" -	print "uprop_get_wb(rune ch)" -	print "{" -	print "\treturn ch < lengthof(lookup_lat1) ? lookup_lat1[ch] : mlib_lookup(ch);" -	print "}" + +MAP = { +	'ALetter':            'LE', +	'CR':                 'CR', +	'Double_Quote':       'DQ', +	'E_Base':             'EB', +	'E_Base_GAZ':         'EBG', +	'E_Modifier':         'EM', +	'Extend':             'EXTEND', +	'ExtendNumLet':       'EX', +	'Format':             'FO', +	'Glue_After_Zwj':     'GAZ', +	'Hebrew_Letter':      'HL', +	'Katakana':           'KA', +	'LF':                 'LF', +	'MidLetter':          'ML', +	'MidNumLet':          'MB', +	'MidNum':             'MN', +	'Newline':            'NL', +	'Numeric':            'NU', +	'Other':              'XX', +	'Regional_Indicator': 'RI', +	'Single_Quote':       'SQ', +	'WSegSpace':          'WSEGSPACE', +	'ZWJ':                'ZWJ',  } -' data/WordBreakProperty | sed 's/\s*$//' + +longest = 0 + +def parse(file: str) -> list[bool]: +	global longest + +	xs = ['WB_XX'] * 0x110000 +	with open(file, 'r') as f: +		for line in f.readlines(): +			if len(line.strip()) == 0 or line[0] == '#': +				continue + +			parts = line.split(';') +			ranges = [int(x, 16) for x in parts[0].strip().split('..')] +			prop = 'WB_' + MAP[parts[1].split('#')[0].strip()] +			longest = max(longest, len(prop)) + +			for i in range(ranges[0], ranges[len(ranges) - 1] + 1): +				xs[i] = prop +	return xs + +def genfile(cs: list[tuple[bool, ...]], blksize: int) -> None: +	Cs = cs +	cs = list(dict.fromkeys(Cs)) + +	print('''\ +/* This file is autogenerated by gen/prop/wb; DO NOT EDIT. */ + +#include <stdint.h> + +#include "unicode/prop.h" +''') + +	print(f'static constexpr {typename(len(cs) - 1)} stage1[] = {{') +	for i, c in enumerate(Cs): +		print(f'%c%{len(str(len(cs) - 1))}d,' % ('\t' if i % 16 == 0 else ' ', cs.index(c)), end='') +		if i % 16 == 15: +			print() +	print('};') + +	print() + +	ppc = columns(blksize, longest + 1) +	print(f'static constexpr enum uprop_wb stage2[][{blksize}] = {{') +	for c in cs: +		for i in range(blksize // ppc): +			print('\t{' if i == 0 else '\t ', end='') +			for j in range(ppc): +				print(c[i*ppc + j], end='') +				if i < blksize // ppc - 1 or j < ppc - 1: +					print(',', end='') +				if j < ppc - 1: +					print(' ' * (longest + 1 - len(c[i*ppc + j])), end='') +			if i < blksize // ppc - 1: +				print() +		print('},') +	print('};') + +	print() + +	print(f'''\ +enum uprop_wb +uprop_get_wb(rune ch) +{{ +	return stage2[stage1[ch / {blksize}]][ch % {blksize}]; +}}''') + +def main() -> None: +	cwd_init() +	xs = parse('data/WordBreakProperty') + +	blksize = -1 +	smallest = math.inf + +	for bs in powers_of_2(): +		if bs > len(xs): +			break +		Cs = [tuple(x) for x in chunks(xs, bs)] +		cs = set(Cs) + +		sz_s1 = len(Cs) * isize(len(cs) - 1) +		sz_s2 = len(cs) * bs +		sz = sz_s1 + sz_s2 + +		if sz < smallest: +			smallest = sz +			blksize = bs + +	Cs = [tuple(x) for x in chunks(xs, blksize)] +	with open('lib/unicode/prop/uprop_get_wb.c', 'w') as f: +		sys.stdout = f +		genfile(Cs, blksize) + +	report_size(len(xs), smallest) + +if __name__ == '__main__': +	main()  |