diff options
Diffstat (limited to 'gen/prop/ccc')
| -rwxr-xr-x | gen/prop/ccc | 269 | 
1 files changed, 158 insertions, 111 deletions
diff --git a/gen/prop/ccc b/gen/prop/ccc index 4f370e7..5339748 100755 --- a/gen/prop/ccc +++ b/gen/prop/ccc @@ -1,116 +1,163 @@ -#!/bin/sh - -set -e -cd "${0%/*}/../.." -exec >lib/unicode/prop/uprop_get_ccc.c - -gawk ' -BEGIN { -	FS = ";" - -	map[1]   = "OV" -	map[6]   = "HANR" -	map[7]   = "NK" -	map[8]   = "KV" -	map[9]   = "VR" -	map[10]  = "CCC10" -	map[11]  = "CCC11" -	map[12]  = "CCC12" -	map[13]  = "CCC13" -	map[14]  = "CCC14" -	map[15]  = "CCC15" -	map[16]  = "CCC16" -	map[17]  = "CCC17" -	map[18]  = "CCC18" -	map[19]  = "CCC19" -	map[20]  = "CCC20" -	map[21]  = "CCC21" -	map[22]  = "CCC22" -	map[23]  = "CCC23" -	map[24]  = "CCC24" -	map[25]  = "CCC25" -	map[26]  = "CCC26" -	map[27]  = "CCC27" -	map[28]  = "CCC28" -	map[29]  = "CCC29" -	map[30]  = "CCC30" -	map[31]  = "CCC31" -	map[32]  = "CCC32" -	map[33]  = "CCC33" -	map[34]  = "CCC34" -	map[35]  = "CCC35" -	map[36]  = "CCC36" -	map[84]  = "CCC84" -	map[91]  = "CCC91" -	map[103] = "CCC103" -	map[107] = "CCC107" -	map[118] = "CCC118" -	map[122] = "CCC122" -	map[129] = "CCC129" -	map[130] = "CCC130" -	map[132] = "CCC132" -	map[133] = "CCC133" -	map[200] = "ATBL" -	map[202] = "ATB" -	map[214] = "ATA" -	map[216] = "ATAR" -	map[218] = "BL" -	map[220] = "B" -	map[222] = "BR" -	map[224] = "L" -	map[226] = "R" -	map[228] = "AL" -	map[230] = "A" -	map[232] = "AR" -	map[233] = "DB" -	map[234] = "DA" -	map[240] = "IS" - -	print "/* This file is autogenerated by gen/prop/ccc; DO NOT EDIT. */" -	print "" -	print "#include \"_bsearch.h\"" -	print "#include \"macros.h\"" -	print "#include \"rune.h\"" -	print "#include \"unicode/prop.h\"" -	print "" -} +#!/usr/bin/python3 -{ -	s = "CCC_" (map[$4] ? map[$4] : "NR") -	lo = strtonum("0X" $1) +import math -	if ($2 ~ /First/) { -		getline -		hi = strtonum("0X" $1) -	} else -		hi = lo +from lib import * -	for (i = lo; i <= hi; i++) -		props[i] = s -} -END { -	print "static const struct {" -	print "\trune lo, hi;" -	print "\tenum uprop_ccc val;" -	print "} lookup[] = {" - -	for (i = 0; i <= 0x10FFFF; i++) { -		if (!props[i] || props[i] == "CCC_NR") -			continue -		for (lo = i; props[lo] == props[i + 1]; i++) -			; -		printf "\t{RUNE_C(0x%06X), RUNE_C(0x%06X), %s},\n", lo, i, props[lo] -	} - -	print "};" -	print "" -	print "_MLIB_DEFINE_BSEARCH(enum uprop_ccc, lookup, CCC_NR)" -	print "" -	print "enum uprop_ccc" -	print "uprop_get_ccc(rune ch)" -	print "{" -	print "\treturn ch < lookup[0].lo ? CCC_NR : mlib_lookup(ch);" -	print "}" +MAP = { +	'0'  : 'NR', +	'1'  : 'OV', +	'6'  : 'HANR', +	'7'  : 'NK', +	'8'  : 'KV', +	'9'  : 'VR', +	'10' : '10', +	'11' : '11', +	'12' : '12', +	'13' : '13', +	'14' : '14', +	'15' : '15', +	'16' : '16', +	'17' : '17', +	'18' : '18', +	'19' : '19', +	'20' : '20', +	'21' : '21', +	'22' : '22', +	'23' : '23', +	'24' : '24', +	'25' : '25', +	'26' : '26', +	'27' : '27', +	'28' : '28', +	'29' : '29', +	'30' : '30', +	'31' : '31', +	'32' : '32', +	'33' : '33', +	'34' : '34', +	'35' : '35', +	'36' : '36', +	'84' : '84', +	'91' : '91', +	'103': '103', +	'107': '107', +	'118': '118', +	'122': '122', +	'129': '129', +	'130': '130', +	'132': '132', +	'133': '133', +	'200': 'ATBL', +	'202': 'ATB', +	'214': 'ATA', +	'216': 'ATAR', +	'218': 'BL', +	'220': 'B', +	'222': 'BR', +	'224': 'L', +	'226': 'R', +	'228': 'AL', +	'230': 'A', +	'232': 'AR', +	'233': 'DB', +	'234': 'DA', +	'240': 'IS',  } -' data/UnicodeData + +longest = 0 + +def parse(file: str) -> list[bool]: +	global longest + +	xs = ['CCC_NR'] * 0x110000 +	with open(file, 'r') as f: +		for line in f.readlines(): +			parts = line.split(';') +			parts[0] = int(parts[0], 16) +			if 'First' in parts[1]: +				lo = parts[0] +			elif 'Last' in parts[1]: +				hi = parts[0] +				for i in range(lo, hi + 1): +					xs[i] = f'CCC_{MAP[parts[3]]}' +					longest = max(longest, len(xs[i])) +			else: +				xs[parts[0]] = f'CCC_{MAP[parts[3]]}' +				longest = max(longest, len(xs[parts[0]])) +	return xs + +def genfile(cs: list[tuple[bool, ...]], blksize: int) -> None: +	Cs = cs +	cs = list(dict.fromkeys(Cs)) + +	print('''\ +/* This file is autogenerated by gen/prop/ccc; DO NOT EDIT. */ + +#include "unicode/prop.h" +''') + +	print(f'static constexpr {typename(len(cs) - 1)} stage1[] = {{') +	for i, c in enumerate(Cs): +		print(f'%c%{len(str(len(cs) - 1))}d,' % ('\t' if i % 16 == 0 else ' ', cs.index(c)), end='') +		if i % 16 == 15: +			print() +	print('};') + +	print() + +	ppc = columns(blksize, longest + 1) +	print(f'static constexpr enum uprop_ccc stage2[][{blksize}] = {{') +	for c in cs: +		for i in range(blksize // ppc): +			print('\t{' if i == 0 else '\t ', end='') +			for j in range(ppc): +				print(c[i*ppc + j], end='') +				if i < blksize // ppc - 1 or j < ppc - 1: +					print(',', end='') +				if j < ppc - 1: +					print(' ' * (longest + 1 - len(c[i*ppc + j])), end='') +			if i < blksize // ppc - 1: +				print() +		print('},') +	print('};') + +	print() + +	print(f'''\ +enum uprop_ccc +uprop_get_ccc(rune ch) +{{ +	return stage2[stage1[ch / {blksize}]][ch % {blksize}]; +}}''') + +def main() -> None: +	cwd_init() +	sys.stdout = open('lib/unicode/prop/uprop_get_ccc.c', 'w') +	xs = parse('data/UnicodeData') + +	blksize = -1 +	smallest = math.inf + +	for bs in powers_of_2(): +		if bs > len(xs): +			break +		Cs = [tuple(x) for x in chunks(xs, bs)] +		cs = set(Cs) + +		sz_s1 = len(Cs) * isize(len(cs) - 1) +		sz_s2 = len(cs) * bs * 4 +		sz = sz_s1 + sz_s2 + +		if sz < smallest: +			smallest = sz +			blksize = bs + +	Cs = [tuple(x) for x in chunks(xs, blksize)] +	genfile(Cs, blksize) + +	report_size(len(xs), smallest) + +if __name__ == '__main__': +	main()  |