diff options
| author | Thomas Voss <mail@thomasvoss.com> | 2024-05-07 22:50:21 +0200 | 
|---|---|---|
| committer | Thomas Voss <mail@thomasvoss.com> | 2024-05-07 22:50:21 +0200 | 
| commit | 6b3518c84ad90b1b593ef4bb700b39465c79b50e (patch) | |
| tree | 54e9ae2e84b7660623659157466766a0e2538bc1 /gen | |
| parent | 6a3a62a41abc81a82d6cb6c5491d5ac778256150 (diff) | |
Use a 2-stage lookups for u8gnext() and u8gcnt()
Diffstat (limited to 'gen')
| -rwxr-xr-x | gen/string/gbrk | 243 | ||||
| l--------- | gen/string/lib.py | 1 | 
2 files changed, 161 insertions, 83 deletions
diff --git a/gen/string/gbrk b/gen/string/gbrk index e0acca7..8397e90 100755 --- a/gen/string/gbrk +++ b/gen/string/gbrk @@ -1,97 +1,174 @@ -#!/bin/sh +#!/usr/bin/python3 + +import math + +from lib import * + + +MAP = { +	'Control':               1, +	'Extended_Pictographic': 2, +	'Extend':                3, +	'L':                     4, +	'LV':                    5, +	'LVT':                   6, +	'Prepend':               7, +	'Regional_Indicator':    8, +	'SpacingMark':           9, +	'T':                    10, +	'V':                    11, +	'ZWJ':                  12, + +	'InCB; Consonant': 0b0100_0000, +	'InCB; Extend':    0b1000_0000, +	'InCB; Linker':    0b1100_0000, +} + +longest = 3 + +def parse(*files: str) -> list[bool]: +	global longest + +	xs = [0] * 0x110000 + +	lines = [] +	for file in files: +		with open(file, 'r') as f: +			lines.extend(f.readlines()) + +	for line in lines: +		if len(line.strip()) == 0 or line[0] == '#': +			continue + +		parts = line.split(';') +		ranges = [int(x, 16) for x in parts[0].strip().split('..')] + +		if parts[1].strip() == 'InCB': +			p = 'InCB; ' + parts[2].split('#')[0].strip() +		else: +			p = parts[1].split('#')[0].strip() +		if p not in MAP: +			continue + +		for i in range(ranges[0], ranges[len(ranges) - 1] + 1): +			xs[i] |= MAP[p] +	return list(map(str, xs)) -set -e -cd "${0%/*}/../.." -exec >include/unicode/_gbrk.h +def genfile(cs: list[tuple[bool, ...]], blksize: int) -> None: +	Cs = cs +	cs = list(dict.fromkeys(Cs)) -cat <<C +	print('''\  /* This file is autogenerated by gen/string/gbrk; DO NOT EDIT. */  #ifndef MLIB_UNICODE__GBRK_H  #define MLIB_UNICODE__GBRK_H -/* clang-format off */ +#include <stdint.h> +#include "_attrs.h"  #include "_rune.h" -typedef enum { -	GBP_OTHER = 0, - -	GBP_CTRL = 1 << 0, /* Control */ -	GBP_EXT  = 1 << 1, /* Extend */ -	GBP_PIC  = 1 << 2, /* Extended_Pictographic */ -	GBP_PREP = 1 << 3, /* Prepend */ -	GBP_RI   = 1 << 4, /* Regional_Indicator */ -	GBP_SM   = 1 << 5, /* SpacingMark */ -	GBP_ZWJ  = 1 << 6, /* ZWJ */ - -	GBP_HNGL_L   = 1 <<  7, /* Hangul L */ -	GBP_HNGL_LV  = 1 <<  8, /* Hangul LV */ -	GBP_HNGL_LVT = 1 <<  9, /* Hangul LVT */ -	GBP_HNGL_T   = 1 << 10, /* Hangul T */ -	GBP_HNGL_V   = 1 << 11, /* Hangul V */ - -	GBP_INDC_CNSNT = 1 << 12, /* Indic Consonant */ -	GBP_INDC_EXT   = 1 << 13, /* Indic Extend */ -	GBP_INDC_LNK   = 1 << 14, /* Indic Linker */ -} gbrk_prop; - -static const struct { -	rune lo, hi; -	gbrk_prop val; -} gbrk_prop_tbl[] = { -C - -gawk ' -BEGIN { -	FS = "( *#.*| +; +)" -	map["Control"]               = "CTRL" -	map["Extend"]                = "EXT" -	map["Extended_Pictographic"] = "PIC" -	map["Prepend"]               = "PREP" -	map["Regional_Indicator"]    = "RI" -	map["SpacingMark"]           = "SM" -	map["ZWJ"]                   = "ZWJ" - -	map["L"]   = "HNGL_L" -	map["LV"]  = "HNGL_LV" -	map["LVT"] = "HNGL_LVT" -	map["T"]   = "HNGL_T" -	map["V"]   = "HNGL_V" - -	map["InCB; Consonant"] = "INDC_CNSNT" -	map["InCB; Extend"]    = "INDC_EXT" -	map["InCB; Linker"]    = "INDC_LNK" -} +#define GBRK_PROP_HI(x) ((x) >> 6) +#define GBRK_PROP_LO(x) ((x) & 63) -map[$2] { -	n = split($1, a, /\.\./) -	lo = strtonum("0X" a[1]) -	hi = strtonum("0X" a[n]) - -	for (i = lo; i <= hi; i++) { -		s = "GBP_" map[$2] -		props[i] = props[i] ? props[i] " | " s : s -	} -} - -END { -	for (i = 0; i <= 0x10FFFF; i++) { -		if (!props[i]) -			continue -		lo = i -		while (props[lo] == props[i + 1]) -			i++ -		printf "\t{0x%06X, 0x%06X, %s},\n", lo, i, props[lo] -	} -} -' data/GraphemeBreakProperty \ -  data/DerivedCoreProperties \ -  data/emoji-data \ -| sort +enum uprop_gbrk_indc { +	GBRK_INDC_CNSNT = 1, /* Consonant */ +	GBRK_INDC_EXT,       /* Extend */ +	GBRK_INDC_LNK,       /* Linker */ +}; -cat <<C +enum uprop_gbrk { +	GBRK_XX = 0,   /* Other */ +	GBRK_CN,       /* Control */ +	GBRK_EXT_PICT, /* Extended Pictographic */ +	GBRK_EX,       /* Extend */ +	GBRK_HST_L,    /* L */ +	GBRK_HST_LV,   /* LV */ +	GBRK_HST_LVT,  /* LVT */ +	GBRK_PP,       /* Prepend */ +	GBRK_RI,       /* Regional Indicator */ +	GBRK_SM,       /* SpacingMark */ +	GBRK_HST_T,    /* T */ +	GBRK_HST_V,    /* V */ +	GBRK_ZWJ,      /* ZWJ */ +	_GBRK_LO_CNT,  }; -#endif /* !MLIB_UNICODE__GBRK_H */ -C +static_assert(_GBRK_LO_CNT - 1 <= 0b0011'1111, +              "2 bits are required to pack Indic syllables"); +''') + +	print(f'static constexpr {typename(len(cs) - 1)} stage1[] = {{') +	for i, c in enumerate(Cs): +		print(f'%c%{len(str(len(cs) - 1))}d,' % ('\t' if i % 16 == 0 else ' ', cs.index(c)), end='') +		if i % 16 == 15: +			print() +	print('};') + +	print() + +	ppc = columns(blksize, longest + 1) +	print(f'static constexpr uint8_t stage2[][{blksize}] = {{') +	for c in cs: +		for i in range(blksize // ppc): +			print('\t{' if i == 0 else '\t ', end='') +			for j in range(ppc): +				print(c[i*ppc + j], end='') +				if i < blksize // ppc - 1 or j < ppc - 1: +					print(',', end='') +				if j < ppc - 1: +					print(' ' * (longest + 1 - len(c[i*ppc + j])), end='') +			if i < blksize // ppc - 1: +				print() +		print('},') +	print('};') + +	print() + +	print(f'''\ +[[_mlib_pure, _mlib_inline]] +static void +uprop_get_gbrk(enum uprop_gbrk *x, enum uprop_gbrk_indc *y, rune ch) +{{ +	uint8_t z = stage2[stage1[ch / {blksize}]][ch % {blksize}]; +	*x = GBRK_PROP_LO(z); +	*y = GBRK_PROP_HI(z); +}} + +#endif /* !MLIB_UNICODE__GBRK_H */''') + +def main() -> None: +	cwd_init() +	xs = parse( +		'data/GraphemeBreakProperty', +		'data/DerivedCoreProperties', +		'data/emoji-data', +	) + +	blksize = -1 +	smallest = math.inf + +	for bs in powers_of_2(): +		if bs > len(xs): +			break +		Cs = [tuple(x) for x in chunks(xs, bs)] +		cs = set(Cs) + +		sz_s1 = len(Cs) * isize(len(cs) - 1) +		sz_s2 = len(cs) * bs +		sz = sz_s1 + sz_s2 + +		if sz < smallest: +			smallest = sz +			blksize = bs + +	Cs = [tuple(x) for x in chunks(xs, blksize)] +	with open('include/unicode/_gbrk.h', 'w') as f: +		sys.stdout = f +		genfile(Cs, blksize) + +	report_size(len(xs), smallest) + +if __name__ == '__main__': +	main() diff --git a/gen/string/lib.py b/gen/string/lib.py new file mode 120000 index 0000000..33218f3 --- /dev/null +++ b/gen/string/lib.py @@ -0,0 +1 @@ +../prop/lib.py
\ No newline at end of file  |