From 3c6ca49b23fd6a2df735e0eaf93432bfef3cba97 Mon Sep 17 00:00:00 2001 From: Thomas Voss Date: Sat, 4 May 2024 20:48:57 +0200 Subject: More 2-stage lookup tables --- gen/prop/age | 2 + gen/prop/lib.py | 13 +- gen/prop/nfkc_Xcf | 184 +++++++++++++------- gen/prop/nt | 2 + gen/prop/nv | 169 +++++++++++-------- gen/prop/sb | 191 +++++++++++++-------- gen/prop/sc | 492 +++++++++++++++++++++++++++++------------------------- gen/prop/scf | 172 ++++++++++++------- gen/prop/slc | 157 +++++++++++------ gen/prop/stc | 147 +++++++++++----- gen/prop/suc | 157 +++++++++++------ gen/prop/vo | 169 +++++++++++-------- gen/prop/wb | 207 ++++++++++++++--------- 13 files changed, 1265 insertions(+), 797 deletions(-) (limited to 'gen/prop') diff --git a/gen/prop/age b/gen/prop/age index a565021..b36a6f5 100755 --- a/gen/prop/age +++ b/gen/prop/age @@ -32,6 +32,8 @@ def genfile(cs: list[tuple[bool, ...]], blksize: int) -> None: print('''\ /* This file is autogenerated by gen/prop/age; DO NOT EDIT. */ +#include + #include "unicode/prop.h" ''') diff --git a/gen/prop/lib.py b/gen/prop/lib.py index 5f4f061..a363374 100644 --- a/gen/prop/lib.py +++ b/gen/prop/lib.py @@ -24,8 +24,7 @@ def columns(n: int, m: int) -> int: y += x - 1 if y <= 80: return x - - raise ValueError + return 1 def isize(x: int) -> int: if x < 256: @@ -54,14 +53,8 @@ def cwd_init() -> None: os.chdir(dir / '..' / '..') def report_size(before: int, after: int) -> None: - def btokib(n: int) -> str: - s = str(round(n / 1024, 2)) - if s.endswith('.0'): - s = s[:-2] - return s + ' KiB' - prefix = sys.argv[0].split('/')[-1].ljust(len('id_compat_math_continue') + 2) change = round((after - before) / before * 100, 1) - before = btokib(before) - after = btokib(after) + before = '%d KiB' % round(before / 1024, 2) + after = ('%.2f KiB' % round(after / 1024, 2)).rjust(len('XXX.XX KiB')) print(f'%s%s%%, %s → %s' % (prefix, change, before, after), file=sys.stderr) diff --git a/gen/prop/nfkc_Xcf b/gen/prop/nfkc_Xcf index ba5a905..58c3abc 100755 --- a/gen/prop/nfkc_Xcf +++ b/gen/prop/nfkc_Xcf @@ -1,65 +1,121 @@ -#!/bin/sh - -set -e -cd "${0%/*}/../.." - -for x in cf scf -do - gawk -v s=$x ' - BEGIN { - FS = "( *; *| *#.*)" - - print "/* This file is autogenerated by gen/prop/nfkc_Xcf; DO NOT EDIT. */" - print "" - print "#include \"_bsearch.h\"" - print "#include \"macros.h\"" - print "#include \"rune.h\"" - print "#include \"unicode/prop.h\"" - print "" - print "#define M(...) ((struct rview)_(__VA_ARGS__))" - print "#define _(...) \\" - print "\t{(const rune []){__VA_ARGS__}, lengthof(((const rune []){__VA_ARGS__}))}" - print "" - print "static const struct {" - print "\trune lo, hi;" - print "\tstruct rview val;" - print "} lookup[] = {" - } - - $2 == "NFKC_" toupper(s) { - n = split($1, xs, /\.\./) - lo = strtonum("0X" xs[1]) - hi = strtonum("0X" xs[n]) - - for (i = lo; i <= hi; i++) - props[i] = $3 ? $3 : "-" - } - - END { - for (i = 0; i <= 0x10FFFD; i++) { - if (!props[i]) +#!/usr/bin/python3 + +import math + +from lib import * + + +longest = 0 +TYPES = ['cf', 'scf'] + +def parse(file: str, _type: str) -> list[bool]: + global longest + + _type = _type.upper() + + xs = ['_(SENTINAL)'] * 0x110000 + with open(file, 'r') as f: + for line in f.readlines(): + if ( + len(line.strip()) == 0 + or line[0] == '#' + or f'NFKC_{_type}' not in line + ): continue - for (lo = i; props[lo] == props[i + 1]; i++) - ; - printf "\t{RUNE_C(0x%06X), RUNE_C(0x%06X), _(", lo, i - n = split(props[i] == "-" ? "" : props[i], xs, / /) - for (j = 1; j <= n; j++) { - printf "RUNE_C(0x%s)", xs[j] - if (j < n) - printf ", " - } - print ")}," - } - - print "};" - print "" - print "_MLIB_DEFINE_BSEARCH(struct rview, lookup, M(ch))" - print "" - print "struct rview" - print "uprop_get_nfkc_" s "(rune ch)" - print "{" - print "\treturn ch < lookup[0].lo ? M(ch) : mlib_lookup(ch);" - print "}" - } - ' data/DerivedNormalizationProps >lib/unicode/prop/uprop_get_nfkc_${x}.c -done + + parts = line.split(';') + ranges = [int(x, 16) for x in parts[0].strip().split('..')] + prop = ', '.join( + f'0x{x}' for x in parts[2].split('#')[0].strip().split() + ) + prop = f'_({prop})' + longest = max(longest, len(prop)) + + for i in range(ranges[0], ranges[len(ranges) - 1] + 1): + xs[i] = prop + return xs + +def genfile(cs: list[tuple[bool, ...]], blksize: int, _type: str) -> None: + Cs = cs + cs = list(dict.fromkeys(Cs)) + + print('''\ +/* This file is autogenerated by gen/prop/nfkc_Xcf; DO NOT EDIT. */ + +#include "macros.h" +#include "unicode/prop.h" + +#define M(...) ((struct rview)_(__VA_ARGS__)) +#define _(...) \\ + {(const rune []){__VA_ARGS__}, lengthof(((const rune []){__VA_ARGS__}))} + +constexpr rune SENTINAL = 0x110000; +''') + + print(f'static constexpr {typename(len(cs) - 1)} stage1[] = {{') + for i, c in enumerate(Cs): + print(f'%c%{len(str(len(cs) - 1))}d,' % ('\t' if i % 16 == 0 else ' ', cs.index(c)), end='') + if i % 16 == 15: + print() + print('};') + + print() + + ppc = columns(blksize, longest + 1) + print(f'static const struct rview stage2[][{blksize}] = {{') + for c in cs: + for i in range(blksize // ppc): + print('\t{' if i == 0 else '\t ', end='') + for j in range(ppc): + print(c[i*ppc + j], end='') + if i < blksize // ppc - 1 or j < ppc - 1: + print(',', end='') + if j < ppc - 1: + print(' ' * (longest + 1 - len(c[i*ppc + j])), end='') + if i < blksize // ppc - 1: + print() + print('},') + print('};') + + print() + + print(f'''\ +struct rview +uprop_get_nfkc_{_type}(rune ch) +{{ + struct rview rv = stage2[stage1[ch / {blksize}]][ch % {blksize}]; + return rv.len == 1 && rv.p[0] == SENTINAL ? M(ch) : rv; +}}''') + +def main(_type: str) -> None: + cwd_init() + xs = parse('data/DerivedNormalizationProps', _type) + + blksize = -1 + smallest = math.inf + + for bs in powers_of_2(): + if bs > len(xs): + break + Cs = [tuple(x) for x in chunks(xs, bs)] + cs = set(Cs) + + sz_s1 = len(Cs) * isize(len(cs) - 1) + sz_s2 = len(cs) * bs + sz = sz_s1 + sz_s2 + + if sz < smallest: + smallest = sz + blksize = bs + + Cs = [tuple(x) for x in chunks(xs, blksize)] + with open(f'lib/unicode/prop/uprop_get_nfkc_{_type}.c', 'w') as f: + sys.stdout = f + genfile(Cs, blksize, _type) + + report_size(len(xs), smallest) + +if __name__ == '__main__': + for _type in TYPES: + longest = 0 + main(_type) diff --git a/gen/prop/nt b/gen/prop/nt index 477789c..c799f7f 100755 --- a/gen/prop/nt +++ b/gen/prop/nt @@ -38,6 +38,8 @@ def genfile(cs: list[tuple[bool, ...]], blksize: int) -> None: print('''\ /* This file is autogenerated by gen/prop/nt; DO NOT EDIT. */ +#include + #include "unicode/prop.h" ''') diff --git a/gen/prop/nv b/gen/prop/nv index f8c3e31..68cbf0e 100755 --- a/gen/prop/nv +++ b/gen/prop/nv @@ -1,65 +1,104 @@ -#!/bin/sh - -set -e -cd "${0%/*}/../.." -exec >lib/unicode/prop/uprop_get_nv.c - -gawk ' -BEGIN { - FS = "( *#.*| +; +)" - - print "/* This file is autogenerated by gen/prop/nv; DO NOT EDIT. */" - print "" - print "#include " - print "" - print "#include \"_bsearch.h\"" - print "#include \"macros.h\"" - print "#include \"rune.h\"" - print "#include \"unicode/prop.h\"" - print "" -} - -/^[^#]/ { - n = split($1, a, /\.\./) - lo = strtonum("0X" a[1]) - hi = strtonum("0X" a[n]) - - for (i = lo; i <= hi; i++) { - gsub(/^; /, "", $3) - if ($3 ~ /[^.]\//) - sub(/\//, "./", $3) - props[i] = $3 - } -} - -END { - print "static constexpr double lookup_lat1[] = {" - for (i = 0; i < 0x100; i++) { - if (i % 8 == 0) - printf "\t" - printf "%4s,%s", props[i] ? props[i] : "NAN", i % 8 == 7 ? "\n" : " " - } - print "};" - print "" - print "static const struct {" - print "\trune k;" - print "\tdouble v;" - print "} lookup[] = {" - - for (i = 0x100; i <= 0x10FFFF; i++) { - if (!props[i]) - continue - printf "\t{RUNE_C(0x%06X), %s},\n", i, props[i] - } - - print "};" - print "" - print "_MLIB_DEFINE_BSEARCH_KV(double, lookup, NAN)" - print "" - print "double" - print "uprop_get_nv(rune ch)" - print "{" - print "\treturn ch < lengthof(lookup_lat1) ? lookup_lat1[ch] : mlib_lookup_kv(ch);" - print "}" -} -' data/DerivedNumericValues +#!/usr/bin/python3 + +import math + +from lib import * + + +longest = 0 + +def parse(file: str) -> list[bool]: + global longest + + xs = ['NAN'] * 0x110000 + with open(file, 'r') as f: + for line in f.readlines(): + if len(line.strip()) == 0 or line[0] == '#': + continue + + parts = line.split(';') + ranges = [int(x, 16) for x in parts[0].strip().split('..')] + prop = parts[3].split('#')[0].strip().replace('/', './') + longest = max(longest, len(prop)) + + for i in range(ranges[0], ranges[len(ranges) - 1] + 1): + xs[i] = prop + return xs + +def genfile(cs: list[tuple[bool, ...]], blksize: int) -> None: + Cs = cs + cs = list(dict.fromkeys(Cs)) + + print('''\ +/* This file is autogenerated by gen/prop/nv; DO NOT EDIT. */ + +#include +#include + +#include "unicode/prop.h" +''') + + print(f'static constexpr {typename(len(cs) - 1)} stage1[] = {{') + for i, c in enumerate(Cs): + print(f'%c%{len(str(len(cs) - 1))}d,' % ('\t' if i % 16 == 0 else ' ', cs.index(c)), end='') + if i % 16 == 15: + print() + print('};') + + print() + + ppc = columns(blksize, longest + 1) + print(f'static constexpr double stage2[][{blksize}] = {{') + for c in cs: + for i in range(blksize // ppc): + print('\t{' if i == 0 else '\t ', end='') + for j in range(ppc): + print(c[i*ppc + j], end='') + if i < blksize // ppc - 1 or j < ppc - 1: + print(',', end='') + if j < ppc - 1: + print(' ' * (longest + 1 - len(c[i*ppc + j])), end='') + if i < blksize // ppc - 1: + print() + print('},') + print('};') + + print() + + print(f'''\ +double +uprop_get_nv(rune ch) +{{ + return stage2[stage1[ch / {blksize}]][ch % {blksize}]; +}}''') + +def main() -> None: + cwd_init() + xs = parse('data/DerivedNumericValues') + + blksize = -1 + smallest = math.inf + + for bs in powers_of_2(): + if bs > len(xs): + break + Cs = [tuple(x) for x in chunks(xs, bs)] + cs = set(Cs) + + sz_s1 = len(Cs) * isize(len(cs) - 1) + sz_s2 = len(cs) * bs * 8 + sz = sz_s1 + sz_s2 + + if sz < smallest: + smallest = sz + blksize = bs + + Cs = [tuple(x) for x in chunks(xs, blksize)] + with open('lib/unicode/prop/uprop_get_nv.c', 'w') as f: + sys.stdout = f + genfile(Cs, blksize) + + report_size(len(xs), smallest) + +if __name__ == '__main__': + main() diff --git a/gen/prop/sb b/gen/prop/sb index aff06fd..e40f9a5 100755 --- a/gen/prop/sb +++ b/gen/prop/sb @@ -1,78 +1,121 @@ -#!/bin/sh - -set -e -cd "${0%/*}/../.." -exec >lib/unicode/prop/uprop_get_sb.c - -gawk ' -BEGIN { - FS = " *(; *|#.*)" - - map["ATerm"] = "AT" - map["Close"] = "CL" - map["CR"] = "CR" - map["Extend"] = "EX" - map["Format"] = "FO" - map["LF"] = "LF" - map["Lower"] = "LO" - map["Numeric"] = "NU" - map["OLetter"] = "LE" - map["Other"] = "XX" - map["SContinue"] = "SC" - map["Sep"] = "SE" - map["Sp"] = "SP" - map["STerm"] = "ST" - map["Upper"] = "UP" - - print "/* This file is autogenerated by gen/prop/sb; DO NOT EDIT. */" - print "" - print "#include \"_bsearch.h\"" - print "#include \"macros.h\"" - print "#include \"rune.h\"" - print "#include \"unicode/prop.h\"" - print "" -} +#!/usr/bin/python3 -/^[A-F0-9]/ { - n = split($1, a, /\.\./) - lo = strtonum("0X" a[1]) - hi = strtonum("0X" a[n]) +import math - for (i = lo; i <= hi; i++) - props[i] = "SB_" map[$2] -} +from lib import * -END { - print "static constexpr enum uprop_sb lookup_lat1[] = {" - for (i = 0; i < 0x100; i++) { - if (i % 8 == 0) - printf "\t" - printf "%s%s", (props[i] ? props[i] : "SB_XX") ",", \ - i % 8 == 7 ? "\n" : " " - } - print "};" - print "" - print "static const struct {" - print "\trune lo, hi;" - print "\tenum uprop_sb val;" - print "} lookup[] = {" - - for (i = 0x100; i <= 0x10FFFF; i++) { - if (!props[i]) - continue - for (lo = i; props[lo] == props[i + 1]; i++) - ; - printf "\t{RUNE_C(0x%06X), RUNE_C(0x%06X), %s},\n", lo, i, props[i] - } - - print "};" - print "" - print "_MLIB_DEFINE_BSEARCH(enum uprop_sb, lookup, SB_XX)" - print "" - print "enum uprop_sb" - print "uprop_get_sb(rune ch)" - print "{" - print "\treturn ch < lengthof(lookup_lat1) ? lookup_lat1[ch] : mlib_lookup(ch);" - print "}" + +MAP = { + 'ATerm': 'AT', + 'Close': 'CL', + 'CR': 'CR', + 'Extend': 'EX', + 'Format': 'FO', + 'LF': 'LF', + 'Lower': 'LO', + 'Numeric': 'NU', + 'OLetter': 'LE', + 'Other': 'XX', + 'SContinue': 'SC', + 'Sep': 'SE', + 'Sp': 'SP', + 'STerm': 'ST', + 'Upper': 'UP', } -' data/SentenceBreakProperty | sed 's/\s*$//' + +longest = 0 + +def parse(file: str) -> list[bool]: + global longest + + xs = ['SB_XX'] * 0x110000 + with open(file, 'r') as f: + for line in f.readlines(): + if len(line.strip()) == 0 or line[0] == '#': + continue + + parts = line.split(';') + ranges = [int(x, 16) for x in parts[0].strip().split('..')] + prop = 'SB_' + MAP[parts[1].split('#')[0].strip()] + longest = max(longest, len(prop)) + + for i in range(ranges[0], ranges[len(ranges) - 1] + 1): + xs[i] = prop + return xs + +def genfile(cs: list[tuple[bool, ...]], blksize: int) -> None: + Cs = cs + cs = list(dict.fromkeys(Cs)) + + print('''\ +/* This file is autogenerated by gen/prop/sb; DO NOT EDIT. */ + +#include + +#include "unicode/prop.h" +''') + + print(f'static constexpr {typename(len(cs) - 1)} stage1[] = {{') + for i, c in enumerate(Cs): + print(f'%c%{len(str(len(cs) - 1))}d,' % ('\t' if i % 16 == 0 else ' ', cs.index(c)), end='') + if i % 16 == 15: + print() + print('};') + + print() + + ppc = columns(blksize, longest + 1) + print(f'static constexpr enum uprop_sb stage2[][{blksize}] = {{') + for c in cs: + for i in range(blksize // ppc): + print('\t{' if i == 0 else '\t ', end='') + for j in range(ppc): + print(c[i*ppc + j], end='') + if i < blksize // ppc - 1 or j < ppc - 1: + print(',', end='') + if j < ppc - 1: + print(' ' * (longest + 1 - len(c[i*ppc + j])), end='') + if i < blksize // ppc - 1: + print() + print('},') + print('};') + + print() + + print(f'''\ +enum uprop_sb +uprop_get_sb(rune ch) +{{ + return stage2[stage1[ch / {blksize}]][ch % {blksize}]; +}}''') + +def main() -> None: + cwd_init() + xs = parse('data/SentenceBreakProperty') + + blksize = -1 + smallest = math.inf + + for bs in powers_of_2(): + if bs > len(xs): + break + Cs = [tuple(x) for x in chunks(xs, bs)] + cs = set(Cs) + + sz_s1 = len(Cs) * isize(len(cs) - 1) + sz_s2 = len(cs) * bs + sz = sz_s1 + sz_s2 + + if sz < smallest: + smallest = sz + blksize = bs + + Cs = [tuple(x) for x in chunks(xs, blksize)] + with open('lib/unicode/prop/uprop_get_sb.c', 'w') as f: + sys.stdout = f + genfile(Cs, blksize) + + report_size(len(xs), smallest) + +if __name__ == '__main__': + main() diff --git a/gen/prop/sc b/gen/prop/sc index 7eb219b..af8c316 100755 --- a/gen/prop/sc +++ b/gen/prop/sc @@ -1,230 +1,270 @@ -#!/bin/sh - -set -e -cd "${0%/*}/../.." -exec >lib/unicode/prop/uprop_get_sc.c - -gawk ' -BEGIN { - FS = " *(; *|#.*)" - - map["Adlam"] = "ADLM" - map["Caucasian_Albanian"] = "AGHB" - map["Ahom"] = "AHOM" - map["Arabic"] = "ARAB" - map["Imperial_Aramaic"] = "ARMI" - map["Armenian"] = "ARMN" - map["Avestan"] = "AVST" - map["Balinese"] = "BALI" - map["Bamum"] = "BAMU" - map["Bassa_Vah"] = "BASS" - map["Batak"] = "BATK" - map["Bengali"] = "BENG" - map["Bhaiksuki"] = "BHKS" - map["Bopomofo"] = "BOPO" - map["Brahmi"] = "BRAH" - map["Braille"] = "BRAI" - map["Buginese"] = "BUGI" - map["Buhid"] = "BUHD" - map["Chakma"] = "CAKM" - map["Canadian_Aboriginal"] = "CANS" - map["Carian"] = "CARI" - map["Cham"] = "CHAM" - map["Cherokee"] = "CHER" - map["Chorasmian"] = "CHRS" - map["Coptic"] = "COPT" - map["Cypro_Minoan"] = "CPMN" - map["Cypriot"] = "CPRT" - map["Cyrillic"] = "CYRL" - map["Devanagari"] = "DEVA" - map["Dives_Akuru"] = "DIAK" - map["Dogra"] = "DOGR" - map["Deseret"] = "DSRT" - map["Duployan"] = "DUPL" - map["Egyptian_Hieroglyphs"] = "EGYP" - map["Elbasan"] = "ELBA" - map["Elymaic"] = "ELYM" - map["Ethiopic"] = "ETHI" - map["Georgian"] = "GEOR" - map["Glagolitic"] = "GLAG" - map["Gunjala_Gondi"] = "GONG" - map["Masaram_Gondi"] = "GONM" - map["Gothic"] = "GOTH" - map["Grantha"] = "GRAN" - map["Greek"] = "GREK" - map["Gujarati"] = "GUJR" - map["Gurmukhi"] = "GURU" - map["Hangul"] = "HANG" - map["Han"] = "HANI" - map["Hanunoo"] = "HANO" - map["Hatran"] = "HATR" - map["Hebrew"] = "HEBR" - map["Hiragana"] = "HIRA" - map["Anatolian_Hieroglyphs"] = "HLUW" - map["Pahawh_Hmong"] = "HMNG" - map["Nyiakeng_Puachue_Hmong"] = "HMNP" - map["Katakana_Or_Hiragana"] = "HRKT" - map["Old_Hungarian"] = "HUNG" - map["Old_Italic"] = "ITAL" - map["Javanese"] = "JAVA" - map["Kayah_Li"] = "KALI" - map["Katakana"] = "KANA" - map["Kawi"] = "KAWI" - map["Kharoshthi"] = "KHAR" - map["Khmer"] = "KHMR" - map["Khojki"] = "KHOJ" - map["Khitan_Small_Script"] = "KITS" - map["Kannada"] = "KNDA" - map["Kaithi"] = "KTHI" - map["Tai_Tham"] = "LANA" - map["Lao"] = "LAOO" - map["Latin"] = "LATN" - map["Lepcha"] = "LEPC" - map["Limbu"] = "LIMB" - map["Linear_A"] = "LINA" - map["Linear_B"] = "LINB" - map["Lisu"] = "LISU" - map["Lycian"] = "LYCI" - map["Lydian"] = "LYDI" - map["Mahajani"] = "MAHJ" - map["Makasar"] = "MAKA" - map["Mandaic"] = "MAND" - map["Manichaean"] = "MANI" - map["Marchen"] = "MARC" - map["Medefaidrin"] = "MEDF" - map["Mende_Kikakui"] = "MEND" - map["Meroitic_Cursive"] = "MERC" - map["Meroitic_Hieroglyphs"] = "MERO" - map["Malayalam"] = "MLYM" - map["Modi"] = "MODI" - map["Mongolian"] = "MONG" - map["Mro"] = "MROO" - map["Meetei_Mayek"] = "MTEI" - map["Multani"] = "MULT" - map["Myanmar"] = "MYMR" - map["Nag_Mundari"] = "NAGM" - map["Nandinagari"] = "NAND" - map["Old_North_Arabian"] = "NARB" - map["Nabataean"] = "NBAT" - map["Newa"] = "NEWA" - map["Nko"] = "NKOO" - map["Nushu"] = "NSHU" - map["Ogham"] = "OGAM" - map["Ol_Chiki"] = "OLCK" - map["Old_Turkic"] = "ORKH" - map["Oriya"] = "ORYA" - map["Osage"] = "OSGE" - map["Osmanya"] = "OSMA" - map["Old_Uyghur"] = "OUGR" - map["Palmyrene"] = "PALM" - map["Pau_Cin_Hau"] = "PAUC" - map["Old_Permic"] = "PERM" - map["Phags_Pa"] = "PHAG" - map["Inscriptional_Pahlavi"] = "PHLI" - map["Psalter_Pahlavi"] = "PHLP" - map["Phoenician"] = "PHNX" - map["Miao"] = "PLRD" - map["Inscriptional_Parthian"] = "PRTI" - map["Rejang"] = "RJNG" - map["Hanifi_Rohingya"] = "ROHG" - map["Runic"] = "RUNR" - map["Samaritan"] = "SAMR" - map["Old_South_Arabian"] = "SARB" - map["Saurashtra"] = "SAUR" - map["SignWriting"] = "SGNW" - map["Shavian"] = "SHAW" - map["Sharada"] = "SHRD" - map["Siddham"] = "SIDD" - map["Khudawadi"] = "SIND" - map["Sinhala"] = "SINH" - map["Sogdian"] = "SOGD" - map["Old_Sogdian"] = "SOGO" - map["Sora_Sompeng"] = "SORA" - map["Soyombo"] = "SOYO" - map["Sundanese"] = "SUND" - map["Syloti_Nagri"] = "SYLO" - map["Syriac"] = "SYRC" - map["Tagbanwa"] = "TAGB" - map["Takri"] = "TAKR" - map["Tai_Le"] = "TALE" - map["New_Tai_Lue"] = "TALU" - map["Tamil"] = "TAML" - map["Tangut"] = "TANG" - map["Tai_Viet"] = "TAVT" - map["Telugu"] = "TELU" - map["Tifinagh"] = "TFNG" - map["Tagalog"] = "TGLG" - map["Thaana"] = "THAA" - map["Thai"] = "THAI" - map["Tibetan"] = "TIBT" - map["Tirhuta"] = "TIRH" - map["Tangsa"] = "TNSA" - map["Toto"] = "TOTO" - map["Ugaritic"] = "UGAR" - map["Vai"] = "VAII" - map["Vithkuqi"] = "VITH" - map["Warang_Citi"] = "WARA" - map["Wancho"] = "WCHO" - map["Old_Persian"] = "XPEO" - map["Cuneiform"] = "XSUX" - map["Yezidi"] = "YEZI" - map["Yi"] = "YIII" - map["Zanabazar_Square"] = "ZANB" - map["Inherited"] = "ZINH" - map["Common"] = "ZYYY" - - print "/* This file is autogenerated by gen/prop/sc; DO NOT EDIT. */" - print "" - print "#include \"_bsearch.h\"" - print "#include \"macros.h\"" - print "#include \"rune.h\"" - print "#include \"unicode/prop.h\"" - print "" -} +#!/usr/bin/python3 -/^[^#]/ { - n = split($1, a, /\.\./) - lo = strtonum("0X" a[1]) - hi = strtonum("0X" a[n]) +import math - for (i = lo; i <= hi; i++) { - gsub(/^; /, "", $2) - props[i] = "SC_" map[$2] - } -} +from lib import * -END { - print "static constexpr enum uprop_sc lookup_lat1[] = {" - for (i = 0; i < 0x100; i++) { - if (i % 8 == 0) - printf "\t" - printf "%-7s,%s", props[i] ? props[i] : 0, i % 8 == 7 ? "\n" : " " - } - print "};" - print "" - - print "static const struct {" - print "\trune lo, hi;" - print "\tenum uprop_sc val;" - print "} lookup[] = {" - - for (i = 0x100; i <= 0x10FFFF; i++) { - if (!props[i]) - continue - lo = i - while (props[lo] == props[i + 1]) - i++ - printf "\t{RUNE_C(0x%06X), RUNE_C(0x%06X), %s},\n", lo, i, props[i] - } - - print "};" - print "" - print "_MLIB_DEFINE_BSEARCH(enum uprop_sc, lookup, SC_ZZZZ)" - print "" - print "enum uprop_sc" - print "uprop_get_sc(rune ch)" - print "{" - print "\treturn ch <= lengthof(lookup_lat1) ? lookup_lat1[ch] : mlib_lookup(ch);" - print "}" + +MAP = { + 'Adlam': 'ADLM', + 'Caucasian_Albanian': 'AGHB', + 'Ahom': 'AHOM', + 'Arabic': 'ARAB', + 'Imperial_Aramaic': 'ARMI', + 'Armenian': 'ARMN', + 'Avestan': 'AVST', + 'Balinese': 'BALI', + 'Bamum': 'BAMU', + 'Bassa_Vah': 'BASS', + 'Batak': 'BATK', + 'Bengali': 'BENG', + 'Bhaiksuki': 'BHKS', + 'Bopomofo': 'BOPO', + 'Brahmi': 'BRAH', + 'Braille': 'BRAI', + 'Buginese': 'BUGI', + 'Buhid': 'BUHD', + 'Chakma': 'CAKM', + 'Canadian_Aboriginal': 'CANS', + 'Carian': 'CARI', + 'Cham': 'CHAM', + 'Cherokee': 'CHER', + 'Chorasmian': 'CHRS', + 'Coptic': 'COPT', + 'Cypro_Minoan': 'CPMN', + 'Cypriot': 'CPRT', + 'Cyrillic': 'CYRL', + 'Devanagari': 'DEVA', + 'Dives_Akuru': 'DIAK', + 'Dogra': 'DOGR', + 'Deseret': 'DSRT', + 'Duployan': 'DUPL', + 'Egyptian_Hieroglyphs': 'EGYP', + 'Elbasan': 'ELBA', + 'Elymaic': 'ELYM', + 'Ethiopic': 'ETHI', + 'Georgian': 'GEOR', + 'Glagolitic': 'GLAG', + 'Gunjala_Gondi': 'GONG', + 'Masaram_Gondi': 'GONM', + 'Gothic': 'GOTH', + 'Grantha': 'GRAN', + 'Greek': 'GREK', + 'Gujarati': 'GUJR', + 'Gurmukhi': 'GURU', + 'Hangul': 'HANG', + 'Han': 'HANI', + 'Hanunoo': 'HANO', + 'Hatran': 'HATR', + 'Hebrew': 'HEBR', + 'Hiragana': 'HIRA', + 'Anatolian_Hieroglyphs': 'HLUW', + 'Pahawh_Hmong': 'HMNG', + 'Nyiakeng_Puachue_Hmong': 'HMNP', + 'Katakana_Or_Hiragana': 'HRKT', + 'Old_Hungarian': 'HUNG', + 'Old_Italic': 'ITAL', + 'Javanese': 'JAVA', + 'Kayah_Li': 'KALI', + 'Katakana': 'KANA', + 'Kawi': 'KAWI', + 'Kharoshthi': 'KHAR', + 'Khmer': 'KHMR', + 'Khojki': 'KHOJ', + 'Khitan_Small_Script': 'KITS', + 'Kannada': 'KNDA', + 'Kaithi': 'KTHI', + 'Tai_Tham': 'LANA', + 'Lao': 'LAOO', + 'Latin': 'LATN', + 'Lepcha': 'LEPC', + 'Limbu': 'LIMB', + 'Linear_A': 'LINA', + 'Linear_B': 'LINB', + 'Lisu': 'LISU', + 'Lycian': 'LYCI', + 'Lydian': 'LYDI', + 'Mahajani': 'MAHJ', + 'Makasar': 'MAKA', + 'Mandaic': 'MAND', + 'Manichaean': 'MANI', + 'Marchen': 'MARC', + 'Medefaidrin': 'MEDF', + 'Mende_Kikakui': 'MEND', + 'Meroitic_Cursive': 'MERC', + 'Meroitic_Hieroglyphs': 'MERO', + 'Malayalam': 'MLYM', + 'Modi': 'MODI', + 'Mongolian': 'MONG', + 'Mro': 'MROO', + 'Meetei_Mayek': 'MTEI', + 'Multani': 'MULT', + 'Myanmar': 'MYMR', + 'Nag_Mundari': 'NAGM', + 'Nandinagari': 'NAND', + 'Old_North_Arabian': 'NARB', + 'Nabataean': 'NBAT', + 'Newa': 'NEWA', + 'Nko': 'NKOO', + 'Nushu': 'NSHU', + 'Ogham': 'OGAM', + 'Ol_Chiki': 'OLCK', + 'Old_Turkic': 'ORKH', + 'Oriya': 'ORYA', + 'Osage': 'OSGE', + 'Osmanya': 'OSMA', + 'Old_Uyghur': 'OUGR', + 'Palmyrene': 'PALM', + 'Pau_Cin_Hau': 'PAUC', + 'Old_Permic': 'PERM', + 'Phags_Pa': 'PHAG', + 'Inscriptional_Pahlavi': 'PHLI', + 'Psalter_Pahlavi': 'PHLP', + 'Phoenician': 'PHNX', + 'Miao': 'PLRD', + 'Inscriptional_Parthian': 'PRTI', + 'Rejang': 'RJNG', + 'Hanifi_Rohingya': 'ROHG', + 'Runic': 'RUNR', + 'Samaritan': 'SAMR', + 'Old_South_Arabian': 'SARB', + 'Saurashtra': 'SAUR', + 'SignWriting': 'SGNW', + 'Shavian': 'SHAW', + 'Sharada': 'SHRD', + 'Siddham': 'SIDD', + 'Khudawadi': 'SIND', + 'Sinhala': 'SINH', + 'Sogdian': 'SOGD', + 'Old_Sogdian': 'SOGO', + 'Sora_Sompeng': 'SORA', + 'Soyombo': 'SOYO', + 'Sundanese': 'SUND', + 'Syloti_Nagri': 'SYLO', + 'Syriac': 'SYRC', + 'Tagbanwa': 'TAGB', + 'Takri': 'TAKR', + 'Tai_Le': 'TALE', + 'New_Tai_Lue': 'TALU', + 'Tamil': 'TAML', + 'Tangut': 'TANG', + 'Tai_Viet': 'TAVT', + 'Telugu': 'TELU', + 'Tifinagh': 'TFNG', + 'Tagalog': 'TGLG', + 'Thaana': 'THAA', + 'Thai': 'THAI', + 'Tibetan': 'TIBT', + 'Tirhuta': 'TIRH', + 'Tangsa': 'TNSA', + 'Toto': 'TOTO', + 'Ugaritic': 'UGAR', + 'Vai': 'VAII', + 'Vithkuqi': 'VITH', + 'Warang_Citi': 'WARA', + 'Wancho': 'WCHO', + 'Old_Persian': 'XPEO', + 'Cuneiform': 'XSUX', + 'Yezidi': 'YEZI', + 'Yi': 'YIII', + 'Zanabazar_Square': 'ZANB', + 'Inherited': 'ZINH', + 'Common': 'ZYYY', } -' data/Scripts | sed 's/\s*$//' + +longest = 0 + +def parse(file: str) -> list[bool]: + global longest + + xs = ['SC_ZZZZ'] * 0x110000 + with open(file, 'r') as f: + for line in f.readlines(): + if len(line.strip()) == 0 or line[0] == '#': + continue + + parts = line.split(';') + ranges = [int(x, 16) for x in parts[0].strip().split('..')] + prop = 'SC_' + MAP[parts[1].split('#')[0].strip()] + longest = max(longest, len(prop)) + + for i in range(ranges[0], ranges[len(ranges) - 1] + 1): + xs[i] = prop + return xs + +def genfile(cs: list[tuple[bool, ...]], blksize: int) -> None: + Cs = cs + cs = list(dict.fromkeys(Cs)) + + print('''\ +/* This file is autogenerated by gen/prop/sc; DO NOT EDIT. */ + +#include + +#include "unicode/prop.h" +''') + + print(f'static constexpr {typename(len(cs) - 1)} stage1[] = {{') + for i, c in enumerate(Cs): + print(f'%c%{len(str(len(cs) - 1))}d,' % ('\t' if i % 16 == 0 else ' ', cs.index(c)), end='') + if i % 16 == 15: + print() + print('};') + + print() + + ppc = columns(blksize, longest + 1) + print(f'static constexpr enum uprop_sc stage2[][{blksize}] = {{') + for c in cs: + for i in range(blksize // ppc): + print('\t{' if i == 0 else '\t ', end='') + for j in range(ppc): + print(c[i*ppc + j], end='') + if i < blksize // ppc - 1 or j < ppc - 1: + print(',', end='') + if j < ppc - 1: + print(' ' * (longest + 1 - len(c[i*ppc + j])), end='') + if i < blksize // ppc - 1: + print() + print('},') + print('};') + + print() + + print(f'''\ +enum uprop_sc +uprop_get_sc(rune ch) +{{ + return stage2[stage1[ch / {blksize}]][ch % {blksize}]; +}}''') + +def main() -> None: + cwd_init() + xs = parse('data/Scripts') + + blksize = -1 + smallest = math.inf + + for bs in powers_of_2(): + if bs > len(xs): + break + Cs = [tuple(x) for x in chunks(xs, bs)] + cs = set(Cs) + + sz_s1 = len(Cs) * isize(len(cs) - 1) + sz_s2 = len(cs) * bs + sz = sz_s1 + sz_s2 + + if sz < smallest: + smallest = sz + blksize = bs + + Cs = [tuple(x) for x in chunks(xs, blksize)] + with open('lib/unicode/prop/uprop_get_sc.c', 'w') as f: + sys.stdout = f + genfile(Cs, blksize) + + report_size(len(xs), smallest) + +if __name__ == '__main__': + main() diff --git a/gen/prop/scf b/gen/prop/scf index 8dcfcec..47cfc0a 100755 --- a/gen/prop/scf +++ b/gen/prop/scf @@ -1,59 +1,113 @@ -#!/bin/sh - -set -e -cd "${0%/*}/../.." -exec >lib/unicode/prop/uprop_get_scf.c - -gawk ' -BEGIN { - FS = "[ ;]+" - - print "/* This file is autogenerated by gen/prop/scf; DO NOT EDIT. */" - print "" - print "#include \"_bsearch.h\"" - print "#include \"macros.h\"" - print "#include \"rune.h\"" - print "#include \"unicode/prop.h\"" - print "" -} - -$0 !~ /^#/ && $2 ~ /[CS]/ { - map[strtonum("0X" $1)] = strtonum("0X" $3) -} - -END { - print "static constexpr rune lookup_lat1[] = {" - for (i = 0; i < 0x100; i++) { - if (i % 8 == 0) - printf "\t" - printf "0x%03X,%s", map[i] ? map[i] : i, i % 8 == 7 ? "\n" : " " - } - print "};" - print "" - print "static const struct {" - print "\trune k, v;" - print "} lookup[] = {" - - for (i = 0x100; i <= 0x10FFFF; i++) { - if (!map[i]) - continue - printf "\t{RUNE_C(0x%06X), RUNE_C(0x%06X)},\n", i, map[i] - } - - print "};" - print "" - print "_MLIB_DEFINE_BSEARCH_KV(rune, lookup, ch)" - print "" - print "rune" - print "uprop_get_scf(rune ch, bool az_or_tr)" - print "{" - print "\tif (az_or_tr) {" - print "\t\tif (ch == \x27I\x27)" - print "\t\t\treturn U\x27ı\x27;" - print "\t\tif (ch == U\x27İ\x27)" - print "\t\t\treturn \x27i\x27;" - print "\t}" - print "\treturn ch < lengthof(lookup_lat1) ? lookup_lat1[ch] : mlib_lookup_kv(ch);" - print "}" -} -' data/CaseFolding +#!/usr/bin/python3 + +import math + +from lib import * + + +longest = 0 + +def parse(file: str) -> list[bool]: + global longest + + xs = ['0'] * 0x110000 + with open(file, 'r') as f: + for line in f.readlines(): + if len(line.strip()) == 0 or line[0] == '#': + continue + + parts = line.split(';') + if parts[1].strip() not in {'C', 'S'}: + continue + ranges = [int(x, 16) for x in parts[0].strip().split('..')] + prop = f'RUNE_C(0x{parts[2].split('#')[0].strip()})' + longest = max(longest, len(prop)) + + for i in range(ranges[0], ranges[len(ranges) - 1] + 1): + xs[i] = prop + return xs + +def genfile(cs: list[tuple[bool, ...]], blksize: int) -> None: + Cs = cs + cs = list(dict.fromkeys(Cs)) + + print('''\ +/* This file is autogenerated by gen/prop/scf; DO NOT EDIT. */ + +#include + +#include "rune.h" +#include "unicode/prop.h" +''') + + print(f'static constexpr {typename(len(cs) - 1)} stage1[] = {{') + for i, c in enumerate(Cs): + print(f'%c%{len(str(len(cs) - 1))}d,' % ('\t' if i % 16 == 0 else ' ', cs.index(c)), end='') + if i % 16 == 15: + print() + print('};') + + print() + + ppc = columns(blksize, longest + 1) + print(f'static constexpr rune stage2[][{blksize}] = {{') + for c in cs: + for i in range(blksize // ppc): + print('\t{' if i == 0 else '\t ', end='') + for j in range(ppc): + print(c[i*ppc + j], end='') + if i < blksize // ppc - 1 or j < ppc - 1: + print(',', end='') + if j < ppc - 1: + print(' ' * (longest + 1 - len(c[i*ppc + j])), end='') + if i < blksize // ppc - 1: + print() + print('},') + print('};') + + print() + + print(f'''\ +rune +uprop_get_scf(rune ch, bool az_tr) +{{ + if (az_tr) {{ + if (ch == 'I') + return U'ı'; + if (ch == U'İ') + return 'i'; + }} + rune hc = stage2[stage1[ch / {blksize}]][ch % {blksize}]; + return hc == 0 ? ch : hc; +}}''') + +def main() -> None: + cwd_init() + xs = parse('data/CaseFolding') + + blksize = -1 + smallest = math.inf + + for bs in powers_of_2(): + if bs > len(xs): + break + Cs = [tuple(x) for x in chunks(xs, bs)] + cs = set(Cs) + + sz_s1 = len(Cs) * isize(len(cs) - 1) + sz_s2 = len(cs) * bs * 4 + sz = sz_s1 + sz_s2 + + if sz < smallest: + smallest = sz + blksize = bs + + Cs = [tuple(x) for x in chunks(xs, blksize)] + with open('lib/unicode/prop/uprop_get_scf.c', 'w') as f: + sys.stdout = f + genfile(Cs, blksize) + + report_size(len(xs), smallest) + +if __name__ == '__main__': + main() diff --git a/gen/prop/slc b/gen/prop/slc index 8142be8..3bb08b8 100755 --- a/gen/prop/slc +++ b/gen/prop/slc @@ -1,53 +1,104 @@ -#!/bin/sh - -set -e -cd "${0%/*}/../.." -exec >lib/unicode/prop/uprop_get_slc.c - -gawk ' -BEGIN { - FS = ";" - - print "/* This file is autogenerated by gen/prop/slc; DO NOT EDIT. */" - print "" - print "#include \"_bsearch.h\"" - print "#include \"macros.h\"" - print "#include \"rune.h\"" - print "#include \"unicode/prop.h\"" - print "" -} - -length($14) > 0 { - map[strtonum("0X" $1)] = strtonum("0X" $14) -} - -END { - print "static constexpr rune lookup_lat1[] = {" - for (i = 0; i < 0x100; i++) { - if (i % 8 == 0) - printf "\t" - printf "0x%03X,%s", map[i] ? map[i] : i, i % 8 == 7 ? "\n" : " " - } - print "};" - print "" - print "static const struct {" - print "\trune k, v;" - print "} lookup[] = {" - - for (i = 0x100; i <= 0x10FFFF; i++) { - if (!map[i]) - continue - printf "\t{RUNE_C(0x%06X), RUNE_C(0x%06X)},\n", i, map[i] - } - - print "};" - print "" - print "_MLIB_DEFINE_BSEARCH_KV(rune, lookup, ch)" - print "" - print "rune" - print "uprop_get_slc(rune ch)" - print "{" - print "\treturn ch < lengthof(lookup_lat1) ? lookup_lat1[ch] : mlib_lookup_kv(ch);" - print "}" -} -' data/UnicodeData +#!/usr/bin/python3 + +import math + +from lib import * + + +longest = 0 + +def parse(file: str) -> list[bool]: + global longest + + xs = ['0'] * 0x110000 + with open(file, 'r') as f: + for line in f.readlines(): + if len(line.strip()) == 0 or line[0] == '#': + continue + + parts = line.split(';') + if parts[13] == '': + continue + n = int(parts[0], 16) + xs[n] = f'RUNE_C(0x{parts[13]})' + longest = max(longest, len(xs[n])) + return xs + +def genfile(cs: list[tuple[bool, ...]], blksize: int) -> None: + Cs = cs + cs = list(dict.fromkeys(Cs)) + + print('''\ +/* This file is autogenerated by gen/prop/slc; DO NOT EDIT. */ + +#include + +#include "rune.h" +#include "unicode/prop.h" +''') + + print(f'static constexpr {typename(len(cs) - 1)} stage1[] = {{') + for i, c in enumerate(Cs): + print(f'%c%{len(str(len(cs) - 1))}d,' % ('\t' if i % 16 == 0 else ' ', cs.index(c)), end='') + if i % 16 == 15: + print() + print('};') + + print() + + ppc = columns(blksize, longest + 1) + print(f'static constexpr rune stage2[][{blksize}] = {{') + for c in cs: + for i in range(blksize // ppc): + print('\t{' if i == 0 else '\t ', end='') + for j in range(ppc): + print(c[i*ppc + j], end='') + if i < blksize // ppc - 1 or j < ppc - 1: + print(',', end='') + if j < ppc - 1: + print(' ' * (longest + 1 - len(c[i*ppc + j])), end='') + if i < blksize // ppc - 1: + print() + print('},') + print('};') + + print() + + print(f'''\ +rune +uprop_get_slc(rune ch) +{{ + rune hc = stage2[stage1[ch / {blksize}]][ch % {blksize}]; + return hc == 0 ? ch : hc; +}}''') + +def main() -> None: + cwd_init() + xs = parse('data/UnicodeData') + + blksize = -1 + smallest = math.inf + + for bs in powers_of_2(): + if bs > len(xs): + break + Cs = [tuple(x) for x in chunks(xs, bs)] + cs = set(Cs) + + sz_s1 = len(Cs) * isize(len(cs) - 1) + sz_s2 = len(cs) * bs * 4 + sz = sz_s1 + sz_s2 + + if sz < smallest: + smallest = sz + blksize = bs + + Cs = [tuple(x) for x in chunks(xs, blksize)] + with open('lib/unicode/prop/uprop_get_slc.c', 'w') as f: + sys.stdout = f + genfile(Cs, blksize) + + report_size(len(xs), smallest) + +if __name__ == '__main__': + main() diff --git a/gen/prop/stc b/gen/prop/stc index eb65d07..3df2004 100755 --- a/gen/prop/stc +++ b/gen/prop/stc @@ -1,45 +1,102 @@ -#!/bin/sh - -set -e -cd "${0%/*}/../.." -exec >lib/unicode/prop/uprop_get_stc.c - -gawk ' -BEGIN { - FS = ";" - - print "/* This file is autogenerated by gen/prop/stc; DO NOT EDIT. */" - print "" - print "#include \"_bsearch.h\"" - print "#include \"macros.h\"" - print "#include \"rune.h\"" - print "#include \"unicode/prop.h\"" - print "" -} - -length($15) > 0 && $13 != $15 && $1 != $15 { - map[strtonum("0X" $1)] = strtonum("0X" $15) -} - -END { - print "static const struct {" - print "\trune k, v;" - print "} lookup[] = {" - - for (i = 0x100; i <= 0x10FFFF; i++) { - if (!map[i]) - continue - printf "\t{RUNE_C(0x%06X), RUNE_C(0x%06X)},\n", i, map[i] - } - - print "};" - print "" - print "_MLIB_DEFINE_BSEARCH_KV(rune, lookup, uprop_get_suc(ch))" - print "" - print "rune" - print "uprop_get_stc(rune ch)" - print "{" - print "\treturn mlib_lookup_kv(ch);" - print "}" -} -' data/UnicodeData +#!/usr/bin/python3 + +import math + +from lib import * + + +longest = 0 + +def parse(file: str) -> list[bool]: + global longest + + xs = ['0'] * 0x110000 + with open(file, 'r') as f: + for line in f.readlines(): + if len(line.strip()) == 0 or line[0] == '#': + continue + + parts = line.split(';') + parts[14] = parts[14].strip() + if ( + parts[14] == '' or + parts[12] == parts[14] or + parts[00] == parts[14] + ): + continue + n = int(parts[0], 16) + xs[n] = f'RUNE_C(0x{parts[14]})' + longest = max(longest, len(xs[n])) + return xs + +def genfile(cs: list[tuple[bool, ...]], blksize: int) -> None: + Cs = cs + cs = list(dict.fromkeys(Cs)) + + assert len(cs) == 2, f'{len(cs)=}, need a 2-stage lookup' + + print('''\ +/* This file is autogenerated by gen/prop/stc; DO NOT EDIT. */ + +#include + +#include "rune.h" +#include "unicode/prop.h" +''') + + ppc = columns(blksize, longest + 1) + print(f'static constexpr rune lookup[][{blksize}] = {{') + for c in cs: + for i in range(blksize // ppc): + print('\t{' if i == 0 else '\t ', end='') + for j in range(ppc): + print(c[i*ppc + j], end='') + if i < blksize // ppc - 1 or j < ppc - 1: + print(',', end='') + if j < ppc - 1: + print(' ' * (longest + 1 - len(c[i*ppc + j])), end='') + if i < blksize // ppc - 1: + print() + print('},') + print('};') + + print() + + print(f'''\ +rune +uprop_get_stc(rune ch) +{{ + rune hc = lookup[ch / {blksize} != 0][ch % {blksize}]; + return hc == 0 ? uprop_get_suc(ch) : hc; +}}''') + +def main() -> None: + cwd_init() + xs = parse('data/UnicodeData') + + blksize = -1 + smallest = math.inf + + for bs in powers_of_2(): + if bs > len(xs): + break + Cs = [tuple(x) for x in chunks(xs, bs)] + cs = set(Cs) + + sz_s1 = len(Cs) * isize(len(cs) - 1) + sz_s2 = len(cs) * bs * 4 + sz = sz_s1 + sz_s2 + + if sz < smallest: + smallest = sz + blksize = bs + + Cs = [tuple(x) for x in chunks(xs, blksize)] + with open('lib/unicode/prop/uprop_get_stc.c', 'w') as f: + sys.stdout = f + genfile(Cs, blksize) + + report_size(len(xs), smallest) + +if __name__ == '__main__': + main() diff --git a/gen/prop/suc b/gen/prop/suc index 9448dbc..84174b0 100755 --- a/gen/prop/suc +++ b/gen/prop/suc @@ -1,53 +1,104 @@ -#!/bin/sh - -set -e -cd "${0%/*}/../.." -exec >lib/unicode/prop/uprop_get_suc.c - -gawk ' -BEGIN { - FS = ";" - - print "/* This file is autogenerated by gen/prop/suc; DO NOT EDIT. */" - print "" - print "#include \"_bsearch.h\"" - print "#include \"macros.h\"" - print "#include \"rune.h\"" - print "#include \"unicode/prop.h\"" - print "" -} - -length($13) > 0 { - map[strtonum("0X" $1)] = strtonum("0X" $13) -} - -END { - print "static constexpr rune lookup_lat1[] = {" - for (i = 0; i < 0x100; i++) { - if (i % 8 == 0) - printf "\t" - printf "0x%03X,%s", map[i] ? map[i] : i, i % 8 == 7 ? "\n" : " " - } - print "};" - print "" - print "static const struct {" - print "\trune k, v;" - print "} lookup[] = {" - - for (i = 0x100; i <= 0x10FFFF; i++) { - if (!map[i]) - continue - printf "\t{RUNE_C(0x%06X), RUNE_C(0x%06X)},\n", i, map[i] - } - - print "};" - print "" - print "_MLIB_DEFINE_BSEARCH_KV(rune, lookup, ch)" - print "" - print "rune" - print "uprop_get_suc(rune ch)" - print "{" - print "\treturn ch < lengthof(lookup_lat1) ? lookup_lat1[ch] : mlib_lookup_kv(ch);" - print "}" -} -' data/UnicodeData +#!/usr/bin/python3 + +import math + +from lib import * + + +longest = 0 + +def parse(file: str) -> list[bool]: + global longest + + xs = ['0'] * 0x110000 + with open(file, 'r') as f: + for line in f.readlines(): + if len(line.strip()) == 0 or line[0] == '#': + continue + + parts = line.split(';') + if parts[12] == '': + continue + n = int(parts[0], 16) + xs[n] = f'RUNE_C(0x{parts[12]})' + longest = max(longest, len(xs[n])) + return xs + +def genfile(cs: list[tuple[bool, ...]], blksize: int) -> None: + Cs = cs + cs = list(dict.fromkeys(Cs)) + + print('''\ +/* This file is autogenerated by gen/prop/suc; DO NOT EDIT. */ + +#include + +#include "rune.h" +#include "unicode/prop.h" +''') + + print(f'static constexpr {typename(len(cs) - 1)} stage1[] = {{') + for i, c in enumerate(Cs): + print(f'%c%{len(str(len(cs) - 1))}d,' % ('\t' if i % 16 == 0 else ' ', cs.index(c)), end='') + if i % 16 == 15: + print() + print('};') + + print() + + ppc = columns(blksize, longest + 1) + print(f'static constexpr rune stage2[][{blksize}] = {{') + for c in cs: + for i in range(blksize // ppc): + print('\t{' if i == 0 else '\t ', end='') + for j in range(ppc): + print(c[i*ppc + j], end='') + if i < blksize // ppc - 1 or j < ppc - 1: + print(',', end='') + if j < ppc - 1: + print(' ' * (longest + 1 - len(c[i*ppc + j])), end='') + if i < blksize // ppc - 1: + print() + print('},') + print('};') + + print() + + print(f'''\ +rune +uprop_get_suc(rune ch) +{{ + rune hc = stage2[stage1[ch / {blksize}]][ch % {blksize}]; + return hc == 0 ? ch : hc; +}}''') + +def main() -> None: + cwd_init() + xs = parse('data/UnicodeData') + + blksize = -1 + smallest = math.inf + + for bs in powers_of_2(): + if bs > len(xs): + break + Cs = [tuple(x) for x in chunks(xs, bs)] + cs = set(Cs) + + sz_s1 = len(Cs) * isize(len(cs) - 1) + sz_s2 = len(cs) * bs * 4 + sz = sz_s1 + sz_s2 + + if sz < smallest: + smallest = sz + blksize = bs + + Cs = [tuple(x) for x in chunks(xs, blksize)] + with open('lib/unicode/prop/uprop_get_suc.c', 'w') as f: + sys.stdout = f + genfile(Cs, blksize) + + report_size(len(xs), smallest) + +if __name__ == '__main__': + main() diff --git a/gen/prop/vo b/gen/prop/vo index b390c67..7b94691 100755 --- a/gen/prop/vo +++ b/gen/prop/vo @@ -1,66 +1,103 @@ -#!/bin/sh - -set -e -cd "${0%/*}/../.." -exec >lib/unicode/prop/uprop_get_vo.c - -gawk ' -BEGIN { - FS = "( *#.*| +; +)" - - print "/* This file is autogenerated by gen/prop/vo; DO NOT EDIT. */" - print "" - print "#include \"_bsearch.h\"" - print "#include \"macros.h\"" - print "#include \"rune.h\"" - print "#include \"unicode/prop.h\"" - print "" -} - -/^[^#]/ { - n = split($1, a, /\.\./) - lo = strtonum("0X" a[1]) - hi = strtonum("0X" a[n]) - - for (i = lo; i <= hi; i++) { - gsub(/^; /, "", $2) - props[i] = "VO_" toupper($2) - } -} - -END { - print "static constexpr enum uprop_vo lookup_lat1[] = {" - for (i = 0; i < 0x100; i++) { - if (i % 8 == 0) - printf "\t" - printf "%-5s%s", (props[i] ? props[i] : "VO_R") ",", \ - i % 8 == 7 ? "\n" : " " - } - print "};" - print "" - print "static const struct {" - print "\trune lo, hi;" - print "\tenum uprop_vo val;" - print "} lookup[] = {" - - for (i = 0x100; i <= 0x10FFFF; i++) { - if (!props[i]) - continue - lo = i - while (props[lo] == props[i + 1]) - i++ - if (props[i] != "VO_R") - printf "\t{RUNE_C(0x%06X), RUNE_C(0x%06X), %s},\n", lo, i, props[i] - } - - print "};" - print "" - print "_MLIB_DEFINE_BSEARCH(enum uprop_vo, lookup, VO_R)" - print "" - print "enum uprop_vo" - print "uprop_get_vo(rune ch)" - print "{" - print "\treturn ch < lengthof(lookup_lat1) ? lookup_lat1[ch] : mlib_lookup(ch);" - print "}" -} -' data/VerticalOrientation | sed 's/\s*$//' +#!/usr/bin/python3 + +import math + +from lib import * + + +longest = 0 + +def parse(file: str) -> list[bool]: + global longest + + xs = ['VO_R'] * 0x110000 + with open(file, 'r') as f: + for line in f.readlines(): + if len(line.strip()) == 0 or line[0] == '#': + continue + + parts = line.split(';') + ranges = [int(x, 16) for x in parts[0].strip().split('..')] + prop = 'VO_' + parts[1].split('#')[0].strip().upper() + longest = max(longest, len(prop)) + + for i in range(ranges[0], ranges[len(ranges) - 1] + 1): + xs[i] = prop + return xs + +def genfile(cs: list[tuple[bool, ...]], blksize: int) -> None: + Cs = cs + cs = list(dict.fromkeys(Cs)) + + print('''\ +/* This file is autogenerated by gen/prop/vo; DO NOT EDIT. */ + +#include + +#include "unicode/prop.h" +''') + + print(f'static constexpr {typename(len(cs) - 1)} stage1[] = {{') + for i, c in enumerate(Cs): + print(f'%c%{len(str(len(cs) - 1))}d,' % ('\t' if i % 16 == 0 else ' ', cs.index(c)), end='') + if i % 16 == 15: + print() + print('};') + + print() + + ppc = columns(blksize, longest + 1) + print(f'static constexpr enum uprop_vo stage2[][{blksize}] = {{') + for c in cs: + for i in range(blksize // ppc): + print('\t{' if i == 0 else '\t ', end='') + for j in range(ppc): + print(c[i*ppc + j], end='') + if i < blksize // ppc - 1 or j < ppc - 1: + print(',', end='') + if j < ppc - 1: + print(' ' * (longest + 1 - len(c[i*ppc + j])), end='') + if i < blksize // ppc - 1: + print() + print('},') + print('};') + + print() + + print(f'''\ +enum uprop_vo +uprop_get_vo(rune ch) +{{ + return stage2[stage1[ch / {blksize}]][ch % {blksize}]; +}}''') + +def main() -> None: + cwd_init() + xs = parse('data/VerticalOrientation') + + blksize = -1 + smallest = math.inf + + for bs in powers_of_2(): + if bs > len(xs): + break + Cs = [tuple(x) for x in chunks(xs, bs)] + cs = set(Cs) + + sz_s1 = len(Cs) * isize(len(cs) - 1) + sz_s2 = len(cs) * bs + sz = sz_s1 + sz_s2 + + if sz < smallest: + smallest = sz + blksize = bs + + Cs = [tuple(x) for x in chunks(xs, blksize)] + with open('lib/unicode/prop/uprop_get_vo.c', 'w') as f: + sys.stdout = f + genfile(Cs, blksize) + + report_size(len(xs), smallest) + +if __name__ == '__main__': + main() diff --git a/gen/prop/wb b/gen/prop/wb index a6b47f2..f6621f5 100755 --- a/gen/prop/wb +++ b/gen/prop/wb @@ -1,86 +1,129 @@ -#!/bin/sh - -set -e -cd "${0%/*}/../.." -exec >lib/unicode/prop/uprop_get_wb.c - -gawk ' -BEGIN { - FS = " *(; *|#.*)" - - map["ALetter"] = "LE" - map["CR"] = "CR" - map["Double_Quote"] = "DQ" - map["E_Base"] = "EB" - map["E_Base_GAZ"] = "EBG" - map["E_Modifier"] = "EM" - map["Extend"] = "EXTEND" - map["ExtendNumLet"] = "EX" - map["Format"] = "FO" - map["Glue_After_Zwj"] = "GAZ" - map["Hebrew_Letter"] = "HL" - map["Katakana"] = "KA" - map["LF"] = "LF" - map["MidLetter"] = "ML" - map["MidNumLet"] = "MB" - map["MidNum"] = "MN" - map["Newline"] = "NL" - map["Numeric"] = "NU" - map["Other"] = "XX" - map["Regional_Indicator"] = "RI" - map["Single_Quote"] = "SQ" - map["WSegSpace"] = "WSEGSPACE" - map["ZWJ"] = "ZWJ" - - print "/* This file is autogenerated by gen/prop/wb; DO NOT EDIT. */" - print "" - print "#include \"_bsearch.h\"" - print "#include \"macros.h\"" - print "#include \"rune.h\"" - print "#include \"unicode/prop.h\"" - print "" -} +#!/usr/bin/python3 -/^[A-F0-9]/ { - n = split($1, a, /\.\./) - lo = strtonum("0X" a[1]) - hi = strtonum("0X" a[n]) +import math - for (i = lo; i <= hi; i++) - props[i] = "WB_" map[$2] -} +from lib import * -END { - print "static constexpr enum uprop_wb lookup_lat1[] = {" - for (i = 0; i < 0x100; i++) { - if (i % 4 == 0) - printf "\t" - printf "%-13s%s", (props[i] ? props[i] : "WB_XX") ",", \ - i % 4 == 3 ? "\n" : " " - } - print "};" - print "" - print "static const struct {" - print "\trune lo, hi;" - print "\tenum uprop_wb val;" - print "} lookup[] = {" - - for (i = 0x100; i <= 0x10FFFF; i++) { - if (!props[i]) - continue - for (lo = i; props[lo] == props[i + 1]; i++) - ; - printf "\t{RUNE_C(0x%06X), RUNE_C(0x%06X), %s},\n", lo, i, props[i] - } - - print "};" - print "" - print "_MLIB_DEFINE_BSEARCH(enum uprop_wb, lookup, WB_XX)" - print "" - print "enum uprop_wb" - print "uprop_get_wb(rune ch)" - print "{" - print "\treturn ch < lengthof(lookup_lat1) ? lookup_lat1[ch] : mlib_lookup(ch);" - print "}" + +MAP = { + 'ALetter': 'LE', + 'CR': 'CR', + 'Double_Quote': 'DQ', + 'E_Base': 'EB', + 'E_Base_GAZ': 'EBG', + 'E_Modifier': 'EM', + 'Extend': 'EXTEND', + 'ExtendNumLet': 'EX', + 'Format': 'FO', + 'Glue_After_Zwj': 'GAZ', + 'Hebrew_Letter': 'HL', + 'Katakana': 'KA', + 'LF': 'LF', + 'MidLetter': 'ML', + 'MidNumLet': 'MB', + 'MidNum': 'MN', + 'Newline': 'NL', + 'Numeric': 'NU', + 'Other': 'XX', + 'Regional_Indicator': 'RI', + 'Single_Quote': 'SQ', + 'WSegSpace': 'WSEGSPACE', + 'ZWJ': 'ZWJ', } -' data/WordBreakProperty | sed 's/\s*$//' + +longest = 0 + +def parse(file: str) -> list[bool]: + global longest + + xs = ['WB_XX'] * 0x110000 + with open(file, 'r') as f: + for line in f.readlines(): + if len(line.strip()) == 0 or line[0] == '#': + continue + + parts = line.split(';') + ranges = [int(x, 16) for x in parts[0].strip().split('..')] + prop = 'WB_' + MAP[parts[1].split('#')[0].strip()] + longest = max(longest, len(prop)) + + for i in range(ranges[0], ranges[len(ranges) - 1] + 1): + xs[i] = prop + return xs + +def genfile(cs: list[tuple[bool, ...]], blksize: int) -> None: + Cs = cs + cs = list(dict.fromkeys(Cs)) + + print('''\ +/* This file is autogenerated by gen/prop/wb; DO NOT EDIT. */ + +#include + +#include "unicode/prop.h" +''') + + print(f'static constexpr {typename(len(cs) - 1)} stage1[] = {{') + for i, c in enumerate(Cs): + print(f'%c%{len(str(len(cs) - 1))}d,' % ('\t' if i % 16 == 0 else ' ', cs.index(c)), end='') + if i % 16 == 15: + print() + print('};') + + print() + + ppc = columns(blksize, longest + 1) + print(f'static constexpr enum uprop_wb stage2[][{blksize}] = {{') + for c in cs: + for i in range(blksize // ppc): + print('\t{' if i == 0 else '\t ', end='') + for j in range(ppc): + print(c[i*ppc + j], end='') + if i < blksize // ppc - 1 or j < ppc - 1: + print(',', end='') + if j < ppc - 1: + print(' ' * (longest + 1 - len(c[i*ppc + j])), end='') + if i < blksize // ppc - 1: + print() + print('},') + print('};') + + print() + + print(f'''\ +enum uprop_wb +uprop_get_wb(rune ch) +{{ + return stage2[stage1[ch / {blksize}]][ch % {blksize}]; +}}''') + +def main() -> None: + cwd_init() + xs = parse('data/WordBreakProperty') + + blksize = -1 + smallest = math.inf + + for bs in powers_of_2(): + if bs > len(xs): + break + Cs = [tuple(x) for x in chunks(xs, bs)] + cs = set(Cs) + + sz_s1 = len(Cs) * isize(len(cs) - 1) + sz_s2 = len(cs) * bs + sz = sz_s1 + sz_s2 + + if sz < smallest: + smallest = sz + blksize = bs + + Cs = [tuple(x) for x in chunks(xs, blksize)] + with open('lib/unicode/prop/uprop_get_wb.c', 'w') as f: + sys.stdout = f + genfile(Cs, blksize) + + report_size(len(xs), smallest) + +if __name__ == '__main__': + main() -- cgit v1.2.3