aboutsummaryrefslogtreecommitdiff
path: root/gen/prop
diff options
context:
space:
mode:
authorThomas Voss <mail@thomasvoss.com> 2024-05-04 20:48:57 +0200
committerThomas Voss <mail@thomasvoss.com> 2024-05-04 20:48:57 +0200
commit3c6ca49b23fd6a2df735e0eaf93432bfef3cba97 (patch)
tree12a0f4ebb8d774af1b4f6f2a41b2367e99567943 /gen/prop
parent10fe179c3d4b8ca2fe3a09c40aff73d3dfe585ee (diff)
More 2-stage lookup tables
Diffstat (limited to 'gen/prop')
-rwxr-xr-xgen/prop/age2
-rw-r--r--gen/prop/lib.py13
-rwxr-xr-xgen/prop/nfkc_Xcf184
-rwxr-xr-xgen/prop/nt2
-rwxr-xr-xgen/prop/nv169
-rwxr-xr-xgen/prop/sb191
-rwxr-xr-xgen/prop/sc492
-rwxr-xr-xgen/prop/scf172
-rwxr-xr-xgen/prop/slc157
-rwxr-xr-xgen/prop/stc147
-rwxr-xr-xgen/prop/suc157
-rwxr-xr-xgen/prop/vo169
-rwxr-xr-xgen/prop/wb207
13 files changed, 1265 insertions, 797 deletions
diff --git a/gen/prop/age b/gen/prop/age
index a565021..b36a6f5 100755
--- a/gen/prop/age
+++ b/gen/prop/age
@@ -32,6 +32,8 @@ def genfile(cs: list[tuple[bool, ...]], blksize: int) -> None:
print('''\
/* This file is autogenerated by gen/prop/age; DO NOT EDIT. */
+#include <stdint.h>
+
#include "unicode/prop.h"
''')
diff --git a/gen/prop/lib.py b/gen/prop/lib.py
index 5f4f061..a363374 100644
--- a/gen/prop/lib.py
+++ b/gen/prop/lib.py
@@ -24,8 +24,7 @@ def columns(n: int, m: int) -> int:
y += x - 1
if y <= 80:
return x
-
- raise ValueError
+ return 1
def isize(x: int) -> int:
if x < 256:
@@ -54,14 +53,8 @@ def cwd_init() -> None:
os.chdir(dir / '..' / '..')
def report_size(before: int, after: int) -> None:
- def btokib(n: int) -> str:
- s = str(round(n / 1024, 2))
- if s.endswith('.0'):
- s = s[:-2]
- return s + ' KiB'
-
prefix = sys.argv[0].split('/')[-1].ljust(len('id_compat_math_continue') + 2)
change = round((after - before) / before * 100, 1)
- before = btokib(before)
- after = btokib(after)
+ before = '%d KiB' % round(before / 1024, 2)
+ after = ('%.2f KiB' % round(after / 1024, 2)).rjust(len('XXX.XX KiB'))
print(f'%s%s%%, %s → %s' % (prefix, change, before, after), file=sys.stderr)
diff --git a/gen/prop/nfkc_Xcf b/gen/prop/nfkc_Xcf
index ba5a905..58c3abc 100755
--- a/gen/prop/nfkc_Xcf
+++ b/gen/prop/nfkc_Xcf
@@ -1,65 +1,121 @@
-#!/bin/sh
-
-set -e
-cd "${0%/*}/../.."
-
-for x in cf scf
-do
- gawk -v s=$x '
- BEGIN {
- FS = "( *; *| *#.*)"
-
- print "/* This file is autogenerated by gen/prop/nfkc_Xcf; DO NOT EDIT. */"
- print ""
- print "#include \"_bsearch.h\""
- print "#include \"macros.h\""
- print "#include \"rune.h\""
- print "#include \"unicode/prop.h\""
- print ""
- print "#define M(...) ((struct rview)_(__VA_ARGS__))"
- print "#define _(...) \\"
- print "\t{(const rune []){__VA_ARGS__}, lengthof(((const rune []){__VA_ARGS__}))}"
- print ""
- print "static const struct {"
- print "\trune lo, hi;"
- print "\tstruct rview val;"
- print "} lookup[] = {"
- }
-
- $2 == "NFKC_" toupper(s) {
- n = split($1, xs, /\.\./)
- lo = strtonum("0X" xs[1])
- hi = strtonum("0X" xs[n])
-
- for (i = lo; i <= hi; i++)
- props[i] = $3 ? $3 : "-"
- }
-
- END {
- for (i = 0; i <= 0x10FFFD; i++) {
- if (!props[i])
+#!/usr/bin/python3
+
+import math
+
+from lib import *
+
+
+longest = 0
+TYPES = ['cf', 'scf']
+
+def parse(file: str, _type: str) -> list[bool]:
+ global longest
+
+ _type = _type.upper()
+
+ xs = ['_(SENTINAL)'] * 0x110000
+ with open(file, 'r') as f:
+ for line in f.readlines():
+ if (
+ len(line.strip()) == 0
+ or line[0] == '#'
+ or f'NFKC_{_type}' not in line
+ ):
continue
- for (lo = i; props[lo] == props[i + 1]; i++)
- ;
- printf "\t{RUNE_C(0x%06X), RUNE_C(0x%06X), _(", lo, i
- n = split(props[i] == "-" ? "" : props[i], xs, / /)
- for (j = 1; j <= n; j++) {
- printf "RUNE_C(0x%s)", xs[j]
- if (j < n)
- printf ", "
- }
- print ")},"
- }
-
- print "};"
- print ""
- print "_MLIB_DEFINE_BSEARCH(struct rview, lookup, M(ch))"
- print ""
- print "struct rview"
- print "uprop_get_nfkc_" s "(rune ch)"
- print "{"
- print "\treturn ch < lookup[0].lo ? M(ch) : mlib_lookup(ch);"
- print "}"
- }
- ' data/DerivedNormalizationProps >lib/unicode/prop/uprop_get_nfkc_${x}.c
-done
+
+ parts = line.split(';')
+ ranges = [int(x, 16) for x in parts[0].strip().split('..')]
+ prop = ', '.join(
+ f'0x{x}' for x in parts[2].split('#')[0].strip().split()
+ )
+ prop = f'_({prop})'
+ longest = max(longest, len(prop))
+
+ for i in range(ranges[0], ranges[len(ranges) - 1] + 1):
+ xs[i] = prop
+ return xs
+
+def genfile(cs: list[tuple[bool, ...]], blksize: int, _type: str) -> None:
+ Cs = cs
+ cs = list(dict.fromkeys(Cs))
+
+ print('''\
+/* This file is autogenerated by gen/prop/nfkc_Xcf; DO NOT EDIT. */
+
+#include "macros.h"
+#include "unicode/prop.h"
+
+#define M(...) ((struct rview)_(__VA_ARGS__))
+#define _(...) \\
+ {(const rune []){__VA_ARGS__}, lengthof(((const rune []){__VA_ARGS__}))}
+
+constexpr rune SENTINAL = 0x110000;
+''')
+
+ print(f'static constexpr {typename(len(cs) - 1)} stage1[] = {{')
+ for i, c in enumerate(Cs):
+ print(f'%c%{len(str(len(cs) - 1))}d,' % ('\t' if i % 16 == 0 else ' ', cs.index(c)), end='')
+ if i % 16 == 15:
+ print()
+ print('};')
+
+ print()
+
+ ppc = columns(blksize, longest + 1)
+ print(f'static const struct rview stage2[][{blksize}] = {{')
+ for c in cs:
+ for i in range(blksize // ppc):
+ print('\t{' if i == 0 else '\t ', end='')
+ for j in range(ppc):
+ print(c[i*ppc + j], end='')
+ if i < blksize // ppc - 1 or j < ppc - 1:
+ print(',', end='')
+ if j < ppc - 1:
+ print(' ' * (longest + 1 - len(c[i*ppc + j])), end='')
+ if i < blksize // ppc - 1:
+ print()
+ print('},')
+ print('};')
+
+ print()
+
+ print(f'''\
+struct rview
+uprop_get_nfkc_{_type}(rune ch)
+{{
+ struct rview rv = stage2[stage1[ch / {blksize}]][ch % {blksize}];
+ return rv.len == 1 && rv.p[0] == SENTINAL ? M(ch) : rv;
+}}''')
+
+def main(_type: str) -> None:
+ cwd_init()
+ xs = parse('data/DerivedNormalizationProps', _type)
+
+ blksize = -1
+ smallest = math.inf
+
+ for bs in powers_of_2():
+ if bs > len(xs):
+ break
+ Cs = [tuple(x) for x in chunks(xs, bs)]
+ cs = set(Cs)
+
+ sz_s1 = len(Cs) * isize(len(cs) - 1)
+ sz_s2 = len(cs) * bs
+ sz = sz_s1 + sz_s2
+
+ if sz < smallest:
+ smallest = sz
+ blksize = bs
+
+ Cs = [tuple(x) for x in chunks(xs, blksize)]
+ with open(f'lib/unicode/prop/uprop_get_nfkc_{_type}.c', 'w') as f:
+ sys.stdout = f
+ genfile(Cs, blksize, _type)
+
+ report_size(len(xs), smallest)
+
+if __name__ == '__main__':
+ for _type in TYPES:
+ longest = 0
+ main(_type)
diff --git a/gen/prop/nt b/gen/prop/nt
index 477789c..c799f7f 100755
--- a/gen/prop/nt
+++ b/gen/prop/nt
@@ -38,6 +38,8 @@ def genfile(cs: list[tuple[bool, ...]], blksize: int) -> None:
print('''\
/* This file is autogenerated by gen/prop/nt; DO NOT EDIT. */
+#include <stdint.h>
+
#include "unicode/prop.h"
''')
diff --git a/gen/prop/nv b/gen/prop/nv
index f8c3e31..68cbf0e 100755
--- a/gen/prop/nv
+++ b/gen/prop/nv
@@ -1,65 +1,104 @@
-#!/bin/sh
-
-set -e
-cd "${0%/*}/../.."
-exec >lib/unicode/prop/uprop_get_nv.c
-
-gawk '
-BEGIN {
- FS = "( *#.*| +; +)"
-
- print "/* This file is autogenerated by gen/prop/nv; DO NOT EDIT. */"
- print ""
- print "#include <float.h>"
- print ""
- print "#include \"_bsearch.h\""
- print "#include \"macros.h\""
- print "#include \"rune.h\""
- print "#include \"unicode/prop.h\""
- print ""
-}
-
-/^[^#]/ {
- n = split($1, a, /\.\./)
- lo = strtonum("0X" a[1])
- hi = strtonum("0X" a[n])
-
- for (i = lo; i <= hi; i++) {
- gsub(/^; /, "", $3)
- if ($3 ~ /[^.]\//)
- sub(/\//, "./", $3)
- props[i] = $3
- }
-}
-
-END {
- print "static constexpr double lookup_lat1[] = {"
- for (i = 0; i < 0x100; i++) {
- if (i % 8 == 0)
- printf "\t"
- printf "%4s,%s", props[i] ? props[i] : "NAN", i % 8 == 7 ? "\n" : " "
- }
- print "};"
- print ""
- print "static const struct {"
- print "\trune k;"
- print "\tdouble v;"
- print "} lookup[] = {"
-
- for (i = 0x100; i <= 0x10FFFF; i++) {
- if (!props[i])
- continue
- printf "\t{RUNE_C(0x%06X), %s},\n", i, props[i]
- }
-
- print "};"
- print ""
- print "_MLIB_DEFINE_BSEARCH_KV(double, lookup, NAN)"
- print ""
- print "double"
- print "uprop_get_nv(rune ch)"
- print "{"
- print "\treturn ch < lengthof(lookup_lat1) ? lookup_lat1[ch] : mlib_lookup_kv(ch);"
- print "}"
-}
-' data/DerivedNumericValues
+#!/usr/bin/python3
+
+import math
+
+from lib import *
+
+
+longest = 0
+
+def parse(file: str) -> list[bool]:
+ global longest
+
+ xs = ['NAN'] * 0x110000
+ with open(file, 'r') as f:
+ for line in f.readlines():
+ if len(line.strip()) == 0 or line[0] == '#':
+ continue
+
+ parts = line.split(';')
+ ranges = [int(x, 16) for x in parts[0].strip().split('..')]
+ prop = parts[3].split('#')[0].strip().replace('/', './')
+ longest = max(longest, len(prop))
+
+ for i in range(ranges[0], ranges[len(ranges) - 1] + 1):
+ xs[i] = prop
+ return xs
+
+def genfile(cs: list[tuple[bool, ...]], blksize: int) -> None:
+ Cs = cs
+ cs = list(dict.fromkeys(Cs))
+
+ print('''\
+/* This file is autogenerated by gen/prop/nv; DO NOT EDIT. */
+
+#include <float.h>
+#include <stdint.h>
+
+#include "unicode/prop.h"
+''')
+
+ print(f'static constexpr {typename(len(cs) - 1)} stage1[] = {{')
+ for i, c in enumerate(Cs):
+ print(f'%c%{len(str(len(cs) - 1))}d,' % ('\t' if i % 16 == 0 else ' ', cs.index(c)), end='')
+ if i % 16 == 15:
+ print()
+ print('};')
+
+ print()
+
+ ppc = columns(blksize, longest + 1)
+ print(f'static constexpr double stage2[][{blksize}] = {{')
+ for c in cs:
+ for i in range(blksize // ppc):
+ print('\t{' if i == 0 else '\t ', end='')
+ for j in range(ppc):
+ print(c[i*ppc + j], end='')
+ if i < blksize // ppc - 1 or j < ppc - 1:
+ print(',', end='')
+ if j < ppc - 1:
+ print(' ' * (longest + 1 - len(c[i*ppc + j])), end='')
+ if i < blksize // ppc - 1:
+ print()
+ print('},')
+ print('};')
+
+ print()
+
+ print(f'''\
+double
+uprop_get_nv(rune ch)
+{{
+ return stage2[stage1[ch / {blksize}]][ch % {blksize}];
+}}''')
+
+def main() -> None:
+ cwd_init()
+ xs = parse('data/DerivedNumericValues')
+
+ blksize = -1
+ smallest = math.inf
+
+ for bs in powers_of_2():
+ if bs > len(xs):
+ break
+ Cs = [tuple(x) for x in chunks(xs, bs)]
+ cs = set(Cs)
+
+ sz_s1 = len(Cs) * isize(len(cs) - 1)
+ sz_s2 = len(cs) * bs * 8
+ sz = sz_s1 + sz_s2
+
+ if sz < smallest:
+ smallest = sz
+ blksize = bs
+
+ Cs = [tuple(x) for x in chunks(xs, blksize)]
+ with open('lib/unicode/prop/uprop_get_nv.c', 'w') as f:
+ sys.stdout = f
+ genfile(Cs, blksize)
+
+ report_size(len(xs), smallest)
+
+if __name__ == '__main__':
+ main()
diff --git a/gen/prop/sb b/gen/prop/sb
index aff06fd..e40f9a5 100755
--- a/gen/prop/sb
+++ b/gen/prop/sb
@@ -1,78 +1,121 @@
-#!/bin/sh
-
-set -e
-cd "${0%/*}/../.."
-exec >lib/unicode/prop/uprop_get_sb.c
-
-gawk '
-BEGIN {
- FS = " *(; *|#.*)"
-
- map["ATerm"] = "AT"
- map["Close"] = "CL"
- map["CR"] = "CR"
- map["Extend"] = "EX"
- map["Format"] = "FO"
- map["LF"] = "LF"
- map["Lower"] = "LO"
- map["Numeric"] = "NU"
- map["OLetter"] = "LE"
- map["Other"] = "XX"
- map["SContinue"] = "SC"
- map["Sep"] = "SE"
- map["Sp"] = "SP"
- map["STerm"] = "ST"
- map["Upper"] = "UP"
-
- print "/* This file is autogenerated by gen/prop/sb; DO NOT EDIT. */"
- print ""
- print "#include \"_bsearch.h\""
- print "#include \"macros.h\""
- print "#include \"rune.h\""
- print "#include \"unicode/prop.h\""
- print ""
-}
+#!/usr/bin/python3
-/^[A-F0-9]/ {
- n = split($1, a, /\.\./)
- lo = strtonum("0X" a[1])
- hi = strtonum("0X" a[n])
+import math
- for (i = lo; i <= hi; i++)
- props[i] = "SB_" map[$2]
-}
+from lib import *
-END {
- print "static constexpr enum uprop_sb lookup_lat1[] = {"
- for (i = 0; i < 0x100; i++) {
- if (i % 8 == 0)
- printf "\t"
- printf "%s%s", (props[i] ? props[i] : "SB_XX") ",", \
- i % 8 == 7 ? "\n" : " "
- }
- print "};"
- print ""
- print "static const struct {"
- print "\trune lo, hi;"
- print "\tenum uprop_sb val;"
- print "} lookup[] = {"
-
- for (i = 0x100; i <= 0x10FFFF; i++) {
- if (!props[i])
- continue
- for (lo = i; props[lo] == props[i + 1]; i++)
- ;
- printf "\t{RUNE_C(0x%06X), RUNE_C(0x%06X), %s},\n", lo, i, props[i]
- }
-
- print "};"
- print ""
- print "_MLIB_DEFINE_BSEARCH(enum uprop_sb, lookup, SB_XX)"
- print ""
- print "enum uprop_sb"
- print "uprop_get_sb(rune ch)"
- print "{"
- print "\treturn ch < lengthof(lookup_lat1) ? lookup_lat1[ch] : mlib_lookup(ch);"
- print "}"
+
+MAP = {
+ 'ATerm': 'AT',
+ 'Close': 'CL',
+ 'CR': 'CR',
+ 'Extend': 'EX',
+ 'Format': 'FO',
+ 'LF': 'LF',
+ 'Lower': 'LO',
+ 'Numeric': 'NU',
+ 'OLetter': 'LE',
+ 'Other': 'XX',
+ 'SContinue': 'SC',
+ 'Sep': 'SE',
+ 'Sp': 'SP',
+ 'STerm': 'ST',
+ 'Upper': 'UP',
}
-' data/SentenceBreakProperty | sed 's/\s*$//'
+
+longest = 0
+
+def parse(file: str) -> list[bool]:
+ global longest
+
+ xs = ['SB_XX'] * 0x110000
+ with open(file, 'r') as f:
+ for line in f.readlines():
+ if len(line.strip()) == 0 or line[0] == '#':
+ continue
+
+ parts = line.split(';')
+ ranges = [int(x, 16) for x in parts[0].strip().split('..')]
+ prop = 'SB_' + MAP[parts[1].split('#')[0].strip()]
+ longest = max(longest, len(prop))
+
+ for i in range(ranges[0], ranges[len(ranges) - 1] + 1):
+ xs[i] = prop
+ return xs
+
+def genfile(cs: list[tuple[bool, ...]], blksize: int) -> None:
+ Cs = cs
+ cs = list(dict.fromkeys(Cs))
+
+ print('''\
+/* This file is autogenerated by gen/prop/sb; DO NOT EDIT. */
+
+#include <stdint.h>
+
+#include "unicode/prop.h"
+''')
+
+ print(f'static constexpr {typename(len(cs) - 1)} stage1[] = {{')
+ for i, c in enumerate(Cs):
+ print(f'%c%{len(str(len(cs) - 1))}d,' % ('\t' if i % 16 == 0 else ' ', cs.index(c)), end='')
+ if i % 16 == 15:
+ print()
+ print('};')
+
+ print()
+
+ ppc = columns(blksize, longest + 1)
+ print(f'static constexpr enum uprop_sb stage2[][{blksize}] = {{')
+ for c in cs:
+ for i in range(blksize // ppc):
+ print('\t{' if i == 0 else '\t ', end='')
+ for j in range(ppc):
+ print(c[i*ppc + j], end='')
+ if i < blksize // ppc - 1 or j < ppc - 1:
+ print(',', end='')
+ if j < ppc - 1:
+ print(' ' * (longest + 1 - len(c[i*ppc + j])), end='')
+ if i < blksize // ppc - 1:
+ print()
+ print('},')
+ print('};')
+
+ print()
+
+ print(f'''\
+enum uprop_sb
+uprop_get_sb(rune ch)
+{{
+ return stage2[stage1[ch / {blksize}]][ch % {blksize}];
+}}''')
+
+def main() -> None:
+ cwd_init()
+ xs = parse('data/SentenceBreakProperty')
+
+ blksize = -1
+ smallest = math.inf
+
+ for bs in powers_of_2():
+ if bs > len(xs):
+ break
+ Cs = [tuple(x) for x in chunks(xs, bs)]
+ cs = set(Cs)
+
+ sz_s1 = len(Cs) * isize(len(cs) - 1)
+ sz_s2 = len(cs) * bs
+ sz = sz_s1 + sz_s2
+
+ if sz < smallest:
+ smallest = sz
+ blksize = bs
+
+ Cs = [tuple(x) for x in chunks(xs, blksize)]
+ with open('lib/unicode/prop/uprop_get_sb.c', 'w') as f:
+ sys.stdout = f
+ genfile(Cs, blksize)
+
+ report_size(len(xs), smallest)
+
+if __name__ == '__main__':
+ main()
diff --git a/gen/prop/sc b/gen/prop/sc
index 7eb219b..af8c316 100755
--- a/gen/prop/sc
+++ b/gen/prop/sc
@@ -1,230 +1,270 @@
-#!/bin/sh
-
-set -e
-cd "${0%/*}/../.."
-exec >lib/unicode/prop/uprop_get_sc.c
-
-gawk '
-BEGIN {
- FS = " *(; *|#.*)"
-
- map["Adlam"] = "ADLM"
- map["Caucasian_Albanian"] = "AGHB"
- map["Ahom"] = "AHOM"
- map["Arabic"] = "ARAB"
- map["Imperial_Aramaic"] = "ARMI"
- map["Armenian"] = "ARMN"
- map["Avestan"] = "AVST"
- map["Balinese"] = "BALI"
- map["Bamum"] = "BAMU"
- map["Bassa_Vah"] = "BASS"
- map["Batak"] = "BATK"
- map["Bengali"] = "BENG"
- map["Bhaiksuki"] = "BHKS"
- map["Bopomofo"] = "BOPO"
- map["Brahmi"] = "BRAH"
- map["Braille"] = "BRAI"
- map["Buginese"] = "BUGI"
- map["Buhid"] = "BUHD"
- map["Chakma"] = "CAKM"
- map["Canadian_Aboriginal"] = "CANS"
- map["Carian"] = "CARI"
- map["Cham"] = "CHAM"
- map["Cherokee"] = "CHER"
- map["Chorasmian"] = "CHRS"
- map["Coptic"] = "COPT"
- map["Cypro_Minoan"] = "CPMN"
- map["Cypriot"] = "CPRT"
- map["Cyrillic"] = "CYRL"
- map["Devanagari"] = "DEVA"
- map["Dives_Akuru"] = "DIAK"
- map["Dogra"] = "DOGR"
- map["Deseret"] = "DSRT"
- map["Duployan"] = "DUPL"
- map["Egyptian_Hieroglyphs"] = "EGYP"
- map["Elbasan"] = "ELBA"
- map["Elymaic"] = "ELYM"
- map["Ethiopic"] = "ETHI"
- map["Georgian"] = "GEOR"
- map["Glagolitic"] = "GLAG"
- map["Gunjala_Gondi"] = "GONG"
- map["Masaram_Gondi"] = "GONM"
- map["Gothic"] = "GOTH"
- map["Grantha"] = "GRAN"
- map["Greek"] = "GREK"
- map["Gujarati"] = "GUJR"
- map["Gurmukhi"] = "GURU"
- map["Hangul"] = "HANG"
- map["Han"] = "HANI"
- map["Hanunoo"] = "HANO"
- map["Hatran"] = "HATR"
- map["Hebrew"] = "HEBR"
- map["Hiragana"] = "HIRA"
- map["Anatolian_Hieroglyphs"] = "HLUW"
- map["Pahawh_Hmong"] = "HMNG"
- map["Nyiakeng_Puachue_Hmong"] = "HMNP"
- map["Katakana_Or_Hiragana"] = "HRKT"
- map["Old_Hungarian"] = "HUNG"
- map["Old_Italic"] = "ITAL"
- map["Javanese"] = "JAVA"
- map["Kayah_Li"] = "KALI"
- map["Katakana"] = "KANA"
- map["Kawi"] = "KAWI"
- map["Kharoshthi"] = "KHAR"
- map["Khmer"] = "KHMR"
- map["Khojki"] = "KHOJ"
- map["Khitan_Small_Script"] = "KITS"
- map["Kannada"] = "KNDA"
- map["Kaithi"] = "KTHI"
- map["Tai_Tham"] = "LANA"
- map["Lao"] = "LAOO"
- map["Latin"] = "LATN"
- map["Lepcha"] = "LEPC"
- map["Limbu"] = "LIMB"
- map["Linear_A"] = "LINA"
- map["Linear_B"] = "LINB"
- map["Lisu"] = "LISU"
- map["Lycian"] = "LYCI"
- map["Lydian"] = "LYDI"
- map["Mahajani"] = "MAHJ"
- map["Makasar"] = "MAKA"
- map["Mandaic"] = "MAND"
- map["Manichaean"] = "MANI"
- map["Marchen"] = "MARC"
- map["Medefaidrin"] = "MEDF"
- map["Mende_Kikakui"] = "MEND"
- map["Meroitic_Cursive"] = "MERC"
- map["Meroitic_Hieroglyphs"] = "MERO"
- map["Malayalam"] = "MLYM"
- map["Modi"] = "MODI"
- map["Mongolian"] = "MONG"
- map["Mro"] = "MROO"
- map["Meetei_Mayek"] = "MTEI"
- map["Multani"] = "MULT"
- map["Myanmar"] = "MYMR"
- map["Nag_Mundari"] = "NAGM"
- map["Nandinagari"] = "NAND"
- map["Old_North_Arabian"] = "NARB"
- map["Nabataean"] = "NBAT"
- map["Newa"] = "NEWA"
- map["Nko"] = "NKOO"
- map["Nushu"] = "NSHU"
- map["Ogham"] = "OGAM"
- map["Ol_Chiki"] = "OLCK"
- map["Old_Turkic"] = "ORKH"
- map["Oriya"] = "ORYA"
- map["Osage"] = "OSGE"
- map["Osmanya"] = "OSMA"
- map["Old_Uyghur"] = "OUGR"
- map["Palmyrene"] = "PALM"
- map["Pau_Cin_Hau"] = "PAUC"
- map["Old_Permic"] = "PERM"
- map["Phags_Pa"] = "PHAG"
- map["Inscriptional_Pahlavi"] = "PHLI"
- map["Psalter_Pahlavi"] = "PHLP"
- map["Phoenician"] = "PHNX"
- map["Miao"] = "PLRD"
- map["Inscriptional_Parthian"] = "PRTI"
- map["Rejang"] = "RJNG"
- map["Hanifi_Rohingya"] = "ROHG"
- map["Runic"] = "RUNR"
- map["Samaritan"] = "SAMR"
- map["Old_South_Arabian"] = "SARB"
- map["Saurashtra"] = "SAUR"
- map["SignWriting"] = "SGNW"
- map["Shavian"] = "SHAW"
- map["Sharada"] = "SHRD"
- map["Siddham"] = "SIDD"
- map["Khudawadi"] = "SIND"
- map["Sinhala"] = "SINH"
- map["Sogdian"] = "SOGD"
- map["Old_Sogdian"] = "SOGO"
- map["Sora_Sompeng"] = "SORA"
- map["Soyombo"] = "SOYO"
- map["Sundanese"] = "SUND"
- map["Syloti_Nagri"] = "SYLO"
- map["Syriac"] = "SYRC"
- map["Tagbanwa"] = "TAGB"
- map["Takri"] = "TAKR"
- map["Tai_Le"] = "TALE"
- map["New_Tai_Lue"] = "TALU"
- map["Tamil"] = "TAML"
- map["Tangut"] = "TANG"
- map["Tai_Viet"] = "TAVT"
- map["Telugu"] = "TELU"
- map["Tifinagh"] = "TFNG"
- map["Tagalog"] = "TGLG"
- map["Thaana"] = "THAA"
- map["Thai"] = "THAI"
- map["Tibetan"] = "TIBT"
- map["Tirhuta"] = "TIRH"
- map["Tangsa"] = "TNSA"
- map["Toto"] = "TOTO"
- map["Ugaritic"] = "UGAR"
- map["Vai"] = "VAII"
- map["Vithkuqi"] = "VITH"
- map["Warang_Citi"] = "WARA"
- map["Wancho"] = "WCHO"
- map["Old_Persian"] = "XPEO"
- map["Cuneiform"] = "XSUX"
- map["Yezidi"] = "YEZI"
- map["Yi"] = "YIII"
- map["Zanabazar_Square"] = "ZANB"
- map["Inherited"] = "ZINH"
- map["Common"] = "ZYYY"
-
- print "/* This file is autogenerated by gen/prop/sc; DO NOT EDIT. */"
- print ""
- print "#include \"_bsearch.h\""
- print "#include \"macros.h\""
- print "#include \"rune.h\""
- print "#include \"unicode/prop.h\""
- print ""
-}
+#!/usr/bin/python3
-/^[^#]/ {
- n = split($1, a, /\.\./)
- lo = strtonum("0X" a[1])
- hi = strtonum("0X" a[n])
+import math
- for (i = lo; i <= hi; i++) {
- gsub(/^; /, "", $2)
- props[i] = "SC_" map[$2]
- }
-}
+from lib import *
-END {
- print "static constexpr enum uprop_sc lookup_lat1[] = {"
- for (i = 0; i < 0x100; i++) {
- if (i % 8 == 0)
- printf "\t"
- printf "%-7s,%s", props[i] ? props[i] : 0, i % 8 == 7 ? "\n" : " "
- }
- print "};"
- print ""
-
- print "static const struct {"
- print "\trune lo, hi;"
- print "\tenum uprop_sc val;"
- print "} lookup[] = {"
-
- for (i = 0x100; i <= 0x10FFFF; i++) {
- if (!props[i])
- continue
- lo = i
- while (props[lo] == props[i + 1])
- i++
- printf "\t{RUNE_C(0x%06X), RUNE_C(0x%06X), %s},\n", lo, i, props[i]
- }
-
- print "};"
- print ""
- print "_MLIB_DEFINE_BSEARCH(enum uprop_sc, lookup, SC_ZZZZ)"
- print ""
- print "enum uprop_sc"
- print "uprop_get_sc(rune ch)"
- print "{"
- print "\treturn ch <= lengthof(lookup_lat1) ? lookup_lat1[ch] : mlib_lookup(ch);"
- print "}"
+
+MAP = {
+ 'Adlam': 'ADLM',
+ 'Caucasian_Albanian': 'AGHB',
+ 'Ahom': 'AHOM',
+ 'Arabic': 'ARAB',
+ 'Imperial_Aramaic': 'ARMI',
+ 'Armenian': 'ARMN',
+ 'Avestan': 'AVST',
+ 'Balinese': 'BALI',
+ 'Bamum': 'BAMU',
+ 'Bassa_Vah': 'BASS',
+ 'Batak': 'BATK',
+ 'Bengali': 'BENG',
+ 'Bhaiksuki': 'BHKS',
+ 'Bopomofo': 'BOPO',
+ 'Brahmi': 'BRAH',
+ 'Braille': 'BRAI',
+ 'Buginese': 'BUGI',
+ 'Buhid': 'BUHD',
+ 'Chakma': 'CAKM',
+ 'Canadian_Aboriginal': 'CANS',
+ 'Carian': 'CARI',
+ 'Cham': 'CHAM',
+ 'Cherokee': 'CHER',
+ 'Chorasmian': 'CHRS',
+ 'Coptic': 'COPT',
+ 'Cypro_Minoan': 'CPMN',
+ 'Cypriot': 'CPRT',
+ 'Cyrillic': 'CYRL',
+ 'Devanagari': 'DEVA',
+ 'Dives_Akuru': 'DIAK',
+ 'Dogra': 'DOGR',
+ 'Deseret': 'DSRT',
+ 'Duployan': 'DUPL',
+ 'Egyptian_Hieroglyphs': 'EGYP',
+ 'Elbasan': 'ELBA',
+ 'Elymaic': 'ELYM',
+ 'Ethiopic': 'ETHI',
+ 'Georgian': 'GEOR',
+ 'Glagolitic': 'GLAG',
+ 'Gunjala_Gondi': 'GONG',
+ 'Masaram_Gondi': 'GONM',
+ 'Gothic': 'GOTH',
+ 'Grantha': 'GRAN',
+ 'Greek': 'GREK',
+ 'Gujarati': 'GUJR',
+ 'Gurmukhi': 'GURU',
+ 'Hangul': 'HANG',
+ 'Han': 'HANI',
+ 'Hanunoo': 'HANO',
+ 'Hatran': 'HATR',
+ 'Hebrew': 'HEBR',
+ 'Hiragana': 'HIRA',
+ 'Anatolian_Hieroglyphs': 'HLUW',
+ 'Pahawh_Hmong': 'HMNG',
+ 'Nyiakeng_Puachue_Hmong': 'HMNP',
+ 'Katakana_Or_Hiragana': 'HRKT',
+ 'Old_Hungarian': 'HUNG',
+ 'Old_Italic': 'ITAL',
+ 'Javanese': 'JAVA',
+ 'Kayah_Li': 'KALI',
+ 'Katakana': 'KANA',
+ 'Kawi': 'KAWI',
+ 'Kharoshthi': 'KHAR',
+ 'Khmer': 'KHMR',
+ 'Khojki': 'KHOJ',
+ 'Khitan_Small_Script': 'KITS',
+ 'Kannada': 'KNDA',
+ 'Kaithi': 'KTHI',
+ 'Tai_Tham': 'LANA',
+ 'Lao': 'LAOO',
+ 'Latin': 'LATN',
+ 'Lepcha': 'LEPC',
+ 'Limbu': 'LIMB',
+ 'Linear_A': 'LINA',
+ 'Linear_B': 'LINB',
+ 'Lisu': 'LISU',
+ 'Lycian': 'LYCI',
+ 'Lydian': 'LYDI',
+ 'Mahajani': 'MAHJ',
+ 'Makasar': 'MAKA',
+ 'Mandaic': 'MAND',
+ 'Manichaean': 'MANI',
+ 'Marchen': 'MARC',
+ 'Medefaidrin': 'MEDF',
+ 'Mende_Kikakui': 'MEND',
+ 'Meroitic_Cursive': 'MERC',
+ 'Meroitic_Hieroglyphs': 'MERO',
+ 'Malayalam': 'MLYM',
+ 'Modi': 'MODI',
+ 'Mongolian': 'MONG',
+ 'Mro': 'MROO',
+ 'Meetei_Mayek': 'MTEI',
+ 'Multani': 'MULT',
+ 'Myanmar': 'MYMR',
+ 'Nag_Mundari': 'NAGM',
+ 'Nandinagari': 'NAND',
+ 'Old_North_Arabian': 'NARB',
+ 'Nabataean': 'NBAT',
+ 'Newa': 'NEWA',
+ 'Nko': 'NKOO',
+ 'Nushu': 'NSHU',
+ 'Ogham': 'OGAM',
+ 'Ol_Chiki': 'OLCK',
+ 'Old_Turkic': 'ORKH',
+ 'Oriya': 'ORYA',
+ 'Osage': 'OSGE',
+ 'Osmanya': 'OSMA',
+ 'Old_Uyghur': 'OUGR',
+ 'Palmyrene': 'PALM',
+ 'Pau_Cin_Hau': 'PAUC',
+ 'Old_Permic': 'PERM',
+ 'Phags_Pa': 'PHAG',
+ 'Inscriptional_Pahlavi': 'PHLI',
+ 'Psalter_Pahlavi': 'PHLP',
+ 'Phoenician': 'PHNX',
+ 'Miao': 'PLRD',
+ 'Inscriptional_Parthian': 'PRTI',
+ 'Rejang': 'RJNG',
+ 'Hanifi_Rohingya': 'ROHG',
+ 'Runic': 'RUNR',
+ 'Samaritan': 'SAMR',
+ 'Old_South_Arabian': 'SARB',
+ 'Saurashtra': 'SAUR',
+ 'SignWriting': 'SGNW',
+ 'Shavian': 'SHAW',
+ 'Sharada': 'SHRD',
+ 'Siddham': 'SIDD',
+ 'Khudawadi': 'SIND',
+ 'Sinhala': 'SINH',
+ 'Sogdian': 'SOGD',
+ 'Old_Sogdian': 'SOGO',
+ 'Sora_Sompeng': 'SORA',
+ 'Soyombo': 'SOYO',
+ 'Sundanese': 'SUND',
+ 'Syloti_Nagri': 'SYLO',
+ 'Syriac': 'SYRC',
+ 'Tagbanwa': 'TAGB',
+ 'Takri': 'TAKR',
+ 'Tai_Le': 'TALE',
+ 'New_Tai_Lue': 'TALU',
+ 'Tamil': 'TAML',
+ 'Tangut': 'TANG',
+ 'Tai_Viet': 'TAVT',
+ 'Telugu': 'TELU',
+ 'Tifinagh': 'TFNG',
+ 'Tagalog': 'TGLG',
+ 'Thaana': 'THAA',
+ 'Thai': 'THAI',
+ 'Tibetan': 'TIBT',
+ 'Tirhuta': 'TIRH',
+ 'Tangsa': 'TNSA',
+ 'Toto': 'TOTO',
+ 'Ugaritic': 'UGAR',
+ 'Vai': 'VAII',
+ 'Vithkuqi': 'VITH',
+ 'Warang_Citi': 'WARA',
+ 'Wancho': 'WCHO',
+ 'Old_Persian': 'XPEO',
+ 'Cuneiform': 'XSUX',
+ 'Yezidi': 'YEZI',
+ 'Yi': 'YIII',
+ 'Zanabazar_Square': 'ZANB',
+ 'Inherited': 'ZINH',
+ 'Common': 'ZYYY',
}
-' data/Scripts | sed 's/\s*$//'
+
+longest = 0
+
+def parse(file: str) -> list[bool]:
+ global longest
+
+ xs = ['SC_ZZZZ'] * 0x110000
+ with open(file, 'r') as f:
+ for line in f.readlines():
+ if len(line.strip()) == 0 or line[0] == '#':
+ continue
+
+ parts = line.split(';')
+ ranges = [int(x, 16) for x in parts[0].strip().split('..')]
+ prop = 'SC_' + MAP[parts[1].split('#')[0].strip()]
+ longest = max(longest, len(prop))
+
+ for i in range(ranges[0], ranges[len(ranges) - 1] + 1):
+ xs[i] = prop
+ return xs
+
+def genfile(cs: list[tuple[bool, ...]], blksize: int) -> None:
+ Cs = cs
+ cs = list(dict.fromkeys(Cs))
+
+ print('''\
+/* This file is autogenerated by gen/prop/sc; DO NOT EDIT. */
+
+#include <stdint.h>
+
+#include "unicode/prop.h"
+''')
+
+ print(f'static constexpr {typename(len(cs) - 1)} stage1[] = {{')
+ for i, c in enumerate(Cs):
+ print(f'%c%{len(str(len(cs) - 1))}d,' % ('\t' if i % 16 == 0 else ' ', cs.index(c)), end='')
+ if i % 16 == 15:
+ print()
+ print('};')
+
+ print()
+
+ ppc = columns(blksize, longest + 1)
+ print(f'static constexpr enum uprop_sc stage2[][{blksize}] = {{')
+ for c in cs:
+ for i in range(blksize // ppc):
+ print('\t{' if i == 0 else '\t ', end='')
+ for j in range(ppc):
+ print(c[i*ppc + j], end='')
+ if i < blksize // ppc - 1 or j < ppc - 1:
+ print(',', end='')
+ if j < ppc - 1:
+ print(' ' * (longest + 1 - len(c[i*ppc + j])), end='')
+ if i < blksize // ppc - 1:
+ print()
+ print('},')
+ print('};')
+
+ print()
+
+ print(f'''\
+enum uprop_sc
+uprop_get_sc(rune ch)
+{{
+ return stage2[stage1[ch / {blksize}]][ch % {blksize}];
+}}''')
+
+def main() -> None:
+ cwd_init()
+ xs = parse('data/Scripts')
+
+ blksize = -1
+ smallest = math.inf
+
+ for bs in powers_of_2():
+ if bs > len(xs):
+ break
+ Cs = [tuple(x) for x in chunks(xs, bs)]
+ cs = set(Cs)
+
+ sz_s1 = len(Cs) * isize(len(cs) - 1)
+ sz_s2 = len(cs) * bs
+ sz = sz_s1 + sz_s2
+
+ if sz < smallest:
+ smallest = sz
+ blksize = bs
+
+ Cs = [tuple(x) for x in chunks(xs, blksize)]
+ with open('lib/unicode/prop/uprop_get_sc.c', 'w') as f:
+ sys.stdout = f
+ genfile(Cs, blksize)
+
+ report_size(len(xs), smallest)
+
+if __name__ == '__main__':
+ main()
diff --git a/gen/prop/scf b/gen/prop/scf
index 8dcfcec..47cfc0a 100755
--- a/gen/prop/scf
+++ b/gen/prop/scf
@@ -1,59 +1,113 @@
-#!/bin/sh
-
-set -e
-cd "${0%/*}/../.."
-exec >lib/unicode/prop/uprop_get_scf.c
-
-gawk '
-BEGIN {
- FS = "[ ;]+"
-
- print "/* This file is autogenerated by gen/prop/scf; DO NOT EDIT. */"
- print ""
- print "#include \"_bsearch.h\""
- print "#include \"macros.h\""
- print "#include \"rune.h\""
- print "#include \"unicode/prop.h\""
- print ""
-}
-
-$0 !~ /^#/ && $2 ~ /[CS]/ {
- map[strtonum("0X" $1)] = strtonum("0X" $3)
-}
-
-END {
- print "static constexpr rune lookup_lat1[] = {"
- for (i = 0; i < 0x100; i++) {
- if (i % 8 == 0)
- printf "\t"
- printf "0x%03X,%s", map[i] ? map[i] : i, i % 8 == 7 ? "\n" : " "
- }
- print "};"
- print ""
- print "static const struct {"
- print "\trune k, v;"
- print "} lookup[] = {"
-
- for (i = 0x100; i <= 0x10FFFF; i++) {
- if (!map[i])
- continue
- printf "\t{RUNE_C(0x%06X), RUNE_C(0x%06X)},\n", i, map[i]
- }
-
- print "};"
- print ""
- print "_MLIB_DEFINE_BSEARCH_KV(rune, lookup, ch)"
- print ""
- print "rune"
- print "uprop_get_scf(rune ch, bool az_or_tr)"
- print "{"
- print "\tif (az_or_tr) {"
- print "\t\tif (ch == \x27I\x27)"
- print "\t\t\treturn U\x27ı\x27;"
- print "\t\tif (ch == U\x27İ\x27)"
- print "\t\t\treturn \x27i\x27;"
- print "\t}"
- print "\treturn ch < lengthof(lookup_lat1) ? lookup_lat1[ch] : mlib_lookup_kv(ch);"
- print "}"
-}
-' data/CaseFolding
+#!/usr/bin/python3
+
+import math
+
+from lib import *
+
+
+longest = 0
+
+def parse(file: str) -> list[bool]:
+ global longest
+
+ xs = ['0'] * 0x110000
+ with open(file, 'r') as f:
+ for line in f.readlines():
+ if len(line.strip()) == 0 or line[0] == '#':
+ continue
+
+ parts = line.split(';')
+ if parts[1].strip() not in {'C', 'S'}:
+ continue
+ ranges = [int(x, 16) for x in parts[0].strip().split('..')]
+ prop = f'RUNE_C(0x{parts[2].split('#')[0].strip()})'
+ longest = max(longest, len(prop))
+
+ for i in range(ranges[0], ranges[len(ranges) - 1] + 1):
+ xs[i] = prop
+ return xs
+
+def genfile(cs: list[tuple[bool, ...]], blksize: int) -> None:
+ Cs = cs
+ cs = list(dict.fromkeys(Cs))
+
+ print('''\
+/* This file is autogenerated by gen/prop/scf; DO NOT EDIT. */
+
+#include <stdint.h>
+
+#include "rune.h"
+#include "unicode/prop.h"
+''')
+
+ print(f'static constexpr {typename(len(cs) - 1)} stage1[] = {{')
+ for i, c in enumerate(Cs):
+ print(f'%c%{len(str(len(cs) - 1))}d,' % ('\t' if i % 16 == 0 else ' ', cs.index(c)), end='')
+ if i % 16 == 15:
+ print()
+ print('};')
+
+ print()
+
+ ppc = columns(blksize, longest + 1)
+ print(f'static constexpr rune stage2[][{blksize}] = {{')
+ for c in cs:
+ for i in range(blksize // ppc):
+ print('\t{' if i == 0 else '\t ', end='')
+ for j in range(ppc):
+ print(c[i*ppc + j], end='')
+ if i < blksize // ppc - 1 or j < ppc - 1:
+ print(',', end='')
+ if j < ppc - 1:
+ print(' ' * (longest + 1 - len(c[i*ppc + j])), end='')
+ if i < blksize // ppc - 1:
+ print()
+ print('},')
+ print('};')
+
+ print()
+
+ print(f'''\
+rune
+uprop_get_scf(rune ch, bool az_tr)
+{{
+ if (az_tr) {{
+ if (ch == 'I')
+ return U'ı';
+ if (ch == U'İ')
+ return 'i';
+ }}
+ rune hc = stage2[stage1[ch / {blksize}]][ch % {blksize}];
+ return hc == 0 ? ch : hc;
+}}''')
+
+def main() -> None:
+ cwd_init()
+ xs = parse('data/CaseFolding')
+
+ blksize = -1
+ smallest = math.inf
+
+ for bs in powers_of_2():
+ if bs > len(xs):
+ break
+ Cs = [tuple(x) for x in chunks(xs, bs)]
+ cs = set(Cs)
+
+ sz_s1 = len(Cs) * isize(len(cs) - 1)
+ sz_s2 = len(cs) * bs * 4
+ sz = sz_s1 + sz_s2
+
+ if sz < smallest:
+ smallest = sz
+ blksize = bs
+
+ Cs = [tuple(x) for x in chunks(xs, blksize)]
+ with open('lib/unicode/prop/uprop_get_scf.c', 'w') as f:
+ sys.stdout = f
+ genfile(Cs, blksize)
+
+ report_size(len(xs), smallest)
+
+if __name__ == '__main__':
+ main()
diff --git a/gen/prop/slc b/gen/prop/slc
index 8142be8..3bb08b8 100755
--- a/gen/prop/slc
+++ b/gen/prop/slc
@@ -1,53 +1,104 @@
-#!/bin/sh
-
-set -e
-cd "${0%/*}/../.."
-exec >lib/unicode/prop/uprop_get_slc.c
-
-gawk '
-BEGIN {
- FS = ";"
-
- print "/* This file is autogenerated by gen/prop/slc; DO NOT EDIT. */"
- print ""
- print "#include \"_bsearch.h\""
- print "#include \"macros.h\""
- print "#include \"rune.h\""
- print "#include \"unicode/prop.h\""
- print ""
-}
-
-length($14) > 0 {
- map[strtonum("0X" $1)] = strtonum("0X" $14)
-}
-
-END {
- print "static constexpr rune lookup_lat1[] = {"
- for (i = 0; i < 0x100; i++) {
- if (i % 8 == 0)
- printf "\t"
- printf "0x%03X,%s", map[i] ? map[i] : i, i % 8 == 7 ? "\n" : " "
- }
- print "};"
- print ""
- print "static const struct {"
- print "\trune k, v;"
- print "} lookup[] = {"
-
- for (i = 0x100; i <= 0x10FFFF; i++) {
- if (!map[i])
- continue
- printf "\t{RUNE_C(0x%06X), RUNE_C(0x%06X)},\n", i, map[i]
- }
-
- print "};"
- print ""
- print "_MLIB_DEFINE_BSEARCH_KV(rune, lookup, ch)"
- print ""
- print "rune"
- print "uprop_get_slc(rune ch)"
- print "{"
- print "\treturn ch < lengthof(lookup_lat1) ? lookup_lat1[ch] : mlib_lookup_kv(ch);"
- print "}"
-}
-' data/UnicodeData
+#!/usr/bin/python3
+
+import math
+
+from lib import *
+
+
+longest = 0
+
+def parse(file: str) -> list[bool]:
+ global longest
+
+ xs = ['0'] * 0x110000
+ with open(file, 'r') as f:
+ for line in f.readlines():
+ if len(line.strip()) == 0 or line[0] == '#':
+ continue
+
+ parts = line.split(';')
+ if parts[13] == '':
+ continue
+ n = int(parts[0], 16)
+ xs[n] = f'RUNE_C(0x{parts[13]})'
+ longest = max(longest, len(xs[n]))
+ return xs
+
+def genfile(cs: list[tuple[bool, ...]], blksize: int) -> None:
+ Cs = cs
+ cs = list(dict.fromkeys(Cs))
+
+ print('''\
+/* This file is autogenerated by gen/prop/slc; DO NOT EDIT. */
+
+#include <stdint.h>
+
+#include "rune.h"
+#include "unicode/prop.h"
+''')
+
+ print(f'static constexpr {typename(len(cs) - 1)} stage1[] = {{')
+ for i, c in enumerate(Cs):
+ print(f'%c%{len(str(len(cs) - 1))}d,' % ('\t' if i % 16 == 0 else ' ', cs.index(c)), end='')
+ if i % 16 == 15:
+ print()
+ print('};')
+
+ print()
+
+ ppc = columns(blksize, longest + 1)
+ print(f'static constexpr rune stage2[][{blksize}] = {{')
+ for c in cs:
+ for i in range(blksize // ppc):
+ print('\t{' if i == 0 else '\t ', end='')
+ for j in range(ppc):
+ print(c[i*ppc + j], end='')
+ if i < blksize // ppc - 1 or j < ppc - 1:
+ print(',', end='')
+ if j < ppc - 1:
+ print(' ' * (longest + 1 - len(c[i*ppc + j])), end='')
+ if i < blksize // ppc - 1:
+ print()
+ print('},')
+ print('};')
+
+ print()
+
+ print(f'''\
+rune
+uprop_get_slc(rune ch)
+{{
+ rune hc = stage2[stage1[ch / {blksize}]][ch % {blksize}];
+ return hc == 0 ? ch : hc;
+}}''')
+
+def main() -> None:
+ cwd_init()
+ xs = parse('data/UnicodeData')
+
+ blksize = -1
+ smallest = math.inf
+
+ for bs in powers_of_2():
+ if bs > len(xs):
+ break
+ Cs = [tuple(x) for x in chunks(xs, bs)]
+ cs = set(Cs)
+
+ sz_s1 = len(Cs) * isize(len(cs) - 1)
+ sz_s2 = len(cs) * bs * 4
+ sz = sz_s1 + sz_s2
+
+ if sz < smallest:
+ smallest = sz
+ blksize = bs
+
+ Cs = [tuple(x) for x in chunks(xs, blksize)]
+ with open('lib/unicode/prop/uprop_get_slc.c', 'w') as f:
+ sys.stdout = f
+ genfile(Cs, blksize)
+
+ report_size(len(xs), smallest)
+
+if __name__ == '__main__':
+ main()
diff --git a/gen/prop/stc b/gen/prop/stc
index eb65d07..3df2004 100755
--- a/gen/prop/stc
+++ b/gen/prop/stc
@@ -1,45 +1,102 @@
-#!/bin/sh
-
-set -e
-cd "${0%/*}/../.."
-exec >lib/unicode/prop/uprop_get_stc.c
-
-gawk '
-BEGIN {
- FS = ";"
-
- print "/* This file is autogenerated by gen/prop/stc; DO NOT EDIT. */"
- print ""
- print "#include \"_bsearch.h\""
- print "#include \"macros.h\""
- print "#include \"rune.h\""
- print "#include \"unicode/prop.h\""
- print ""
-}
-
-length($15) > 0 && $13 != $15 && $1 != $15 {
- map[strtonum("0X" $1)] = strtonum("0X" $15)
-}
-
-END {
- print "static const struct {"
- print "\trune k, v;"
- print "} lookup[] = {"
-
- for (i = 0x100; i <= 0x10FFFF; i++) {
- if (!map[i])
- continue
- printf "\t{RUNE_C(0x%06X), RUNE_C(0x%06X)},\n", i, map[i]
- }
-
- print "};"
- print ""
- print "_MLIB_DEFINE_BSEARCH_KV(rune, lookup, uprop_get_suc(ch))"
- print ""
- print "rune"
- print "uprop_get_stc(rune ch)"
- print "{"
- print "\treturn mlib_lookup_kv(ch);"
- print "}"
-}
-' data/UnicodeData
+#!/usr/bin/python3
+
+import math
+
+from lib import *
+
+
+longest = 0
+
+def parse(file: str) -> list[bool]:
+ global longest
+
+ xs = ['0'] * 0x110000
+ with open(file, 'r') as f:
+ for line in f.readlines():
+ if len(line.strip()) == 0 or line[0] == '#':
+ continue
+
+ parts = line.split(';')
+ parts[14] = parts[14].strip()
+ if (
+ parts[14] == '' or
+ parts[12] == parts[14] or
+ parts[00] == parts[14]
+ ):
+ continue
+ n = int(parts[0], 16)
+ xs[n] = f'RUNE_C(0x{parts[14]})'
+ longest = max(longest, len(xs[n]))
+ return xs
+
+def genfile(cs: list[tuple[bool, ...]], blksize: int) -> None:
+ Cs = cs
+ cs = list(dict.fromkeys(Cs))
+
+ assert len(cs) == 2, f'{len(cs)=}, need a 2-stage lookup'
+
+ print('''\
+/* This file is autogenerated by gen/prop/stc; DO NOT EDIT. */
+
+#include <stdint.h>
+
+#include "rune.h"
+#include "unicode/prop.h"
+''')
+
+ ppc = columns(blksize, longest + 1)
+ print(f'static constexpr rune lookup[][{blksize}] = {{')
+ for c in cs:
+ for i in range(blksize // ppc):
+ print('\t{' if i == 0 else '\t ', end='')
+ for j in range(ppc):
+ print(c[i*ppc + j], end='')
+ if i < blksize // ppc - 1 or j < ppc - 1:
+ print(',', end='')
+ if j < ppc - 1:
+ print(' ' * (longest + 1 - len(c[i*ppc + j])), end='')
+ if i < blksize // ppc - 1:
+ print()
+ print('},')
+ print('};')
+
+ print()
+
+ print(f'''\
+rune
+uprop_get_stc(rune ch)
+{{
+ rune hc = lookup[ch / {blksize} != 0][ch % {blksize}];
+ return hc == 0 ? uprop_get_suc(ch) : hc;
+}}''')
+
+def main() -> None:
+ cwd_init()
+ xs = parse('data/UnicodeData')
+
+ blksize = -1
+ smallest = math.inf
+
+ for bs in powers_of_2():
+ if bs > len(xs):
+ break
+ Cs = [tuple(x) for x in chunks(xs, bs)]
+ cs = set(Cs)
+
+ sz_s1 = len(Cs) * isize(len(cs) - 1)
+ sz_s2 = len(cs) * bs * 4
+ sz = sz_s1 + sz_s2
+
+ if sz < smallest:
+ smallest = sz
+ blksize = bs
+
+ Cs = [tuple(x) for x in chunks(xs, blksize)]
+ with open('lib/unicode/prop/uprop_get_stc.c', 'w') as f:
+ sys.stdout = f
+ genfile(Cs, blksize)
+
+ report_size(len(xs), smallest)
+
+if __name__ == '__main__':
+ main()
diff --git a/gen/prop/suc b/gen/prop/suc
index 9448dbc..84174b0 100755
--- a/gen/prop/suc
+++ b/gen/prop/suc
@@ -1,53 +1,104 @@
-#!/bin/sh
-
-set -e
-cd "${0%/*}/../.."
-exec >lib/unicode/prop/uprop_get_suc.c
-
-gawk '
-BEGIN {
- FS = ";"
-
- print "/* This file is autogenerated by gen/prop/suc; DO NOT EDIT. */"
- print ""
- print "#include \"_bsearch.h\""
- print "#include \"macros.h\""
- print "#include \"rune.h\""
- print "#include \"unicode/prop.h\""
- print ""
-}
-
-length($13) > 0 {
- map[strtonum("0X" $1)] = strtonum("0X" $13)
-}
-
-END {
- print "static constexpr rune lookup_lat1[] = {"
- for (i = 0; i < 0x100; i++) {
- if (i % 8 == 0)
- printf "\t"
- printf "0x%03X,%s", map[i] ? map[i] : i, i % 8 == 7 ? "\n" : " "
- }
- print "};"
- print ""
- print "static const struct {"
- print "\trune k, v;"
- print "} lookup[] = {"
-
- for (i = 0x100; i <= 0x10FFFF; i++) {
- if (!map[i])
- continue
- printf "\t{RUNE_C(0x%06X), RUNE_C(0x%06X)},\n", i, map[i]
- }
-
- print "};"
- print ""
- print "_MLIB_DEFINE_BSEARCH_KV(rune, lookup, ch)"
- print ""
- print "rune"
- print "uprop_get_suc(rune ch)"
- print "{"
- print "\treturn ch < lengthof(lookup_lat1) ? lookup_lat1[ch] : mlib_lookup_kv(ch);"
- print "}"
-}
-' data/UnicodeData
+#!/usr/bin/python3
+
+import math
+
+from lib import *
+
+
+longest = 0
+
+def parse(file: str) -> list[bool]:
+ global longest
+
+ xs = ['0'] * 0x110000
+ with open(file, 'r') as f:
+ for line in f.readlines():
+ if len(line.strip()) == 0 or line[0] == '#':
+ continue
+
+ parts = line.split(';')
+ if parts[12] == '':
+ continue
+ n = int(parts[0], 16)
+ xs[n] = f'RUNE_C(0x{parts[12]})'
+ longest = max(longest, len(xs[n]))
+ return xs
+
+def genfile(cs: list[tuple[bool, ...]], blksize: int) -> None:
+ Cs = cs
+ cs = list(dict.fromkeys(Cs))
+
+ print('''\
+/* This file is autogenerated by gen/prop/suc; DO NOT EDIT. */
+
+#include <stdint.h>
+
+#include "rune.h"
+#include "unicode/prop.h"
+''')
+
+ print(f'static constexpr {typename(len(cs) - 1)} stage1[] = {{')
+ for i, c in enumerate(Cs):
+ print(f'%c%{len(str(len(cs) - 1))}d,' % ('\t' if i % 16 == 0 else ' ', cs.index(c)), end='')
+ if i % 16 == 15:
+ print()
+ print('};')
+
+ print()
+
+ ppc = columns(blksize, longest + 1)
+ print(f'static constexpr rune stage2[][{blksize}] = {{')
+ for c in cs:
+ for i in range(blksize // ppc):
+ print('\t{' if i == 0 else '\t ', end='')
+ for j in range(ppc):
+ print(c[i*ppc + j], end='')
+ if i < blksize // ppc - 1 or j < ppc - 1:
+ print(',', end='')
+ if j < ppc - 1:
+ print(' ' * (longest + 1 - len(c[i*ppc + j])), end='')
+ if i < blksize // ppc - 1:
+ print()
+ print('},')
+ print('};')
+
+ print()
+
+ print(f'''\
+rune
+uprop_get_suc(rune ch)
+{{
+ rune hc = stage2[stage1[ch / {blksize}]][ch % {blksize}];
+ return hc == 0 ? ch : hc;
+}}''')
+
+def main() -> None:
+ cwd_init()
+ xs = parse('data/UnicodeData')
+
+ blksize = -1
+ smallest = math.inf
+
+ for bs in powers_of_2():
+ if bs > len(xs):
+ break
+ Cs = [tuple(x) for x in chunks(xs, bs)]
+ cs = set(Cs)
+
+ sz_s1 = len(Cs) * isize(len(cs) - 1)
+ sz_s2 = len(cs) * bs * 4
+ sz = sz_s1 + sz_s2
+
+ if sz < smallest:
+ smallest = sz
+ blksize = bs
+
+ Cs = [tuple(x) for x in chunks(xs, blksize)]
+ with open('lib/unicode/prop/uprop_get_suc.c', 'w') as f:
+ sys.stdout = f
+ genfile(Cs, blksize)
+
+ report_size(len(xs), smallest)
+
+if __name__ == '__main__':
+ main()
diff --git a/gen/prop/vo b/gen/prop/vo
index b390c67..7b94691 100755
--- a/gen/prop/vo
+++ b/gen/prop/vo
@@ -1,66 +1,103 @@
-#!/bin/sh
-
-set -e
-cd "${0%/*}/../.."
-exec >lib/unicode/prop/uprop_get_vo.c
-
-gawk '
-BEGIN {
- FS = "( *#.*| +; +)"
-
- print "/* This file is autogenerated by gen/prop/vo; DO NOT EDIT. */"
- print ""
- print "#include \"_bsearch.h\""
- print "#include \"macros.h\""
- print "#include \"rune.h\""
- print "#include \"unicode/prop.h\""
- print ""
-}
-
-/^[^#]/ {
- n = split($1, a, /\.\./)
- lo = strtonum("0X" a[1])
- hi = strtonum("0X" a[n])
-
- for (i = lo; i <= hi; i++) {
- gsub(/^; /, "", $2)
- props[i] = "VO_" toupper($2)
- }
-}
-
-END {
- print "static constexpr enum uprop_vo lookup_lat1[] = {"
- for (i = 0; i < 0x100; i++) {
- if (i % 8 == 0)
- printf "\t"
- printf "%-5s%s", (props[i] ? props[i] : "VO_R") ",", \
- i % 8 == 7 ? "\n" : " "
- }
- print "};"
- print ""
- print "static const struct {"
- print "\trune lo, hi;"
- print "\tenum uprop_vo val;"
- print "} lookup[] = {"
-
- for (i = 0x100; i <= 0x10FFFF; i++) {
- if (!props[i])
- continue
- lo = i
- while (props[lo] == props[i + 1])
- i++
- if (props[i] != "VO_R")
- printf "\t{RUNE_C(0x%06X), RUNE_C(0x%06X), %s},\n", lo, i, props[i]
- }
-
- print "};"
- print ""
- print "_MLIB_DEFINE_BSEARCH(enum uprop_vo, lookup, VO_R)"
- print ""
- print "enum uprop_vo"
- print "uprop_get_vo(rune ch)"
- print "{"
- print "\treturn ch < lengthof(lookup_lat1) ? lookup_lat1[ch] : mlib_lookup(ch);"
- print "}"
-}
-' data/VerticalOrientation | sed 's/\s*$//'
+#!/usr/bin/python3
+
+import math
+
+from lib import *
+
+
+longest = 0
+
+def parse(file: str) -> list[bool]:
+ global longest
+
+ xs = ['VO_R'] * 0x110000
+ with open(file, 'r') as f:
+ for line in f.readlines():
+ if len(line.strip()) == 0 or line[0] == '#':
+ continue
+
+ parts = line.split(';')
+ ranges = [int(x, 16) for x in parts[0].strip().split('..')]
+ prop = 'VO_' + parts[1].split('#')[0].strip().upper()
+ longest = max(longest, len(prop))
+
+ for i in range(ranges[0], ranges[len(ranges) - 1] + 1):
+ xs[i] = prop
+ return xs
+
+def genfile(cs: list[tuple[bool, ...]], blksize: int) -> None:
+ Cs = cs
+ cs = list(dict.fromkeys(Cs))
+
+ print('''\
+/* This file is autogenerated by gen/prop/vo; DO NOT EDIT. */
+
+#include <stdint.h>
+
+#include "unicode/prop.h"
+''')
+
+ print(f'static constexpr {typename(len(cs) - 1)} stage1[] = {{')
+ for i, c in enumerate(Cs):
+ print(f'%c%{len(str(len(cs) - 1))}d,' % ('\t' if i % 16 == 0 else ' ', cs.index(c)), end='')
+ if i % 16 == 15:
+ print()
+ print('};')
+
+ print()
+
+ ppc = columns(blksize, longest + 1)
+ print(f'static constexpr enum uprop_vo stage2[][{blksize}] = {{')
+ for c in cs:
+ for i in range(blksize // ppc):
+ print('\t{' if i == 0 else '\t ', end='')
+ for j in range(ppc):
+ print(c[i*ppc + j], end='')
+ if i < blksize // ppc - 1 or j < ppc - 1:
+ print(',', end='')
+ if j < ppc - 1:
+ print(' ' * (longest + 1 - len(c[i*ppc + j])), end='')
+ if i < blksize // ppc - 1:
+ print()
+ print('},')
+ print('};')
+
+ print()
+
+ print(f'''\
+enum uprop_vo
+uprop_get_vo(rune ch)
+{{
+ return stage2[stage1[ch / {blksize}]][ch % {blksize}];
+}}''')
+
+def main() -> None:
+ cwd_init()
+ xs = parse('data/VerticalOrientation')
+
+ blksize = -1
+ smallest = math.inf
+
+ for bs in powers_of_2():
+ if bs > len(xs):
+ break
+ Cs = [tuple(x) for x in chunks(xs, bs)]
+ cs = set(Cs)
+
+ sz_s1 = len(Cs) * isize(len(cs) - 1)
+ sz_s2 = len(cs) * bs
+ sz = sz_s1 + sz_s2
+
+ if sz < smallest:
+ smallest = sz
+ blksize = bs
+
+ Cs = [tuple(x) for x in chunks(xs, blksize)]
+ with open('lib/unicode/prop/uprop_get_vo.c', 'w') as f:
+ sys.stdout = f
+ genfile(Cs, blksize)
+
+ report_size(len(xs), smallest)
+
+if __name__ == '__main__':
+ main()
diff --git a/gen/prop/wb b/gen/prop/wb
index a6b47f2..f6621f5 100755
--- a/gen/prop/wb
+++ b/gen/prop/wb
@@ -1,86 +1,129 @@
-#!/bin/sh
-
-set -e
-cd "${0%/*}/../.."
-exec >lib/unicode/prop/uprop_get_wb.c
-
-gawk '
-BEGIN {
- FS = " *(; *|#.*)"
-
- map["ALetter"] = "LE"
- map["CR"] = "CR"
- map["Double_Quote"] = "DQ"
- map["E_Base"] = "EB"
- map["E_Base_GAZ"] = "EBG"
- map["E_Modifier"] = "EM"
- map["Extend"] = "EXTEND"
- map["ExtendNumLet"] = "EX"
- map["Format"] = "FO"
- map["Glue_After_Zwj"] = "GAZ"
- map["Hebrew_Letter"] = "HL"
- map["Katakana"] = "KA"
- map["LF"] = "LF"
- map["MidLetter"] = "ML"
- map["MidNumLet"] = "MB"
- map["MidNum"] = "MN"
- map["Newline"] = "NL"
- map["Numeric"] = "NU"
- map["Other"] = "XX"
- map["Regional_Indicator"] = "RI"
- map["Single_Quote"] = "SQ"
- map["WSegSpace"] = "WSEGSPACE"
- map["ZWJ"] = "ZWJ"
-
- print "/* This file is autogenerated by gen/prop/wb; DO NOT EDIT. */"
- print ""
- print "#include \"_bsearch.h\""
- print "#include \"macros.h\""
- print "#include \"rune.h\""
- print "#include \"unicode/prop.h\""
- print ""
-}
+#!/usr/bin/python3
-/^[A-F0-9]/ {
- n = split($1, a, /\.\./)
- lo = strtonum("0X" a[1])
- hi = strtonum("0X" a[n])
+import math
- for (i = lo; i <= hi; i++)
- props[i] = "WB_" map[$2]
-}
+from lib import *
-END {
- print "static constexpr enum uprop_wb lookup_lat1[] = {"
- for (i = 0; i < 0x100; i++) {
- if (i % 4 == 0)
- printf "\t"
- printf "%-13s%s", (props[i] ? props[i] : "WB_XX") ",", \
- i % 4 == 3 ? "\n" : " "
- }
- print "};"
- print ""
- print "static const struct {"
- print "\trune lo, hi;"
- print "\tenum uprop_wb val;"
- print "} lookup[] = {"
-
- for (i = 0x100; i <= 0x10FFFF; i++) {
- if (!props[i])
- continue
- for (lo = i; props[lo] == props[i + 1]; i++)
- ;
- printf "\t{RUNE_C(0x%06X), RUNE_C(0x%06X), %s},\n", lo, i, props[i]
- }
-
- print "};"
- print ""
- print "_MLIB_DEFINE_BSEARCH(enum uprop_wb, lookup, WB_XX)"
- print ""
- print "enum uprop_wb"
- print "uprop_get_wb(rune ch)"
- print "{"
- print "\treturn ch < lengthof(lookup_lat1) ? lookup_lat1[ch] : mlib_lookup(ch);"
- print "}"
+
+MAP = {
+ 'ALetter': 'LE',
+ 'CR': 'CR',
+ 'Double_Quote': 'DQ',
+ 'E_Base': 'EB',
+ 'E_Base_GAZ': 'EBG',
+ 'E_Modifier': 'EM',
+ 'Extend': 'EXTEND',
+ 'ExtendNumLet': 'EX',
+ 'Format': 'FO',
+ 'Glue_After_Zwj': 'GAZ',
+ 'Hebrew_Letter': 'HL',
+ 'Katakana': 'KA',
+ 'LF': 'LF',
+ 'MidLetter': 'ML',
+ 'MidNumLet': 'MB',
+ 'MidNum': 'MN',
+ 'Newline': 'NL',
+ 'Numeric': 'NU',
+ 'Other': 'XX',
+ 'Regional_Indicator': 'RI',
+ 'Single_Quote': 'SQ',
+ 'WSegSpace': 'WSEGSPACE',
+ 'ZWJ': 'ZWJ',
}
-' data/WordBreakProperty | sed 's/\s*$//'
+
+longest = 0
+
+def parse(file: str) -> list[bool]:
+ global longest
+
+ xs = ['WB_XX'] * 0x110000
+ with open(file, 'r') as f:
+ for line in f.readlines():
+ if len(line.strip()) == 0 or line[0] == '#':
+ continue
+
+ parts = line.split(';')
+ ranges = [int(x, 16) for x in parts[0].strip().split('..')]
+ prop = 'WB_' + MAP[parts[1].split('#')[0].strip()]
+ longest = max(longest, len(prop))
+
+ for i in range(ranges[0], ranges[len(ranges) - 1] + 1):
+ xs[i] = prop
+ return xs
+
+def genfile(cs: list[tuple[bool, ...]], blksize: int) -> None:
+ Cs = cs
+ cs = list(dict.fromkeys(Cs))
+
+ print('''\
+/* This file is autogenerated by gen/prop/wb; DO NOT EDIT. */
+
+#include <stdint.h>
+
+#include "unicode/prop.h"
+''')
+
+ print(f'static constexpr {typename(len(cs) - 1)} stage1[] = {{')
+ for i, c in enumerate(Cs):
+ print(f'%c%{len(str(len(cs) - 1))}d,' % ('\t' if i % 16 == 0 else ' ', cs.index(c)), end='')
+ if i % 16 == 15:
+ print()
+ print('};')
+
+ print()
+
+ ppc = columns(blksize, longest + 1)
+ print(f'static constexpr enum uprop_wb stage2[][{blksize}] = {{')
+ for c in cs:
+ for i in range(blksize // ppc):
+ print('\t{' if i == 0 else '\t ', end='')
+ for j in range(ppc):
+ print(c[i*ppc + j], end='')
+ if i < blksize // ppc - 1 or j < ppc - 1:
+ print(',', end='')
+ if j < ppc - 1:
+ print(' ' * (longest + 1 - len(c[i*ppc + j])), end='')
+ if i < blksize // ppc - 1:
+ print()
+ print('},')
+ print('};')
+
+ print()
+
+ print(f'''\
+enum uprop_wb
+uprop_get_wb(rune ch)
+{{
+ return stage2[stage1[ch / {blksize}]][ch % {blksize}];
+}}''')
+
+def main() -> None:
+ cwd_init()
+ xs = parse('data/WordBreakProperty')
+
+ blksize = -1
+ smallest = math.inf
+
+ for bs in powers_of_2():
+ if bs > len(xs):
+ break
+ Cs = [tuple(x) for x in chunks(xs, bs)]
+ cs = set(Cs)
+
+ sz_s1 = len(Cs) * isize(len(cs) - 1)
+ sz_s2 = len(cs) * bs
+ sz = sz_s1 + sz_s2
+
+ if sz < smallest:
+ smallest = sz
+ blksize = bs
+
+ Cs = [tuple(x) for x in chunks(xs, blksize)]
+ with open('lib/unicode/prop/uprop_get_wb.c', 'w') as f:
+ sys.stdout = f
+ genfile(Cs, blksize)
+
+ report_size(len(xs), smallest)
+
+if __name__ == '__main__':
+ main()