From 6b3518c84ad90b1b593ef4bb700b39465c79b50e Mon Sep 17 00:00:00 2001
From: Thomas Voss <mail@thomasvoss.com>
Date: Tue, 7 May 2024 22:50:21 +0200
Subject: Use a 2-stage lookups for u8gnext() and u8gcnt()

---
 gen/string/gbrk   | 243 +++++++++++++++++++++++++++++++++++-------------------
 gen/string/lib.py |   1 +
 2 files changed, 161 insertions(+), 83 deletions(-)
 create mode 120000 gen/string/lib.py

(limited to 'gen/string')
diff --git a/gen/string/gbrk b/gen/string/gbrk
index e0acca7..8397e90 100755
--- a/gen/string/gbrk
+++ b/gen/string/gbrk
@@ -1,97 +1,174 @@
-#!/bin/sh
+#!/usr/bin/python3
+
+import math
+
+from lib import *
+
+
+MAP = {
+	'Control':               1,
+	'Extended_Pictographic': 2,
+	'Extend':                3,
+	'L':                     4,
+	'LV':                    5,
+	'LVT':                   6,
+	'Prepend':               7,
+	'Regional_Indicator':    8,
+	'SpacingMark':           9,
+	'T':                    10,
+	'V':                    11,
+	'ZWJ':                  12,
+
+	'InCB; Consonant': 0b0100_0000,
+	'InCB; Extend':    0b1000_0000,
+	'InCB; Linker':    0b1100_0000,
+}
+
+longest = 3
+
+def parse(*files: str) -> list[bool]:
+	global longest
+
+	xs = [0] * 0x110000
+
+	lines = []
+	for file in files:
+		with open(file, 'r') as f:
+			lines.extend(f.readlines())
+
+	for line in lines:
+		if len(line.strip()) == 0 or line[0] == '#':
+			continue
+
+		parts = line.split(';')
+		ranges = [int(x, 16) for x in parts[0].strip().split('..')]
+
+		if parts[1].strip() == 'InCB':
+			p = 'InCB; ' + parts[2].split('#')[0].strip()
+		else:
+			p = parts[1].split('#')[0].strip()
+		if p not in MAP:
+			continue
+
+		for i in range(ranges[0], ranges[len(ranges) - 1] + 1):
+			xs[i] |= MAP[p]
+	return list(map(str, xs))
 
-set -e
-cd "${0%/*}/../.."
-exec >include/unicode/_gbrk.h
+def genfile(cs: list[tuple[bool, ...]], blksize: int) -> None:
+	Cs = cs
+	cs = list(dict.fromkeys(Cs))
 
-cat <<C
+	print('''\
 /* This file is autogenerated by gen/string/gbrk; DO NOT EDIT. */
 
 #ifndef MLIB_UNICODE__GBRK_H
 #define MLIB_UNICODE__GBRK_H
 
-/* clang-format off */
+#include <stdint.h>
 
+#include "_attrs.h"
 #include "_rune.h"
 
-typedef enum {
-	GBP_OTHER = 0,
-
-	GBP_CTRL = 1 << 0, /* Control */
-	GBP_EXT  = 1 << 1, /* Extend */
-	GBP_PIC  = 1 << 2, /* Extended_Pictographic */
-	GBP_PREP = 1 << 3, /* Prepend */
-	GBP_RI   = 1 << 4, /* Regional_Indicator */
-	GBP_SM   = 1 << 5, /* SpacingMark */
-	GBP_ZWJ  = 1 << 6, /* ZWJ */
-
-	GBP_HNGL_L   = 1 <<  7, /* Hangul L */
-	GBP_HNGL_LV  = 1 <<  8, /* Hangul LV */
-	GBP_HNGL_LVT = 1 <<  9, /* Hangul LVT */
-	GBP_HNGL_T   = 1 << 10, /* Hangul T */
-	GBP_HNGL_V   = 1 << 11, /* Hangul V */
-
-	GBP_INDC_CNSNT = 1 << 12, /* Indic Consonant */
-	GBP_INDC_EXT   = 1 << 13, /* Indic Extend */
-	GBP_INDC_LNK   = 1 << 14, /* Indic Linker */
-} gbrk_prop;
-
-static const struct {
-	rune lo, hi;
-	gbrk_prop val;
-} gbrk_prop_tbl[] = {
-C
-
-gawk '
-BEGIN {
-	FS = "( *#.*| +; +)"
-	map["Control"]               = "CTRL"
-	map["Extend"]                = "EXT"
-	map["Extended_Pictographic"] = "PIC"
-	map["Prepend"]               = "PREP"
-	map["Regional_Indicator"]    = "RI"
-	map["SpacingMark"]           = "SM"
-	map["ZWJ"]                   = "ZWJ"
-
-	map["L"]   = "HNGL_L"
-	map["LV"]  = "HNGL_LV"
-	map["LVT"] = "HNGL_LVT"
-	map["T"]   = "HNGL_T"
-	map["V"]   = "HNGL_V"
-
-	map["InCB; Consonant"] = "INDC_CNSNT"
-	map["InCB; Extend"]    = "INDC_EXT"
-	map["InCB; Linker"]    = "INDC_LNK"
-}
+#define GBRK_PROP_HI(x) ((x) >> 6)
+#define GBRK_PROP_LO(x) ((x) & 63)
 
-map[$2] {
-	n = split($1, a, /\.\./)
-	lo = strtonum("0X" a[1])
-	hi = strtonum("0X" a[n])
-
-	for (i = lo; i <= hi; i++) {
-		s = "GBP_" map[$2]
-		props[i] = props[i] ? props[i] " | " s : s
-	}
-}
-
-END {
-	for (i = 0; i <= 0x10FFFF; i++) {
-		if (!props[i])
-			continue
-		lo = i
-		while (props[lo] == props[i + 1])
-			i++
-		printf "\t{0x%06X, 0x%06X, %s},\n", lo, i, props[lo]
-	}
-}
-' data/GraphemeBreakProperty \
-  data/DerivedCoreProperties \
-  data/emoji-data \
-| sort
+enum uprop_gbrk_indc {
+	GBRK_INDC_CNSNT = 1, /* Consonant */
+	GBRK_INDC_EXT,       /* Extend */
+	GBRK_INDC_LNK,       /* Linker */
+};
 
-cat <<C
+enum uprop_gbrk {
+	GBRK_XX = 0,   /* Other */
+	GBRK_CN,       /* Control */
+	GBRK_EXT_PICT, /* Extended Pictographic */
+	GBRK_EX,       /* Extend */
+	GBRK_HST_L,    /* L */
+	GBRK_HST_LV,   /* LV */
+	GBRK_HST_LVT,  /* LVT */
+	GBRK_PP,       /* Prepend */
+	GBRK_RI,       /* Regional Indicator */
+	GBRK_SM,       /* SpacingMark */
+	GBRK_HST_T,    /* T */
+	GBRK_HST_V,    /* V */
+	GBRK_ZWJ,      /* ZWJ */
+	_GBRK_LO_CNT,
 };
 
-#endif /* !MLIB_UNICODE__GBRK_H */
-C
+static_assert(_GBRK_LO_CNT - 1 <= 0b0011'1111,
+              "2 bits are required to pack Indic syllables");
+''')
+
+	print(f'static constexpr {typename(len(cs) - 1)} stage1[] = {{')
+	for i, c in enumerate(Cs):
+		print(f'%c%{len(str(len(cs) - 1))}d,' % ('\t' if i % 16 == 0 else ' ', cs.index(c)), end='')
+		if i % 16 == 15:
+			print()
+	print('};')
+
+	print()
+
+	ppc = columns(blksize, longest + 1)
+	print(f'static constexpr uint8_t stage2[][{blksize}] = {{')
+	for c in cs:
+		for i in range(blksize // ppc):
+			print('\t{' if i == 0 else '\t ', end='')
+			for j in range(ppc):
+				print(c[i*ppc + j], end='')
+				if i < blksize // ppc - 1 or j < ppc - 1:
+					print(',', end='')
+				if j < ppc - 1:
+					print(' ' * (longest + 1 - len(c[i*ppc + j])), end='')
+			if i < blksize // ppc - 1:
+				print()
+		print('},')
+	print('};')
+
+	print()
+
+	print(f'''\
+[[_mlib_pure, _mlib_inline]]
+static void
+uprop_get_gbrk(enum uprop_gbrk *x, enum uprop_gbrk_indc *y, rune ch)
+{{
+	uint8_t z = stage2[stage1[ch / {blksize}]][ch % {blksize}];
+	*x = GBRK_PROP_LO(z);
+	*y = GBRK_PROP_HI(z);
+}}
+
+#endif /* !MLIB_UNICODE__GBRK_H */''')
+
+def main() -> None:
+	cwd_init()
+	xs = parse(
+		'data/GraphemeBreakProperty',
+		'data/DerivedCoreProperties',
+		'data/emoji-data',
+	)
+
+	blksize = -1
+	smallest = math.inf
+
+	for bs in powers_of_2():
+		if bs > len(xs):
+			break
+		Cs = [tuple(x) for x in chunks(xs, bs)]
+		cs = set(Cs)
+
+		sz_s1 = len(Cs) * isize(len(cs) - 1)
+		sz_s2 = len(cs) * bs
+		sz = sz_s1 + sz_s2
+
+		if sz < smallest:
+			smallest = sz
+			blksize = bs
+
+	Cs = [tuple(x) for x in chunks(xs, blksize)]
+	with open('include/unicode/_gbrk.h', 'w') as f:
+		sys.stdout = f
+		genfile(Cs, blksize)
+
+	report_size(len(xs), smallest)
+
+if __name__ == '__main__':
+	main()
diff --git a/gen/string/lib.py b/gen/string/lib.py
new file mode 120000
index 0000000..33218f3
--- /dev/null
+++ b/gen/string/lib.py
@@ -0,0 +1 @@
+../prop/lib.py
\ No newline at end of file
-- 
cgit v1.2.3