From 6b3518c84ad90b1b593ef4bb700b39465c79b50e Mon Sep 17 00:00:00 2001 From: Thomas Voss Date: Tue, 7 May 2024 22:50:21 +0200 Subject: Use a 2-stage lookups for u8gnext() and u8gcnt() --- gen/string/gbrk | 243 +++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 160 insertions(+), 83 deletions(-) (limited to 'gen/string/gbrk') diff --git a/gen/string/gbrk b/gen/string/gbrk index e0acca7..8397e90 100755 --- a/gen/string/gbrk +++ b/gen/string/gbrk @@ -1,97 +1,174 @@ -#!/bin/sh +#!/usr/bin/python3 + +import math + +from lib import * + + +MAP = { + 'Control': 1, + 'Extended_Pictographic': 2, + 'Extend': 3, + 'L': 4, + 'LV': 5, + 'LVT': 6, + 'Prepend': 7, + 'Regional_Indicator': 8, + 'SpacingMark': 9, + 'T': 10, + 'V': 11, + 'ZWJ': 12, + + 'InCB; Consonant': 0b0100_0000, + 'InCB; Extend': 0b1000_0000, + 'InCB; Linker': 0b1100_0000, +} + +longest = 3 + +def parse(*files: str) -> list[bool]: + global longest + + xs = [0] * 0x110000 + + lines = [] + for file in files: + with open(file, 'r') as f: + lines.extend(f.readlines()) + + for line in lines: + if len(line.strip()) == 0 or line[0] == '#': + continue + + parts = line.split(';') + ranges = [int(x, 16) for x in parts[0].strip().split('..')] + + if parts[1].strip() == 'InCB': + p = 'InCB; ' + parts[2].split('#')[0].strip() + else: + p = parts[1].split('#')[0].strip() + if p not in MAP: + continue + + for i in range(ranges[0], ranges[len(ranges) - 1] + 1): + xs[i] |= MAP[p] + return list(map(str, xs)) -set -e -cd "${0%/*}/../.." -exec >include/unicode/_gbrk.h +def genfile(cs: list[tuple[bool, ...]], blksize: int) -> None: + Cs = cs + cs = list(dict.fromkeys(Cs)) -cat < +#include "_attrs.h" #include "_rune.h" -typedef enum { - GBP_OTHER = 0, - - GBP_CTRL = 1 << 0, /* Control */ - GBP_EXT = 1 << 1, /* Extend */ - GBP_PIC = 1 << 2, /* Extended_Pictographic */ - GBP_PREP = 1 << 3, /* Prepend */ - GBP_RI = 1 << 4, /* Regional_Indicator */ - GBP_SM = 1 << 5, /* SpacingMark */ - GBP_ZWJ = 1 << 6, /* ZWJ */ - - GBP_HNGL_L = 1 << 7, /* Hangul L */ - GBP_HNGL_LV = 1 << 8, /* Hangul LV */ - GBP_HNGL_LVT = 1 << 9, /* Hangul LVT */ - GBP_HNGL_T = 1 << 10, /* Hangul T */ - GBP_HNGL_V = 1 << 11, /* Hangul V */ - - GBP_INDC_CNSNT = 1 << 12, /* Indic Consonant */ - GBP_INDC_EXT = 1 << 13, /* Indic Extend */ - GBP_INDC_LNK = 1 << 14, /* Indic Linker */ -} gbrk_prop; - -static const struct { - rune lo, hi; - gbrk_prop val; -} gbrk_prop_tbl[] = { -C - -gawk ' -BEGIN { - FS = "( *#.*| +; +)" - map["Control"] = "CTRL" - map["Extend"] = "EXT" - map["Extended_Pictographic"] = "PIC" - map["Prepend"] = "PREP" - map["Regional_Indicator"] = "RI" - map["SpacingMark"] = "SM" - map["ZWJ"] = "ZWJ" - - map["L"] = "HNGL_L" - map["LV"] = "HNGL_LV" - map["LVT"] = "HNGL_LVT" - map["T"] = "HNGL_T" - map["V"] = "HNGL_V" - - map["InCB; Consonant"] = "INDC_CNSNT" - map["InCB; Extend"] = "INDC_EXT" - map["InCB; Linker"] = "INDC_LNK" -} +#define GBRK_PROP_HI(x) ((x) >> 6) +#define GBRK_PROP_LO(x) ((x) & 63) -map[$2] { - n = split($1, a, /\.\./) - lo = strtonum("0X" a[1]) - hi = strtonum("0X" a[n]) - - for (i = lo; i <= hi; i++) { - s = "GBP_" map[$2] - props[i] = props[i] ? props[i] " | " s : s - } -} - -END { - for (i = 0; i <= 0x10FFFF; i++) { - if (!props[i]) - continue - lo = i - while (props[lo] == props[i + 1]) - i++ - printf "\t{0x%06X, 0x%06X, %s},\n", lo, i, props[lo] - } -} -' data/GraphemeBreakProperty \ - data/DerivedCoreProperties \ - data/emoji-data \ -| sort +enum uprop_gbrk_indc { + GBRK_INDC_CNSNT = 1, /* Consonant */ + GBRK_INDC_EXT, /* Extend */ + GBRK_INDC_LNK, /* Linker */ +}; -cat < None: + cwd_init() + xs = parse( + 'data/GraphemeBreakProperty', + 'data/DerivedCoreProperties', + 'data/emoji-data', + ) + + blksize = -1 + smallest = math.inf + + for bs in powers_of_2(): + if bs > len(xs): + break + Cs = [tuple(x) for x in chunks(xs, bs)] + cs = set(Cs) + + sz_s1 = len(Cs) * isize(len(cs) - 1) + sz_s2 = len(cs) * bs + sz = sz_s1 + sz_s2 + + if sz < smallest: + smallest = sz + blksize = bs + + Cs = [tuple(x) for x in chunks(xs, blksize)] + with open('include/unicode/_gbrk.h', 'w') as f: + sys.stdout = f + genfile(Cs, blksize) + + report_size(len(xs), smallest) + +if __name__ == '__main__': + main() -- cgit v1.2.3