#!/usr/bin/python3
import math
from lib import *
MAP = {
'Control': 1,
'Extended_Pictographic': 2,
'Extend': 3,
'L': 4,
'LV': 5,
'LVT': 6,
'Prepend': 7,
'Regional_Indicator': 8,
'SpacingMark': 9,
'T': 10,
'V': 11,
'ZWJ': 12,
'InCB; Consonant': 0b0100_0000,
'InCB; Extend': 0b1000_0000,
'InCB; Linker': 0b1100_0000,
}
longest = 3
def parse(*files: str) -> list[bool]:
global longest
xs = [0] * 0x110000
lines = []
for file in files:
with open(file, 'r') as f:
lines.extend(f.readlines())
for line in lines:
if len(line.strip()) == 0 or line[0] == '#':
continue
parts = line.split(';')
ranges = [int(x, 16) for x in parts[0].strip().split('..')]
if parts[1].strip() == 'InCB':
p = 'InCB; ' + parts[2].split('#')[0].strip()
else:
p = parts[1].split('#')[0].strip()
if p not in MAP:
continue
for i in range(ranges[0], ranges[len(ranges) - 1] + 1):
xs[i] |= MAP[p]
return list(map(str, xs))
def genfile(cs: list[tuple[bool, ...]], blksize: int) -> None:
Cs = cs
cs = list(dict.fromkeys(Cs))
print('''\
/* This file is autogenerated by gen/string/gbrk; DO NOT EDIT. */
#ifndef MLIB_UNICODE__GBRK_H
#define MLIB_UNICODE__GBRK_H
#include <stdint.h>
#include "_attrs.h"
#include "_rune.h"
#define GBRK_PROP_HI(x) ((x) >> 6)
#define GBRK_PROP_LO(x) ((x) & 63)
enum uprop_gbrk_indc {
GBRK_INDC_CNSNT = 1, /* Consonant */
GBRK_INDC_EXT, /* Extend */
GBRK_INDC_LNK, /* Linker */
};
enum uprop_gbrk {
GBRK_XX = 0, /* Other */
GBRK_CN, /* Control */
GBRK_EXT_PICT, /* Extended Pictographic */
GBRK_EX, /* Extend */
GBRK_HST_L, /* L */
GBRK_HST_LV, /* LV */
GBRK_HST_LVT, /* LVT */
GBRK_PP, /* Prepend */
GBRK_RI, /* Regional Indicator */
GBRK_SM, /* SpacingMark */
GBRK_HST_T, /* T */
GBRK_HST_V, /* V */
GBRK_ZWJ, /* ZWJ */
_GBRK_LO_CNT,
};
static_assert(_GBRK_LO_CNT - 1 <= 0b0011'1111,
"2 bits are required to pack Indic syllables");
''')
print(f'static constexpr {typename(len(cs) - 1)} stage1[] = {{')
for i, c in enumerate(Cs):
print(f'%c%{len(str(len(cs) - 1))}d,' % ('\t' if i % 16 == 0 else ' ', cs.index(c)), end='')
if i % 16 == 15:
print()
print('};')
print()
ppc = columns(blksize, longest + 1)
print(f'static constexpr uint8_t stage2[][{blksize}] = {{')
for c in cs:
for i in range(blksize // ppc):
print('\t{' if i == 0 else '\t ', end='')
for j in range(ppc):
print(c[i*ppc + j], end='')
if i < blksize // ppc - 1 or j < ppc - 1:
print(',', end='')
if j < ppc - 1:
print(' ' * (longest + 1 - len(c[i*ppc + j])), end='')
if i < blksize // ppc - 1:
print()
print('},')
print('};')
print()
print(f'''\
[[_mlib_pure, _mlib_inline]]
static void
uprop_get_gbrk(enum uprop_gbrk *x, enum uprop_gbrk_indc *y, rune ch)
{{
uint8_t z = stage2[stage1[ch / {blksize}]][ch % {blksize}];
*x = GBRK_PROP_LO(z);
*y = GBRK_PROP_HI(z);
}}
#endif /* !MLIB_UNICODE__GBRK_H */''')
def main() -> None:
cwd_init()
xs = parse(
'data/GraphemeBreakProperty',
'data/DerivedCoreProperties',
'data/emoji-data',
)
blksize = -1
smallest = math.inf
for bs in powers_of_2():
if bs > len(xs):
break
Cs = [tuple(x) for x in chunks(xs, bs)]
cs = set(Cs)
sz_s1 = len(Cs) * isize(len(cs) - 1)
sz_s2 = len(cs) * bs
sz = sz_s1 + sz_s2
if sz < smallest:
smallest = sz
blksize = bs
Cs = [tuple(x) for x in chunks(xs, blksize)]
with open('include/unicode/_gbrk.h', 'w') as f:
sys.stdout = f
genfile(Cs, blksize)
report_size(len(xs), smallest)
if __name__ == '__main__':
main()