aboutsummaryrefslogblamecommitdiff
path: root/gen/string/gbrk
blob: 8397e902c6ddf59574f8f792f8e1c79414739891 (plain) (tree)





















































                                                                           
 

                                                              
 
                  
                                                                 
 
                            
 
                   
 
                   
                  
 
                                  
 



                                            
 













                                                  
  











































































                                                                                                            
#!/usr/bin/python3

import math

from lib import *


MAP = {
	'Control':               1,
	'Extended_Pictographic': 2,
	'Extend':                3,
	'L':                     4,
	'LV':                    5,
	'LVT':                   6,
	'Prepend':               7,
	'Regional_Indicator':    8,
	'SpacingMark':           9,
	'T':                    10,
	'V':                    11,
	'ZWJ':                  12,

	'InCB; Consonant': 0b0100_0000,
	'InCB; Extend':    0b1000_0000,
	'InCB; Linker':    0b1100_0000,
}

longest = 3

def parse(*files: str) -> list[bool]:
	global longest

	xs = [0] * 0x110000

	lines = []
	for file in files:
		with open(file, 'r') as f:
			lines.extend(f.readlines())

	for line in lines:
		if len(line.strip()) == 0 or line[0] == '#':
			continue

		parts = line.split(';')
		ranges = [int(x, 16) for x in parts[0].strip().split('..')]

		if parts[1].strip() == 'InCB':
			p = 'InCB; ' + parts[2].split('#')[0].strip()
		else:
			p = parts[1].split('#')[0].strip()
		if p not in MAP:
			continue

		for i in range(ranges[0], ranges[len(ranges) - 1] + 1):
			xs[i] |= MAP[p]
	return list(map(str, xs))

def genfile(cs: list[tuple[bool, ...]], blksize: int) -> None:
	Cs = cs
	cs = list(dict.fromkeys(Cs))

	print('''\
/* This file is autogenerated by gen/string/gbrk; DO NOT EDIT. */

#ifndef MLIB_UNICODE__GBRK_H
#define MLIB_UNICODE__GBRK_H

#include <stdint.h>

#include "_attrs.h"
#include "_rune.h"

#define GBRK_PROP_HI(x) ((x) >> 6)
#define GBRK_PROP_LO(x) ((x) & 63)

enum uprop_gbrk_indc {
	GBRK_INDC_CNSNT = 1, /* Consonant */
	GBRK_INDC_EXT,       /* Extend */
	GBRK_INDC_LNK,       /* Linker */
};

enum uprop_gbrk {
	GBRK_XX = 0,   /* Other */
	GBRK_CN,       /* Control */
	GBRK_EXT_PICT, /* Extended Pictographic */
	GBRK_EX,       /* Extend */
	GBRK_HST_L,    /* L */
	GBRK_HST_LV,   /* LV */
	GBRK_HST_LVT,  /* LVT */
	GBRK_PP,       /* Prepend */
	GBRK_RI,       /* Regional Indicator */
	GBRK_SM,       /* SpacingMark */
	GBRK_HST_T,    /* T */
	GBRK_HST_V,    /* V */
	GBRK_ZWJ,      /* ZWJ */
	_GBRK_LO_CNT,
};

static_assert(_GBRK_LO_CNT - 1 <= 0b0011'1111,
              "2 bits are required to pack Indic syllables");
''')

	print(f'static constexpr {typename(len(cs) - 1)} stage1[] = {{')
	for i, c in enumerate(Cs):
		print(f'%c%{len(str(len(cs) - 1))}d,' % ('\t' if i % 16 == 0 else ' ', cs.index(c)), end='')
		if i % 16 == 15:
			print()
	print('};')

	print()

	ppc = columns(blksize, longest + 1)
	print(f'static constexpr uint8_t stage2[][{blksize}] = {{')
	for c in cs:
		for i in range(blksize // ppc):
			print('\t{' if i == 0 else '\t ', end='')
			for j in range(ppc):
				print(c[i*ppc + j], end='')
				if i < blksize // ppc - 1 or j < ppc - 1:
					print(',', end='')
				if j < ppc - 1:
					print(' ' * (longest + 1 - len(c[i*ppc + j])), end='')
			if i < blksize // ppc - 1:
				print()
		print('},')
	print('};')

	print()

	print(f'''\
[[_mlib_pure, _mlib_inline]]
static void
uprop_get_gbrk(enum uprop_gbrk *x, enum uprop_gbrk_indc *y, rune ch)
{{
	uint8_t z = stage2[stage1[ch / {blksize}]][ch % {blksize}];
	*x = GBRK_PROP_LO(z);
	*y = GBRK_PROP_HI(z);
}}

#endif /* !MLIB_UNICODE__GBRK_H */''')

def main() -> None:
	cwd_init()
	xs = parse(
		'data/GraphemeBreakProperty',
		'data/DerivedCoreProperties',
		'data/emoji-data',
	)

	blksize = -1
	smallest = math.inf

	for bs in powers_of_2():
		if bs > len(xs):
			break
		Cs = [tuple(x) for x in chunks(xs, bs)]
		cs = set(Cs)

		sz_s1 = len(Cs) * isize(len(cs) - 1)
		sz_s2 = len(cs) * bs
		sz = sz_s1 + sz_s2

		if sz < smallest:
			smallest = sz
			blksize = bs

	Cs = [tuple(x) for x in chunks(xs, blksize)]
	with open('include/unicode/_gbrk.h', 'w') as f:
		sys.stdout = f
		genfile(Cs, blksize)

	report_size(len(xs), smallest)

if __name__ == '__main__':
	main()