aboutsummaryrefslogtreecommitdiff
path: root/gen/prop/bool-props.py
blob: a913904300b7bfbd979e3b4373d467eb17ea0d3b (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!/usr/bin/python3

import functools
import math
import sys
from typing import Generator


def chunks[T](xs: list[T], n: int) -> Generator[list[T], None, None]:
	for i in range(0, len(xs), n):
		yield xs[i:i + n]

def powers_of_2() -> Generator[int, None, None]:
	i = 0
	while True:
		yield 2 ** i
		i += 1

def bytes_per_col(n: int) -> int:
	xs = list(set(functools.reduce(list.__add__, (
		[i, n // i] for i in range(1, int(n ** 0.5) + 1) if n % i == 0)
	)))
	for x in sorted(xs, reverse=True):
		y = 5
		y += x * 5
		y += x - 1
		if y <= 80:
			return x

	raise ValueError

def isize(x: int) -> int:
	if x < 256:
		return 1
	if x < 65535:
		return 2
	if x < 4294967295:
		return 3
	if x < 18446744073709551615:
		return 4
	raise ValueError

def typename(x: int) -> str:
	if x < 256:
		return "uint8_t"
	if x < 65535:
		return "uint16_t"
	if x < 4294967295:
		return "uint32_t"
	if x < 18446744073709551615:
		return "uint64_t"
	raise ValueError

def parse(file: str) -> list[bool]:
	xs = [False] * 0x110000
	if sys.argv[1] == 'Indic_Conjunct_Break':
		sys.argv[1] = 'InCB;'
	with open(file, 'r') as f:
		for line in f.readlines():
			if (
				len(line) == 0
				or line[0] == '#'
				or sys.argv[1] not in line
			):
				continue
			parts = [int(x, 16) for x in line.split(';')[0].strip().split('..')]
			for i in range(parts[0], parts[len(parts) - 1] + 1):
				xs[i] = True
	return xs

def genfile(cs: list[tuple[bool, ...]], blksize: int) -> None:
	Cs = cs
	cs = list(dict.fromkeys(Cs))

	print('''\
/* This file is autogenerated by gen/prop/bool-props; DO NOT EDIT. */

#include "bitset.h"
#include "unicode/prop.h"
''')

	print(f'static constexpr {typename(len(cs) - 1)} stage1[] = {{')
	for i, c in enumerate(Cs):
		print(f'%c%{len(str(len(cs) - 1))}d,' % ('\t' if i % 16 == 0 else ' ', cs.index(c)), end='')
		if i % 16 == 15:
			print()
	print('};')

	print()

	bcnt = blksize // 8
	bpc = bytes_per_col(bcnt)
	print(f'static constexpr unsigned char stage2[][{bcnt}] = {{')
	for c in cs:
		x = sum(map(lambda x: x[1] << x[0], enumerate(c)))

		for i in range(bcnt // bpc):
			print('\t{' if i == 0 else '\t ', end='')
			for j in range(bpc):
				print('0x%02X' % (x & 0xFF), end='')
				x >>= 8
				if i < bcnt // bpc - 1 or j < bpc - 1:
					print(',', end='')
				if j < bpc - 1:
					print(' ', end='')
			if i < bcnt // bpc - 1:
				print()
		print('},')
	print('};')

	print()

	print(f'''\
bool
uprop_is_{sys.argv[2]}(rune ch)
{{
	return TESTBIT(stage2[stage1[ch / {blksize}]], ch % {blksize});
}}''')

def main() -> None:
	if len(sys.argv) != 4:
		print('Usage: bool-props.py name shortname file', file=sys.stderr)
		exit(1)

	xs = parse(sys.argv[3])

	blksize = -1
	smallest = math.inf

	for bs in powers_of_2():
		if bs > len(xs):
			break
		Cs = [tuple(x) for x in chunks(xs, bs)]
		cs = list(dict.fromkeys(Cs))

		sz_s1 = len(Cs) * isize(len(cs) - 1)
		sz_s2 = len(cs) * bs
		sz = sz_s1 + sz_s2

		if sz < smallest:
			smallest = sz
			blksize = bs

	Cs = [tuple(x) for x in chunks(xs, blksize)]
	genfile(Cs, blksize)

if __name__ == '__main__':
	main()