1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
|
#!/usr/bin/python3
import functools
import math
import sys
from typing import Generator
def chunks[T](xs: list[T], n: int) -> Generator[list[T], None, None]:
for i in range(0, len(xs), n):
yield xs[i:i + n]
def powers_of_2() -> Generator[int, None, None]:
i = 0
while True:
yield 2 ** i
i += 1
def bytes_per_col(n: int) -> int:
xs = list(set(functools.reduce(list.__add__, (
[i, n // i] for i in range(1, int(n ** 0.5) + 1) if n % i == 0)
)))
for x in sorted(xs, reverse=True):
y = 5
y += x * 5
y += x - 1
if y <= 80:
return x
raise ValueError
def isize(x: int) -> int:
if x < 256:
return 1
if x < 65535:
return 2
if x < 4294967295:
return 3
if x < 18446744073709551615:
return 4
raise ValueError
def typename(x: int) -> str:
if x < 256:
return "uint8_t"
if x < 65535:
return "uint16_t"
if x < 4294967295:
return "uint32_t"
if x < 18446744073709551615:
return "uint64_t"
raise ValueError
def parse(file: str) -> list[bool]:
xs = [False] * 0x110000
if sys.argv[1] == 'Indic_Conjunct_Break':
sys.argv[1] = 'InCB;'
with open(file, 'r') as f:
for line in f.readlines():
if (
len(line) == 0
or line[0] == '#'
or sys.argv[1] not in line
):
continue
parts = [int(x, 16) for x in line.split(';')[0].strip().split('..')]
for i in range(parts[0], parts[len(parts) - 1] + 1):
xs[i] = True
return xs
def genfile(cs: list[tuple[bool, ...]], blksize: int) -> None:
Cs = cs
cs = list(dict.fromkeys(Cs))
print('''\
/* This file is autogenerated by gen/prop/bool-props; DO NOT EDIT. */
#include "bitset.h"
#include "unicode/prop.h"
''')
print(f'static constexpr {typename(len(cs) - 1)} stage1[] = {{')
for i, c in enumerate(Cs):
print(f'%c%{len(str(len(cs) - 1))}d,' % ('\t' if i % 16 == 0 else ' ', cs.index(c)), end='')
if i % 16 == 15:
print()
print('};')
print()
bcnt = blksize // 8
bpc = bytes_per_col(bcnt)
print(f'static constexpr unsigned char stage2[][{bcnt}] = {{')
for c in cs:
x = sum(map(lambda x: x[1] << x[0], enumerate(c)))
for i in range(bcnt // bpc):
print('\t{' if i == 0 else '\t ', end='')
for j in range(bpc):
print('0x%02X' % (x & 0xFF), end='')
x >>= 8
if i < bcnt // bpc - 1 or j < bpc - 1:
print(',', end='')
if j < bpc - 1:
print(' ', end='')
if i < bcnt // bpc - 1:
print()
print('},')
print('};')
print()
print(f'''\
bool
uprop_is_{sys.argv[2]}(rune ch)
{{
return TESTBIT(stage2[stage1[ch / {blksize}]], ch % {blksize});
}}''')
def main() -> None:
if len(sys.argv) != 4:
print('Usage: bool-props.py name shortname file', file=sys.stderr)
exit(1)
xs = parse(sys.argv[3])
blksize = -1
smallest = math.inf
for bs in powers_of_2():
if bs > len(xs):
break
Cs = [tuple(x) for x in chunks(xs, bs)]
cs = list(dict.fromkeys(Cs))
sz_s1 = len(Cs) * isize(len(cs) - 1)
sz_s2 = len(cs) * bs
sz = sz_s1 + sz_s2
if sz < smallest:
smallest = sz
blksize = bs
Cs = [tuple(x) for x in chunks(xs, blksize)]
genfile(Cs, blksize)
if __name__ == '__main__':
main()
|