lib/unicode/string/u8norm.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192

#include <string.h>

#include "macros.h"
#include "mbstring.h"
#include "unicode/_cm.h"
#include "unicode/prop.h"
#include "unicode/string.h"

#define BETWEEN(x, y, z) ((x) <= (y) && (y) <= (z))

static void decomp(char8_t *, size_t *, size_t, rune, enum normform);
static void compbuf(char8_t *, size_t *);

/* Computed using a gen/scale-norm.c */
constexpr int NFD_SCALE = 3;
constexpr int NFKD_SCALE = 11;

/* For Hangul syllable decomposition */
constexpr rune SBASE = 0xAC00;
constexpr rune LBASE = 0x1100;
constexpr rune VBASE = 0x1161;
constexpr rune TBASE = 0x11A7;
constexpr int LCNT = 19;
constexpr int VCNT = 21;
constexpr int TCNT = 28;
constexpr int NCNT = VCNT * TCNT;
constexpr int SCNT = LCNT * NCNT;

char8_t *
u8norm(size_t *dstn, struct u8view src, alloc_fn alloc, void *ctx,
       enum normform nf)
{
	ASSUME(dstn != nullptr);
	ASSUME(alloc != nullptr);
	ASSUME(BETWEEN(0, nf, 4));

	/* Pre-allocate a buffer with some initial capacity; there is no need to
	   check for overflow when computing bufsz because alloc() will handle the
	   overflow error for us. */
	int scale = (nf & 0b10) ? NFKD_SCALE : NFD_SCALE;
	size_t bufsz = src.len * scale;
	char8_t *dst = alloc(ctx, nullptr, 0, src.len, scale, alignof(char8_t));

	*dstn = 0;
	for (rune ch; ucsnext(&ch, &src) != 0; decomp(dst, dstn, bufsz, ch, nf))
		;
	if (nf & 0b01)
		compbuf(dst, dstn);
	return alloc(ctx, dst, src.len, *dstn, 1, alignof(char8_t));
}

#define WRITE(ch) *dstn += rtoucs(dst + *dstn, bufsz - *dstn, (ch))

void
decomp(char8_t *dst, size_t *dstn, size_t bufsz, rune ch, enum normform nf)
{
	if (uprop_get_hst(ch) != HST_NA) {
		int si = ch - SBASE;
		if (si < 0 || si > SCNT) {
			WRITE(ch);
			return;
		}
		rune l, v, t;
		l = LBASE + si / NCNT;
		v = VBASE + (si % NCNT) / TCNT;
		t = TBASE + si % TCNT;
		WRITE(l);
		WRITE(v);
		if (t != TBASE)
			WRITE(t);
	} else if (((nf & 0b10) && uprop_get_dt(ch) != DT_NONE)
	           || ((nf & 0b10) == 0 && uprop_get_dt(ch) == DT_CAN))
	{
		struct rview rv = uprop_get_dm(ch);
		for (size_t i = 0; i < rv.len; i++)
			decomp(dst, dstn, bufsz, rv.p[i], nf);
	} else {
		enum uprop_ccc ccc = uprop_get_ccc(ch);
		if (ccc == CCC_NR) {
			WRITE(ch);
			return;
		}

		int w;
		rune hc;
		char8_t *p = dst + *dstn;
		while (w = ucsprev(&hc, (const char8_t **)&p, dst)) {
			enum uprop_ccc ccc2 = uprop_get_ccc(hc);
			if (ccc2 == CCC_NR || ccc2 <= ccc) {
out:
				char8_t tmp[U8_LEN_MAX];
				int w2 = rtoucs(tmp, sizeof(tmp), ch);
				p += w;
				memmove(p + w2, p, dst + *dstn - p);
				memcpy(p, tmp, w2);
				*dstn += w2;
				return;
			}
		}

		/* Loop didn’t early-return; append to the start */
		goto out;
	}
}

#undef WRITE

/* The following implements the canonical composition algorithm, and it may be
   useful to read it to understand what’s going on.  It can be found under
   §3.11 Normalization Forms of the Unicode standard, subsection ‘Canonical
   Composition Algorithm’. */

void
compbuf(char8_t *dst, size_t *dstn)
{
	int wC, wL;
	rune C, L;
	struct u8view sv = {dst, *dstn};

	while ((wL = ucsnext(&L, &sv)) != 0) {
		if (uprop_get_ccc(L) != CCC_NR)
			continue;
		char8_t *after_L = (char8_t *)sv.p;

		enum uprop_ccc prevcc = 0;
		struct u8view sv_ = sv;

		while ((wC = ucsnext(&C, &sv_)) != 0) {
			enum uprop_ccc curcc = uprop_get_ccc(C);
			bool blocked = curcc <= prevcc;

			if (blocked) {
				if (curcc != CCC_NR)
					continue;
				if (curcc != prevcc)
					break;
			}

			prevcc = curcc;
			rune comp = uprop_get_cm(L, C);

			/* Try Hangul composition */
			if (comp == 0) {
				if (BETWEEN(LBASE, L, LBASE + LCNT - 1)
				    && BETWEEN(VBASE, C, VBASE + VCNT - 1))
				{
					comp = SBASE + ((L - LBASE) * NCNT + (C - VBASE) * TCNT);
				} else if (BETWEEN(TBASE, C, TBASE + TCNT - 1)
				           && BETWEEN(SBASE, L, SBASE + SCNT - 1)
				           && ((L - SBASE) % TCNT) == 0)
				{
					comp = L + (C - TBASE);
				}
			}

			if (comp != 0) {
				char8_t *after_C = (char8_t *)sv_.p;

				/* Shift bytes between L & C so that they’re contiguous with the
				   bytes after C */
				memmove(after_L + wC, after_L, after_C - wC - after_L);

				/* Write the composition into where L was */
				int w = rtoucs(after_L - wL, wL + wC, comp);

				/* Shift the bytes after L & C to be right after the new
				   composition */
				memmove(after_L - wL + w, after_L + wC,
				        *dstn - (after_L + wC - dst));

				/* Correct *dstn */
				int shift = wL + wC - w;
				*dstn -= shift;

				/* Fix the inner string view */
				sv_.p = after_C - shift;
				sv_.len = *dstn - (sv_.p - dst);

				/* Fix outer string view */
				sv.p = sv.p - wL + w;
				sv.len = *dstn - (sv.p - dst);
				after_L = (char8_t *)sv.p;

				/* Update the value of L */
				L = comp;
				wL = w;
				prevcc = CCC_NR;
			} else if (blocked)
				break;
		}
	}
}