aboutsummaryrefslogtreecommitdiff
path: root/lib/unicode/string/u8norm_nfd.c
blob: 8f142e1728f27c1329b994d1ad374d36419eb50a (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#include <string.h>

#include "macros.h"
#include "mbstring.h"
#include "unicode/prop.h"
#include "unicode/string.h"

#include <stdio.h>

static size_t quickchk_spn(struct u8view);
static void decomp(char8_t *, size_t *, size_t, rune);

/* Computed using a gen/scale-norm.c */
constexpr int NFD_SCALE = 3;

/* For Hangul syllable decomposition */
constexpr rune SBASE = 0xAC00;
constexpr rune LBASE = 0x1100;
constexpr rune VBASE = 0x1161;
constexpr rune TBASE = 0x11A7;
constexpr int LCNT = 19;
constexpr int VCNT = 21;
constexpr int TCNT = 28;
constexpr int NCNT = VCNT * TCNT;
constexpr int SCNT = LCNT * NCNT;

char8_t *
u8norm_nfd(size_t *dstn, struct u8view src, alloc_fn alloc, void *ctx)
{
	ASSUME(dstn != nullptr);
	ASSUME(alloc != nullptr);

	/* Pre-allocate a buffer with some initial capacity; there is no need to
	   check for overflow when computing bufsz because alloc() will handle the
	   overflow error for us. */
	size_t bufsz = src.len * NFD_SCALE;
	uint8_t *dst = alloc(ctx, nullptr, 0, src.len, NFD_SCALE, alignof(char8_t));

	/* Copy over the initial codepoints that are already in NFD; if the entire
	   string is in NFD then just return it immediately */
	size_t spn = quickchk_spn(src);
	memcpy(dst, src.p, spn);
	*dstn = spn;
	if (spn == src.len)
		return dst;
	VSHFT(&src, spn);

	rune ch;
	while (u8next(&ch, &src) != 0)
		decomp(dst, dstn, bufsz, ch);
	return alloc(ctx, dst, src.len, *dstn, 1, alignof(char8_t));
}

#define WRITE(ch) *dstn += rtou8(dst + *dstn, bufsz - *dstn, (ch))

void
decomp(char8_t *dst, size_t *dstn, size_t bufsz, rune ch)
{
	if (uprop_get_hst(ch) != HST_NA) {
		int si = ch - SBASE;
		if (si < 0 || si > SCNT) {
			WRITE(ch);
			return;
		}
		rune l, v, t;
		l = LBASE + si / NCNT;
		v = VBASE + (si % NCNT) / TCNT;
		t = TBASE + si % TCNT;
		WRITE(l);
		WRITE(v);
		if (t != TBASE)
			WRITE(t);
	} else if (uprop_get_dt(ch) == DT_CAN) {
		struct rview rv = uprop_get_dm(ch);
		for (size_t i = 0; i < rv.len; i++)
			decomp(dst, dstn, bufsz, rv.p[i]);
	} else {
		enum uprop_ccc ccc = uprop_get_ccc(ch);
		if (ccc == CCC_NR) {
			WRITE(ch);
			return;
		}

		int w;
		rune hc;
		char8_t *p = dst + *dstn;
		while (w = u8prev(&hc, (const char8_t **)&p, dst)) {
			enum uprop_ccc ccc2 = uprop_get_ccc(hc);
			if (ccc2 == CCC_NR || ccc2 <= ccc) {
out:
				char8_t tmp[U8_LEN_MAX];
				int w2 = rtou8(tmp, sizeof(tmp), ch);
				p += w;
				memmove(p + w2, p, dst + *dstn - p);
				memcpy(p, tmp, w2);
				*dstn += w2;
				return;
			}
		}

		/* Loop didn’t early-return; append to the start */
		goto out;
	}
}

#undef WRITE

size_t
quickchk_spn(struct u8view src)
{
	rune ch;
	size_t spn = 0;
	enum uprop_ccc prv, cur;

	prv = cur = CCC_NR;

	for (int w; w = u8next(&ch, &src); spn += w) {
		if (uprop_get_nfd_qc(ch) == NFD_QC_N)
			break;
		if ((cur = uprop_get_ccc(ch)) < prv)
			break;
		prv = cur;
	}

	return spn;
}