Implement NFD string normalization

author: Thomas Voss <mail@thomasvoss.com> 2024-05-14 20:53:13 +0200
committer: Thomas Voss <mail@thomasvoss.com> 2024-05-14 20:53:22 +0200
commit: a39a4797730a09ec4fbb41c11b7dc1f7d245bc15 (patch)
tree: 13f2a025d52535d66cc8e1c4be0aa502ffaeab7b /lib/unicode/string
parent: 4e88af1babd6555c389b1e14316c29b78146f8f0 (diff)
1 files changed, 126 insertions, 0 deletions
diff --git a/lib/unicode/string/u8norm_nfd.c b/lib/unicode/string/u8norm_nfd.c
new file mode 100644
index 0000000..8f142e1
--- /dev/null
+++ b/lib/unicode/string/u8norm_nfd.c
@@ -0,0 +1,126 @@
+#include <string.h>
+
+#include "macros.h"
+#include "mbstring.h"
+#include "unicode/prop.h"
+#include "unicode/string.h"
+
+#include <stdio.h>
+
+static size_t quickchk_spn(struct u8view);
+static void decomp(char8_t *, size_t *, size_t, rune);
+
+/* Computed using a gen/scale-norm.c */
+constexpr int NFD_SCALE = 3;
+
+/* For Hangul syllable decomposition */
+constexpr rune SBASE = 0xAC00;
+constexpr rune LBASE = 0x1100;
+constexpr rune VBASE = 0x1161;
+constexpr rune TBASE = 0x11A7;
+constexpr int LCNT = 19;
+constexpr int VCNT = 21;
+constexpr int TCNT = 28;
+constexpr int NCNT = VCNT * TCNT;
+constexpr int SCNT = LCNT * NCNT;
+
+char8_t *
+u8norm_nfd(size_t *dstn, struct u8view src, alloc_fn alloc, void *ctx)
+{
+	ASSUME(dstn != nullptr);
+	ASSUME(alloc != nullptr);
+
+	/* Pre-allocate a buffer with some initial capacity; there is no need to
+	   check for overflow when computing bufsz because alloc() will handle the
+	   overflow error for us. */
+	size_t bufsz = src.len * NFD_SCALE;
+	uint8_t *dst = alloc(ctx, nullptr, 0, src.len, NFD_SCALE, alignof(char8_t));
+
+	/* Copy over the initial codepoints that are already in NFD; if the entire
+	   string is in NFD then just return it immediately */
+	size_t spn = quickchk_spn(src);
+	memcpy(dst, src.p, spn);
+	*dstn = spn;
+	if (spn == src.len)
+		return dst;
+	VSHFT(&src, spn);
+
+	rune ch;
+	while (u8next(&ch, &src) != 0)
+		decomp(dst, dstn, bufsz, ch);
+	return alloc(ctx, dst, src.len, *dstn, 1, alignof(char8_t));
+}
+
+#define WRITE(ch) *dstn += rtou8(dst + *dstn, bufsz - *dstn, (ch))
+
+void
+decomp(char8_t *dst, size_t *dstn, size_t bufsz, rune ch)
+{
+	if (uprop_get_hst(ch) != HST_NA) {
+		int si = ch - SBASE;
+		if (si < 0 || si > SCNT) {
+			WRITE(ch);
+			return;
+		}
+		rune l, v, t;
+		l = LBASE + si / NCNT;
+		v = VBASE + (si % NCNT) / TCNT;
+		t = TBASE + si % TCNT;
+		WRITE(l);
+		WRITE(v);
+		if (t != TBASE)
+			WRITE(t);
+	} else if (uprop_get_dt(ch) == DT_CAN) {
+		struct rview rv = uprop_get_dm(ch);
+		for (size_t i = 0; i < rv.len; i++)
+			decomp(dst, dstn, bufsz, rv.p[i]);
+	} else {
+		enum uprop_ccc ccc = uprop_get_ccc(ch);
+		if (ccc == CCC_NR) {
+			WRITE(ch);
+			return;
+		}
+
+		int w;
+		rune hc;
+		char8_t *p = dst + *dstn;
+		while (w = u8prev(&hc, (const char8_t **)&p, dst)) {
+			enum uprop_ccc ccc2 = uprop_get_ccc(hc);
+			if (ccc2 == CCC_NR || ccc2 <= ccc) {
+out:
+				char8_t tmp[U8_LEN_MAX];
+				int w2 = rtou8(tmp, sizeof(tmp), ch);
+				p += w;
+				memmove(p + w2, p, dst + *dstn - p);
+				memcpy(p, tmp, w2);
+				*dstn += w2;
+				return;
+			}
+		}
+
+		/* Loop didn’t early-return; append to the start */
+		goto out;
+	}
+}
+
+#undef WRITE
+
+size_t
+quickchk_spn(struct u8view src)
+{
+	rune ch;
+	size_t spn = 0;
+	enum uprop_ccc prv, cur;
+
+	prv = cur = CCC_NR;
+
+	for (int w; w = u8next(&ch, &src); spn += w) {
+		if (uprop_get_nfd_qc(ch) == NFD_QC_N)
+			break;
+		if ((cur = uprop_get_ccc(ch)) < prv)
+			break;
+		prv = cur;
+	}
+
+	return spn;
+}
author	Thomas Voss <mail@thomasvoss.com>	2024-05-14 20:53:13 +0200
committer	Thomas Voss <mail@thomasvoss.com>	2024-05-14 20:53:22 +0200
commit	a39a4797730a09ec4fbb41c11b7dc1f7d245bc15 (patch)
tree	13f2a025d52535d66cc8e1c4be0aa502ffaeab7b /lib/unicode/string
parent	4e88af1babd6555c389b1e14316c29b78146f8f0 (diff)