Support the 4 forms of Unicode string normalization

author: Thomas Voss <mail@thomasvoss.com> 2024-05-20 17:56:55 +0200
committer: Thomas Voss <mail@thomasvoss.com> 2024-05-20 17:56:55 +0200
commit: 2e125c1c7e75db14a88f0b8b09e61a132977c63e (patch)
tree: 30c37263315e07f983c2b05b69c17e47c827b849 /lib/unicode
parent: d6b1db5c14ca1e731db299748d2df9eb955c9f7c (diff)
3 files changed, 192 insertions, 188 deletions
diff --git a/lib/unicode/string/u8norm.c b/lib/unicode/string/u8norm.c
new file mode 100644
index 0000000..a918479
--- /dev/null
+++ b/lib/unicode/string/u8norm.c
@@ -0,0 +1,192 @@
+#include <string.h>
+
+#include "macros.h"
+#include "mbstring.h"
+#include "unicode/_cm.h"
+#include "unicode/prop.h"
+#include "unicode/string.h"
+
+#define BETWEEN(x, y, z) ((x) <= (y) && (y) <= (z))
+
+static void decomp(char8_t *, size_t *, size_t, rune, enum normtype);
+static void compbuf(char8_t *, size_t *);
+
+/* Computed using a gen/scale-norm.c */
+constexpr int NFD_SCALE = 3;
+constexpr int NFKD_SCALE = 11;
+
+/* For Hangul syllable decomposition */
+constexpr rune SBASE = 0xAC00;
+constexpr rune LBASE = 0x1100;
+constexpr rune VBASE = 0x1161;
+constexpr rune TBASE = 0x11A7;
+constexpr int LCNT = 19;
+constexpr int VCNT = 21;
+constexpr int TCNT = 28;
+constexpr int NCNT = VCNT * TCNT;
+constexpr int SCNT = LCNT * NCNT;
+
+char8_t *
+u8norm(size_t *dstn, struct u8view src, alloc_fn alloc, void *ctx,
+       enum normtype nt)
+{
+	ASSUME(dstn != nullptr);
+	ASSUME(alloc != nullptr);
+	ASSUME(BETWEEN(0, nt, 4));
+
+	/* Pre-allocate a buffer with some initial capacity; there is no need to
+	   check for overflow when computing bufsz because alloc() will handle the
+	   overflow error for us. */
+	int scale = (nt & 0b10) ? NFKD_SCALE : NFD_SCALE;
+	size_t bufsz = src.len * scale;
+	char8_t *dst = alloc(ctx, nullptr, 0, src.len, scale, alignof(char8_t));
+
+	*dstn = 0;
+	for (rune ch; ucsnext(&ch, &src) != 0; decomp(dst, dstn, bufsz, ch, nt))
+		;
+	if (nt & 0b01)
+		compbuf(dst, dstn);
+	return alloc(ctx, dst, src.len, *dstn, 1, alignof(char8_t));
+}
+
+#define WRITE(ch) *dstn += rtoucs(dst + *dstn, bufsz - *dstn, (ch))
+
+void
+decomp(char8_t *dst, size_t *dstn, size_t bufsz, rune ch, enum normtype nt)
+{
+	if (uprop_get_hst(ch) != HST_NA) {
+		int si = ch - SBASE;
+		if (si < 0 || si > SCNT) {
+			WRITE(ch);
+			return;
+		}
+		rune l, v, t;
+		l = LBASE + si / NCNT;
+		v = VBASE + (si % NCNT) / TCNT;
+		t = TBASE + si % TCNT;
+		WRITE(l);
+		WRITE(v);
+		if (t != TBASE)
+			WRITE(t);
+	} else if (((nt & 0b10) && uprop_get_dt(ch) != DT_NONE)
+	           || ((nt & 0b10) == 0 && uprop_get_dt(ch) == DT_CAN))
+	{
+		struct rview rv = uprop_get_dm(ch);
+		for (size_t i = 0; i < rv.len; i++)
+			decomp(dst, dstn, bufsz, rv.p[i], nt);
+	} else {
+		enum uprop_ccc ccc = uprop_get_ccc(ch);
+		if (ccc == CCC_NR) {
+			WRITE(ch);
+			return;
+		}
+
+		int w;
+		rune hc;
+		char8_t *p = dst + *dstn;
+		while (w = ucsprev(&hc, (const char8_t **)&p, dst)) {
+			enum uprop_ccc ccc2 = uprop_get_ccc(hc);
+			if (ccc2 == CCC_NR || ccc2 <= ccc) {
+out:
+				char8_t tmp[U8_LEN_MAX];
+				int w2 = rtoucs(tmp, sizeof(tmp), ch);
+				p += w;
+				memmove(p + w2, p, dst + *dstn - p);
+				memcpy(p, tmp, w2);
+				*dstn += w2;
+				return;
+			}
+		}
+
+		/* Loop didn’t early-return; append to the start */
+		goto out;
+	}
+}
+
+#undef WRITE
+
+/* The following implements the canonical composition algorithm, and it may be
+   useful to read it to understand what’s going on.  It can be found under
+   §3.11 Normalization Forms of the Unicode standard, subsection ‘Canonical
+   Composition Algorithm’. */
+
+void
+compbuf(char8_t *dst, size_t *dstn)
+{
+	int wC, wL;
+	rune C, L;
+	struct u8view sv = {dst, *dstn};
+
+	while ((wL = ucsnext(&L, &sv)) != 0) {
+		if (uprop_get_ccc(L) != CCC_NR)
+			continue;
+		char8_t *after_L = (char8_t *)sv.p;
+
+		enum uprop_ccc prevcc = 0;
+		struct u8view sv_ = sv;
+
+		while ((wC = ucsnext(&C, &sv_)) != 0) {
+			enum uprop_ccc curcc = uprop_get_ccc(C);
+			bool blocked = curcc <= prevcc;
+
+			if (blocked) {
+				if (curcc != CCC_NR)
+					continue;
+				if (curcc != prevcc)
+					break;
+			}
+
+			prevcc = curcc;
+			rune comp = uprop_get_cm(L, C);
+
+			/* Try Hangul composition */
+			if (comp == 0) {
+				if (BETWEEN(LBASE, L, LBASE + LCNT - 1)
+				    && BETWEEN(VBASE, C, VBASE + VCNT - 1))
+				{
+					comp = SBASE + ((L - LBASE) * NCNT + (C - VBASE) * TCNT);
+				} else if (BETWEEN(TBASE, C, TBASE + TCNT - 1)
+				           && BETWEEN(SBASE, L, SBASE + SCNT - 1)
+				           && ((L - SBASE) % TCNT) == 0)
+				{
+					comp = L + (C - TBASE);
+				}
+			}
+
+			if (comp != 0) {
+				char8_t *after_C = (char8_t *)sv_.p;
+
+				/* Shift bytes between L & C so that they’re contiguous with the
+				   bytes after C */
+				memmove(after_L + wC, after_L, after_C - wC - after_L);
+
+				/* Write the composition into where L was */
+				int w = rtoucs(after_L - wL, wL + wC, comp);
+
+				/* Shift the bytes after L & C to be right after the new
+				   composition */
+				memmove(after_L - wL + w, after_L + wC,
+				        *dstn - (after_L + wC - dst));
+
+				/* Correct *dstn */
+				int shift = wL + wC - w;
+				*dstn -= shift;
+
+				/* Fix the inner string view */
+				sv_.p = after_C - shift;
+				sv_.len = *dstn - (sv_.p - dst);
+
+				/* Fix outer string view */
+				sv.p = sv.p - wL + w;
+				sv.len = *dstn - (sv.p - dst);
+				after_L = (char8_t *)sv.p;
+
+				/* Update the value of L */
+				L = comp;
+				wL = w;
+				prevcc = CCC_NR;
+			} else if (blocked)
+				break;
+		}
+	}
+}
diff --git a/lib/unicode/string/u8norm_nfd.c b/lib/unicode/string/u8norm_nfd.c
deleted file mode 100644
index a89a1b5..0000000
--- a/lib/unicode/string/u8norm_nfd.c
+++ /dev/null
@@ -1,94 +0,0 @@
-#include <string.h>
-
-#include "macros.h"
-#include "mbstring.h"
-#include "unicode/prop.h"
-#include "unicode/string.h"
-
-static void decomp(char8_t *, size_t *, size_t, rune);
-
-/* Computed using a gen/scale-norm.c */
-constexpr int NFD_SCALE = 3;
-
-/* For Hangul syllable decomposition */
-constexpr rune SBASE = 0xAC00;
-constexpr rune LBASE = 0x1100;
-constexpr rune VBASE = 0x1161;
-constexpr rune TBASE = 0x11A7;
-constexpr int LCNT = 19;
-constexpr int VCNT = 21;
-constexpr int TCNT = 28;
-constexpr int NCNT = VCNT * TCNT;
-constexpr int SCNT = LCNT * NCNT;
-
-char8_t *
-u8norm_nfd(size_t *dstn, struct u8view src, alloc_fn alloc, void *ctx)
-{
-	ASSUME(dstn != nullptr);
-	ASSUME(alloc != nullptr);
-
-	/* Pre-allocate a buffer with some initial capacity; there is no need to
-	   check for overflow when computing bufsz because alloc() will handle the
-	   overflow error for us. */
-	size_t bufsz = src.len * NFD_SCALE;
-	char8_t *dst = alloc(ctx, nullptr, 0, src.len, NFD_SCALE, alignof(char8_t));
-
-	*dstn = 0;
-	for (rune ch; ucsnext(&ch, &src) != 0; decomp(dst, dstn, bufsz, ch))
-		;
-	return alloc(ctx, dst, src.len, *dstn, 1, alignof(char8_t));
-}
-
-#define WRITE(ch) *dstn += rtoucs(dst + *dstn, bufsz - *dstn, (ch))
-
-void
-decomp(char8_t *dst, size_t *dstn, size_t bufsz, rune ch)
-{
-	if (uprop_get_hst(ch) != HST_NA) {
-		int si = ch - SBASE;
-		if (si < 0 || si > SCNT) {
-			WRITE(ch);
-			return;
-		}
-		rune l, v, t;
-		l = LBASE + si / NCNT;
-		v = VBASE + (si % NCNT) / TCNT;
-		t = TBASE + si % TCNT;
-		WRITE(l);
-		WRITE(v);
-		if (t != TBASE)
-			WRITE(t);
-	} else if (uprop_get_dt(ch) == DT_CAN) {
-		struct rview rv = uprop_get_dm(ch);
-		for (size_t i = 0; i < rv.len; i++)
-			decomp(dst, dstn, bufsz, rv.p[i]);
-	} else {
-		enum uprop_ccc ccc = uprop_get_ccc(ch);
-		if (ccc == CCC_NR) {
-			WRITE(ch);
-			return;
-		}
-
-		int w;
-		rune hc;
-		char8_t *p = dst + *dstn;
-		while (w = ucsprev(&hc, (const char8_t **)&p, dst)) {
-			enum uprop_ccc ccc2 = uprop_get_ccc(hc);
-			if (ccc2 == CCC_NR || ccc2 <= ccc) {
-out:
-				char8_t tmp[U8_LEN_MAX];
-				int w2 = rtoucs(tmp, sizeof(tmp), ch);
-				p += w;
-				memmove(p + w2, p, dst + *dstn - p);
-				memcpy(p, tmp, w2);
-				*dstn += w2;
-				return;
-			}
-		}
-
-		/* Loop didn’t early-return; append to the start */
-		goto out;
-	}
-}
-
-#undef WRITE
diff --git a/lib/unicode/string/u8norm_nfkd.c b/lib/unicode/string/u8norm_nfkd.c
deleted file mode 100644
index 898b650..0000000
--- a/lib/unicode/string/u8norm_nfkd.c
+++ /dev/null
@@ -1,94 +0,0 @@
-#include <string.h>
-
-#include "macros.h"
-#include "mbstring.h"
-#include "unicode/prop.h"
-#include "unicode/string.h"
-
-static void decomp(char8_t *, size_t *, size_t, rune);
-
-/* Computed using a gen/scale-norm.c */
-constexpr int NFKD_SCALE = 11;
-
-/* For Hangul syllable decomposition */
-constexpr rune SBASE = 0xAC00;
-constexpr rune LBASE = 0x1100;
-constexpr rune VBASE = 0x1161;
-constexpr rune TBASE = 0x11A7;
-constexpr int LCNT = 19;
-constexpr int VCNT = 21;
-constexpr int TCNT = 28;
-constexpr int NCNT = VCNT * TCNT;
-constexpr int SCNT = LCNT * NCNT;
-
-char8_t *
-u8norm_nfkd(size_t *dstn, struct u8view src, alloc_fn alloc, void *ctx)
-{
-	ASSUME(dstn != nullptr);
-	ASSUME(alloc != nullptr);
-
-	/* Pre-allocate a buffer with some initial capacity; there is no need to
-	   check for overflow when computing bufsz because alloc() will handle the
-	   overflow error for us. */
-	size_t bufsz = src.len * NFKD_SCALE;
-	char8_t *dst = alloc(ctx, nullptr, 0, src.len, NFKD_SCALE, alignof(char8_t));
-
-	*dstn = 0;
-	for (rune ch; ucsnext(&ch, &src) != 0; decomp(dst, dstn, bufsz, ch))
-		;
-	return alloc(ctx, dst, src.len, *dstn, 1, alignof(char8_t));
-}
-
-#define WRITE(ch) *dstn += rtoucs(dst + *dstn, bufsz - *dstn, (ch))
-
-void
-decomp(char8_t *dst, size_t *dstn, size_t bufsz, rune ch)
-{
-	if (uprop_get_hst(ch) != HST_NA) {
-		int si = ch - SBASE;
-		if (si < 0 || si > SCNT) {
-			WRITE(ch);
-			return;
-		}
-		rune l, v, t;
-		l = LBASE + si / NCNT;
-		v = VBASE + (si % NCNT) / TCNT;
-		t = TBASE + si % TCNT;
-		WRITE(l);
-		WRITE(v);
-		if (t != TBASE)
-			WRITE(t);
-	} else if (uprop_get_dt(ch) != DT_NONE) {
-		struct rview rv = uprop_get_dm(ch);
-		for (size_t i = 0; i < rv.len; i++)
-			decomp(dst, dstn, bufsz, rv.p[i]);
-	} else {
-		enum uprop_ccc ccc = uprop_get_ccc(ch);
-		if (ccc == CCC_NR) {
-			WRITE(ch);
-			return;
-		}
-
-		int w;
-		rune hc;
-		char8_t *p = dst + *dstn;
-		while (w = ucsprev(&hc, (const char8_t **)&p, dst)) {
-			enum uprop_ccc ccc2 = uprop_get_ccc(hc);
-			if (ccc2 == CCC_NR || ccc2 <= ccc) {
-out:
-				char8_t tmp[U8_LEN_MAX];
-				int w2 = rtoucs(tmp, sizeof(tmp), ch);
-				p += w;
-				memmove(p + w2, p, dst + *dstn - p);
-				memcpy(p, tmp, w2);
-				*dstn += w2;
-				return;
-			}
-		}
-
-		/* Loop didn’t early-return; append to the start */
-		goto out;
-	}
-}
-
-#undef WRITE
author	Thomas Voss <mail@thomasvoss.com>	2024-05-20 17:56:55 +0200
committer	Thomas Voss <mail@thomasvoss.com>	2024-05-20 17:56:55 +0200
commit	2e125c1c7e75db14a88f0b8b09e61a132977c63e (patch)
tree	30c37263315e07f983c2b05b69c17e47c827b849 /lib/unicode
parent	d6b1db5c14ca1e731db299748d2df9eb955c9f7c (diff)