Add ucsnorm_nfkd()

author: Thomas Voss <mail@thomasvoss.com> 2024-05-15 00:43:54 +0200
committer: Thomas Voss <mail@thomasvoss.com> 2024-05-15 00:45:17 +0200
commit: 5498793a56b19da99b7b6856c953933e50b8d572 (patch)
tree: 708166215910ef89e9d4133a805f6e12b3ba4ab3
parent: d7ba894d2af0e0c5a8d5db9cbadd7ea9a277100b (diff)
7 files changed, 213 insertions, 106 deletions
diff --git a/README b/README
index b56105f..0a92230 100644
--- a/README
+++ b/README
@@ -121,16 +121,16 @@ FEATURES:
           Azeri, Dutch, German, Lithuanian, and Turkish.
         • Iteration and counting of graphemes, words, and human-precieved
           words in a string
-        • NFD string normalization
+        • NFD- and NFKD string normalization
         • Unicode-aware case-mapping of strings with custom allocator
           support
 
 
 PLANNED FEATURES:
 
-    • Line- and sentence segmentation            (unicode/string.h)
-    • String collation                           (unicode/string.h)
-    • NFC-, NFKC-, and NFKD string normalization (unicode/string.h)
+    • Line- and sentence segmentation     (unicode/string.h)
+    • String collation                    (unicode/string.h)
+    • NFC-, and NFKC string normalization (unicode/string.h)
 
 
 BUGS:
diff --git a/include/unicode/string.h b/include/unicode/string.h
index a5b1cdb..06edb6c 100644
--- a/include/unicode/string.h
+++ b/include/unicode/string.h
@@ -34,8 +34,8 @@ size_t u8wnext_human(struct u8view *, struct u8view *);
                                alloc_fn, void *);
 [[nodiscard]] char8_t *u8upper(size_t *, struct u8view, enum caseflags,
                                alloc_fn, void *);
-[[nodiscard]] char8_t *u8norm_nfc(size_t *, struct u8view, alloc_fn, void *);
 [[nodiscard]] char8_t *u8norm_nfd(size_t *, struct u8view, alloc_fn, void *);
+[[nodiscard]] char8_t *u8norm_nfkd(size_t *, struct u8view, alloc_fn, void *);
 
 /* Encoding-generic macros */
 #define ucsgcnt(sv)       _Generic((sv), struct u8view: u8gcnt)((sv))
@@ -57,10 +57,10 @@ size_t u8wnext_human(struct u8view *, struct u8view *);
 #define ucsupper(dstn, sv, flags, alloc, ctx)                                  \
 	_Generic((sv), struct u8view: u8upper)((dstn), (sv), (flags), (alloc),     \
 	                                       (ctx))
-#define ucsnorm_nfc(dstn, sv, alloc, ctx)                                      \
-	_Generic((sv), struct u8view: u8norm_nfc)((dstn), (sv), (alloc), (ctx))
 #define ucsnorm_nfd(dstn, sv, alloc, ctx)                                      \
 	_Generic((sv), struct u8view: u8norm_nfd)((dstn), (sv), (alloc), (ctx))
+#define ucsnorm_nfkd(dstn, sv, alloc, ctx)                                     \
+	_Generic((sv), struct u8view: u8norm_nfkd)((dstn), (sv), (alloc), (ctx))
 
 constexpr double U8CASEFOLD_SCALE = 3;
 constexpr double U8LOWER_SCALE = 1.5;
diff --git a/lib/unicode/string/u8norm_nfkd.c b/lib/unicode/string/u8norm_nfkd.c
new file mode 100644
index 0000000..898b650
--- /dev/null
+++ b/lib/unicode/string/u8norm_nfkd.c
@@ -0,0 +1,94 @@
+#include <string.h>
+
+#include "macros.h"
+#include "mbstring.h"
+#include "unicode/prop.h"
+#include "unicode/string.h"
+
+static void decomp(char8_t *, size_t *, size_t, rune);
+
+/* Computed using a gen/scale-norm.c */
+constexpr int NFKD_SCALE = 11;
+
+/* For Hangul syllable decomposition */
+constexpr rune SBASE = 0xAC00;
+constexpr rune LBASE = 0x1100;
+constexpr rune VBASE = 0x1161;
+constexpr rune TBASE = 0x11A7;
+constexpr int LCNT = 19;
+constexpr int VCNT = 21;
+constexpr int TCNT = 28;
+constexpr int NCNT = VCNT * TCNT;
+constexpr int SCNT = LCNT * NCNT;
+
+char8_t *
+u8norm_nfkd(size_t *dstn, struct u8view src, alloc_fn alloc, void *ctx)
+{
+	ASSUME(dstn != nullptr);
+	ASSUME(alloc != nullptr);
+
+	/* Pre-allocate a buffer with some initial capacity; there is no need to
+	   check for overflow when computing bufsz because alloc() will handle the
+	   overflow error for us. */
+	size_t bufsz = src.len * NFKD_SCALE;
+	char8_t *dst = alloc(ctx, nullptr, 0, src.len, NFKD_SCALE, alignof(char8_t));
+
+	*dstn = 0;
+	for (rune ch; ucsnext(&ch, &src) != 0; decomp(dst, dstn, bufsz, ch))
+		;
+	return alloc(ctx, dst, src.len, *dstn, 1, alignof(char8_t));
+}
+
+#define WRITE(ch) *dstn += rtoucs(dst + *dstn, bufsz - *dstn, (ch))
+
+void
+decomp(char8_t *dst, size_t *dstn, size_t bufsz, rune ch)
+{
+	if (uprop_get_hst(ch) != HST_NA) {
+		int si = ch - SBASE;
+		if (si < 0 || si > SCNT) {
+			WRITE(ch);
+			return;
+		}
+		rune l, v, t;
+		l = LBASE + si / NCNT;
+		v = VBASE + (si % NCNT) / TCNT;
+		t = TBASE + si % TCNT;
+		WRITE(l);
+		WRITE(v);
+		if (t != TBASE)
+			WRITE(t);
+	} else if (uprop_get_dt(ch) != DT_NONE) {
+		struct rview rv = uprop_get_dm(ch);
+		for (size_t i = 0; i < rv.len; i++)
+			decomp(dst, dstn, bufsz, rv.p[i]);
+	} else {
+		enum uprop_ccc ccc = uprop_get_ccc(ch);
+		if (ccc == CCC_NR) {
+			WRITE(ch);
+			return;
+		}
+
+		int w;
+		rune hc;
+		char8_t *p = dst + *dstn;
+		while (w = ucsprev(&hc, (const char8_t **)&p, dst)) {
+			enum uprop_ccc ccc2 = uprop_get_ccc(hc);
+			if (ccc2 == CCC_NR || ccc2 <= ccc) {
+out:
+				char8_t tmp[U8_LEN_MAX];
+				int w2 = rtoucs(tmp, sizeof(tmp), ch);
+				p += w;
+				memmove(p + w2, p, dst + *dstn - p);
+				memcpy(p, tmp, w2);
+				*dstn += w2;
+				return;
+			}
+		}
+
+		/* Loop didn’t early-return; append to the start */
+		goto out;
+	}
+}
+
+#undef WRITE
diff --git a/test/_norm-test.h b/test/_norm-test.h
new file mode 100644
index 0000000..68209f1
--- /dev/null
+++ b/test/_norm-test.h
@@ -0,0 +1,107 @@
+#if !defined(NORMTYPE)
+#	error "NORMTYPE must be defined"
+#endif
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <alloc.h>
+#include <dynarr.h>
+#include <errors.h>
+#include <macros.h>
+#include <mbstring.h>
+#include <rune.h>
+#include <unicode/string.h>
+
+#define TESTFILE "norm.in"
+#define FUNC     CONCAT(ucsnorm_, NORMTYPE)
+
+static bool test(struct u8view, int);
+
+int
+main(int, char **argv)
+{
+	int rv;
+	size_t n;
+	ssize_t nr;
+	char *line;
+	FILE *fp;
+
+	rv = EXIT_SUCCESS;
+	line = nullptr;
+	mlib_setprogname(argv[0]);
+
+	if ((fp = fopen(TESTFILE, "r")) == nullptr)
+		err("fopen: %s:", TESTFILE);
+
+	for (int id = 1; (nr = getline(&line, &n, fp)) > 0; id++) {
+		if (line[nr - 1] == '\n')
+			line[--nr] = '\0';
+
+		if (!test((struct u8view){line, (size_t)nr}, id)) {
+			rv = EXIT_FAILURE;
+			break;
+		}
+	}
+	if (ferror(fp))
+		err("getline: %s:", TESTFILE);
+
+	free(line);
+	fclose(fp);
+	return rv;
+}
+
+bool
+test(struct u8view sv, int id)
+{
+	bool rv = true;
+	arena a = mkarena(0);
+	struct arena_ctx ctx = {.a = &a};
+
+	dynarr(struct u8view) columns = {
+		.alloc = alloc_arena,
+		.ctx = &ctx,
+	};
+
+	struct u8view column;
+	while (ucscut(&column, &sv, U";", 1) != MBEND) {
+		dynarr(char8_t) s = {
+			.alloc = alloc_arena,
+			.ctx = &ctx,
+		};
+
+		rune _;
+		struct u8view cp;
+		do {
+			rune ch;
+			_ = ucscut(&cp, &column, U" ", 1);
+			sscanf(cp.p, "%" SCNxRUNE, &ch);
+			char8_t buf[U8_LEN_MAX];
+			int w = rtoucs(buf, sizeof(buf), ch);
+			DAEXTEND(&s, buf, w);
+		} while (_ != MBEND);
+
+		DAPUSH(&columns, ((struct u8view){s.buf, s.len}));
+	}
+
+	for (size_t i = 0; i < 5; i++) {
+		size_t base;
+		if (streq(STR(NORMTYPE), "nfkd"))
+			base = 4;
+		else
+			base = i < 3 ? 2 : 4;
+		struct u8view normd = {};
+		normd.p = FUNC(&normd.len, columns.buf[i], alloc_arena, &ctx);
+		if (!ucseq(columns.buf[base], normd)) {
+			warn("case %d: expected c%zu to be ‘%.*s’ but got ‘%.*s’", id,
+			     i + 1, SV_PRI_ARGS(columns.buf[base]), SV_PRI_ARGS(normd));
+			rv = false;
+			goto out;
+		}
+	}
+
+out:
+	arena_free(&a);
+	return rv;
+}
diff --git a/test/norm-nfd-test.c b/test/norm-nfd-test.c
index 95bc8d5..6067352 100644
--- a/test/norm-nfd-test.c
+++ b/test/norm-nfd-test.c
@@ -1,98 +1,2 @@
-#define _GNU_SOURCE
-#include <stdio.h>
-#include <stdlib.h>
-
-#include <alloc.h>
-#include <dynarr.h>
-#include <errors.h>
-#include <macros.h>
-#include <mbstring.h>
-#include <rune.h>
-#include <unicode/string.h>
-
-#define TESTFILE "norm-nfd.in"
-
-static bool test(struct u8view, int);
-
-int
-main(int, char **argv)
-{
-	int rv;
-	size_t n;
-	ssize_t nr;
-	char *line;
-	FILE *fp;
-
-	rv = EXIT_SUCCESS;
-	line = nullptr;
-	mlib_setprogname(argv[0]);
-
-	if ((fp = fopen(TESTFILE, "r")) == nullptr)
-		err("fopen: %s:", TESTFILE);
-
-	for (int id = 1; (nr = getline(&line, &n, fp)) > 0; id++) {
-		if (line[nr - 1] == '\n')
-			line[--nr] = '\0';
-
-		if (!test((struct u8view){line, (size_t)nr}, id)) {
-			rv = EXIT_FAILURE;
-			break;
-		}
-	}
-	if (ferror(fp))
-		err("getline: %s:", TESTFILE);
-
-	free(line);
-	fclose(fp);
-	return rv;
-}
-
-bool
-test(struct u8view sv, int id)
-{
-	bool rv = true;
-	arena a = mkarena(0);
-	struct arena_ctx ctx = {.a = &a};
-
-	dynarr(struct u8view) columns = {
-		.alloc = alloc_arena,
-		.ctx = &ctx,
-	};
-
-	struct u8view column;
-	while (ucscut(&column, &sv, U";", 1) != MBEND) {
-		dynarr(char8_t) s = {
-			.alloc = alloc_arena,
-			.ctx = &ctx,
-		};
-
-		rune _;
-		struct u8view cp;
-		do {
-			rune ch;
-			_ = ucscut(&cp, &column, U" ", 1);
-			sscanf(cp.p, "%" SCNxRUNE, &ch);
-			char8_t buf[U8_LEN_MAX];
-			int w = rtoucs(buf, sizeof(buf), ch);
-			DAEXTEND(&s, buf, w);
-		} while (_ != MBEND);
-
-		DAPUSH(&columns, ((struct u8view){s.buf, s.len}));
-	}
-
-	for (size_t i = 0; i < 5; i++) {
-		size_t base = i < 3 ? 2 : 4;
-		struct u8view normd = {};
-		normd.p = ucsnorm_nfd(&normd.len, columns.buf[i], alloc_arena, &ctx);
-		if (!ucseq(columns.buf[base], normd)) {
-			warn("case %d: expected c%zu to be ‘%.*s’ but got ‘%.*s’", id,
-			     i + 1, SV_PRI_ARGS(columns.buf[base]), SV_PRI_ARGS(normd));
-			rv = false;
-			goto out;
-		}
-	}
-
-out:
-	arena_free(&a);
-	return rv;
-}
+#define NORMTYPE nfd
+#include "_norm-test.h"
diff --git a/test/norm-nfkd-test.c b/test/norm-nfkd-test.c
new file mode 100644
index 0000000..3fe8ff2
--- /dev/null
+++ b/test/norm-nfkd-test.c
@@ -0,0 +1,2 @@
+#define NORMTYPE nfkd
+#include "_norm-test.h"
diff --git a/test/run-tests b/test/run-tests
index 860d243..ae9c96e 100755
--- a/test/run-tests
+++ b/test/run-tests
@@ -32,7 +32,7 @@ grep '^[^#]'                           data/UppercaseTest      >upper.in
 grep '^[^#]'                           data/WordHumanBreakTest >wbrk-human.in
 sed -En 's/\s+//g; s/÷?#.*//g; /./p'   data/GraphemeBreakTest  >gbrk.in
 sed -En 's/\s+//g; s/÷?#.*//g; /./p'   data/WordBreakTest      >wbrk.in
-sed -En 's/(#|^@).*//; s/\s+$//; /./p' data/NormalizationTest  >norm-nfd.in
+sed -En 's/(#|^@).*//; s/\s+$//; /./p' data/NormalizationTest  >norm.in
 
 for src in *.c
 do
author	Thomas Voss <mail@thomasvoss.com>	2024-05-15 00:43:54 +0200
committer	Thomas Voss <mail@thomasvoss.com>	2024-05-15 00:45:17 +0200
commit	5498793a56b19da99b7b6856c953933e50b8d572 (patch)
tree	708166215910ef89e9d4133a805f6e12b3ba4ab3
parent	d7ba894d2af0e0c5a8d5db9cbadd7ea9a277100b (diff)