aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README8
-rw-r--r--include/unicode/string.h6
-rw-r--r--lib/unicode/string/u8norm_nfkd.c94
-rw-r--r--test/_norm-test.h107
-rw-r--r--test/norm-nfd-test.c100
-rw-r--r--test/norm-nfkd-test.c2
-rwxr-xr-xtest/run-tests2
7 files changed, 213 insertions, 106 deletions
diff --git a/README b/README
index b56105f..0a92230 100644
--- a/README
+++ b/README
@@ -121,16 +121,16 @@ FEATURES:
Azeri, Dutch, German, Lithuanian, and Turkish.
• Iteration and counting of graphemes, words, and human-precieved
words in a string
- • NFD string normalization
+ • NFD- and NFKD string normalization
• Unicode-aware case-mapping of strings with custom allocator
support
PLANNED FEATURES:
- • Line- and sentence segmentation (unicode/string.h)
- • String collation (unicode/string.h)
- • NFC-, NFKC-, and NFKD string normalization (unicode/string.h)
+ • Line- and sentence segmentation (unicode/string.h)
+ • String collation (unicode/string.h)
+ • NFC-, and NFKC string normalization (unicode/string.h)
BUGS:
diff --git a/include/unicode/string.h b/include/unicode/string.h
index a5b1cdb..06edb6c 100644
--- a/include/unicode/string.h
+++ b/include/unicode/string.h
@@ -34,8 +34,8 @@ size_t u8wnext_human(struct u8view *, struct u8view *);
alloc_fn, void *);
[[nodiscard]] char8_t *u8upper(size_t *, struct u8view, enum caseflags,
alloc_fn, void *);
-[[nodiscard]] char8_t *u8norm_nfc(size_t *, struct u8view, alloc_fn, void *);
[[nodiscard]] char8_t *u8norm_nfd(size_t *, struct u8view, alloc_fn, void *);
+[[nodiscard]] char8_t *u8norm_nfkd(size_t *, struct u8view, alloc_fn, void *);
/* Encoding-generic macros */
#define ucsgcnt(sv) _Generic((sv), struct u8view: u8gcnt)((sv))
@@ -57,10 +57,10 @@ size_t u8wnext_human(struct u8view *, struct u8view *);
#define ucsupper(dstn, sv, flags, alloc, ctx) \
_Generic((sv), struct u8view: u8upper)((dstn), (sv), (flags), (alloc), \
(ctx))
-#define ucsnorm_nfc(dstn, sv, alloc, ctx) \
- _Generic((sv), struct u8view: u8norm_nfc)((dstn), (sv), (alloc), (ctx))
#define ucsnorm_nfd(dstn, sv, alloc, ctx) \
_Generic((sv), struct u8view: u8norm_nfd)((dstn), (sv), (alloc), (ctx))
+#define ucsnorm_nfkd(dstn, sv, alloc, ctx) \
+ _Generic((sv), struct u8view: u8norm_nfkd)((dstn), (sv), (alloc), (ctx))
constexpr double U8CASEFOLD_SCALE = 3;
constexpr double U8LOWER_SCALE = 1.5;
diff --git a/lib/unicode/string/u8norm_nfkd.c b/lib/unicode/string/u8norm_nfkd.c
new file mode 100644
index 0000000..898b650
--- /dev/null
+++ b/lib/unicode/string/u8norm_nfkd.c
@@ -0,0 +1,94 @@
+#include <string.h>
+
+#include "macros.h"
+#include "mbstring.h"
+#include "unicode/prop.h"
+#include "unicode/string.h"
+
+static void decomp(char8_t *, size_t *, size_t, rune);
+
+/* Computed using a gen/scale-norm.c */
+constexpr int NFKD_SCALE = 11;
+
+/* For Hangul syllable decomposition */
+constexpr rune SBASE = 0xAC00;
+constexpr rune LBASE = 0x1100;
+constexpr rune VBASE = 0x1161;
+constexpr rune TBASE = 0x11A7;
+constexpr int LCNT = 19;
+constexpr int VCNT = 21;
+constexpr int TCNT = 28;
+constexpr int NCNT = VCNT * TCNT;
+constexpr int SCNT = LCNT * NCNT;
+
+char8_t *
+u8norm_nfkd(size_t *dstn, struct u8view src, alloc_fn alloc, void *ctx)
+{
+ ASSUME(dstn != nullptr);
+ ASSUME(alloc != nullptr);
+
+ /* Pre-allocate a buffer with some initial capacity; there is no need to
+ check for overflow when computing bufsz because alloc() will handle the
+ overflow error for us. */
+ size_t bufsz = src.len * NFKD_SCALE;
+ char8_t *dst = alloc(ctx, nullptr, 0, src.len, NFKD_SCALE, alignof(char8_t));
+
+ *dstn = 0;
+ for (rune ch; ucsnext(&ch, &src) != 0; decomp(dst, dstn, bufsz, ch))
+ ;
+ return alloc(ctx, dst, src.len, *dstn, 1, alignof(char8_t));
+}
+
+#define WRITE(ch) *dstn += rtoucs(dst + *dstn, bufsz - *dstn, (ch))
+
+void
+decomp(char8_t *dst, size_t *dstn, size_t bufsz, rune ch)
+{
+ if (uprop_get_hst(ch) != HST_NA) {
+ int si = ch - SBASE;
+ if (si < 0 || si > SCNT) {
+ WRITE(ch);
+ return;
+ }
+ rune l, v, t;
+ l = LBASE + si / NCNT;
+ v = VBASE + (si % NCNT) / TCNT;
+ t = TBASE + si % TCNT;
+ WRITE(l);
+ WRITE(v);
+ if (t != TBASE)
+ WRITE(t);
+ } else if (uprop_get_dt(ch) != DT_NONE) {
+ struct rview rv = uprop_get_dm(ch);
+ for (size_t i = 0; i < rv.len; i++)
+ decomp(dst, dstn, bufsz, rv.p[i]);
+ } else {
+ enum uprop_ccc ccc = uprop_get_ccc(ch);
+ if (ccc == CCC_NR) {
+ WRITE(ch);
+ return;
+ }
+
+ int w;
+ rune hc;
+ char8_t *p = dst + *dstn;
+ while (w = ucsprev(&hc, (const char8_t **)&p, dst)) {
+ enum uprop_ccc ccc2 = uprop_get_ccc(hc);
+ if (ccc2 == CCC_NR || ccc2 <= ccc) {
+out:
+ char8_t tmp[U8_LEN_MAX];
+ int w2 = rtoucs(tmp, sizeof(tmp), ch);
+ p += w;
+ memmove(p + w2, p, dst + *dstn - p);
+ memcpy(p, tmp, w2);
+ *dstn += w2;
+ return;
+ }
+ }
+
+ /* Loop didn’t early-return; append to the start */
+ goto out;
+ }
+}
+
+#undef WRITE
diff --git a/test/_norm-test.h b/test/_norm-test.h
new file mode 100644
index 0000000..68209f1
--- /dev/null
+++ b/test/_norm-test.h
@@ -0,0 +1,107 @@
+#if !defined(NORMTYPE)
+# error "NORMTYPE must be defined"
+#endif
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <alloc.h>
+#include <dynarr.h>
+#include <errors.h>
+#include <macros.h>
+#include <mbstring.h>
+#include <rune.h>
+#include <unicode/string.h>
+
+#define TESTFILE "norm.in"
+#define FUNC CONCAT(ucsnorm_, NORMTYPE)
+
+static bool test(struct u8view, int);
+
+int
+main(int, char **argv)
+{
+ int rv;
+ size_t n;
+ ssize_t nr;
+ char *line;
+ FILE *fp;
+
+ rv = EXIT_SUCCESS;
+ line = nullptr;
+ mlib_setprogname(argv[0]);
+
+ if ((fp = fopen(TESTFILE, "r")) == nullptr)
+ err("fopen: %s:", TESTFILE);
+
+ for (int id = 1; (nr = getline(&line, &n, fp)) > 0; id++) {
+ if (line[nr - 1] == '\n')
+ line[--nr] = '\0';
+
+ if (!test((struct u8view){line, (size_t)nr}, id)) {
+ rv = EXIT_FAILURE;
+ break;
+ }
+ }
+ if (ferror(fp))
+ err("getline: %s:", TESTFILE);
+
+ free(line);
+ fclose(fp);
+ return rv;
+}
+
+bool
+test(struct u8view sv, int id)
+{
+ bool rv = true;
+ arena a = mkarena(0);
+ struct arena_ctx ctx = {.a = &a};
+
+ dynarr(struct u8view) columns = {
+ .alloc = alloc_arena,
+ .ctx = &ctx,
+ };
+
+ struct u8view column;
+ while (ucscut(&column, &sv, U";", 1) != MBEND) {
+ dynarr(char8_t) s = {
+ .alloc = alloc_arena,
+ .ctx = &ctx,
+ };
+
+ rune _;
+ struct u8view cp;
+ do {
+ rune ch;
+ _ = ucscut(&cp, &column, U" ", 1);
+ sscanf(cp.p, "%" SCNxRUNE, &ch);
+ char8_t buf[U8_LEN_MAX];
+ int w = rtoucs(buf, sizeof(buf), ch);
+ DAEXTEND(&s, buf, w);
+ } while (_ != MBEND);
+
+ DAPUSH(&columns, ((struct u8view){s.buf, s.len}));
+ }
+
+ for (size_t i = 0; i < 5; i++) {
+ size_t base;
+ if (streq(STR(NORMTYPE), "nfkd"))
+ base = 4;
+ else
+ base = i < 3 ? 2 : 4;
+ struct u8view normd = {};
+ normd.p = FUNC(&normd.len, columns.buf[i], alloc_arena, &ctx);
+ if (!ucseq(columns.buf[base], normd)) {
+ warn("case %d: expected c%zu to be ‘%.*s’ but got ‘%.*s’", id,
+ i + 1, SV_PRI_ARGS(columns.buf[base]), SV_PRI_ARGS(normd));
+ rv = false;
+ goto out;
+ }
+ }
+
+out:
+ arena_free(&a);
+ return rv;
+}
diff --git a/test/norm-nfd-test.c b/test/norm-nfd-test.c
index 95bc8d5..6067352 100644
--- a/test/norm-nfd-test.c
+++ b/test/norm-nfd-test.c
@@ -1,98 +1,2 @@
-#define _GNU_SOURCE
-#include <stdio.h>
-#include <stdlib.h>
-
-#include <alloc.h>
-#include <dynarr.h>
-#include <errors.h>
-#include <macros.h>
-#include <mbstring.h>
-#include <rune.h>
-#include <unicode/string.h>
-
-#define TESTFILE "norm-nfd.in"
-
-static bool test(struct u8view, int);
-
-int
-main(int, char **argv)
-{
- int rv;
- size_t n;
- ssize_t nr;
- char *line;
- FILE *fp;
-
- rv = EXIT_SUCCESS;
- line = nullptr;
- mlib_setprogname(argv[0]);
-
- if ((fp = fopen(TESTFILE, "r")) == nullptr)
- err("fopen: %s:", TESTFILE);
-
- for (int id = 1; (nr = getline(&line, &n, fp)) > 0; id++) {
- if (line[nr - 1] == '\n')
- line[--nr] = '\0';
-
- if (!test((struct u8view){line, (size_t)nr}, id)) {
- rv = EXIT_FAILURE;
- break;
- }
- }
- if (ferror(fp))
- err("getline: %s:", TESTFILE);
-
- free(line);
- fclose(fp);
- return rv;
-}
-
-bool
-test(struct u8view sv, int id)
-{
- bool rv = true;
- arena a = mkarena(0);
- struct arena_ctx ctx = {.a = &a};
-
- dynarr(struct u8view) columns = {
- .alloc = alloc_arena,
- .ctx = &ctx,
- };
-
- struct u8view column;
- while (ucscut(&column, &sv, U";", 1) != MBEND) {
- dynarr(char8_t) s = {
- .alloc = alloc_arena,
- .ctx = &ctx,
- };
-
- rune _;
- struct u8view cp;
- do {
- rune ch;
- _ = ucscut(&cp, &column, U" ", 1);
- sscanf(cp.p, "%" SCNxRUNE, &ch);
- char8_t buf[U8_LEN_MAX];
- int w = rtoucs(buf, sizeof(buf), ch);
- DAEXTEND(&s, buf, w);
- } while (_ != MBEND);
-
- DAPUSH(&columns, ((struct u8view){s.buf, s.len}));
- }
-
- for (size_t i = 0; i < 5; i++) {
- size_t base = i < 3 ? 2 : 4;
- struct u8view normd = {};
- normd.p = ucsnorm_nfd(&normd.len, columns.buf[i], alloc_arena, &ctx);
- if (!ucseq(columns.buf[base], normd)) {
- warn("case %d: expected c%zu to be ‘%.*s’ but got ‘%.*s’", id,
- i + 1, SV_PRI_ARGS(columns.buf[base]), SV_PRI_ARGS(normd));
- rv = false;
- goto out;
- }
- }
-
-out:
- arena_free(&a);
- return rv;
-}
+#define NORMTYPE nfd
+#include "_norm-test.h"
diff --git a/test/norm-nfkd-test.c b/test/norm-nfkd-test.c
new file mode 100644
index 0000000..3fe8ff2
--- /dev/null
+++ b/test/norm-nfkd-test.c
@@ -0,0 +1,2 @@
+#define NORMTYPE nfkd
+#include "_norm-test.h"
diff --git a/test/run-tests b/test/run-tests
index 860d243..ae9c96e 100755
--- a/test/run-tests
+++ b/test/run-tests
@@ -32,7 +32,7 @@ grep '^[^#]' data/UppercaseTest >upper.in
grep '^[^#]' data/WordHumanBreakTest >wbrk-human.in
sed -En 's/\s+//g; s/÷?#.*//g; /./p' data/GraphemeBreakTest >gbrk.in
sed -En 's/\s+//g; s/÷?#.*//g; /./p' data/WordBreakTest >wbrk.in
-sed -En 's/(#|^@).*//; s/\s+$//; /./p' data/NormalizationTest >norm-nfd.in
+sed -En 's/(#|^@).*//; s/\s+$//; /./p' data/NormalizationTest >norm.in
for src in *.c
do