From 5498793a56b19da99b7b6856c953933e50b8d572 Mon Sep 17 00:00:00 2001 From: Thomas Voss Date: Wed, 15 May 2024 00:43:54 +0200 Subject: Add ucsnorm_nfkd() --- README | 8 +-- include/unicode/string.h | 6 +-- lib/unicode/string/u8norm_nfkd.c | 94 ++++++++++++++++++++++++++++++++++ test/_norm-test.h | 107 +++++++++++++++++++++++++++++++++++++++ test/norm-nfd-test.c | 100 +----------------------------------- test/norm-nfkd-test.c | 2 + test/run-tests | 2 +- 7 files changed, 213 insertions(+), 106 deletions(-) create mode 100644 lib/unicode/string/u8norm_nfkd.c create mode 100644 test/_norm-test.h create mode 100644 test/norm-nfkd-test.c diff --git a/README b/README index b56105f..0a92230 100644 --- a/README +++ b/README @@ -121,16 +121,16 @@ FEATURES: Azeri, Dutch, German, Lithuanian, and Turkish. • Iteration and counting of graphemes, words, and human-precieved words in a string - • NFD string normalization + • NFD- and NFKD string normalization • Unicode-aware case-mapping of strings with custom allocator support PLANNED FEATURES: - • Line- and sentence segmentation (unicode/string.h) - • String collation (unicode/string.h) - • NFC-, NFKC-, and NFKD string normalization (unicode/string.h) + • Line- and sentence segmentation (unicode/string.h) + • String collation (unicode/string.h) + • NFC-, and NFKC string normalization (unicode/string.h) BUGS: diff --git a/include/unicode/string.h b/include/unicode/string.h index a5b1cdb..06edb6c 100644 --- a/include/unicode/string.h +++ b/include/unicode/string.h @@ -34,8 +34,8 @@ size_t u8wnext_human(struct u8view *, struct u8view *); alloc_fn, void *); [[nodiscard]] char8_t *u8upper(size_t *, struct u8view, enum caseflags, alloc_fn, void *); -[[nodiscard]] char8_t *u8norm_nfc(size_t *, struct u8view, alloc_fn, void *); [[nodiscard]] char8_t *u8norm_nfd(size_t *, struct u8view, alloc_fn, void *); +[[nodiscard]] char8_t *u8norm_nfkd(size_t *, struct u8view, alloc_fn, void *); /* Encoding-generic macros */ #define ucsgcnt(sv) _Generic((sv), struct u8view: u8gcnt)((sv)) @@ -57,10 +57,10 @@ size_t u8wnext_human(struct u8view *, struct u8view *); #define ucsupper(dstn, sv, flags, alloc, ctx) \ _Generic((sv), struct u8view: u8upper)((dstn), (sv), (flags), (alloc), \ (ctx)) -#define ucsnorm_nfc(dstn, sv, alloc, ctx) \ - _Generic((sv), struct u8view: u8norm_nfc)((dstn), (sv), (alloc), (ctx)) #define ucsnorm_nfd(dstn, sv, alloc, ctx) \ _Generic((sv), struct u8view: u8norm_nfd)((dstn), (sv), (alloc), (ctx)) +#define ucsnorm_nfkd(dstn, sv, alloc, ctx) \ + _Generic((sv), struct u8view: u8norm_nfkd)((dstn), (sv), (alloc), (ctx)) constexpr double U8CASEFOLD_SCALE = 3; constexpr double U8LOWER_SCALE = 1.5; diff --git a/lib/unicode/string/u8norm_nfkd.c b/lib/unicode/string/u8norm_nfkd.c new file mode 100644 index 0000000..898b650 --- /dev/null +++ b/lib/unicode/string/u8norm_nfkd.c @@ -0,0 +1,94 @@ +#include + +#include "macros.h" +#include "mbstring.h" +#include "unicode/prop.h" +#include "unicode/string.h" + +static void decomp(char8_t *, size_t *, size_t, rune); + +/* Computed using a gen/scale-norm.c */ +constexpr int NFKD_SCALE = 11; + +/* For Hangul syllable decomposition */ +constexpr rune SBASE = 0xAC00; +constexpr rune LBASE = 0x1100; +constexpr rune VBASE = 0x1161; +constexpr rune TBASE = 0x11A7; +constexpr int LCNT = 19; +constexpr int VCNT = 21; +constexpr int TCNT = 28; +constexpr int NCNT = VCNT * TCNT; +constexpr int SCNT = LCNT * NCNT; + +char8_t * +u8norm_nfkd(size_t *dstn, struct u8view src, alloc_fn alloc, void *ctx) +{ + ASSUME(dstn != nullptr); + ASSUME(alloc != nullptr); + + /* Pre-allocate a buffer with some initial capacity; there is no need to + check for overflow when computing bufsz because alloc() will handle the + overflow error for us. */ + size_t bufsz = src.len * NFKD_SCALE; + char8_t *dst = alloc(ctx, nullptr, 0, src.len, NFKD_SCALE, alignof(char8_t)); + + *dstn = 0; + for (rune ch; ucsnext(&ch, &src) != 0; decomp(dst, dstn, bufsz, ch)) + ; + return alloc(ctx, dst, src.len, *dstn, 1, alignof(char8_t)); +} + +#define WRITE(ch) *dstn += rtoucs(dst + *dstn, bufsz - *dstn, (ch)) + +void +decomp(char8_t *dst, size_t *dstn, size_t bufsz, rune ch) +{ + if (uprop_get_hst(ch) != HST_NA) { + int si = ch - SBASE; + if (si < 0 || si > SCNT) { + WRITE(ch); + return; + } + rune l, v, t; + l = LBASE + si / NCNT; + v = VBASE + (si % NCNT) / TCNT; + t = TBASE + si % TCNT; + WRITE(l); + WRITE(v); + if (t != TBASE) + WRITE(t); + } else if (uprop_get_dt(ch) != DT_NONE) { + struct rview rv = uprop_get_dm(ch); + for (size_t i = 0; i < rv.len; i++) + decomp(dst, dstn, bufsz, rv.p[i]); + } else { + enum uprop_ccc ccc = uprop_get_ccc(ch); + if (ccc == CCC_NR) { + WRITE(ch); + return; + } + + int w; + rune hc; + char8_t *p = dst + *dstn; + while (w = ucsprev(&hc, (const char8_t **)&p, dst)) { + enum uprop_ccc ccc2 = uprop_get_ccc(hc); + if (ccc2 == CCC_NR || ccc2 <= ccc) { +out: + char8_t tmp[U8_LEN_MAX]; + int w2 = rtoucs(tmp, sizeof(tmp), ch); + p += w; + memmove(p + w2, p, dst + *dstn - p); + memcpy(p, tmp, w2); + *dstn += w2; + return; + } + } + + /* Loop didn’t early-return; append to the start */ + goto out; + } +} + +#undef WRITE diff --git a/test/_norm-test.h b/test/_norm-test.h new file mode 100644 index 0000000..68209f1 --- /dev/null +++ b/test/_norm-test.h @@ -0,0 +1,107 @@ +#if !defined(NORMTYPE) +# error "NORMTYPE must be defined" +#endif + +#define _GNU_SOURCE +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#define TESTFILE "norm.in" +#define FUNC CONCAT(ucsnorm_, NORMTYPE) + +static bool test(struct u8view, int); + +int +main(int, char **argv) +{ + int rv; + size_t n; + ssize_t nr; + char *line; + FILE *fp; + + rv = EXIT_SUCCESS; + line = nullptr; + mlib_setprogname(argv[0]); + + if ((fp = fopen(TESTFILE, "r")) == nullptr) + err("fopen: %s:", TESTFILE); + + for (int id = 1; (nr = getline(&line, &n, fp)) > 0; id++) { + if (line[nr - 1] == '\n') + line[--nr] = '\0'; + + if (!test((struct u8view){line, (size_t)nr}, id)) { + rv = EXIT_FAILURE; + break; + } + } + if (ferror(fp)) + err("getline: %s:", TESTFILE); + + free(line); + fclose(fp); + return rv; +} + +bool +test(struct u8view sv, int id) +{ + bool rv = true; + arena a = mkarena(0); + struct arena_ctx ctx = {.a = &a}; + + dynarr(struct u8view) columns = { + .alloc = alloc_arena, + .ctx = &ctx, + }; + + struct u8view column; + while (ucscut(&column, &sv, U";", 1) != MBEND) { + dynarr(char8_t) s = { + .alloc = alloc_arena, + .ctx = &ctx, + }; + + rune _; + struct u8view cp; + do { + rune ch; + _ = ucscut(&cp, &column, U" ", 1); + sscanf(cp.p, "%" SCNxRUNE, &ch); + char8_t buf[U8_LEN_MAX]; + int w = rtoucs(buf, sizeof(buf), ch); + DAEXTEND(&s, buf, w); + } while (_ != MBEND); + + DAPUSH(&columns, ((struct u8view){s.buf, s.len})); + } + + for (size_t i = 0; i < 5; i++) { + size_t base; + if (streq(STR(NORMTYPE), "nfkd")) + base = 4; + else + base = i < 3 ? 2 : 4; + struct u8view normd = {}; + normd.p = FUNC(&normd.len, columns.buf[i], alloc_arena, &ctx); + if (!ucseq(columns.buf[base], normd)) { + warn("case %d: expected c%zu to be ‘%.*s’ but got ‘%.*s’", id, + i + 1, SV_PRI_ARGS(columns.buf[base]), SV_PRI_ARGS(normd)); + rv = false; + goto out; + } + } + +out: + arena_free(&a); + return rv; +} diff --git a/test/norm-nfd-test.c b/test/norm-nfd-test.c index 95bc8d5..6067352 100644 --- a/test/norm-nfd-test.c +++ b/test/norm-nfd-test.c @@ -1,98 +1,2 @@ -#define _GNU_SOURCE -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#define TESTFILE "norm-nfd.in" - -static bool test(struct u8view, int); - -int -main(int, char **argv) -{ - int rv; - size_t n; - ssize_t nr; - char *line; - FILE *fp; - - rv = EXIT_SUCCESS; - line = nullptr; - mlib_setprogname(argv[0]); - - if ((fp = fopen(TESTFILE, "r")) == nullptr) - err("fopen: %s:", TESTFILE); - - for (int id = 1; (nr = getline(&line, &n, fp)) > 0; id++) { - if (line[nr - 1] == '\n') - line[--nr] = '\0'; - - if (!test((struct u8view){line, (size_t)nr}, id)) { - rv = EXIT_FAILURE; - break; - } - } - if (ferror(fp)) - err("getline: %s:", TESTFILE); - - free(line); - fclose(fp); - return rv; -} - -bool -test(struct u8view sv, int id) -{ - bool rv = true; - arena a = mkarena(0); - struct arena_ctx ctx = {.a = &a}; - - dynarr(struct u8view) columns = { - .alloc = alloc_arena, - .ctx = &ctx, - }; - - struct u8view column; - while (ucscut(&column, &sv, U";", 1) != MBEND) { - dynarr(char8_t) s = { - .alloc = alloc_arena, - .ctx = &ctx, - }; - - rune _; - struct u8view cp; - do { - rune ch; - _ = ucscut(&cp, &column, U" ", 1); - sscanf(cp.p, "%" SCNxRUNE, &ch); - char8_t buf[U8_LEN_MAX]; - int w = rtoucs(buf, sizeof(buf), ch); - DAEXTEND(&s, buf, w); - } while (_ != MBEND); - - DAPUSH(&columns, ((struct u8view){s.buf, s.len})); - } - - for (size_t i = 0; i < 5; i++) { - size_t base = i < 3 ? 2 : 4; - struct u8view normd = {}; - normd.p = ucsnorm_nfd(&normd.len, columns.buf[i], alloc_arena, &ctx); - if (!ucseq(columns.buf[base], normd)) { - warn("case %d: expected c%zu to be ‘%.*s’ but got ‘%.*s’", id, - i + 1, SV_PRI_ARGS(columns.buf[base]), SV_PRI_ARGS(normd)); - rv = false; - goto out; - } - } - -out: - arena_free(&a); - return rv; -} +#define NORMTYPE nfd +#include "_norm-test.h" diff --git a/test/norm-nfkd-test.c b/test/norm-nfkd-test.c new file mode 100644 index 0000000..3fe8ff2 --- /dev/null +++ b/test/norm-nfkd-test.c @@ -0,0 +1,2 @@ +#define NORMTYPE nfkd +#include "_norm-test.h" diff --git a/test/run-tests b/test/run-tests index 860d243..ae9c96e 100755 --- a/test/run-tests +++ b/test/run-tests @@ -32,7 +32,7 @@ grep '^[^#]' data/UppercaseTest >upper.in grep '^[^#]' data/WordHumanBreakTest >wbrk-human.in sed -En 's/\s+//g; s/÷?#.*//g; /./p' data/GraphemeBreakTest >gbrk.in sed -En 's/\s+//g; s/÷?#.*//g; /./p' data/WordBreakTest >wbrk.in -sed -En 's/(#|^@).*//; s/\s+$//; /./p' data/NormalizationTest >norm-nfd.in +sed -En 's/(#|^@).*//; s/\s+$//; /./p' data/NormalizationTest >norm.in for src in *.c do -- cgit v1.2.3