From 5498793a56b19da99b7b6856c953933e50b8d572 Mon Sep 17 00:00:00 2001 From: Thomas Voss Date: Wed, 15 May 2024 00:43:54 +0200 Subject: Add ucsnorm_nfkd() --- lib/unicode/string/u8norm_nfkd.c | 94 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 lib/unicode/string/u8norm_nfkd.c (limited to 'lib/unicode') diff --git a/lib/unicode/string/u8norm_nfkd.c b/lib/unicode/string/u8norm_nfkd.c new file mode 100644 index 0000000..898b650 --- /dev/null +++ b/lib/unicode/string/u8norm_nfkd.c @@ -0,0 +1,94 @@ +#include + +#include "macros.h" +#include "mbstring.h" +#include "unicode/prop.h" +#include "unicode/string.h" + +static void decomp(char8_t *, size_t *, size_t, rune); + +/* Computed using a gen/scale-norm.c */ +constexpr int NFKD_SCALE = 11; + +/* For Hangul syllable decomposition */ +constexpr rune SBASE = 0xAC00; +constexpr rune LBASE = 0x1100; +constexpr rune VBASE = 0x1161; +constexpr rune TBASE = 0x11A7; +constexpr int LCNT = 19; +constexpr int VCNT = 21; +constexpr int TCNT = 28; +constexpr int NCNT = VCNT * TCNT; +constexpr int SCNT = LCNT * NCNT; + +char8_t * +u8norm_nfkd(size_t *dstn, struct u8view src, alloc_fn alloc, void *ctx) +{ + ASSUME(dstn != nullptr); + ASSUME(alloc != nullptr); + + /* Pre-allocate a buffer with some initial capacity; there is no need to + check for overflow when computing bufsz because alloc() will handle the + overflow error for us. */ + size_t bufsz = src.len * NFKD_SCALE; + char8_t *dst = alloc(ctx, nullptr, 0, src.len, NFKD_SCALE, alignof(char8_t)); + + *dstn = 0; + for (rune ch; ucsnext(&ch, &src) != 0; decomp(dst, dstn, bufsz, ch)) + ; + return alloc(ctx, dst, src.len, *dstn, 1, alignof(char8_t)); +} + +#define WRITE(ch) *dstn += rtoucs(dst + *dstn, bufsz - *dstn, (ch)) + +void +decomp(char8_t *dst, size_t *dstn, size_t bufsz, rune ch) +{ + if (uprop_get_hst(ch) != HST_NA) { + int si = ch - SBASE; + if (si < 0 || si > SCNT) { + WRITE(ch); + return; + } + rune l, v, t; + l = LBASE + si / NCNT; + v = VBASE + (si % NCNT) / TCNT; + t = TBASE + si % TCNT; + WRITE(l); + WRITE(v); + if (t != TBASE) + WRITE(t); + } else if (uprop_get_dt(ch) != DT_NONE) { + struct rview rv = uprop_get_dm(ch); + for (size_t i = 0; i < rv.len; i++) + decomp(dst, dstn, bufsz, rv.p[i]); + } else { + enum uprop_ccc ccc = uprop_get_ccc(ch); + if (ccc == CCC_NR) { + WRITE(ch); + return; + } + + int w; + rune hc; + char8_t *p = dst + *dstn; + while (w = ucsprev(&hc, (const char8_t **)&p, dst)) { + enum uprop_ccc ccc2 = uprop_get_ccc(hc); + if (ccc2 == CCC_NR || ccc2 <= ccc) { +out: + char8_t tmp[U8_LEN_MAX]; + int w2 = rtoucs(tmp, sizeof(tmp), ch); + p += w; + memmove(p + w2, p, dst + *dstn - p); + memcpy(p, tmp, w2); + *dstn += w2; + return; + } + } + + /* Loop didn’t early-return; append to the start */ + goto out; + } +} + +#undef WRITE -- cgit v1.2.3