diff options
author | Thomas Voss <mail@thomasvoss.com> | 2024-05-14 20:53:13 +0200 |
---|---|---|
committer | Thomas Voss <mail@thomasvoss.com> | 2024-05-14 20:53:22 +0200 |
commit | a39a4797730a09ec4fbb41c11b7dc1f7d245bc15 (patch) | |
tree | 13f2a025d52535d66cc8e1c4be0aa502ffaeab7b /lib/unicode/string | |
parent | 4e88af1babd6555c389b1e14316c29b78146f8f0 (diff) |
Implement NFD string normalization
Diffstat (limited to 'lib/unicode/string')
-rw-r--r-- | lib/unicode/string/u8norm_nfd.c | 126 |
1 files changed, 126 insertions, 0 deletions
diff --git a/lib/unicode/string/u8norm_nfd.c b/lib/unicode/string/u8norm_nfd.c new file mode 100644 index 0000000..8f142e1 --- /dev/null +++ b/lib/unicode/string/u8norm_nfd.c @@ -0,0 +1,126 @@ +#include <string.h> + +#include "macros.h" +#include "mbstring.h" +#include "unicode/prop.h" +#include "unicode/string.h" + +#include <stdio.h> + +static size_t quickchk_spn(struct u8view); +static void decomp(char8_t *, size_t *, size_t, rune); + +/* Computed using a gen/scale-norm.c */ +constexpr int NFD_SCALE = 3; + +/* For Hangul syllable decomposition */ +constexpr rune SBASE = 0xAC00; +constexpr rune LBASE = 0x1100; +constexpr rune VBASE = 0x1161; +constexpr rune TBASE = 0x11A7; +constexpr int LCNT = 19; +constexpr int VCNT = 21; +constexpr int TCNT = 28; +constexpr int NCNT = VCNT * TCNT; +constexpr int SCNT = LCNT * NCNT; + +char8_t * +u8norm_nfd(size_t *dstn, struct u8view src, alloc_fn alloc, void *ctx) +{ + ASSUME(dstn != nullptr); + ASSUME(alloc != nullptr); + + /* Pre-allocate a buffer with some initial capacity; there is no need to + check for overflow when computing bufsz because alloc() will handle the + overflow error for us. */ + size_t bufsz = src.len * NFD_SCALE; + uint8_t *dst = alloc(ctx, nullptr, 0, src.len, NFD_SCALE, alignof(char8_t)); + + /* Copy over the initial codepoints that are already in NFD; if the entire + string is in NFD then just return it immediately */ + size_t spn = quickchk_spn(src); + memcpy(dst, src.p, spn); + *dstn = spn; + if (spn == src.len) + return dst; + VSHFT(&src, spn); + + rune ch; + while (u8next(&ch, &src) != 0) + decomp(dst, dstn, bufsz, ch); + return alloc(ctx, dst, src.len, *dstn, 1, alignof(char8_t)); +} + +#define WRITE(ch) *dstn += rtou8(dst + *dstn, bufsz - *dstn, (ch)) + +void +decomp(char8_t *dst, size_t *dstn, size_t bufsz, rune ch) +{ + if (uprop_get_hst(ch) != HST_NA) { + int si = ch - SBASE; + if (si < 0 || si > SCNT) { + WRITE(ch); + return; + } + rune l, v, t; + l = LBASE + si / NCNT; + v = VBASE + (si % NCNT) / TCNT; + t = TBASE + si % TCNT; + WRITE(l); + WRITE(v); + if (t != TBASE) + WRITE(t); + } else if (uprop_get_dt(ch) == DT_CAN) { + struct rview rv = uprop_get_dm(ch); + for (size_t i = 0; i < rv.len; i++) + decomp(dst, dstn, bufsz, rv.p[i]); + } else { + enum uprop_ccc ccc = uprop_get_ccc(ch); + if (ccc == CCC_NR) { + WRITE(ch); + return; + } + + int w; + rune hc; + char8_t *p = dst + *dstn; + while (w = u8prev(&hc, (const char8_t **)&p, dst)) { + enum uprop_ccc ccc2 = uprop_get_ccc(hc); + if (ccc2 == CCC_NR || ccc2 <= ccc) { +out: + char8_t tmp[U8_LEN_MAX]; + int w2 = rtou8(tmp, sizeof(tmp), ch); + p += w; + memmove(p + w2, p, dst + *dstn - p); + memcpy(p, tmp, w2); + *dstn += w2; + return; + } + } + + /* Loop didn’t early-return; append to the start */ + goto out; + } +} + +#undef WRITE + +size_t +quickchk_spn(struct u8view src) +{ + rune ch; + size_t spn = 0; + enum uprop_ccc prv, cur; + + prv = cur = CCC_NR; + + for (int w; w = u8next(&ch, &src); spn += w) { + if (uprop_get_nfd_qc(ch) == NFD_QC_N) + break; + if ((cur = uprop_get_ccc(ch)) < prv) + break; + prv = cur; + } + + return spn; +} |