From 2e125c1c7e75db14a88f0b8b09e61a132977c63e Mon Sep 17 00:00:00 2001 From: Thomas Voss Date: Mon, 20 May 2024 17:56:55 +0200 Subject: Support the 4 forms of Unicode string normalization --- lib/unicode/string/u8norm.c | 192 +++++++++++++++++++++++++++++++++++++++ lib/unicode/string/u8norm_nfd.c | 94 ------------------- lib/unicode/string/u8norm_nfkd.c | 94 ------------------- 3 files changed, 192 insertions(+), 188 deletions(-) create mode 100644 lib/unicode/string/u8norm.c delete mode 100644 lib/unicode/string/u8norm_nfd.c delete mode 100644 lib/unicode/string/u8norm_nfkd.c (limited to 'lib/unicode') diff --git a/lib/unicode/string/u8norm.c b/lib/unicode/string/u8norm.c new file mode 100644 index 0000000..a918479 --- /dev/null +++ b/lib/unicode/string/u8norm.c @@ -0,0 +1,192 @@ +#include + +#include "macros.h" +#include "mbstring.h" +#include "unicode/_cm.h" +#include "unicode/prop.h" +#include "unicode/string.h" + +#define BETWEEN(x, y, z) ((x) <= (y) && (y) <= (z)) + +static void decomp(char8_t *, size_t *, size_t, rune, enum normtype); +static void compbuf(char8_t *, size_t *); + +/* Computed using a gen/scale-norm.c */ +constexpr int NFD_SCALE = 3; +constexpr int NFKD_SCALE = 11; + +/* For Hangul syllable decomposition */ +constexpr rune SBASE = 0xAC00; +constexpr rune LBASE = 0x1100; +constexpr rune VBASE = 0x1161; +constexpr rune TBASE = 0x11A7; +constexpr int LCNT = 19; +constexpr int VCNT = 21; +constexpr int TCNT = 28; +constexpr int NCNT = VCNT * TCNT; +constexpr int SCNT = LCNT * NCNT; + +char8_t * +u8norm(size_t *dstn, struct u8view src, alloc_fn alloc, void *ctx, + enum normtype nt) +{ + ASSUME(dstn != nullptr); + ASSUME(alloc != nullptr); + ASSUME(BETWEEN(0, nt, 4)); + + /* Pre-allocate a buffer with some initial capacity; there is no need to + check for overflow when computing bufsz because alloc() will handle the + overflow error for us. */ + int scale = (nt & 0b10) ? NFKD_SCALE : NFD_SCALE; + size_t bufsz = src.len * scale; + char8_t *dst = alloc(ctx, nullptr, 0, src.len, scale, alignof(char8_t)); + + *dstn = 0; + for (rune ch; ucsnext(&ch, &src) != 0; decomp(dst, dstn, bufsz, ch, nt)) + ; + if (nt & 0b01) + compbuf(dst, dstn); + return alloc(ctx, dst, src.len, *dstn, 1, alignof(char8_t)); +} + +#define WRITE(ch) *dstn += rtoucs(dst + *dstn, bufsz - *dstn, (ch)) + +void +decomp(char8_t *dst, size_t *dstn, size_t bufsz, rune ch, enum normtype nt) +{ + if (uprop_get_hst(ch) != HST_NA) { + int si = ch - SBASE; + if (si < 0 || si > SCNT) { + WRITE(ch); + return; + } + rune l, v, t; + l = LBASE + si / NCNT; + v = VBASE + (si % NCNT) / TCNT; + t = TBASE + si % TCNT; + WRITE(l); + WRITE(v); + if (t != TBASE) + WRITE(t); + } else if (((nt & 0b10) && uprop_get_dt(ch) != DT_NONE) + || ((nt & 0b10) == 0 && uprop_get_dt(ch) == DT_CAN)) + { + struct rview rv = uprop_get_dm(ch); + for (size_t i = 0; i < rv.len; i++) + decomp(dst, dstn, bufsz, rv.p[i], nt); + } else { + enum uprop_ccc ccc = uprop_get_ccc(ch); + if (ccc == CCC_NR) { + WRITE(ch); + return; + } + + int w; + rune hc; + char8_t *p = dst + *dstn; + while (w = ucsprev(&hc, (const char8_t **)&p, dst)) { + enum uprop_ccc ccc2 = uprop_get_ccc(hc); + if (ccc2 == CCC_NR || ccc2 <= ccc) { +out: + char8_t tmp[U8_LEN_MAX]; + int w2 = rtoucs(tmp, sizeof(tmp), ch); + p += w; + memmove(p + w2, p, dst + *dstn - p); + memcpy(p, tmp, w2); + *dstn += w2; + return; + } + } + + /* Loop didn’t early-return; append to the start */ + goto out; + } +} + +#undef WRITE + +/* The following implements the canonical composition algorithm, and it may be + useful to read it to understand what’s going on. It can be found under + §3.11 Normalization Forms of the Unicode standard, subsection ‘Canonical + Composition Algorithm’. */ + +void +compbuf(char8_t *dst, size_t *dstn) +{ + int wC, wL; + rune C, L; + struct u8view sv = {dst, *dstn}; + + while ((wL = ucsnext(&L, &sv)) != 0) { + if (uprop_get_ccc(L) != CCC_NR) + continue; + char8_t *after_L = (char8_t *)sv.p; + + enum uprop_ccc prevcc = 0; + struct u8view sv_ = sv; + + while ((wC = ucsnext(&C, &sv_)) != 0) { + enum uprop_ccc curcc = uprop_get_ccc(C); + bool blocked = curcc <= prevcc; + + if (blocked) { + if (curcc != CCC_NR) + continue; + if (curcc != prevcc) + break; + } + + prevcc = curcc; + rune comp = uprop_get_cm(L, C); + + /* Try Hangul composition */ + if (comp == 0) { + if (BETWEEN(LBASE, L, LBASE + LCNT - 1) + && BETWEEN(VBASE, C, VBASE + VCNT - 1)) + { + comp = SBASE + ((L - LBASE) * NCNT + (C - VBASE) * TCNT); + } else if (BETWEEN(TBASE, C, TBASE + TCNT - 1) + && BETWEEN(SBASE, L, SBASE + SCNT - 1) + && ((L - SBASE) % TCNT) == 0) + { + comp = L + (C - TBASE); + } + } + + if (comp != 0) { + char8_t *after_C = (char8_t *)sv_.p; + + /* Shift bytes between L & C so that they’re contiguous with the + bytes after C */ + memmove(after_L + wC, after_L, after_C - wC - after_L); + + /* Write the composition into where L was */ + int w = rtoucs(after_L - wL, wL + wC, comp); + + /* Shift the bytes after L & C to be right after the new + composition */ + memmove(after_L - wL + w, after_L + wC, + *dstn - (after_L + wC - dst)); + + /* Correct *dstn */ + int shift = wL + wC - w; + *dstn -= shift; + + /* Fix the inner string view */ + sv_.p = after_C - shift; + sv_.len = *dstn - (sv_.p - dst); + + /* Fix outer string view */ + sv.p = sv.p - wL + w; + sv.len = *dstn - (sv.p - dst); + after_L = (char8_t *)sv.p; + + /* Update the value of L */ + L = comp; + wL = w; + prevcc = CCC_NR; + } else if (blocked) + break; + } + } +} diff --git a/lib/unicode/string/u8norm_nfd.c b/lib/unicode/string/u8norm_nfd.c deleted file mode 100644 index a89a1b5..0000000 --- a/lib/unicode/string/u8norm_nfd.c +++ /dev/null @@ -1,94 +0,0 @@ -#include - -#include "macros.h" -#include "mbstring.h" -#include "unicode/prop.h" -#include "unicode/string.h" - -static void decomp(char8_t *, size_t *, size_t, rune); - -/* Computed using a gen/scale-norm.c */ -constexpr int NFD_SCALE = 3; - -/* For Hangul syllable decomposition */ -constexpr rune SBASE = 0xAC00; -constexpr rune LBASE = 0x1100; -constexpr rune VBASE = 0x1161; -constexpr rune TBASE = 0x11A7; -constexpr int LCNT = 19; -constexpr int VCNT = 21; -constexpr int TCNT = 28; -constexpr int NCNT = VCNT * TCNT; -constexpr int SCNT = LCNT * NCNT; - -char8_t * -u8norm_nfd(size_t *dstn, struct u8view src, alloc_fn alloc, void *ctx) -{ - ASSUME(dstn != nullptr); - ASSUME(alloc != nullptr); - - /* Pre-allocate a buffer with some initial capacity; there is no need to - check for overflow when computing bufsz because alloc() will handle the - overflow error for us. */ - size_t bufsz = src.len * NFD_SCALE; - char8_t *dst = alloc(ctx, nullptr, 0, src.len, NFD_SCALE, alignof(char8_t)); - - *dstn = 0; - for (rune ch; ucsnext(&ch, &src) != 0; decomp(dst, dstn, bufsz, ch)) - ; - return alloc(ctx, dst, src.len, *dstn, 1, alignof(char8_t)); -} - -#define WRITE(ch) *dstn += rtoucs(dst + *dstn, bufsz - *dstn, (ch)) - -void -decomp(char8_t *dst, size_t *dstn, size_t bufsz, rune ch) -{ - if (uprop_get_hst(ch) != HST_NA) { - int si = ch - SBASE; - if (si < 0 || si > SCNT) { - WRITE(ch); - return; - } - rune l, v, t; - l = LBASE + si / NCNT; - v = VBASE + (si % NCNT) / TCNT; - t = TBASE + si % TCNT; - WRITE(l); - WRITE(v); - if (t != TBASE) - WRITE(t); - } else if (uprop_get_dt(ch) == DT_CAN) { - struct rview rv = uprop_get_dm(ch); - for (size_t i = 0; i < rv.len; i++) - decomp(dst, dstn, bufsz, rv.p[i]); - } else { - enum uprop_ccc ccc = uprop_get_ccc(ch); - if (ccc == CCC_NR) { - WRITE(ch); - return; - } - - int w; - rune hc; - char8_t *p = dst + *dstn; - while (w = ucsprev(&hc, (const char8_t **)&p, dst)) { - enum uprop_ccc ccc2 = uprop_get_ccc(hc); - if (ccc2 == CCC_NR || ccc2 <= ccc) { -out: - char8_t tmp[U8_LEN_MAX]; - int w2 = rtoucs(tmp, sizeof(tmp), ch); - p += w; - memmove(p + w2, p, dst + *dstn - p); - memcpy(p, tmp, w2); - *dstn += w2; - return; - } - } - - /* Loop didn’t early-return; append to the start */ - goto out; - } -} - -#undef WRITE diff --git a/lib/unicode/string/u8norm_nfkd.c b/lib/unicode/string/u8norm_nfkd.c deleted file mode 100644 index 898b650..0000000 --- a/lib/unicode/string/u8norm_nfkd.c +++ /dev/null @@ -1,94 +0,0 @@ -#include - -#include "macros.h" -#include "mbstring.h" -#include "unicode/prop.h" -#include "unicode/string.h" - -static void decomp(char8_t *, size_t *, size_t, rune); - -/* Computed using a gen/scale-norm.c */ -constexpr int NFKD_SCALE = 11; - -/* For Hangul syllable decomposition */ -constexpr rune SBASE = 0xAC00; -constexpr rune LBASE = 0x1100; -constexpr rune VBASE = 0x1161; -constexpr rune TBASE = 0x11A7; -constexpr int LCNT = 19; -constexpr int VCNT = 21; -constexpr int TCNT = 28; -constexpr int NCNT = VCNT * TCNT; -constexpr int SCNT = LCNT * NCNT; - -char8_t * -u8norm_nfkd(size_t *dstn, struct u8view src, alloc_fn alloc, void *ctx) -{ - ASSUME(dstn != nullptr); - ASSUME(alloc != nullptr); - - /* Pre-allocate a buffer with some initial capacity; there is no need to - check for overflow when computing bufsz because alloc() will handle the - overflow error for us. */ - size_t bufsz = src.len * NFKD_SCALE; - char8_t *dst = alloc(ctx, nullptr, 0, src.len, NFKD_SCALE, alignof(char8_t)); - - *dstn = 0; - for (rune ch; ucsnext(&ch, &src) != 0; decomp(dst, dstn, bufsz, ch)) - ; - return alloc(ctx, dst, src.len, *dstn, 1, alignof(char8_t)); -} - -#define WRITE(ch) *dstn += rtoucs(dst + *dstn, bufsz - *dstn, (ch)) - -void -decomp(char8_t *dst, size_t *dstn, size_t bufsz, rune ch) -{ - if (uprop_get_hst(ch) != HST_NA) { - int si = ch - SBASE; - if (si < 0 || si > SCNT) { - WRITE(ch); - return; - } - rune l, v, t; - l = LBASE + si / NCNT; - v = VBASE + (si % NCNT) / TCNT; - t = TBASE + si % TCNT; - WRITE(l); - WRITE(v); - if (t != TBASE) - WRITE(t); - } else if (uprop_get_dt(ch) != DT_NONE) { - struct rview rv = uprop_get_dm(ch); - for (size_t i = 0; i < rv.len; i++) - decomp(dst, dstn, bufsz, rv.p[i]); - } else { - enum uprop_ccc ccc = uprop_get_ccc(ch); - if (ccc == CCC_NR) { - WRITE(ch); - return; - } - - int w; - rune hc; - char8_t *p = dst + *dstn; - while (w = ucsprev(&hc, (const char8_t **)&p, dst)) { - enum uprop_ccc ccc2 = uprop_get_ccc(hc); - if (ccc2 == CCC_NR || ccc2 <= ccc) { -out: - char8_t tmp[U8_LEN_MAX]; - int w2 = rtoucs(tmp, sizeof(tmp), ch); - p += w; - memmove(p + w2, p, dst + *dstn - p); - memcpy(p, tmp, w2); - *dstn += w2; - return; - } - } - - /* Loop didn’t early-return; append to the start */ - goto out; - } -} - -#undef WRITE -- cgit v1.2.3