aboutsummaryrefslogtreecommitdiff
path: root/lib/unicode
diff options
context:
space:
mode:
Diffstat (limited to 'lib/unicode')
-rw-r--r--lib/unicode/string/u8norm_nfkd.c94
1 files changed, 94 insertions, 0 deletions
diff --git a/lib/unicode/string/u8norm_nfkd.c b/lib/unicode/string/u8norm_nfkd.c
new file mode 100644
index 0000000..898b650
--- /dev/null
+++ b/lib/unicode/string/u8norm_nfkd.c
@@ -0,0 +1,94 @@
+#include <string.h>
+
+#include "macros.h"
+#include "mbstring.h"
+#include "unicode/prop.h"
+#include "unicode/string.h"
+
+static void decomp(char8_t *, size_t *, size_t, rune);
+
+/* Computed using a gen/scale-norm.c */
+constexpr int NFKD_SCALE = 11;
+
+/* For Hangul syllable decomposition */
+constexpr rune SBASE = 0xAC00;
+constexpr rune LBASE = 0x1100;
+constexpr rune VBASE = 0x1161;
+constexpr rune TBASE = 0x11A7;
+constexpr int LCNT = 19;
+constexpr int VCNT = 21;
+constexpr int TCNT = 28;
+constexpr int NCNT = VCNT * TCNT;
+constexpr int SCNT = LCNT * NCNT;
+
+char8_t *
+u8norm_nfkd(size_t *dstn, struct u8view src, alloc_fn alloc, void *ctx)
+{
+ ASSUME(dstn != nullptr);
+ ASSUME(alloc != nullptr);
+
+ /* Pre-allocate a buffer with some initial capacity; there is no need to
+ check for overflow when computing bufsz because alloc() will handle the
+ overflow error for us. */
+ size_t bufsz = src.len * NFKD_SCALE;
+ char8_t *dst = alloc(ctx, nullptr, 0, src.len, NFKD_SCALE, alignof(char8_t));
+
+ *dstn = 0;
+ for (rune ch; ucsnext(&ch, &src) != 0; decomp(dst, dstn, bufsz, ch))
+ ;
+ return alloc(ctx, dst, src.len, *dstn, 1, alignof(char8_t));
+}
+
+#define WRITE(ch) *dstn += rtoucs(dst + *dstn, bufsz - *dstn, (ch))
+
+void
+decomp(char8_t *dst, size_t *dstn, size_t bufsz, rune ch)
+{
+ if (uprop_get_hst(ch) != HST_NA) {
+ int si = ch - SBASE;
+ if (si < 0 || si > SCNT) {
+ WRITE(ch);
+ return;
+ }
+ rune l, v, t;
+ l = LBASE + si / NCNT;
+ v = VBASE + (si % NCNT) / TCNT;
+ t = TBASE + si % TCNT;
+ WRITE(l);
+ WRITE(v);
+ if (t != TBASE)
+ WRITE(t);
+ } else if (uprop_get_dt(ch) != DT_NONE) {
+ struct rview rv = uprop_get_dm(ch);
+ for (size_t i = 0; i < rv.len; i++)
+ decomp(dst, dstn, bufsz, rv.p[i]);
+ } else {
+ enum uprop_ccc ccc = uprop_get_ccc(ch);
+ if (ccc == CCC_NR) {
+ WRITE(ch);
+ return;
+ }
+
+ int w;
+ rune hc;
+ char8_t *p = dst + *dstn;
+ while (w = ucsprev(&hc, (const char8_t **)&p, dst)) {
+ enum uprop_ccc ccc2 = uprop_get_ccc(hc);
+ if (ccc2 == CCC_NR || ccc2 <= ccc) {
+out:
+ char8_t tmp[U8_LEN_MAX];
+ int w2 = rtoucs(tmp, sizeof(tmp), ch);
+ p += w;
+ memmove(p + w2, p, dst + *dstn - p);
+ memcpy(p, tmp, w2);
+ *dstn += w2;
+ return;
+ }
+ }
+
+ /* Loop didn’t early-return; append to the start */
+ goto out;
+ }
+}
+
+#undef WRITE