aboutsummaryrefslogtreecommitdiff
path: root/lib/unicode
diff options
context:
space:
mode:
authorThomas Voss <mail@thomasvoss.com> 2024-05-20 22:38:19 +0200
committerThomas Voss <mail@thomasvoss.com> 2024-05-20 22:38:19 +0200
commit8a90029b40c3cb159d0fda5ccd66a0d3984c8088 (patch)
treeec40fdfc7638fd450274c30e8b1c9760ba0a5d76 /lib/unicode
parent82a976c742ba88ecd2aa66074e388025f9af6231 (diff)
Don’t renormalize already normalized strings
Diffstat (limited to 'lib/unicode')
-rw-r--r--lib/unicode/string/u8norm.c32
1 files changed, 30 insertions, 2 deletions
diff --git a/lib/unicode/string/u8norm.c b/lib/unicode/string/u8norm.c
index 128a67a..91c6aa5 100644
--- a/lib/unicode/string/u8norm.c
+++ b/lib/unicode/string/u8norm.c
@@ -1,3 +1,4 @@
+#include <inttypes.h>
#include <string.h>
#include "macros.h"
@@ -8,8 +9,9 @@
#define BETWEEN(x, y, z) ((x) <= (y) && (y) <= (z))
-static void decomp(char8_t *, size_t *, size_t, rune, enum normform);
-static void compbuf(char8_t *, size_t *);
+typedef uint_least8_t (*qcfn)(rune);
+
+constexpr uint_least8_t YES = 1;
/* Computed using a gen/scale-norm.c */
constexpr int NFD_SCALE = 3;
@@ -26,6 +28,16 @@ constexpr int TCNT = 28;
constexpr int NCNT = VCNT * TCNT;
constexpr int SCNT = LCNT * NCNT;
+static void decomp(char8_t *, size_t *, size_t, rune, enum normform);
+static void compbuf(char8_t *, size_t *);
+
+static const qcfn qc_lookup[] = {
+ [NF_NFC] = (qcfn)uprop_get_nfc_qc,
+ [NF_NFD] = (qcfn)uprop_get_nfd_qc,
+ [NF_NFKC] = (qcfn)uprop_get_nfkc_qc,
+ [NF_NFKD] = (qcfn)uprop_get_nfkd_qc,
+};
+
char8_t *
u8norm(size_t *dstn, struct u8view src, alloc_fn alloc, void *ctx,
enum normform nf)
@@ -34,6 +46,22 @@ u8norm(size_t *dstn, struct u8view src, alloc_fn alloc, void *ctx,
ASSUME(alloc != nullptr);
ASSUME(BETWEEN(0, nf, 4));
+ {
+ qcfn f = qc_lookup[nf];
+ struct u8view sv = src;
+ enum uprop_ccc prvcc = 0, curcc;
+ for (rune ch; ucsnext(&ch, &sv) != 0; prvcc = curcc) {
+ curcc = uprop_get_ccc(ch);
+ if ((prvcc > curcc && curcc != CCC_NR) || (f(ch) != YES))
+ goto no;
+ }
+
+ *dstn = src.len;
+ char8_t *dst = alloc(ctx, nullptr, 0, src.len, 1, alignof(char8_t));
+ return memcpy(dst, src.p, src.len);
+ }
+
+no:
/* Pre-allocate a buffer with some initial capacity; there is no need to
check for overflow when computing bufsz because alloc() will handle the
overflow error for us. */