From 8a90029b40c3cb159d0fda5ccd66a0d3984c8088 Mon Sep 17 00:00:00 2001 From: Thomas Voss Date: Mon, 20 May 2024 22:38:19 +0200 Subject: Don’t renormalize already normalized strings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- include/unicode/prop.h | 4 ++-- lib/unicode/string/u8norm.c | 32 ++++++++++++++++++++++++++++++-- 2 files changed, 32 insertions(+), 4 deletions(-) diff --git a/include/unicode/prop.h b/include/unicode/prop.h index 647f461..a7e90ad 100644 --- a/include/unicode/prop.h +++ b/include/unicode/prop.h @@ -864,9 +864,9 @@ enum uprop_lb : uint_least8_t { }; enum uprop_nfc_qc : uint_least8_t { - NFC_QC_M, /* Maybe */ NFC_QC_N, /* No */ NFC_QC_Y, /* Yes */ + NFC_QC_M, /* Maybe */ }; enum uprop_nfd_qc : uint_least8_t { @@ -875,9 +875,9 @@ enum uprop_nfd_qc : uint_least8_t { }; enum uprop_nfkc_qc : uint_least8_t { - NFKC_QC_M, /* Maybe */ NFKC_QC_N, /* No */ NFKC_QC_Y, /* Yes */ + NFKC_QC_M, /* Maybe */ }; enum uprop_nfkd_qc : uint_least8_t { diff --git a/lib/unicode/string/u8norm.c b/lib/unicode/string/u8norm.c index 128a67a..91c6aa5 100644 --- a/lib/unicode/string/u8norm.c +++ b/lib/unicode/string/u8norm.c @@ -1,3 +1,4 @@ +#include #include #include "macros.h" @@ -8,8 +9,9 @@ #define BETWEEN(x, y, z) ((x) <= (y) && (y) <= (z)) -static void decomp(char8_t *, size_t *, size_t, rune, enum normform); -static void compbuf(char8_t *, size_t *); +typedef uint_least8_t (*qcfn)(rune); + +constexpr uint_least8_t YES = 1; /* Computed using a gen/scale-norm.c */ constexpr int NFD_SCALE = 3; @@ -26,6 +28,16 @@ constexpr int TCNT = 28; constexpr int NCNT = VCNT * TCNT; constexpr int SCNT = LCNT * NCNT; +static void decomp(char8_t *, size_t *, size_t, rune, enum normform); +static void compbuf(char8_t *, size_t *); + +static const qcfn qc_lookup[] = { + [NF_NFC] = (qcfn)uprop_get_nfc_qc, + [NF_NFD] = (qcfn)uprop_get_nfd_qc, + [NF_NFKC] = (qcfn)uprop_get_nfkc_qc, + [NF_NFKD] = (qcfn)uprop_get_nfkd_qc, +}; + char8_t * u8norm(size_t *dstn, struct u8view src, alloc_fn alloc, void *ctx, enum normform nf) @@ -34,6 +46,22 @@ u8norm(size_t *dstn, struct u8view src, alloc_fn alloc, void *ctx, ASSUME(alloc != nullptr); ASSUME(BETWEEN(0, nf, 4)); + { + qcfn f = qc_lookup[nf]; + struct u8view sv = src; + enum uprop_ccc prvcc = 0, curcc; + for (rune ch; ucsnext(&ch, &sv) != 0; prvcc = curcc) { + curcc = uprop_get_ccc(ch); + if ((prvcc > curcc && curcc != CCC_NR) || (f(ch) != YES)) + goto no; + } + + *dstn = src.len; + char8_t *dst = alloc(ctx, nullptr, 0, src.len, 1, alignof(char8_t)); + return memcpy(dst, src.p, src.len); + } + +no: /* Pre-allocate a buffer with some initial capacity; there is no need to check for overflow when computing bufsz because alloc() will handle the overflow error for us. */ -- cgit v1.2.3