diff options
author | Thomas Voss <mail@thomasvoss.com> | 2024-05-20 22:38:19 +0200 |
---|---|---|
committer | Thomas Voss <mail@thomasvoss.com> | 2024-05-20 22:38:19 +0200 |
commit | 8a90029b40c3cb159d0fda5ccd66a0d3984c8088 (patch) | |
tree | ec40fdfc7638fd450274c30e8b1c9760ba0a5d76 | |
parent | 82a976c742ba88ecd2aa66074e388025f9af6231 (diff) |
Don’t renormalize already normalized strings
-rw-r--r-- | include/unicode/prop.h | 4 | ||||
-rw-r--r-- | lib/unicode/string/u8norm.c | 32 |
2 files changed, 32 insertions, 4 deletions
diff --git a/include/unicode/prop.h b/include/unicode/prop.h index 647f461..a7e90ad 100644 --- a/include/unicode/prop.h +++ b/include/unicode/prop.h @@ -864,9 +864,9 @@ enum uprop_lb : uint_least8_t { }; enum uprop_nfc_qc : uint_least8_t { - NFC_QC_M, /* Maybe */ NFC_QC_N, /* No */ NFC_QC_Y, /* Yes */ + NFC_QC_M, /* Maybe */ }; enum uprop_nfd_qc : uint_least8_t { @@ -875,9 +875,9 @@ enum uprop_nfd_qc : uint_least8_t { }; enum uprop_nfkc_qc : uint_least8_t { - NFKC_QC_M, /* Maybe */ NFKC_QC_N, /* No */ NFKC_QC_Y, /* Yes */ + NFKC_QC_M, /* Maybe */ }; enum uprop_nfkd_qc : uint_least8_t { diff --git a/lib/unicode/string/u8norm.c b/lib/unicode/string/u8norm.c index 128a67a..91c6aa5 100644 --- a/lib/unicode/string/u8norm.c +++ b/lib/unicode/string/u8norm.c @@ -1,3 +1,4 @@ +#include <inttypes.h> #include <string.h> #include "macros.h" @@ -8,8 +9,9 @@ #define BETWEEN(x, y, z) ((x) <= (y) && (y) <= (z)) -static void decomp(char8_t *, size_t *, size_t, rune, enum normform); -static void compbuf(char8_t *, size_t *); +typedef uint_least8_t (*qcfn)(rune); + +constexpr uint_least8_t YES = 1; /* Computed using a gen/scale-norm.c */ constexpr int NFD_SCALE = 3; @@ -26,6 +28,16 @@ constexpr int TCNT = 28; constexpr int NCNT = VCNT * TCNT; constexpr int SCNT = LCNT * NCNT; +static void decomp(char8_t *, size_t *, size_t, rune, enum normform); +static void compbuf(char8_t *, size_t *); + +static const qcfn qc_lookup[] = { + [NF_NFC] = (qcfn)uprop_get_nfc_qc, + [NF_NFD] = (qcfn)uprop_get_nfd_qc, + [NF_NFKC] = (qcfn)uprop_get_nfkc_qc, + [NF_NFKD] = (qcfn)uprop_get_nfkd_qc, +}; + char8_t * u8norm(size_t *dstn, struct u8view src, alloc_fn alloc, void *ctx, enum normform nf) @@ -34,6 +46,22 @@ u8norm(size_t *dstn, struct u8view src, alloc_fn alloc, void *ctx, ASSUME(alloc != nullptr); ASSUME(BETWEEN(0, nf, 4)); + { + qcfn f = qc_lookup[nf]; + struct u8view sv = src; + enum uprop_ccc prvcc = 0, curcc; + for (rune ch; ucsnext(&ch, &sv) != 0; prvcc = curcc) { + curcc = uprop_get_ccc(ch); + if ((prvcc > curcc && curcc != CCC_NR) || (f(ch) != YES)) + goto no; + } + + *dstn = src.len; + char8_t *dst = alloc(ctx, nullptr, 0, src.len, 1, alignof(char8_t)); + return memcpy(dst, src.p, src.len); + } + +no: /* Pre-allocate a buffer with some initial capacity; there is no need to check for overflow when computing bufsz because alloc() will handle the overflow error for us. */ |