diff options
author | Thomas Voss <mail@thomasvoss.com> | 2024-05-14 20:53:13 +0200 |
---|---|---|
committer | Thomas Voss <mail@thomasvoss.com> | 2024-05-14 20:53:22 +0200 |
commit | a39a4797730a09ec4fbb41c11b7dc1f7d245bc15 (patch) | |
tree | 13f2a025d52535d66cc8e1c4be0aa502ffaeab7b /gen | |
parent | 4e88af1babd6555c389b1e14316c29b78146f8f0 (diff) |
Implement NFD string normalization
Diffstat (limited to 'gen')
-rwxr-xr-x | gen/string/scale-norm.c | 77 |
1 files changed, 77 insertions, 0 deletions
diff --git a/gen/string/scale-norm.c b/gen/string/scale-norm.c new file mode 100755 index 0000000..2b926fa --- /dev/null +++ b/gen/string/scale-norm.c @@ -0,0 +1,77 @@ +#if 0 +cd "${0%/*}/../.." +trap 'rm -f /tmp/scale-norm' EXIT +cc -Iinclude -std=c23 -Wno-attributes -fsanitize=address,undefined \ + -o /tmp/scale-norm gen/string/scale-norm.c libmlib.a +/tmp/scale-norm +exit 0 +#endif + +#include <stdio.h> + +#include <macros.h> +#include <rune.h> +#include <unicode/prop.h> + +static double scale(rune ch); +static int bcnt(rune ch); +static int hscnt(rune ch); +static int scale2(rune ch); + +int +main(void) +{ + double maxscale = 1; + for (rune ch = 0; ch <= RUNE_MAX; ch++) { + double n = scale(ch); + maxscale = MAX(n, maxscale); + } + printf("NFD\t%g\n", maxscale); +} + +double +scale(rune ch) +{ + int old, new; + old = bcnt(ch); + new = scale2(ch); + return (double)new / (double)old; +} + +int +scale2(rune ch) +{ + if (uprop_get_hst(ch) != HST_NA) + return hscnt(ch); + if (uprop_get_dt(ch) != DT_CAN) + return bcnt(ch); + int acc = 0; + struct rview rv = uprop_get_dm(ch); + for (size_t i = 0; i < rv.len; i++) + acc += scale2(rv.p[i]); + return acc; +} + +int +bcnt(rune ch) +{ + return ch < 0x80 ? 1 : ch < 0x800 ? 2 : ch < 0x10'000 ? 3 : 4; +} + +int +hscnt(rune s) +{ + const rune sbase = 0xAC00, lbase = 0x1100, vbase = 0x1161, tbase = 0x11A7; + const int lcnt = 19, vcnt = 21, tcnt = 28, ncnt = 588, scnt = 11172; + + int sidx = s - sbase; + if (sidx < 0 || sidx > scnt) + return bcnt(s); + rune l = lbase + sidx / ncnt; + rune v = vbase + (sidx % ncnt) / tcnt; + rune t = tbase + sidx % tcnt; + int acc = bcnt(l) + bcnt(v); + if (t != tbase) + acc += bcnt(t); + return acc; +} |