diff options
-rwxr-xr-x | gen/string/scale | 55 | ||||
-rw-r--r-- | include/unicode/string.h | 5 |
2 files changed, 60 insertions, 0 deletions
diff --git a/gen/string/scale b/gen/string/scale new file mode 100755 index 0000000..0455447 --- /dev/null +++ b/gen/string/scale @@ -0,0 +1,55 @@ +#!/bin/sh + +# Usage: scale -v utf=X -v mapping=X [-v az=X] [-v lt=X] +# Example: scale -v utf=8 -v mapping=title -v lt=1 + +set -e +cd "${0%/*}/../.." + +gawk "$@" ' +function bcnt(x) +{ + x = strtonum("0X" x) + if (utf == 32) + return 4 + if (utf == 16) + return x < 0x10000 ? 2 : 4 + return x < 0x00080 ? 1 \ + : x < 0x00800 ? 2 \ + : x < 0x10000 ? 3 \ + : /* ... */ 4 +} + +function max(x, y) +{ + return x > y ? x : y +} + +BEGIN { + FS = " *; *" + if (mapping == "lower") + field = 2 + else if (mapping == "title") + field = 3 + else if (mapping == "upper") + field = 4 +} + +$5 ~ /^(az|tr)/ && !az { next } +$5 ~ /^lt/ && !lt { next } + +/^[A-F0-9]/ { + to = 0 + from = bcnt($1) + split($field, xs, / /) + for (i in xs) + to += bcnt(xs[i]) + results[g_i++] = to / from +} + +END { + for (i = 1; i <= g_i; i++) + n = max(n, results[i]) + print n +} +' data/SpecialCasing diff --git a/include/unicode/string.h b/include/unicode/string.h index 6363d9d..0cee934 100644 --- a/include/unicode/string.h +++ b/include/unicode/string.h @@ -29,4 +29,9 @@ size_t u8gnext(struct u8view *, const char8_t **, size_t *); size_t, enum caseflags); #undef mlib_warn_trunc +constexpr double U8LOWER_SCALE = 1.5; +constexpr double U8LOWER_SCALE_LT = 3; +constexpr double U8TITLE_SCALE = 3; +constexpr double U8UPPER_SCALE = 3; + #endif /* !MLIB_UNICODE_STRING_H */ |