diff options
author | Thomas Voss <mail@thomasvoss.com> | 2024-04-26 17:20:26 +0200 |
---|---|---|
committer | Thomas Voss <mail@thomasvoss.com> | 2024-04-26 17:20:26 +0200 |
commit | ef8a2910aa4bfc49973e63e1003b01f47a675249 (patch) | |
tree | e2ebe2a8c898f6570d69da89b00ee1b30161fffd /lib/unicode | |
parent | b5e7acf641d4ef3538803b746723b90a822ea1ad (diff) |
Fully comply with Unicode 15.1 § 3.13 in u8lower()
Diffstat (limited to 'lib/unicode')
-rw-r--r-- | lib/unicode/prop/uprop_get_lc.c | 25 | ||||
-rw-r--r-- | lib/unicode/string/u8lower.c | 68 | ||||
-rw-r--r-- | lib/unicode/string/u8title.c | 4 |
3 files changed, 72 insertions, 25 deletions
diff --git a/lib/unicode/prop/uprop_get_lc.c b/lib/unicode/prop/uprop_get_lc.c index 0ce1072..752ed71 100644 --- a/lib/unicode/prop/uprop_get_lc.c +++ b/lib/unicode/prop/uprop_get_lc.c @@ -8,35 +8,40 @@ struct rview uprop_get_lc(rune ch, struct lcctx ctx) { + constexpr rune COMB_GRAVE = 0x300; + constexpr rune COMB_ACUTE = 0x301; + constexpr rune COMB_TILDE = 0x303; + constexpr rune COMB_DOT_ABOVE = 0x307; + if (ch == U'Σ') - return ctx.eow ? M(U'ς') : M(U'σ'); + return ctx.final_sigma ? M(U'ς') : M(U'σ'); if (ch == U'İ') - return ctx.az_or_tr ? M('i') : M('i', 0x307); + return ctx.az_or_tr ? M('i') : M('i', COMB_DOT_ABOVE); if (ctx.lt) { - if (ctx.before_acc) { + if (ctx.more_above) { switch (ch) { case 'I': - return M('i', 0x307); + return M('i', COMB_DOT_ABOVE); case 'J': - return M('j', 0x307); + return M('j', COMB_DOT_ABOVE); case U'Į': - return M(U'į', 0x307); + return M(U'į', COMB_DOT_ABOVE); } } switch (ch) { case U'Ì': - return M('i', 0x307, 0x300); + return M('i', COMB_DOT_ABOVE, COMB_GRAVE); case U'Í': - return M('i', 0x307, 0x301); + return M('i', COMB_DOT_ABOVE, COMB_ACUTE); case U'Ĩ': - return M('i', 0x307, 0x303); + return M('i', COMB_DOT_ABOVE, COMB_TILDE); } } if (ctx.az_or_tr) { - if (ch == 0x307 && ctx.after_I) + if (ch == COMB_DOT_ABOVE && ctx.after_I) return M(); if (ch == 'I' && !ctx.before_dot) return M(U'ı'); diff --git a/lib/unicode/string/u8lower.c b/lib/unicode/string/u8lower.c index 77b0e18..f9ac78c 100644 --- a/lib/unicode/string/u8lower.c +++ b/lib/unicode/string/u8lower.c @@ -1,12 +1,17 @@ +#include "_attrs.h" #include "mbstring.h" #include "unicode/prop.h" #include "unicode/string.h" -constexpr rune COMB_GRAVE = 0x0300; -constexpr rune COMB_ACUTE = 0x0301; -constexpr rune COMB_TILDE = 0x0303; constexpr rune COMB_DOT_ABOVE = 0x0307; +[[unsequenced, _mlib_inline]] static inline bool +uprop_ccc_0_or_230(rune ch) +{ + enum uprop_ccc x = uprop_get_ccc(ch); + return x == 0 || x == 230; +} + size_t u8lower(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn, enum caseflags flags) @@ -17,21 +22,54 @@ u8lower(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn, }; rune ch; - size_t n = 0; - struct u8view word = {}, cpy = {src, srcn}; + size_t n, before_dot_cnt, more_above_cnt; + struct { + bool before; + size_t after; + } final_sigma = {}; + + n = before_dot_cnt = more_above_cnt = 0; while (u8next(&ch, &src, &srcn)) { rune next = 0; if (srcn > 0) u8tor(&next, src); - if (src > word.p + word.len) - u8wnext(&word, U8_ARGSP(cpy)); - ctx.eow = src == word.p + word.len; - ctx.before_dot = next == COMB_DOT_ABOVE; - ctx.before_acc = next == COMB_GRAVE - || next == COMB_ACUTE - || next == COMB_TILDE; + if (before_dot_cnt == 0 || more_above_cnt == 0) { + rune ch = 0; + before_dot_cnt = more_above_cnt = 0; + struct u8view cpy = {src, srcn}; + + do { + before_dot_cnt++; + more_above_cnt++; + } while (u8next(&ch ,U8_ARGSP(cpy)) && !uprop_ccc_0_or_230(ch)); + + if (ch != COMB_DOT_ABOVE) + before_dot_cnt = 0; + if (uprop_get_ccc(ch) != 230) + more_above_cnt = 0; + } else { + before_dot_cnt--; + more_above_cnt--; + } + + if (final_sigma.after == 0) { + rune ch; + struct u8view cpy = {src, srcn}; + + do + final_sigma.after++; + while (u8next(&ch, U8_ARGSP(cpy)) && uprop_is_ci(ch)); + + if (!uprop_is_cased(ch)) + final_sigma.after = 0; + } else + final_sigma.after--; + + ctx.before_dot = before_dot_cnt > 0; + ctx.more_above = more_above_cnt > 0; + ctx.final_sigma = final_sigma.before && final_sigma.after == 0; struct rview rv = uprop_get_lc(ch, ctx); for (size_t i = 0; i < rv.len; i++) { @@ -42,7 +80,11 @@ u8lower(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn, n += rtou8(dst + n, dstn - n, rv.p[i]); } - ctx.after_I = ch == 'I'; + ctx.after_I = (ch == 'I') || (ctx.after_I && !uprop_ccc_0_or_230(ch)); + if (uprop_is_cased(ch)) + final_sigma.before = true; + else if (!uprop_is_ci(ch)) + final_sigma.before = false; } return n; diff --git a/lib/unicode/string/u8title.c b/lib/unicode/string/u8title.c index dcf0b2e..380e874 100644 --- a/lib/unicode/string/u8title.c +++ b/lib/unicode/string/u8title.c @@ -36,9 +36,9 @@ u8title(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn, u8wnext(&word, U8_ARGSP(cpy)); bool sow = src - w == word.p; - ctx_l.eow = src == word.p + word.len; + ctx_l.final_sigma = src == word.p + word.len; ctx_l.before_dot = next == COMB_DOT_ABOVE; - ctx_l.before_acc = + ctx_l.more_above = next == COMB_GRAVE || next == COMB_ACUTE || next == COMB_TILDE; struct rview rv; |