From ef8a2910aa4bfc49973e63e1003b01f47a675249 Mon Sep 17 00:00:00 2001 From: Thomas Voss Date: Fri, 26 Apr 2024 17:20:26 +0200 Subject: Fully comply with Unicode 15.1 ยง 3.13 in u8lower() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lib/unicode/string/u8lower.c | 68 +++++++++++++++++++++++++++++++++++--------- lib/unicode/string/u8title.c | 4 +-- 2 files changed, 57 insertions(+), 15 deletions(-) (limited to 'lib/unicode/string') diff --git a/lib/unicode/string/u8lower.c b/lib/unicode/string/u8lower.c index 77b0e18..f9ac78c 100644 --- a/lib/unicode/string/u8lower.c +++ b/lib/unicode/string/u8lower.c @@ -1,12 +1,17 @@ +#include "_attrs.h" #include "mbstring.h" #include "unicode/prop.h" #include "unicode/string.h" -constexpr rune COMB_GRAVE = 0x0300; -constexpr rune COMB_ACUTE = 0x0301; -constexpr rune COMB_TILDE = 0x0303; constexpr rune COMB_DOT_ABOVE = 0x0307; +[[unsequenced, _mlib_inline]] static inline bool +uprop_ccc_0_or_230(rune ch) +{ + enum uprop_ccc x = uprop_get_ccc(ch); + return x == 0 || x == 230; +} + size_t u8lower(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn, enum caseflags flags) @@ -17,21 +22,54 @@ u8lower(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn, }; rune ch; - size_t n = 0; - struct u8view word = {}, cpy = {src, srcn}; + size_t n, before_dot_cnt, more_above_cnt; + struct { + bool before; + size_t after; + } final_sigma = {}; + + n = before_dot_cnt = more_above_cnt = 0; while (u8next(&ch, &src, &srcn)) { rune next = 0; if (srcn > 0) u8tor(&next, src); - if (src > word.p + word.len) - u8wnext(&word, U8_ARGSP(cpy)); - ctx.eow = src == word.p + word.len; - ctx.before_dot = next == COMB_DOT_ABOVE; - ctx.before_acc = next == COMB_GRAVE - || next == COMB_ACUTE - || next == COMB_TILDE; + if (before_dot_cnt == 0 || more_above_cnt == 0) { + rune ch = 0; + before_dot_cnt = more_above_cnt = 0; + struct u8view cpy = {src, srcn}; + + do { + before_dot_cnt++; + more_above_cnt++; + } while (u8next(&ch ,U8_ARGSP(cpy)) && !uprop_ccc_0_or_230(ch)); + + if (ch != COMB_DOT_ABOVE) + before_dot_cnt = 0; + if (uprop_get_ccc(ch) != 230) + more_above_cnt = 0; + } else { + before_dot_cnt--; + more_above_cnt--; + } + + if (final_sigma.after == 0) { + rune ch; + struct u8view cpy = {src, srcn}; + + do + final_sigma.after++; + while (u8next(&ch, U8_ARGSP(cpy)) && uprop_is_ci(ch)); + + if (!uprop_is_cased(ch)) + final_sigma.after = 0; + } else + final_sigma.after--; + + ctx.before_dot = before_dot_cnt > 0; + ctx.more_above = more_above_cnt > 0; + ctx.final_sigma = final_sigma.before && final_sigma.after == 0; struct rview rv = uprop_get_lc(ch, ctx); for (size_t i = 0; i < rv.len; i++) { @@ -42,7 +80,11 @@ u8lower(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn, n += rtou8(dst + n, dstn - n, rv.p[i]); } - ctx.after_I = ch == 'I'; + ctx.after_I = (ch == 'I') || (ctx.after_I && !uprop_ccc_0_or_230(ch)); + if (uprop_is_cased(ch)) + final_sigma.before = true; + else if (!uprop_is_ci(ch)) + final_sigma.before = false; } return n; diff --git a/lib/unicode/string/u8title.c b/lib/unicode/string/u8title.c index dcf0b2e..380e874 100644 --- a/lib/unicode/string/u8title.c +++ b/lib/unicode/string/u8title.c @@ -36,9 +36,9 @@ u8title(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn, u8wnext(&word, U8_ARGSP(cpy)); bool sow = src - w == word.p; - ctx_l.eow = src == word.p + word.len; + ctx_l.final_sigma = src == word.p + word.len; ctx_l.before_dot = next == COMB_DOT_ABOVE; - ctx_l.before_acc = + ctx_l.more_above = next == COMB_GRAVE || next == COMB_ACUTE || next == COMB_TILDE; struct rview rv; -- cgit v1.2.3