diff options
author | Thomas Voss <mail@thomasvoss.com> | 2024-04-26 23:42:01 +0200 |
---|---|---|
committer | Thomas Voss <mail@thomasvoss.com> | 2024-04-26 23:42:01 +0200 |
commit | dc7c4289a4285453dbf9d427fa6da9c076f7f537 (patch) | |
tree | e1e90d3d8138ff2bd2d074ba3f3a5443237e584c | |
parent | fa95265eaa8c1d7f41dc537f44450b142247253e (diff) |
Fully comply with Unicode 15.1 § 3.13 in u8title()
-rw-r--r-- | README | 8 | ||||
-rw-r--r-- | lib/unicode/string/u8title.c | 143 |
2 files changed, 94 insertions, 57 deletions
@@ -108,14 +108,14 @@ FEATURES: • Iteration and counting of graphemes and words in a string • Unicode-aware case-mapping of strings with truncation checking • Case-mapping supports optional language-specific quirks for - Azeri, Dutch, German, Lithuanian, and Turkish. + Azeri, German, Lithuanian, and Turkish. PLANNED FEATURES: - • Titlecase Conversions (unicode/string.h) - • Unicode Normalization (unicode/string.h) - • Line- and Sentence Segmentation (unicode/string.h) + • Dutch titlecase tailorings (unicode/string.h) + • Unicode normalization (unicode/string.h) + • Line- and sentence segmentation (unicode/string.h) BUGS: diff --git a/lib/unicode/string/u8title.c b/lib/unicode/string/u8title.c index 380e874..536c4ed 100644 --- a/lib/unicode/string/u8title.c +++ b/lib/unicode/string/u8title.c @@ -1,52 +1,99 @@ +#include "_attrs.h" #include "mbstring.h" #include "unicode/prop.h" #include "unicode/string.h" -constexpr rune COMB_GRAVE = 0x0300; -constexpr rune COMB_ACUTE = 0x0301; -constexpr rune COMB_TILDE = 0x0303; constexpr rune COMB_DOT_ABOVE = 0x0307; +[[unsequenced, _mlib_inline]] +static inline bool +uprop_ccc_0_or_230(rune ch) +{ + enum uprop_ccc x = uprop_get_ccc(ch); + return x == 0 || x == 230; +} + size_t u8title(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn, enum caseflags flags) { - struct tcctx ctx_t = { - .az_or_tr = flags & CF_LANG_AZ, - .lt = flags & CF_LANG_LT, - }; - struct lcctx ctx_l = { - .az_or_tr = ctx_t.az_or_tr, - .lt = ctx_t.lt, - }; - - int w; + struct tcctx ctx_t; + struct lcctx ctx_l; + + ctx_t.az_or_tr = ctx_l.az_or_tr = flags & CF_LANG_AZ; + ctx_t.lt = ctx_l.lt = flags & CF_LANG_LT; + rune ch; - size_t n = 0; - bool lt_special, nl_special; - struct u8view word = {}, cpy = {src, srcn}; + size_t n, before_dot_cnt, more_above_cnt; + struct u8view word = {}, wcpy = {src, srcn}; + struct { + bool before; + size_t after; + } final_sigma = {}; + enum { + TITLE, + BETWEEN, + LOWER, + } state = 0; - lt_special = nl_special = false; + n = before_dot_cnt = more_above_cnt = 0; - while (w = u8next(&ch, &src, &srcn)) { + while (u8next(&ch, &src, &srcn)) { rune next = 0; if (srcn > 0) u8tor(&next, src); - if (src > word.p + word.len) - u8wnext(&word, U8_ARGSP(cpy)); - - bool sow = src - w == word.p; - ctx_l.final_sigma = src == word.p + word.len; - ctx_l.before_dot = next == COMB_DOT_ABOVE; - ctx_l.more_above = - next == COMB_GRAVE || next == COMB_ACUTE || next == COMB_TILDE; - - struct rview rv; - if (nl_special && (ch == 'j' || ch == 'J')) - rv = (struct rview){.p = U"J", .len = 1}; - else - rv = sow || lt_special ? uprop_get_tc(ch, ctx_t) - : uprop_get_lc(ch, ctx_l); + + if (src > word.p + word.len) { + u8wnext(&word, U8_ARGSP(wcpy)); + ctx_t.after_soft_dotted = false; + state = TITLE; + } + + if (ctx_l.az_or_tr || ctx_l.lt) { + if (before_dot_cnt == 0 || more_above_cnt == 0) { + rune ch = 0; + before_dot_cnt = more_above_cnt = 0; + struct u8view cpy = {src, srcn}; + + do { + before_dot_cnt++; + more_above_cnt++; + } while (u8next(&ch, U8_ARGSP(cpy)) && !uprop_ccc_0_or_230(ch)); + + if (ch != COMB_DOT_ABOVE) + before_dot_cnt = 0; + if (uprop_get_ccc(ch) != 230) + more_above_cnt = 0; + } else { + before_dot_cnt--; + more_above_cnt--; + } + } + + if (final_sigma.after == 0) { + rune ch; + struct u8view cpy = {src, srcn}; + + do + final_sigma.after++; + while (u8next(&ch, U8_ARGSP(cpy)) && uprop_is_ci(ch)); + + if (!uprop_is_cased(ch)) + final_sigma.after = 0; + } else + final_sigma.after--; + + ctx_l.before_dot = before_dot_cnt > 0; + ctx_l.more_above = more_above_cnt > 0; + ctx_l.final_sigma = final_sigma.before && final_sigma.after == 0; + + if (state == BETWEEN && uprop_is_cased(ch)) + state = LOWER; + struct rview rv = + state == LOWER ? uprop_get_lc(ch, ctx_l) : uprop_get_tc(ch, ctx_t); + if (state == TITLE && uprop_is_cased(ch)) + state = BETWEEN; + for (size_t i = 0; i < rv.len; i++) { if (n >= dstn) { char8_t buf[U8_LEN_MAX]; @@ -55,27 +102,17 @@ u8title(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn, n += rtou8(dst + n, dstn - n, rv.p[i]); } - if (flags & CF_LANG_NL) - nl_special = sow && (ch == 'i' || ch == 'I'); - if (ctx_t.lt) { - /* If the rune at SOW is Soft_Dotted, then the next rune should be - titlecased if it is U+0307 or if does not have ccc=0 and ccc=230. - If the current rune was titlecased as a result of the above rule, - then the rule should be applied again to the next rune. If the - current rune was titlecased and is U+0307, then lowercase until - the next word boundary. */ - enum uprop_ccc ccc; - if (lt_special || uprop_is_sd(ch)) { - ctx_t.after_soft_dotted = true; - lt_special = - (sow || lt_special) && ch != COMB_DOT_ABOVE - && (next == COMB_DOT_ABOVE - || ((ccc = uprop_get_ccc(next)) != 0 && ccc != 230)); - } else - ctx_t.after_soft_dotted = false; - } + ctx_l.after_I = + (ch == 'I') || (ctx_l.after_I && !uprop_ccc_0_or_230(ch)); + if (uprop_is_cased(ch)) + final_sigma.before = true; + else if (!uprop_is_ci(ch)) + final_sigma.before = false; - ctx_l.after_I = ch == 'I'; + if (uprop_is_sd(ch)) + ctx_t.after_soft_dotted = true; + else if (uprop_ccc_0_or_230(ch)) + ctx_t.after_soft_dotted = false; } return n; |