diff options
author | Thomas Voss <mail@thomasvoss.com> | 2024-04-24 00:09:05 +0200 |
---|---|---|
committer | Thomas Voss <mail@thomasvoss.com> | 2024-04-24 00:09:05 +0200 |
commit | 3b797a5f3ce1d77fa7d0ed991b52553c1b3e8757 (patch) | |
tree | e608ecc5b689afaebe1ac3ce112cb2a04e597448 | |
parent | 9cc2a0368fb0a3aa8b878d1795ed76734beadc02 (diff) |
Properly upper- and titlecase ‘i’ and ‘j’ in Lithuanian
-rw-r--r-- | include/unicode/prop.h | 44 | ||||
-rw-r--r-- | lib/unicode/prop/uprop_get_tc.c | 4 | ||||
-rw-r--r-- | lib/unicode/prop/uprop_get_uc.c | 4 | ||||
-rw-r--r-- | lib/unicode/string/u8title.c | 38 | ||||
-rw-r--r-- | lib/unicode/string/u8upper.c | 9 |
5 files changed, 74 insertions, 25 deletions
diff --git a/include/unicode/prop.h b/include/unicode/prop.h index d2d6cec..8cb50c5 100644 --- a/include/unicode/prop.h +++ b/include/unicode/prop.h @@ -13,31 +13,53 @@ struct rview { size_t len; }; -/* clang-format off */ +/* The structures lcctx, tcctx, and ucctx are used to provide context to the + casing property functions whos return values are context-dependent. Each + group of flags in a context structure is separated by a newline. + + The first group of flags are named using language codes. If one of these + flags is set, then language-specific tailorings for the given language are + enabled. For example of the ‘az_or_tr’ flag is enabled in ucctx, then the + letter ‘i’ is uppercased to ‘İ’ as opposed to ‘I’. + + The second group of flags relate to context specified by the Unicode standard + and typically have to do with which characters surround the one being cased. + The description for these flags can be found in Table 3-17 of chapter 3 of + the Unicode standard[1]. + + The third group of flags are extensions provided by MLib, and are documented + above or besides the relevant option. + + [1]: https://www.unicode.org/versions/Unicode15.1.0/ch03.pdf#G54277 */ struct lcctx { - bool az_or_tr : 1; /* Azeri or Turkish */ - bool lt : 1; /* Lithuanian */ + bool az_or_tr : 1; + bool lt : 1; - bool after_I : 1; /* After ‘I’ */ + bool after_I : 1; /* After ‘I’ */ bool before_acc : 1; /* Before accent on ‘i’ or ‘j’ in Lithuanian */ bool before_dot : 1; /* Before U+0307 */ bool eow : 1; /* End of word */ }; struct tcctx { - bool az_or_tr : 1; /* Azeri or Turkish */ - bool lt : 1; /* Lithuanian */ + bool az_or_tr : 1; + bool lt : 1; - bool after_i : 1; /* After ‘i’ */ + bool after_soft_dotted : 1; }; struct ucctx { - bool az_or_tr : 1; /* Azeri or Turkish */ - bool lt : 1; /* Lithuanian */ + bool az_or_tr : 1; + bool lt : 1; + + bool after_soft_dotted : 1; - bool ẞ : 1; /* Uppercase ‘ß’ into ‘ẞ’ (instead of ‘SS’) */ - bool after_i : 1; /* After ‘i’ */ + /* Uppercase the German lowercase-eszett ‘ß’ into the uppercase-eszett ‘ẞ’ + instead of the typical ‘SS’. The uppercase-eszett was added to the + German orthography in 2017 but has not yet seen widespread adoption as of + writing (2024). */ + bool ẞ : 1; }; /* clang-format on */ diff --git a/lib/unicode/prop/uprop_get_tc.c b/lib/unicode/prop/uprop_get_tc.c index 442db6e..83649e1 100644 --- a/lib/unicode/prop/uprop_get_tc.c +++ b/lib/unicode/prop/uprop_get_tc.c @@ -64,9 +64,11 @@ _MLIB_DEFINE_BSEARCH_KV(struct rview, lookup, M(ch)) struct rview uprop_get_tc(rune ch, struct tcctx ctx) { + constexpr rune COMB_DOT_ABOVE = 0x307; + if (ch == 'i' && ctx.az_or_tr) return M(U'İ'); - if (ch == 0x307 && ctx.lt && ctx.after_i) + if (ch == COMB_DOT_ABOVE && ctx.lt && ctx.after_soft_dotted) return M(); rune CH = uprop_get_stc(ch); diff --git a/lib/unicode/prop/uprop_get_uc.c b/lib/unicode/prop/uprop_get_uc.c index ecb0883..4563921 100644 --- a/lib/unicode/prop/uprop_get_uc.c +++ b/lib/unicode/prop/uprop_get_uc.c @@ -119,11 +119,13 @@ _MLIB_DEFINE_BSEARCH_KV(struct rview, lookup, M(ch)) struct rview uprop_get_uc(rune ch, struct ucctx ctx) { + constexpr rune COMB_DOT_ABOVE = 0x307; + if (ch == U'ß') return ctx.ẞ ? M(U'ẞ') : M('S', 'S'); if (ch == 'i' && ctx.az_or_tr) return M(U'İ'); - if (ch == 0x307 && ctx.lt && ctx.after_i) + if (ch == COMB_DOT_ABOVE && ctx.lt && ctx.after_soft_dotted) return M(); rune CH = uprop_get_suc(ch); diff --git a/lib/unicode/string/u8title.c b/lib/unicode/string/u8title.c index a462c4e..b704eef 100644 --- a/lib/unicode/string/u8title.c +++ b/lib/unicode/string/u8title.c @@ -2,9 +2,9 @@ #include "unicode/prop.h" #include "unicode/string.h" -constexpr rune COMB_GRAVE = 0x0300; -constexpr rune COMB_ACUTE = 0x0301; -constexpr rune COMB_TILDE = 0x0303; +constexpr rune COMB_GRAVE = 0x0300; +constexpr rune COMB_ACUTE = 0x0301; +constexpr rune COMB_TILDE = 0x0303; constexpr rune COMB_DOT_ABOVE = 0x0307; size_t @@ -22,8 +22,8 @@ u8title(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn, int w; rune ch; - bool sow; size_t n = 0; + bool lt_special = false; struct u8view word = {}, cpy = {src, srcn}; while (w = u8next(&ch, &src, &srcn)) { @@ -33,15 +33,14 @@ u8title(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn, if (src > word.p + word.len) u8wnext(&word, U8_ARGSP(cpy)); - sow = src - w == word.p; + bool sow = src - w == word.p; ctx_l.eow = src == word.p + word.len; ctx_l.before_dot = next == COMB_DOT_ABOVE; - ctx_l.before_acc = next == COMB_GRAVE - || next == COMB_ACUTE - || next == COMB_TILDE; + ctx_l.before_acc = + next == COMB_GRAVE || next == COMB_ACUTE || next == COMB_TILDE; - struct rview rv = sow ? uprop_get_tc(ch, ctx_t) - : uprop_get_lc(ch, ctx_l); + struct rview rv = sow || lt_special ? uprop_get_tc(ch, ctx_t) + : uprop_get_lc(ch, ctx_l); for (size_t i = 0; i < rv.len; i++) { if (n >= dstn) { char8_t buf[U8_LEN_MAX]; @@ -50,7 +49,24 @@ u8title(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn, n += rtou8(dst + n, dstn - n, rv.p[i]); } - ctx_t.after_i = ch == 'i'; + if (ctx_t.lt) { + /* If the rune at SOW is Soft_Dotted, then the next rune should be + titlecased if it is U+0307 or if does not have ccc=0 and ccc=230. + If the current rune was titlecased as a result of the above rule, + then the rule should be applied again to the next rune. If the + current rune was titlecased and is U+0307, then lowercase until + the next word boundary. */ + enum uprop_ccc ccc; + if (lt_special || uprop_is_sd(ch)) { + ctx_t.after_soft_dotted = true; + lt_special = + (sow || lt_special) && ch != COMB_DOT_ABOVE + && (next == COMB_DOT_ABOVE + || ((ccc = uprop_get_ccc(next)) != 0 && ccc != 230)); + } else + ctx_t.after_soft_dotted = false; + } + ctx_l.after_I = ch == 'I'; } diff --git a/lib/unicode/string/u8upper.c b/lib/unicode/string/u8upper.c index 6b041f6..086a160 100644 --- a/lib/unicode/string/u8upper.c +++ b/lib/unicode/string/u8upper.c @@ -24,7 +24,14 @@ u8upper(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn, } else n += rtou8(dst + n, dstn - n, rv.p[i]); } - ctx.after_i = ch == 'i'; + + if (ctx.lt) { + enum uprop_ccc ccc; + if (uprop_is_sd(ch)) + ctx.after_soft_dotted = true; + else if ((ccc = uprop_get_ccc(ch)) == CCC_NR || ccc == CCC_L) + ctx.after_soft_dotted = false; + } } return n; |