aboutsummaryrefslogtreecommitdiff
path: root/lib/unicode
diff options
context:
space:
mode:
authorThomas Voss <mail@thomasvoss.com> 2024-04-24 00:09:05 +0200
committerThomas Voss <mail@thomasvoss.com> 2024-04-24 00:09:05 +0200
commit3b797a5f3ce1d77fa7d0ed991b52553c1b3e8757 (patch)
treee608ecc5b689afaebe1ac3ce112cb2a04e597448 /lib/unicode
parent9cc2a0368fb0a3aa8b878d1795ed76734beadc02 (diff)
Properly upper- and titlecase ‘i’ and ‘j’ in Lithuanian
Diffstat (limited to 'lib/unicode')
-rw-r--r--lib/unicode/prop/uprop_get_tc.c4
-rw-r--r--lib/unicode/prop/uprop_get_uc.c4
-rw-r--r--lib/unicode/string/u8title.c38
-rw-r--r--lib/unicode/string/u8upper.c9
4 files changed, 41 insertions, 14 deletions
diff --git a/lib/unicode/prop/uprop_get_tc.c b/lib/unicode/prop/uprop_get_tc.c
index 442db6e..83649e1 100644
--- a/lib/unicode/prop/uprop_get_tc.c
+++ b/lib/unicode/prop/uprop_get_tc.c
@@ -64,9 +64,11 @@ _MLIB_DEFINE_BSEARCH_KV(struct rview, lookup, M(ch))
struct rview
uprop_get_tc(rune ch, struct tcctx ctx)
{
+ constexpr rune COMB_DOT_ABOVE = 0x307;
+
if (ch == 'i' && ctx.az_or_tr)
return M(U'İ');
- if (ch == 0x307 && ctx.lt && ctx.after_i)
+ if (ch == COMB_DOT_ABOVE && ctx.lt && ctx.after_soft_dotted)
return M();
rune CH = uprop_get_stc(ch);
diff --git a/lib/unicode/prop/uprop_get_uc.c b/lib/unicode/prop/uprop_get_uc.c
index ecb0883..4563921 100644
--- a/lib/unicode/prop/uprop_get_uc.c
+++ b/lib/unicode/prop/uprop_get_uc.c
@@ -119,11 +119,13 @@ _MLIB_DEFINE_BSEARCH_KV(struct rview, lookup, M(ch))
struct rview
uprop_get_uc(rune ch, struct ucctx ctx)
{
+ constexpr rune COMB_DOT_ABOVE = 0x307;
+
if (ch == U'ß')
return ctx.ẞ ? M(U'ẞ') : M('S', 'S');
if (ch == 'i' && ctx.az_or_tr)
return M(U'İ');
- if (ch == 0x307 && ctx.lt && ctx.after_i)
+ if (ch == COMB_DOT_ABOVE && ctx.lt && ctx.after_soft_dotted)
return M();
rune CH = uprop_get_suc(ch);
diff --git a/lib/unicode/string/u8title.c b/lib/unicode/string/u8title.c
index a462c4e..b704eef 100644
--- a/lib/unicode/string/u8title.c
+++ b/lib/unicode/string/u8title.c
@@ -2,9 +2,9 @@
#include "unicode/prop.h"
#include "unicode/string.h"
-constexpr rune COMB_GRAVE = 0x0300;
-constexpr rune COMB_ACUTE = 0x0301;
-constexpr rune COMB_TILDE = 0x0303;
+constexpr rune COMB_GRAVE = 0x0300;
+constexpr rune COMB_ACUTE = 0x0301;
+constexpr rune COMB_TILDE = 0x0303;
constexpr rune COMB_DOT_ABOVE = 0x0307;
size_t
@@ -22,8 +22,8 @@ u8title(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn,
int w;
rune ch;
- bool sow;
size_t n = 0;
+ bool lt_special = false;
struct u8view word = {}, cpy = {src, srcn};
while (w = u8next(&ch, &src, &srcn)) {
@@ -33,15 +33,14 @@ u8title(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn,
if (src > word.p + word.len)
u8wnext(&word, U8_ARGSP(cpy));
- sow = src - w == word.p;
+ bool sow = src - w == word.p;
ctx_l.eow = src == word.p + word.len;
ctx_l.before_dot = next == COMB_DOT_ABOVE;
- ctx_l.before_acc = next == COMB_GRAVE
- || next == COMB_ACUTE
- || next == COMB_TILDE;
+ ctx_l.before_acc =
+ next == COMB_GRAVE || next == COMB_ACUTE || next == COMB_TILDE;
- struct rview rv = sow ? uprop_get_tc(ch, ctx_t)
- : uprop_get_lc(ch, ctx_l);
+ struct rview rv = sow || lt_special ? uprop_get_tc(ch, ctx_t)
+ : uprop_get_lc(ch, ctx_l);
for (size_t i = 0; i < rv.len; i++) {
if (n >= dstn) {
char8_t buf[U8_LEN_MAX];
@@ -50,7 +49,24 @@ u8title(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn,
n += rtou8(dst + n, dstn - n, rv.p[i]);
}
- ctx_t.after_i = ch == 'i';
+ if (ctx_t.lt) {
+ /* If the rune at SOW is Soft_Dotted, then the next rune should be
+ titlecased if it is U+0307 or if does not have ccc=0 and ccc=230.
+ If the current rune was titlecased as a result of the above rule,
+ then the rule should be applied again to the next rune. If the
+ current rune was titlecased and is U+0307, then lowercase until
+ the next word boundary. */
+ enum uprop_ccc ccc;
+ if (lt_special || uprop_is_sd(ch)) {
+ ctx_t.after_soft_dotted = true;
+ lt_special =
+ (sow || lt_special) && ch != COMB_DOT_ABOVE
+ && (next == COMB_DOT_ABOVE
+ || ((ccc = uprop_get_ccc(next)) != 0 && ccc != 230));
+ } else
+ ctx_t.after_soft_dotted = false;
+ }
+
ctx_l.after_I = ch == 'I';
}
diff --git a/lib/unicode/string/u8upper.c b/lib/unicode/string/u8upper.c
index 6b041f6..086a160 100644
--- a/lib/unicode/string/u8upper.c
+++ b/lib/unicode/string/u8upper.c
@@ -24,7 +24,14 @@ u8upper(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn,
} else
n += rtou8(dst + n, dstn - n, rv.p[i]);
}
- ctx.after_i = ch == 'i';
+
+ if (ctx.lt) {
+ enum uprop_ccc ccc;
+ if (uprop_is_sd(ch))
+ ctx.after_soft_dotted = true;
+ else if ((ccc = uprop_get_ccc(ch)) == CCC_NR || ccc == CCC_L)
+ ctx.after_soft_dotted = false;
+ }
}
return n;