Properly upper- and titlecase ‘i’ and ‘j’ in Lithuanian

author: Thomas Voss <mail@thomasvoss.com> 2024-04-24 00:09:05 +0200
committer: Thomas Voss <mail@thomasvoss.com> 2024-04-24 00:09:05 +0200
commit: 3b797a5f3ce1d77fa7d0ed991b52553c1b3e8757 (patch)
tree: e608ecc5b689afaebe1ac3ce112cb2a04e597448
parent: 9cc2a0368fb0a3aa8b878d1795ed76734beadc02 (diff)
5 files changed, 74 insertions, 25 deletions
diff --git a/include/unicode/prop.h b/include/unicode/prop.h
index d2d6cec..8cb50c5 100644
--- a/include/unicode/prop.h
+++ b/include/unicode/prop.h
@@ -13,31 +13,53 @@ struct rview {
 	size_t len;
 };
 
-/* clang-format off */
+/* The structures lcctx, tcctx, and ucctx are used to provide context to the
+   casing property functions whos return values are context-dependent.  Each
+   group of flags in a context structure is separated by a newline.
+
+   The first group of flags are named using language codes.  If one of these
+   flags is set, then language-specific tailorings for the given language are
+   enabled.  For example of the ‘az_or_tr’ flag is enabled in ucctx, then the
+   letter ‘i’ is uppercased to ‘İ’ as opposed to ‘I’.
+
+   The second group of flags relate to context specified by the Unicode standard
+   and typically have to do with which characters surround the one being cased.
+   The description for these flags can be found in Table 3-17 of chapter 3 of
+   the Unicode standard[1].
+
+   The third group of flags are extensions provided by MLib, and are documented
+   above or besides the relevant option.
+
+   [1]: https://www.unicode.org/versions/Unicode15.1.0/ch03.pdf#G54277 */
 
 struct lcctx {
-	bool az_or_tr : 1; /* Azeri or Turkish */
-	bool lt       : 1; /* Lithuanian */
+	bool az_or_tr : 1;
+	bool lt       : 1;
 
-	bool after_I    : 1; /* After ‘I’ */
+	bool after_I : 1;    /* After ‘I’ */
 	bool before_acc : 1; /* Before accent on ‘i’ or ‘j’ in Lithuanian */
 	bool before_dot : 1; /* Before U+0307 */
 	bool eow        : 1; /* End of word */
 };
 
 struct tcctx {
-	bool az_or_tr : 1; /* Azeri or Turkish */
-	bool lt       : 1; /* Lithuanian */
+	bool az_or_tr : 1;
+	bool lt       : 1;
 
-	bool after_i : 1; /* After ‘i’ */
+	bool after_soft_dotted : 1;
 };
 
 struct ucctx {
-	bool az_or_tr : 1; /* Azeri or Turkish */
-	bool lt       : 1; /* Lithuanian */
+	bool az_or_tr : 1;
+	bool lt       : 1;
+
+	bool after_soft_dotted : 1;
 
-	bool ẞ       : 1; /* Uppercase ‘ß’ into ‘ẞ’ (instead of ‘SS’) */
-	bool after_i : 1; /* After ‘i’ */
+	/* Uppercase the German lowercase-eszett ‘ß’ into the uppercase-eszett ‘ẞ’
+	   instead of the typical ‘SS’.  The uppercase-eszett was added to the
+	   German orthography in 2017 but has not yet seen widespread adoption as of
+	   writing (2024). */
+	bool ẞ : 1;
 };
 
 /* clang-format on */
diff --git a/lib/unicode/prop/uprop_get_tc.c b/lib/unicode/prop/uprop_get_tc.c
index 442db6e..83649e1 100644
--- a/lib/unicode/prop/uprop_get_tc.c
+++ b/lib/unicode/prop/uprop_get_tc.c
@@ -64,9 +64,11 @@ _MLIB_DEFINE_BSEARCH_KV(struct rview, lookup, M(ch))
 struct rview
 uprop_get_tc(rune ch, struct tcctx ctx)
 {
+	constexpr rune COMB_DOT_ABOVE = 0x307;
+
 	if (ch == 'i' && ctx.az_or_tr)
 		return M(U'İ');
-	if (ch == 0x307 && ctx.lt && ctx.after_i)
+	if (ch == COMB_DOT_ABOVE && ctx.lt && ctx.after_soft_dotted)
 		return M();
 
 	rune CH = uprop_get_stc(ch);
diff --git a/lib/unicode/prop/uprop_get_uc.c b/lib/unicode/prop/uprop_get_uc.c
index ecb0883..4563921 100644
--- a/lib/unicode/prop/uprop_get_uc.c
+++ b/lib/unicode/prop/uprop_get_uc.c
@@ -119,11 +119,13 @@ _MLIB_DEFINE_BSEARCH_KV(struct rview, lookup, M(ch))
 struct rview
 uprop_get_uc(rune ch, struct ucctx ctx)
 {
+	constexpr rune COMB_DOT_ABOVE = 0x307;
+
 	if (ch == U'ß')
 		return ctx.ẞ ? M(U'ẞ') : M('S', 'S');
 	if (ch == 'i' && ctx.az_or_tr)
 		return M(U'İ');
-	if (ch == 0x307 && ctx.lt && ctx.after_i)
+	if (ch == COMB_DOT_ABOVE && ctx.lt && ctx.after_soft_dotted)
 		return M();
 
 	rune CH = uprop_get_suc(ch);
diff --git a/lib/unicode/string/u8title.c b/lib/unicode/string/u8title.c
index a462c4e..b704eef 100644
--- a/lib/unicode/string/u8title.c
+++ b/lib/unicode/string/u8title.c
@@ -2,9 +2,9 @@
 #include "unicode/prop.h"
 #include "unicode/string.h"
 
-constexpr rune COMB_GRAVE     = 0x0300;
-constexpr rune COMB_ACUTE     = 0x0301;
-constexpr rune COMB_TILDE     = 0x0303;
+constexpr rune COMB_GRAVE = 0x0300;
+constexpr rune COMB_ACUTE = 0x0301;
+constexpr rune COMB_TILDE = 0x0303;
 constexpr rune COMB_DOT_ABOVE = 0x0307;
 
 size_t
@@ -22,8 +22,8 @@ u8title(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn,
 
 	int w;
 	rune ch;
-	bool sow;
 	size_t n = 0;
+	bool lt_special = false;
 	struct u8view word = {}, cpy = {src, srcn};
 
 	while (w = u8next(&ch, &src, &srcn)) {
@@ -33,15 +33,14 @@ u8title(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn,
 		if (src > word.p + word.len)
 			u8wnext(&word, U8_ARGSP(cpy));
 
-		sow = src - w == word.p;
+		bool sow = src - w == word.p;
 		ctx_l.eow = src == word.p + word.len;
 		ctx_l.before_dot = next == COMB_DOT_ABOVE;
-		ctx_l.before_acc = next == COMB_GRAVE
-		                || next == COMB_ACUTE
-		                || next == COMB_TILDE;
+		ctx_l.before_acc =
+			next == COMB_GRAVE || next == COMB_ACUTE || next == COMB_TILDE;
 
-		struct rview rv = sow ? uprop_get_tc(ch, ctx_t)
-		                      : uprop_get_lc(ch, ctx_l);
+		struct rview rv = sow || lt_special ? uprop_get_tc(ch, ctx_t)
+		                                    : uprop_get_lc(ch, ctx_l);
 		for (size_t i = 0; i < rv.len; i++) {
 			if (n >= dstn) {
 				char8_t buf[U8_LEN_MAX];
@@ -50,7 +49,24 @@ u8title(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn,
 				n += rtou8(dst + n, dstn - n, rv.p[i]);
 		}
 
-		ctx_t.after_i = ch == 'i';
+		if (ctx_t.lt) {
+			/* If the rune at SOW is Soft_Dotted, then the next rune should be
+			   titlecased if it is U+0307 or if does not have ccc=0 and ccc=230.
+			   If the current rune was titlecased as a result of the above rule,
+			   then the rule should be applied again to the next rune.  If the
+			   current rune was titlecased and is U+0307, then lowercase until
+			   the next word boundary. */
+			enum uprop_ccc ccc;
+			if (lt_special || uprop_is_sd(ch)) {
+				ctx_t.after_soft_dotted = true;
+				lt_special =
+					(sow || lt_special) && ch != COMB_DOT_ABOVE
+					&& (next == COMB_DOT_ABOVE
+				        || ((ccc = uprop_get_ccc(next)) != 0 && ccc != 230));
+			} else
+				ctx_t.after_soft_dotted = false;
+		}
+
 		ctx_l.after_I = ch == 'I';
 	}
 
diff --git a/lib/unicode/string/u8upper.c b/lib/unicode/string/u8upper.c
index 6b041f6..086a160 100644
--- a/lib/unicode/string/u8upper.c
+++ b/lib/unicode/string/u8upper.c
@@ -24,7 +24,14 @@ u8upper(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn,
 			} else
 				n += rtou8(dst + n, dstn - n, rv.p[i]);
 		}
-		ctx.after_i = ch == 'i';
+
+		if (ctx.lt) {
+			enum uprop_ccc ccc;
+			if (uprop_is_sd(ch))
+				ctx.after_soft_dotted = true;
+			else if ((ccc = uprop_get_ccc(ch)) == CCC_NR || ccc == CCC_L)
+				ctx.after_soft_dotted = false;
+		}
 	}
 
 	return n;
author	Thomas Voss <mail@thomasvoss.com>	2024-04-24 00:09:05 +0200
committer	Thomas Voss <mail@thomasvoss.com>	2024-04-24 00:09:05 +0200
commit	3b797a5f3ce1d77fa7d0ed991b52553c1b3e8757 (patch)
tree	e608ecc5b689afaebe1ac3ce112cb2a04e597448
parent	9cc2a0368fb0a3aa8b878d1795ed76734beadc02 (diff)