Fully comply with Unicode 15.1 § 3.13 in u8lower()

author: Thomas Voss <mail@thomasvoss.com> 2024-04-26 17:20:26 +0200
committer: Thomas Voss <mail@thomasvoss.com> 2024-04-26 17:20:26 +0200
commit: ef8a2910aa4bfc49973e63e1003b01f47a675249 (patch)
tree: e2ebe2a8c898f6570d69da89b00ee1b30161fffd
parent: b5e7acf641d4ef3538803b746723b90a822ea1ad (diff)
5 files changed, 78 insertions, 32 deletions
diff --git a/README b/README
index e34d14e..8514a86 100644
--- a/README
+++ b/README
@@ -107,8 +107,8 @@ FEATURES:
     • unicode/string.h
         • Iteration and counting of graphemes and words in a string
         • Unicode-aware case-mapping of strings with truncation checking
-        • Case-mapping supports optional language-specific quirks (Azeri,
-          Lithuanian, German, etc.)
+        • Case-mapping supports optional language-specific quirks for
+          Azeri, Dutch, German, Lithuanian, and Turkish.
 
 
 PLANNED FEATURES:
@@ -123,4 +123,3 @@ BUGS:
     • Unicode Name Aliases (Name_Alias property) is not supported
     • Unihan properties are not supported (e.g. Unicode_Radical_Stroke)
     • Casemapping is not fully up-to-spec
-    • Upper- and titlecasing don’t support CF_LANG_NL yet
diff --git a/include/unicode/prop.h b/include/unicode/prop.h
index 8cb50c5..062d121 100644
--- a/include/unicode/prop.h
+++ b/include/unicode/prop.h
@@ -36,10 +36,10 @@ struct lcctx {
 	bool az_or_tr : 1;
 	bool lt       : 1;
 
-	bool after_I : 1;    /* After ‘I’ */
-	bool before_acc : 1; /* Before accent on ‘i’ or ‘j’ in Lithuanian */
-	bool before_dot : 1; /* Before U+0307 */
-	bool eow        : 1; /* End of word */
+	bool after_I     : 1;
+	bool before_dot  : 1;
+	bool final_sigma : 1;
+	bool more_above  : 1;
 };
 
 struct tcctx {
diff --git a/lib/unicode/prop/uprop_get_lc.c b/lib/unicode/prop/uprop_get_lc.c
index 0ce1072..752ed71 100644
--- a/lib/unicode/prop/uprop_get_lc.c
+++ b/lib/unicode/prop/uprop_get_lc.c
@@ -8,35 +8,40 @@
 struct rview
 uprop_get_lc(rune ch, struct lcctx ctx)
 {
+	constexpr rune COMB_GRAVE     = 0x300;
+	constexpr rune COMB_ACUTE     = 0x301;
+	constexpr rune COMB_TILDE     = 0x303;
+	constexpr rune COMB_DOT_ABOVE = 0x307;
+
 	if (ch == U'Σ')
-		return ctx.eow ? M(U'ς') : M(U'σ');
+		return ctx.final_sigma ? M(U'ς') : M(U'σ');
 	if (ch == U'İ')
-		return ctx.az_or_tr ? M('i') : M('i', 0x307);
+		return ctx.az_or_tr ? M('i') : M('i', COMB_DOT_ABOVE);
 
 	if (ctx.lt) {
-		if (ctx.before_acc) {
+		if (ctx.more_above) {
 			switch (ch) {
 			case 'I':
-				return M('i', 0x307);
+				return M('i', COMB_DOT_ABOVE);
 			case 'J':
-				return M('j', 0x307);
+				return M('j', COMB_DOT_ABOVE);
 			case U'Į':
-				return M(U'į', 0x307);
+				return M(U'į', COMB_DOT_ABOVE);
 			}
 		}
 
 		switch (ch) {
 		case U'Ì':
-			return M('i', 0x307, 0x300);
+			return M('i', COMB_DOT_ABOVE, COMB_GRAVE);
 		case U'Í':
-			return M('i', 0x307, 0x301);
+			return M('i', COMB_DOT_ABOVE, COMB_ACUTE);
 		case U'Ĩ':
-			return M('i', 0x307, 0x303);
+			return M('i', COMB_DOT_ABOVE, COMB_TILDE);
 		}
 	}
 
 	if (ctx.az_or_tr) {
-		if (ch == 0x307 && ctx.after_I)
+		if (ch == COMB_DOT_ABOVE && ctx.after_I)
 			return M();
 		if (ch == 'I' && !ctx.before_dot)
 			return M(U'ı');
diff --git a/lib/unicode/string/u8lower.c b/lib/unicode/string/u8lower.c
index 77b0e18..f9ac78c 100644
--- a/lib/unicode/string/u8lower.c
+++ b/lib/unicode/string/u8lower.c
@@ -1,12 +1,17 @@
+#include "_attrs.h"
 #include "mbstring.h"
 #include "unicode/prop.h"
 #include "unicode/string.h"
 
-constexpr rune COMB_GRAVE     = 0x0300;
-constexpr rune COMB_ACUTE     = 0x0301;
-constexpr rune COMB_TILDE     = 0x0303;
 constexpr rune COMB_DOT_ABOVE = 0x0307;
 
+[[unsequenced, _mlib_inline]] static inline bool
+uprop_ccc_0_or_230(rune ch)
+{
+	enum uprop_ccc x = uprop_get_ccc(ch);
+	return x == 0 || x == 230;
+}
+
 size_t
 u8lower(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn,
         enum caseflags flags)
@@ -17,21 +22,54 @@ u8lower(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn,
 	};
 
 	rune ch;
-	size_t n = 0;
-	struct u8view word = {}, cpy = {src, srcn};
+	size_t n, before_dot_cnt, more_above_cnt;
+	struct {
+		bool before;
+		size_t after;
+	} final_sigma = {};
+
+	n = before_dot_cnt = more_above_cnt = 0;
 
 	while (u8next(&ch, &src, &srcn)) {
 		rune next = 0;
 		if (srcn > 0)
 			u8tor(&next, src);
-		if (src > word.p + word.len)
-			u8wnext(&word, U8_ARGSP(cpy));
 
-		ctx.eow = src == word.p + word.len;
-		ctx.before_dot = next == COMB_DOT_ABOVE;
-		ctx.before_acc = next == COMB_GRAVE
-		              || next == COMB_ACUTE
-		              || next == COMB_TILDE;
+		if (before_dot_cnt == 0 || more_above_cnt == 0) {
+			rune ch = 0;
+			before_dot_cnt = more_above_cnt = 0;
+			struct u8view cpy = {src, srcn};
+
+			do {
+				before_dot_cnt++;
+				more_above_cnt++;
+			} while (u8next(&ch ,U8_ARGSP(cpy)) && !uprop_ccc_0_or_230(ch));
+
+			if (ch != COMB_DOT_ABOVE)
+				before_dot_cnt = 0;
+			if (uprop_get_ccc(ch) != 230)
+				more_above_cnt = 0;
+		} else {
+			before_dot_cnt--;
+			more_above_cnt--;
+		}
+
+		if (final_sigma.after == 0) {
+			rune ch;
+			struct u8view cpy = {src, srcn};
+
+			do
+				final_sigma.after++;
+			while (u8next(&ch, U8_ARGSP(cpy)) && uprop_is_ci(ch));
+
+			if (!uprop_is_cased(ch))
+				final_sigma.after = 0;
+		} else
+			final_sigma.after--;
+
+		ctx.before_dot = before_dot_cnt > 0;
+		ctx.more_above = more_above_cnt > 0;
+		ctx.final_sigma = final_sigma.before && final_sigma.after == 0;
 
 		struct rview rv = uprop_get_lc(ch, ctx);
 		for (size_t i = 0; i < rv.len; i++) {
@@ -42,7 +80,11 @@ u8lower(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn,
 				n += rtou8(dst + n, dstn - n, rv.p[i]);
 		}
 
-		ctx.after_I = ch == 'I';
+		ctx.after_I = (ch == 'I') || (ctx.after_I && !uprop_ccc_0_or_230(ch));
+		if (uprop_is_cased(ch))
+			final_sigma.before = true;
+		else if (!uprop_is_ci(ch))
+			final_sigma.before = false;
 	}
 
 	return n;
diff --git a/lib/unicode/string/u8title.c b/lib/unicode/string/u8title.c
index dcf0b2e..380e874 100644
--- a/lib/unicode/string/u8title.c
+++ b/lib/unicode/string/u8title.c
@@ -36,9 +36,9 @@ u8title(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn,
 			u8wnext(&word, U8_ARGSP(cpy));
 
 		bool sow = src - w == word.p;
-		ctx_l.eow = src == word.p + word.len;
+		ctx_l.final_sigma = src == word.p + word.len;
 		ctx_l.before_dot = next == COMB_DOT_ABOVE;
-		ctx_l.before_acc =
+		ctx_l.more_above =
 			next == COMB_GRAVE || next == COMB_ACUTE || next == COMB_TILDE;
 
 		struct rview rv;
author	Thomas Voss <mail@thomasvoss.com>	2024-04-26 17:20:26 +0200
committer	Thomas Voss <mail@thomasvoss.com>	2024-04-26 17:20:26 +0200
commit	ef8a2910aa4bfc49973e63e1003b01f47a675249 (patch)
tree	e2ebe2a8c898f6570d69da89b00ee1b30161fffd
parent	b5e7acf641d4ef3538803b746723b90a822ea1ad (diff)