From ef8a2910aa4bfc49973e63e1003b01f47a675249 Mon Sep 17 00:00:00 2001
From: Thomas Voss <mail@thomasvoss.com>
Date: Fri, 26 Apr 2024 17:20:26 +0200
Subject: Fully comply with Unicode 15.1 § 3.13 in u8lower()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 lib/unicode/string/u8lower.c | 68 +++++++++++++++++++++++++++++++++++---------
 lib/unicode/string/u8title.c |  4 +--
 2 files changed, 57 insertions(+), 15 deletions(-)

(limited to 'lib/unicode/string')

diff --git a/lib/unicode/string/u8lower.c b/lib/unicode/string/u8lower.c
index 77b0e18..f9ac78c 100644
--- a/lib/unicode/string/u8lower.c
+++ b/lib/unicode/string/u8lower.c
@@ -1,12 +1,17 @@
+#include "_attrs.h"
 #include "mbstring.h"
 #include "unicode/prop.h"
 #include "unicode/string.h"
 
-constexpr rune COMB_GRAVE     = 0x0300;
-constexpr rune COMB_ACUTE     = 0x0301;
-constexpr rune COMB_TILDE     = 0x0303;
 constexpr rune COMB_DOT_ABOVE = 0x0307;
 
+[[unsequenced, _mlib_inline]] static inline bool
+uprop_ccc_0_or_230(rune ch)
+{
+	enum uprop_ccc x = uprop_get_ccc(ch);
+	return x == 0 || x == 230;
+}
+
 size_t
 u8lower(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn,
         enum caseflags flags)
@@ -17,21 +22,54 @@ u8lower(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn,
 	};
 
 	rune ch;
-	size_t n = 0;
-	struct u8view word = {}, cpy = {src, srcn};
+	size_t n, before_dot_cnt, more_above_cnt;
+	struct {
+		bool before;
+		size_t after;
+	} final_sigma = {};
+
+	n = before_dot_cnt = more_above_cnt = 0;
 
 	while (u8next(&ch, &src, &srcn)) {
 		rune next = 0;
 		if (srcn > 0)
 			u8tor(&next, src);
-		if (src > word.p + word.len)
-			u8wnext(&word, U8_ARGSP(cpy));
 
-		ctx.eow = src == word.p + word.len;
-		ctx.before_dot = next == COMB_DOT_ABOVE;
-		ctx.before_acc = next == COMB_GRAVE
-		              || next == COMB_ACUTE
-		              || next == COMB_TILDE;
+		if (before_dot_cnt == 0 || more_above_cnt == 0) {
+			rune ch = 0;
+			before_dot_cnt = more_above_cnt = 0;
+			struct u8view cpy = {src, srcn};
+
+			do {
+				before_dot_cnt++;
+				more_above_cnt++;
+			} while (u8next(&ch ,U8_ARGSP(cpy)) && !uprop_ccc_0_or_230(ch));
+
+			if (ch != COMB_DOT_ABOVE)
+				before_dot_cnt = 0;
+			if (uprop_get_ccc(ch) != 230)
+				more_above_cnt = 0;
+		} else {
+			before_dot_cnt--;
+			more_above_cnt--;
+		}
+
+		if (final_sigma.after == 0) {
+			rune ch;
+			struct u8view cpy = {src, srcn};
+
+			do
+				final_sigma.after++;
+			while (u8next(&ch, U8_ARGSP(cpy)) && uprop_is_ci(ch));
+
+			if (!uprop_is_cased(ch))
+				final_sigma.after = 0;
+		} else
+			final_sigma.after--;
+
+		ctx.before_dot = before_dot_cnt > 0;
+		ctx.more_above = more_above_cnt > 0;
+		ctx.final_sigma = final_sigma.before && final_sigma.after == 0;
 
 		struct rview rv = uprop_get_lc(ch, ctx);
 		for (size_t i = 0; i < rv.len; i++) {
@@ -42,7 +80,11 @@ u8lower(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn,
 				n += rtou8(dst + n, dstn - n, rv.p[i]);
 		}
 
-		ctx.after_I = ch == 'I';
+		ctx.after_I = (ch == 'I') || (ctx.after_I && !uprop_ccc_0_or_230(ch));
+		if (uprop_is_cased(ch))
+			final_sigma.before = true;
+		else if (!uprop_is_ci(ch))
+			final_sigma.before = false;
 	}
 
 	return n;
diff --git a/lib/unicode/string/u8title.c b/lib/unicode/string/u8title.c
index dcf0b2e..380e874 100644
--- a/lib/unicode/string/u8title.c
+++ b/lib/unicode/string/u8title.c
@@ -36,9 +36,9 @@ u8title(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn,
 			u8wnext(&word, U8_ARGSP(cpy));
 
 		bool sow = src - w == word.p;
-		ctx_l.eow = src == word.p + word.len;
+		ctx_l.final_sigma = src == word.p + word.len;
 		ctx_l.before_dot = next == COMB_DOT_ABOVE;
-		ctx_l.before_acc =
+		ctx_l.more_above =
 			next == COMB_GRAVE || next == COMB_ACUTE || next == COMB_TILDE;
 
 		struct rview rv;
-- 
cgit v1.2.3