aboutsummaryrefslogtreecommitdiff
path: root/lib/unicode/string
diff options
context:
space:
mode:
authorThomas Voss <mail@thomasvoss.com> 2024-04-26 17:20:26 +0200
committerThomas Voss <mail@thomasvoss.com> 2024-04-26 17:20:26 +0200
commitef8a2910aa4bfc49973e63e1003b01f47a675249 (patch)
treee2ebe2a8c898f6570d69da89b00ee1b30161fffd /lib/unicode/string
parentb5e7acf641d4ef3538803b746723b90a822ea1ad (diff)
Fully comply with Unicode 15.1 § 3.13 in u8lower()
Diffstat (limited to 'lib/unicode/string')
-rw-r--r--lib/unicode/string/u8lower.c68
-rw-r--r--lib/unicode/string/u8title.c4
2 files changed, 57 insertions, 15 deletions
diff --git a/lib/unicode/string/u8lower.c b/lib/unicode/string/u8lower.c
index 77b0e18..f9ac78c 100644
--- a/lib/unicode/string/u8lower.c
+++ b/lib/unicode/string/u8lower.c
@@ -1,12 +1,17 @@
+#include "_attrs.h"
#include "mbstring.h"
#include "unicode/prop.h"
#include "unicode/string.h"
-constexpr rune COMB_GRAVE = 0x0300;
-constexpr rune COMB_ACUTE = 0x0301;
-constexpr rune COMB_TILDE = 0x0303;
constexpr rune COMB_DOT_ABOVE = 0x0307;
+[[unsequenced, _mlib_inline]] static inline bool
+uprop_ccc_0_or_230(rune ch)
+{
+ enum uprop_ccc x = uprop_get_ccc(ch);
+ return x == 0 || x == 230;
+}
+
size_t
u8lower(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn,
enum caseflags flags)
@@ -17,21 +22,54 @@ u8lower(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn,
};
rune ch;
- size_t n = 0;
- struct u8view word = {}, cpy = {src, srcn};
+ size_t n, before_dot_cnt, more_above_cnt;
+ struct {
+ bool before;
+ size_t after;
+ } final_sigma = {};
+
+ n = before_dot_cnt = more_above_cnt = 0;
while (u8next(&ch, &src, &srcn)) {
rune next = 0;
if (srcn > 0)
u8tor(&next, src);
- if (src > word.p + word.len)
- u8wnext(&word, U8_ARGSP(cpy));
- ctx.eow = src == word.p + word.len;
- ctx.before_dot = next == COMB_DOT_ABOVE;
- ctx.before_acc = next == COMB_GRAVE
- || next == COMB_ACUTE
- || next == COMB_TILDE;
+ if (before_dot_cnt == 0 || more_above_cnt == 0) {
+ rune ch = 0;
+ before_dot_cnt = more_above_cnt = 0;
+ struct u8view cpy = {src, srcn};
+
+ do {
+ before_dot_cnt++;
+ more_above_cnt++;
+ } while (u8next(&ch ,U8_ARGSP(cpy)) && !uprop_ccc_0_or_230(ch));
+
+ if (ch != COMB_DOT_ABOVE)
+ before_dot_cnt = 0;
+ if (uprop_get_ccc(ch) != 230)
+ more_above_cnt = 0;
+ } else {
+ before_dot_cnt--;
+ more_above_cnt--;
+ }
+
+ if (final_sigma.after == 0) {
+ rune ch;
+ struct u8view cpy = {src, srcn};
+
+ do
+ final_sigma.after++;
+ while (u8next(&ch, U8_ARGSP(cpy)) && uprop_is_ci(ch));
+
+ if (!uprop_is_cased(ch))
+ final_sigma.after = 0;
+ } else
+ final_sigma.after--;
+
+ ctx.before_dot = before_dot_cnt > 0;
+ ctx.more_above = more_above_cnt > 0;
+ ctx.final_sigma = final_sigma.before && final_sigma.after == 0;
struct rview rv = uprop_get_lc(ch, ctx);
for (size_t i = 0; i < rv.len; i++) {
@@ -42,7 +80,11 @@ u8lower(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn,
n += rtou8(dst + n, dstn - n, rv.p[i]);
}
- ctx.after_I = ch == 'I';
+ ctx.after_I = (ch == 'I') || (ctx.after_I && !uprop_ccc_0_or_230(ch));
+ if (uprop_is_cased(ch))
+ final_sigma.before = true;
+ else if (!uprop_is_ci(ch))
+ final_sigma.before = false;
}
return n;
diff --git a/lib/unicode/string/u8title.c b/lib/unicode/string/u8title.c
index dcf0b2e..380e874 100644
--- a/lib/unicode/string/u8title.c
+++ b/lib/unicode/string/u8title.c
@@ -36,9 +36,9 @@ u8title(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn,
u8wnext(&word, U8_ARGSP(cpy));
bool sow = src - w == word.p;
- ctx_l.eow = src == word.p + word.len;
+ ctx_l.final_sigma = src == word.p + word.len;
ctx_l.before_dot = next == COMB_DOT_ABOVE;
- ctx_l.before_acc =
+ ctx_l.more_above =
next == COMB_GRAVE || next == COMB_ACUTE || next == COMB_TILDE;
struct rview rv;