aboutsummaryrefslogtreecommitdiff
path: root/lib/unicode
diff options
context:
space:
mode:
authorThomas Voss <mail@thomasvoss.com> 2024-04-26 17:20:26 +0200
committerThomas Voss <mail@thomasvoss.com> 2024-04-26 17:20:26 +0200
commitef8a2910aa4bfc49973e63e1003b01f47a675249 (patch)
treee2ebe2a8c898f6570d69da89b00ee1b30161fffd /lib/unicode
parentb5e7acf641d4ef3538803b746723b90a822ea1ad (diff)
Fully comply with Unicode 15.1 § 3.13 in u8lower()
Diffstat (limited to 'lib/unicode')
-rw-r--r--lib/unicode/prop/uprop_get_lc.c25
-rw-r--r--lib/unicode/string/u8lower.c68
-rw-r--r--lib/unicode/string/u8title.c4
3 files changed, 72 insertions, 25 deletions
diff --git a/lib/unicode/prop/uprop_get_lc.c b/lib/unicode/prop/uprop_get_lc.c
index 0ce1072..752ed71 100644
--- a/lib/unicode/prop/uprop_get_lc.c
+++ b/lib/unicode/prop/uprop_get_lc.c
@@ -8,35 +8,40 @@
struct rview
uprop_get_lc(rune ch, struct lcctx ctx)
{
+ constexpr rune COMB_GRAVE = 0x300;
+ constexpr rune COMB_ACUTE = 0x301;
+ constexpr rune COMB_TILDE = 0x303;
+ constexpr rune COMB_DOT_ABOVE = 0x307;
+
if (ch == U'Σ')
- return ctx.eow ? M(U'ς') : M(U'σ');
+ return ctx.final_sigma ? M(U'ς') : M(U'σ');
if (ch == U'İ')
- return ctx.az_or_tr ? M('i') : M('i', 0x307);
+ return ctx.az_or_tr ? M('i') : M('i', COMB_DOT_ABOVE);
if (ctx.lt) {
- if (ctx.before_acc) {
+ if (ctx.more_above) {
switch (ch) {
case 'I':
- return M('i', 0x307);
+ return M('i', COMB_DOT_ABOVE);
case 'J':
- return M('j', 0x307);
+ return M('j', COMB_DOT_ABOVE);
case U'Į':
- return M(U'į', 0x307);
+ return M(U'į', COMB_DOT_ABOVE);
}
}
switch (ch) {
case U'Ì':
- return M('i', 0x307, 0x300);
+ return M('i', COMB_DOT_ABOVE, COMB_GRAVE);
case U'Í':
- return M('i', 0x307, 0x301);
+ return M('i', COMB_DOT_ABOVE, COMB_ACUTE);
case U'Ĩ':
- return M('i', 0x307, 0x303);
+ return M('i', COMB_DOT_ABOVE, COMB_TILDE);
}
}
if (ctx.az_or_tr) {
- if (ch == 0x307 && ctx.after_I)
+ if (ch == COMB_DOT_ABOVE && ctx.after_I)
return M();
if (ch == 'I' && !ctx.before_dot)
return M(U'ı');
diff --git a/lib/unicode/string/u8lower.c b/lib/unicode/string/u8lower.c
index 77b0e18..f9ac78c 100644
--- a/lib/unicode/string/u8lower.c
+++ b/lib/unicode/string/u8lower.c
@@ -1,12 +1,17 @@
+#include "_attrs.h"
#include "mbstring.h"
#include "unicode/prop.h"
#include "unicode/string.h"
-constexpr rune COMB_GRAVE = 0x0300;
-constexpr rune COMB_ACUTE = 0x0301;
-constexpr rune COMB_TILDE = 0x0303;
constexpr rune COMB_DOT_ABOVE = 0x0307;
+[[unsequenced, _mlib_inline]] static inline bool
+uprop_ccc_0_or_230(rune ch)
+{
+ enum uprop_ccc x = uprop_get_ccc(ch);
+ return x == 0 || x == 230;
+}
+
size_t
u8lower(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn,
enum caseflags flags)
@@ -17,21 +22,54 @@ u8lower(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn,
};
rune ch;
- size_t n = 0;
- struct u8view word = {}, cpy = {src, srcn};
+ size_t n, before_dot_cnt, more_above_cnt;
+ struct {
+ bool before;
+ size_t after;
+ } final_sigma = {};
+
+ n = before_dot_cnt = more_above_cnt = 0;
while (u8next(&ch, &src, &srcn)) {
rune next = 0;
if (srcn > 0)
u8tor(&next, src);
- if (src > word.p + word.len)
- u8wnext(&word, U8_ARGSP(cpy));
- ctx.eow = src == word.p + word.len;
- ctx.before_dot = next == COMB_DOT_ABOVE;
- ctx.before_acc = next == COMB_GRAVE
- || next == COMB_ACUTE
- || next == COMB_TILDE;
+ if (before_dot_cnt == 0 || more_above_cnt == 0) {
+ rune ch = 0;
+ before_dot_cnt = more_above_cnt = 0;
+ struct u8view cpy = {src, srcn};
+
+ do {
+ before_dot_cnt++;
+ more_above_cnt++;
+ } while (u8next(&ch ,U8_ARGSP(cpy)) && !uprop_ccc_0_or_230(ch));
+
+ if (ch != COMB_DOT_ABOVE)
+ before_dot_cnt = 0;
+ if (uprop_get_ccc(ch) != 230)
+ more_above_cnt = 0;
+ } else {
+ before_dot_cnt--;
+ more_above_cnt--;
+ }
+
+ if (final_sigma.after == 0) {
+ rune ch;
+ struct u8view cpy = {src, srcn};
+
+ do
+ final_sigma.after++;
+ while (u8next(&ch, U8_ARGSP(cpy)) && uprop_is_ci(ch));
+
+ if (!uprop_is_cased(ch))
+ final_sigma.after = 0;
+ } else
+ final_sigma.after--;
+
+ ctx.before_dot = before_dot_cnt > 0;
+ ctx.more_above = more_above_cnt > 0;
+ ctx.final_sigma = final_sigma.before && final_sigma.after == 0;
struct rview rv = uprop_get_lc(ch, ctx);
for (size_t i = 0; i < rv.len; i++) {
@@ -42,7 +80,11 @@ u8lower(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn,
n += rtou8(dst + n, dstn - n, rv.p[i]);
}
- ctx.after_I = ch == 'I';
+ ctx.after_I = (ch == 'I') || (ctx.after_I && !uprop_ccc_0_or_230(ch));
+ if (uprop_is_cased(ch))
+ final_sigma.before = true;
+ else if (!uprop_is_ci(ch))
+ final_sigma.before = false;
}
return n;
diff --git a/lib/unicode/string/u8title.c b/lib/unicode/string/u8title.c
index dcf0b2e..380e874 100644
--- a/lib/unicode/string/u8title.c
+++ b/lib/unicode/string/u8title.c
@@ -36,9 +36,9 @@ u8title(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn,
u8wnext(&word, U8_ARGSP(cpy));
bool sow = src - w == word.p;
- ctx_l.eow = src == word.p + word.len;
+ ctx_l.final_sigma = src == word.p + word.len;
ctx_l.before_dot = next == COMB_DOT_ABOVE;
- ctx_l.before_acc =
+ ctx_l.more_above =
next == COMB_GRAVE || next == COMB_ACUTE || next == COMB_TILDE;
struct rview rv;