aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorThomas Voss <mail@thomasvoss.com> 2024-04-26 17:20:26 +0200
committerThomas Voss <mail@thomasvoss.com> 2024-04-26 17:20:26 +0200
commitef8a2910aa4bfc49973e63e1003b01f47a675249 (patch)
treee2ebe2a8c898f6570d69da89b00ee1b30161fffd
parentb5e7acf641d4ef3538803b746723b90a822ea1ad (diff)
Fully comply with Unicode 15.1 § 3.13 in u8lower()
-rw-r--r--README5
-rw-r--r--include/unicode/prop.h8
-rw-r--r--lib/unicode/prop/uprop_get_lc.c25
-rw-r--r--lib/unicode/string/u8lower.c68
-rw-r--r--lib/unicode/string/u8title.c4
5 files changed, 78 insertions, 32 deletions
diff --git a/README b/README
index e34d14e..8514a86 100644
--- a/README
+++ b/README
@@ -107,8 +107,8 @@ FEATURES:
• unicode/string.h
• Iteration and counting of graphemes and words in a string
• Unicode-aware case-mapping of strings with truncation checking
- • Case-mapping supports optional language-specific quirks (Azeri,
- Lithuanian, German, etc.)
+ • Case-mapping supports optional language-specific quirks for
+ Azeri, Dutch, German, Lithuanian, and Turkish.
PLANNED FEATURES:
@@ -123,4 +123,3 @@ BUGS:
• Unicode Name Aliases (Name_Alias property) is not supported
• Unihan properties are not supported (e.g. Unicode_Radical_Stroke)
• Casemapping is not fully up-to-spec
- • Upper- and titlecasing don’t support CF_LANG_NL yet
diff --git a/include/unicode/prop.h b/include/unicode/prop.h
index 8cb50c5..062d121 100644
--- a/include/unicode/prop.h
+++ b/include/unicode/prop.h
@@ -36,10 +36,10 @@ struct lcctx {
bool az_or_tr : 1;
bool lt : 1;
- bool after_I : 1; /* After ‘I’ */
- bool before_acc : 1; /* Before accent on ‘i’ or ‘j’ in Lithuanian */
- bool before_dot : 1; /* Before U+0307 */
- bool eow : 1; /* End of word */
+ bool after_I : 1;
+ bool before_dot : 1;
+ bool final_sigma : 1;
+ bool more_above : 1;
};
struct tcctx {
diff --git a/lib/unicode/prop/uprop_get_lc.c b/lib/unicode/prop/uprop_get_lc.c
index 0ce1072..752ed71 100644
--- a/lib/unicode/prop/uprop_get_lc.c
+++ b/lib/unicode/prop/uprop_get_lc.c
@@ -8,35 +8,40 @@
struct rview
uprop_get_lc(rune ch, struct lcctx ctx)
{
+ constexpr rune COMB_GRAVE = 0x300;
+ constexpr rune COMB_ACUTE = 0x301;
+ constexpr rune COMB_TILDE = 0x303;
+ constexpr rune COMB_DOT_ABOVE = 0x307;
+
if (ch == U'Σ')
- return ctx.eow ? M(U'ς') : M(U'σ');
+ return ctx.final_sigma ? M(U'ς') : M(U'σ');
if (ch == U'İ')
- return ctx.az_or_tr ? M('i') : M('i', 0x307);
+ return ctx.az_or_tr ? M('i') : M('i', COMB_DOT_ABOVE);
if (ctx.lt) {
- if (ctx.before_acc) {
+ if (ctx.more_above) {
switch (ch) {
case 'I':
- return M('i', 0x307);
+ return M('i', COMB_DOT_ABOVE);
case 'J':
- return M('j', 0x307);
+ return M('j', COMB_DOT_ABOVE);
case U'Į':
- return M(U'į', 0x307);
+ return M(U'į', COMB_DOT_ABOVE);
}
}
switch (ch) {
case U'Ì':
- return M('i', 0x307, 0x300);
+ return M('i', COMB_DOT_ABOVE, COMB_GRAVE);
case U'Í':
- return M('i', 0x307, 0x301);
+ return M('i', COMB_DOT_ABOVE, COMB_ACUTE);
case U'Ĩ':
- return M('i', 0x307, 0x303);
+ return M('i', COMB_DOT_ABOVE, COMB_TILDE);
}
}
if (ctx.az_or_tr) {
- if (ch == 0x307 && ctx.after_I)
+ if (ch == COMB_DOT_ABOVE && ctx.after_I)
return M();
if (ch == 'I' && !ctx.before_dot)
return M(U'ı');
diff --git a/lib/unicode/string/u8lower.c b/lib/unicode/string/u8lower.c
index 77b0e18..f9ac78c 100644
--- a/lib/unicode/string/u8lower.c
+++ b/lib/unicode/string/u8lower.c
@@ -1,12 +1,17 @@
+#include "_attrs.h"
#include "mbstring.h"
#include "unicode/prop.h"
#include "unicode/string.h"
-constexpr rune COMB_GRAVE = 0x0300;
-constexpr rune COMB_ACUTE = 0x0301;
-constexpr rune COMB_TILDE = 0x0303;
constexpr rune COMB_DOT_ABOVE = 0x0307;
+[[unsequenced, _mlib_inline]] static inline bool
+uprop_ccc_0_or_230(rune ch)
+{
+ enum uprop_ccc x = uprop_get_ccc(ch);
+ return x == 0 || x == 230;
+}
+
size_t
u8lower(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn,
enum caseflags flags)
@@ -17,21 +22,54 @@ u8lower(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn,
};
rune ch;
- size_t n = 0;
- struct u8view word = {}, cpy = {src, srcn};
+ size_t n, before_dot_cnt, more_above_cnt;
+ struct {
+ bool before;
+ size_t after;
+ } final_sigma = {};
+
+ n = before_dot_cnt = more_above_cnt = 0;
while (u8next(&ch, &src, &srcn)) {
rune next = 0;
if (srcn > 0)
u8tor(&next, src);
- if (src > word.p + word.len)
- u8wnext(&word, U8_ARGSP(cpy));
- ctx.eow = src == word.p + word.len;
- ctx.before_dot = next == COMB_DOT_ABOVE;
- ctx.before_acc = next == COMB_GRAVE
- || next == COMB_ACUTE
- || next == COMB_TILDE;
+ if (before_dot_cnt == 0 || more_above_cnt == 0) {
+ rune ch = 0;
+ before_dot_cnt = more_above_cnt = 0;
+ struct u8view cpy = {src, srcn};
+
+ do {
+ before_dot_cnt++;
+ more_above_cnt++;
+ } while (u8next(&ch ,U8_ARGSP(cpy)) && !uprop_ccc_0_or_230(ch));
+
+ if (ch != COMB_DOT_ABOVE)
+ before_dot_cnt = 0;
+ if (uprop_get_ccc(ch) != 230)
+ more_above_cnt = 0;
+ } else {
+ before_dot_cnt--;
+ more_above_cnt--;
+ }
+
+ if (final_sigma.after == 0) {
+ rune ch;
+ struct u8view cpy = {src, srcn};
+
+ do
+ final_sigma.after++;
+ while (u8next(&ch, U8_ARGSP(cpy)) && uprop_is_ci(ch));
+
+ if (!uprop_is_cased(ch))
+ final_sigma.after = 0;
+ } else
+ final_sigma.after--;
+
+ ctx.before_dot = before_dot_cnt > 0;
+ ctx.more_above = more_above_cnt > 0;
+ ctx.final_sigma = final_sigma.before && final_sigma.after == 0;
struct rview rv = uprop_get_lc(ch, ctx);
for (size_t i = 0; i < rv.len; i++) {
@@ -42,7 +80,11 @@ u8lower(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn,
n += rtou8(dst + n, dstn - n, rv.p[i]);
}
- ctx.after_I = ch == 'I';
+ ctx.after_I = (ch == 'I') || (ctx.after_I && !uprop_ccc_0_or_230(ch));
+ if (uprop_is_cased(ch))
+ final_sigma.before = true;
+ else if (!uprop_is_ci(ch))
+ final_sigma.before = false;
}
return n;
diff --git a/lib/unicode/string/u8title.c b/lib/unicode/string/u8title.c
index dcf0b2e..380e874 100644
--- a/lib/unicode/string/u8title.c
+++ b/lib/unicode/string/u8title.c
@@ -36,9 +36,9 @@ u8title(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn,
u8wnext(&word, U8_ARGSP(cpy));
bool sow = src - w == word.p;
- ctx_l.eow = src == word.p + word.len;
+ ctx_l.final_sigma = src == word.p + word.len;
ctx_l.before_dot = next == COMB_DOT_ABOVE;
- ctx_l.before_acc =
+ ctx_l.more_above =
next == COMB_GRAVE || next == COMB_ACUTE || next == COMB_TILDE;
struct rview rv;