From f8727410e6c83a8390eb9a4812bd8831d07d49e4 Mon Sep 17 00:00:00 2001 From: Thomas Voss Date: Tue, 23 Apr 2024 01:13:43 +0200 Subject: Add u8title() --- include/unicode/string.h | 14 +++++++---- lib/unicode/string/u8title.c | 58 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 5 deletions(-) create mode 100644 lib/unicode/string/u8title.c diff --git a/include/unicode/string.h b/include/unicode/string.h index 887a216..cb19821 100644 --- a/include/unicode/string.h +++ b/include/unicode/string.h @@ -6,6 +6,8 @@ #include "_charN_t.h" #include "_u8view.h" +#define mlib_warn_trunc nodiscard("don’t forget to check for truncation") + /* clang-format off */ enum [[clang::flag_enum]] caseflags { @@ -25,18 +27,20 @@ size_t u8gnext(struct u8view *, const char8_t **, size_t *); size_t u8wnext(struct u8view *, const char8_t **, size_t *); size_t u8wnext_human(struct u8view *, const char8_t **, size_t *); -#define mlib_warn_trunc nodiscard("don’t forget to check for truncation") [[mlib_warn_trunc]] size_t u8casefold(char8_t *restrict, size_t, const char8_t *, size_t, enum caseflags); [[mlib_warn_trunc]] size_t u8lower(char8_t *restrict, size_t, const char8_t *, size_t, enum caseflags); +[[mlib_warn_trunc]] size_t u8title(char8_t *restrict, size_t, const char8_t *, + size_t, enum caseflags); [[mlib_warn_trunc]] size_t u8upper(char8_t *restrict, size_t, const char8_t *, size_t, enum caseflags); -#undef mlib_warn_trunc -constexpr double U8LOWER_SCALE = 1.5; +constexpr double U8LOWER_SCALE = 1.5; constexpr double U8LOWER_SCALE_LT = 3; -constexpr double U8TITLE_SCALE = 3; -constexpr double U8UPPER_SCALE = 3; +constexpr double U8TITLE_SCALE = 3; +constexpr double U8UPPER_SCALE = 3; + +#undef mlib_warn_trunc #endif /* !MLIB_UNICODE_STRING_H */ diff --git a/lib/unicode/string/u8title.c b/lib/unicode/string/u8title.c new file mode 100644 index 0000000..a462c4e --- /dev/null +++ b/lib/unicode/string/u8title.c @@ -0,0 +1,58 @@ +#include "mbstring.h" +#include "unicode/prop.h" +#include "unicode/string.h" + +constexpr rune COMB_GRAVE = 0x0300; +constexpr rune COMB_ACUTE = 0x0301; +constexpr rune COMB_TILDE = 0x0303; +constexpr rune COMB_DOT_ABOVE = 0x0307; + +size_t +u8title(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn, + enum caseflags flags) +{ + struct tcctx ctx_t = { + .az_or_tr = flags & CF_LANG_AZ, + .lt = flags & CF_LANG_LT, + }; + struct lcctx ctx_l = { + .az_or_tr = ctx_t.az_or_tr, + .lt = ctx_t.lt, + }; + + int w; + rune ch; + bool sow; + size_t n = 0; + struct u8view word = {}, cpy = {src, srcn}; + + while (w = u8next(&ch, &src, &srcn)) { + rune next = 0; + if (srcn > 0) + u8tor(&next, src); + if (src > word.p + word.len) + u8wnext(&word, U8_ARGSP(cpy)); + + sow = src - w == word.p; + ctx_l.eow = src == word.p + word.len; + ctx_l.before_dot = next == COMB_DOT_ABOVE; + ctx_l.before_acc = next == COMB_GRAVE + || next == COMB_ACUTE + || next == COMB_TILDE; + + struct rview rv = sow ? uprop_get_tc(ch, ctx_t) + : uprop_get_lc(ch, ctx_l); + for (size_t i = 0; i < rv.len; i++) { + if (n >= dstn) { + char8_t buf[U8_LEN_MAX]; + n += rtou8(buf, sizeof(buf), rv.p[i]); + } else + n += rtou8(dst + n, dstn - n, rv.p[i]); + } + + ctx_t.after_i = ch == 'i'; + ctx_l.after_I = ch == 'I'; + } + + return n; +} -- cgit v1.2.3