diff options
author | Thomas Voss <mail@thomasvoss.com> | 2024-05-04 04:01:45 +0200 |
---|---|---|
committer | Thomas Voss <mail@thomasvoss.com> | 2024-05-04 04:01:45 +0200 |
commit | ac1b4bcbaeaee7d2ef9132dcdc254f2d08691650 (patch) | |
tree | 90250966629653f0462cf17bc0b6f2476fb6d1fc | |
parent | 8b923ba5e5bb37ea26350b4c1c688b8697706609 (diff) |
Go all in on string views, and fix manuals
30 files changed, 213 insertions, 268 deletions
diff --git a/include/_qmacros.h b/include/_qmacros.h deleted file mode 100644 index d496581..0000000 --- a/include/_qmacros.h +++ /dev/null @@ -1,26 +0,0 @@ -#ifndef MLIB__QMACROS_H -#define MLIB__QMACROS_H - -/* Macros for qualifier-preserving functions. These are used to create wrappers - around some functions which will return a const-qualified pointer if the - input pointer is const-qualified, and a non-const-qualified pointer - otherwise. - - The macros are taken from the N3020 proposal for C23. */ - -/* clang-format off */ -#define _MLIB_PTR_IS_CONST(P) \ - _Generic(1 ? (P) : (void *)(P), \ - const void *: 1, \ - default: 0) -#define _MLIB_STATIC_IF(P, T, E) \ - _Generic(&(char[!!(P) + 1]){0}, \ - char(*)[2]: T, \ - char(*)[1]: E) -#define _MLIB_Q_PTR(T, F, S, ...) \ - _MLIB_STATIC_IF(_MLIB_PTR_IS_CONST((S)), \ - (const T *)(F)(__VA_ARGS__), \ - (T *)(F)(__VA_ARGS__)) -/* clang-format on */ - -#endif /* !MLIB__QMACROS_H */ diff --git a/include/macros.h b/include/macros.h index d4b53b4..32e8b7c 100644 --- a/include/macros.h +++ b/include/macros.h @@ -7,9 +7,9 @@ #define lengthof(a) (sizeof(a) / sizeof(*(a))) -#define memeq(...) (!memcmp(__VA_ARGS__)) -#define streq(...) (!strcmp(__VA_ARGS__)) -#define u8eq(...) (!u8cmp(__VA_ARGS__)) +#define memeq(x, y, n) (!memcmp((x), (y), (n))) +#define streq(x, y) (!strcmp((x), (y))) +#define u8eq(x, y) (!u8cmp((x), (y))) #define _MLIB_STR(s) #s #define _MLIB_CONCAT(x, y) x##y diff --git a/include/mbstring.h b/include/mbstring.h index d908284..ba654bb 100644 --- a/include/mbstring.h +++ b/include/mbstring.h @@ -4,14 +4,13 @@ #include <stddef.h> #include "_charN_t.h" -#include "_qmacros.h" #include "_rune.h" #include "_u8view.h" #define U8(...) \ ((struct u8view){__VA_OPT__(u8##__VA_ARGS__, sizeof(u8##__VA_ARGS__) - 1)}) -#define U8_ARGS(s) ((s).p), ((s).len) -#define U8_ARGSP(s) (&(s).p), (&(s).len) + +#define VSHFT(sv, n) ((sv)->p += (n), (sv)->len -= (n)) /* clang-format off */ #define u8byte1(x) (((x) & 0x80) == 0x00) @@ -29,33 +28,21 @@ constexpr rune U8_4B_MAX = 0x10FFFF; constexpr int U8_LEN_MAX = 4; #define PRIsU8 ".*s" -#define U8_PRI_ARGS(sv) ((int)(sv).len), ((sv).p) - -[[nodiscard]] bool u8haspfx(const char8_t *, size_t, const char8_t *, size_t); -[[nodiscard]] bool u8hassfx(const char8_t *, size_t, const char8_t *, size_t); - -[[nodiscard]] char8_t *u8chk(const char8_t *, size_t); - -[[nodiscard]] char8_t *u8chr(const char8_t *, size_t, rune); -[[nodiscard]] char8_t *u8rchr(const char8_t *, size_t, rune); +#define SV_PRI_ARGS(sv) ((int)(sv).len), ((sv).p) int rtou8(char8_t *, size_t, rune); -int u8tor(rune *, const char8_t *); - -[[nodiscard]] int u8cmp(const char8_t *, size_t, const char8_t *, size_t); - -int u8next(rune *, const char8_t **, size_t *); +int u8next(rune *, struct u8view *); int u8prev(rune *, const char8_t **, const char8_t *); - -[[nodiscard]] size_t u8spn(const char8_t *, size_t, const rune *, size_t); -[[nodiscard]] size_t u8cspn(const char8_t *, size_t, const rune *, size_t); - -[[nodiscard]] size_t u8len(const char8_t *, size_t); - -struct u8view u8split(const char8_t **, size_t *, rune); - -#define u8chk(s, n) _MLIB_Q_PTR(char8_t, u8chk, (s), (s), (n)) -#define u8chr(s, n, ch) _MLIB_Q_PTR(char8_t, u8chr, (s), (s), (n), (ch)) -#define u8rchr(s, n, ch) _MLIB_Q_PTR(char8_t, u8rchr, (s), (s), (n), (ch)) +int u8tor(rune *, const char8_t *); +[[nodiscard]] bool u8haspfx(struct u8view, struct u8view); +[[nodiscard]] bool u8hassfx(struct u8view, struct u8view); +[[nodiscard]] const char8_t *u8chk(struct u8view); +[[nodiscard]] const char8_t *u8chr(struct u8view, rune); +[[nodiscard]] const char8_t *u8rchr(struct u8view, rune); +[[nodiscard]] int u8cmp(struct u8view, struct u8view); +[[nodiscard]] size_t u8cspn(struct u8view, const rune *, size_t); +[[nodiscard]] size_t u8len(struct u8view); +[[nodiscard]] size_t u8spn(struct u8view, const rune *, size_t); +struct u8view u8split(struct u8view *, rune); #endif /* !MLIB_MBSTRING_H */ diff --git a/include/unicode/string.h b/include/unicode/string.h index 0ae49f0..bb8cafd 100644 --- a/include/unicode/string.h +++ b/include/unicode/string.h @@ -21,22 +21,22 @@ enum [[clang::flag_enum]] caseflags { /* clang-format on */ -[[nodiscard]] size_t u8gcnt(const char8_t *, size_t); -[[nodiscard]] size_t u8wcnt(const char8_t *, size_t); -[[nodiscard]] size_t u8wcnt_human(const char8_t *, size_t); - -size_t u8gnext(struct u8view *, const char8_t **, size_t *); -size_t u8wnext(struct u8view *, const char8_t **, size_t *); -size_t u8wnext_human(struct u8view *, const char8_t **, size_t *); - -[[mlib_warn_trunc]] size_t u8lower(char8_t *restrict, size_t, const char8_t *, - size_t, enum caseflags); -[[mlib_warn_trunc]] size_t u8title(char8_t *restrict, size_t, const char8_t *, - size_t, enum caseflags); -[[mlib_warn_trunc]] size_t u8upper(char8_t *restrict, size_t, const char8_t *, - size_t, enum caseflags); -[[mlib_warn_trunc]] size_t u8casefold(char8_t *restrict, size_t, - const char8_t *, size_t, enum caseflags); +[[nodiscard]] size_t u8gcnt(struct u8view); +[[nodiscard]] size_t u8wcnt(struct u8view); +[[nodiscard]] size_t u8wcnt_human(struct u8view); + +size_t u8gnext(struct u8view *, struct u8view *); +size_t u8wnext(struct u8view *, struct u8view *); +size_t u8wnext_human(struct u8view *, struct u8view *); + +[[mlib_warn_trunc]] size_t u8lower(char8_t *restrict, size_t, struct u8view, + enum caseflags); +[[mlib_warn_trunc]] size_t u8title(char8_t *restrict, size_t, struct u8view, + enum caseflags); +[[mlib_warn_trunc]] size_t u8upper(char8_t *restrict, size_t, struct u8view, + enum caseflags); +[[mlib_warn_trunc]] size_t u8casefold(char8_t *restrict, size_t, struct u8view, + enum caseflags); constexpr double U8LOWER_SCALE = 1.5; constexpr double U8LOWER_SCALE_LT = 3; diff --git a/lib/mbstring/u8chk.c b/lib/mbstring/u8chk.c index 2566bac..20c4f3f 100644 --- a/lib/mbstring/u8chk.c +++ b/lib/mbstring/u8chk.c @@ -1,17 +1,15 @@ #include "rune.h" #include "mbstring.h" -char8_t * -(u8chk)(const char8_t *s, size_t n) +const char8_t * +u8chk(struct u8view sv) { - while (n) { - rune ch; - int m = u8tor(&ch, s); + int w; + rune ch; + while (w = u8next(&ch, &sv)) { if (ch == RUNE_ERROR) - return (char8_t *)s; - n -= m; - s += m; + return sv.p - w; } return nullptr; diff --git a/lib/mbstring/u8chr.c b/lib/mbstring/u8chr.c index 395a328..4831695 100644 --- a/lib/mbstring/u8chr.c +++ b/lib/mbstring/u8chr.c @@ -31,7 +31,7 @@ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -static char8_t * +static const char8_t * memmem2(const char8_t *h, size_t k, const char8_t *n) { uint16_t hw, nw; @@ -40,12 +40,12 @@ memmem2(const char8_t *h, size_t k, const char8_t *n) for (h += 2, k -= 2; k; k--, hw = hw << 8 | *h++) { if (hw == nw) - return (char8_t *)h - 2; + return h - 2; } - return hw == nw ? (char8_t *)h - 2 : nullptr; + return hw == nw ? h - 2 : nullptr; } -static char8_t * +static const char8_t * memmem3(const char8_t *h, size_t k, const char8_t *n) { uint32_t hw, nw; @@ -54,12 +54,12 @@ memmem3(const char8_t *h, size_t k, const char8_t *n) for (h += 3, k -= 3; k; k--, hw = (hw | *h++) << 8) { if (hw == nw) - return (char8_t *)h - 3; + return h - 3; } - return hw == nw ? (char8_t *)h - 3 : nullptr; + return hw == nw ? h - 3 : nullptr; } -static char8_t * +static const char8_t * memmem4(const char8_t *h, size_t k, const char8_t *n) { uint32_t hw, nw; @@ -68,28 +68,28 @@ memmem4(const char8_t *h, size_t k, const char8_t *n) for (h += 4, k -= 4; k; k--, hw = hw << 8 | *h++) { if (hw == nw) - return (char8_t *)h - 4; + return h - 4; } - return hw == nw ? (char8_t *)h - 4 : nullptr; + return hw == nw ? h - 4 : nullptr; } -char8_t * -(u8chr)(const char8_t *s, size_t n, rune ch) +const char8_t * +u8chr(struct u8view sv, rune ch) { char8_t buf[U8_LEN_MAX]; int m = rtou8(buf, sizeof(buf), ch); - if (n < (size_t)m) + if (sv.len < (size_t)m) return nullptr; switch (m) { case 1: - return memchr(s, ch, n); + return memchr(sv.p, ch, sv.len); case 2: - return memmem2(s, n, buf); + return memmem2(sv.p, sv.len, buf); case 3: - return memmem3(s, n, buf); + return memmem3(sv.p, sv.len, buf); case 4: - return memmem4(s, n, buf); + return memmem4(sv.p, sv.len, buf); } unreachable(); diff --git a/lib/mbstring/u8cmp.c b/lib/mbstring/u8cmp.c index 8bd2400..0059020 100644 --- a/lib/mbstring/u8cmp.c +++ b/lib/mbstring/u8cmp.c @@ -3,7 +3,7 @@ #include "mbstring.h" int -u8cmp(const char8_t *x, size_t n, const char8_t *y, size_t m) +u8cmp(struct u8view x, struct u8view y) { - return n != m ? (n > m ? +1 : -1) : memcmp(x, y, n); + return x.len != y.len ? (x.len > y.len ? +1 : -1) : memcmp(x.p, y.p, x.len); } diff --git a/lib/mbstring/u8cspn.c b/lib/mbstring/u8cspn.c index 4892de4..827373f 100644 --- a/lib/mbstring/u8cspn.c +++ b/lib/mbstring/u8cspn.c @@ -1,13 +1,13 @@ #include "mbstring.h" size_t -u8cspn(const char8_t *s, size_t n, const rune *p, size_t m) +u8cspn(struct u8view sv, const rune *p, size_t n) { rune ch; - size_t k, l; + size_t k, w; - for (k = 0; (l = u8next(&ch, &s, &n)); k += l) { - for (size_t i = 0; i < m; i++) { + for (k = 0; w = u8next(&ch, &sv); k += w) { + for (size_t i = 0; i < n; i++) { if (p[i] == ch) goto found; } diff --git a/lib/mbstring/u8haspfx.c b/lib/mbstring/u8haspfx.c index b6cea50..c61efbb 100644 --- a/lib/mbstring/u8haspfx.c +++ b/lib/mbstring/u8haspfx.c @@ -4,7 +4,7 @@ #include "mbstring.h" bool -u8haspfx(const char8_t *s, size_t n, const char8_t *pfx, size_t m) +u8haspfx(struct u8view sv, struct u8view pfx) { - return n >= m && memeq(s, pfx, m); + return sv.len >= pfx.len && memeq(sv.p, pfx.p, pfx.len); } diff --git a/lib/mbstring/u8hassfx.c b/lib/mbstring/u8hassfx.c index e31bb4b..8ea4456 100644 --- a/lib/mbstring/u8hassfx.c +++ b/lib/mbstring/u8hassfx.c @@ -4,7 +4,7 @@ #include "mbstring.h" bool -u8hassfx(const char8_t *s, size_t n, const char8_t *sfx, size_t m) +u8hassfx(struct u8view sv, struct u8view sfx) { - return n >= m && memeq(s + n - m, sfx, m); + return sv.len >= sfx.len && memeq(sv.p + sv.len - sfx.len, sfx.p, sfx.len); } diff --git a/lib/mbstring/u8len.c b/lib/mbstring/u8len.c index 217ab66..23c55c5 100644 --- a/lib/mbstring/u8len.c +++ b/lib/mbstring/u8len.c @@ -1,10 +1,10 @@ #include "mbstring.h" size_t -u8len(const char8_t *s, size_t n) +u8len(struct u8view sv) { size_t m = 0; - while (u8next(nullptr, &s, &n)) + while (u8next(nullptr, &sv)) m++; return m; } diff --git a/lib/mbstring/u8next.c b/lib/mbstring/u8next.c index 82d2ad7..518de49 100644 --- a/lib/mbstring/u8next.c +++ b/lib/mbstring/u8next.c @@ -1,16 +1,15 @@ #include "mbstring.h" int -u8next(rune *ch, const char8_t **s, size_t *n) +u8next(rune *ch, struct u8view *sv) { - rune _; - int m = 0; + int n = 0; - if (*n) { - m = u8tor(ch ? ch : &_, *s); - *n -= m; - *s += m; + if (sv->len) { + rune _; + n = u8tor(ch ? ch : &_, sv->p); + VSHFT(sv, n); } - return m; + return n; } diff --git a/lib/mbstring/u8rchr.c b/lib/mbstring/u8rchr.c index 09aa111..825f8fd 100644 --- a/lib/mbstring/u8rchr.c +++ b/lib/mbstring/u8rchr.c @@ -3,17 +3,17 @@ #include "mbstring.h" -static char8_t * +static const char8_t * memrchr1(const char8_t *s, size_t k, const char8_t *n) { for (const char8_t *p = s + k - 1; k-- > 0; p--) { if (*p == *n) - return (char8_t *)p; + return p; } return nullptr; } -static char8_t * +static const char8_t * memrchr2(const char8_t *h, size_t k, const char8_t *n) { uint16_t hw, nw; @@ -23,13 +23,13 @@ memrchr2(const char8_t *h, size_t k, const char8_t *n) for (H -= 2, k -= 2; k; k--, hw = hw >> 8 | (*H-- << 8)) { if (hw == nw) - return (char8_t *)H + 1; + return H + 1; } - return hw == nw ? (char8_t *)H + 1 : nullptr; + return hw == nw ? H + 1 : nullptr; } -static char8_t * +static const char8_t * memrchr3(const char8_t *h, size_t k, const char8_t *n) { uint32_t hw, nw; @@ -41,13 +41,13 @@ memrchr3(const char8_t *h, size_t k, const char8_t *n) k--, hw = (hw >> 8 | (*H-- << 24)) & UINT32_C(0xFFFFFF00)) { if (hw == nw) - return (char8_t *)H + 1; + return H + 1; } - return hw == nw ? (char8_t *)H + 1 : nullptr; + return hw == nw ? H + 1 : nullptr; } -static char8_t * +static const char8_t * memrchr4(const char8_t *h, size_t k, const char8_t *n) { uint32_t hw, nw; @@ -57,29 +57,29 @@ memrchr4(const char8_t *h, size_t k, const char8_t *n) for (H -= 4, k -= 4; k; k--, hw = hw >> 8 | (*H-- << 24)) { if (hw == nw) - return (char8_t *)H + 1; + return H + 1; } - return hw == nw ? (char8_t *)H + 1 : nullptr; + return hw == nw ? H + 1 : nullptr; } -char8_t * -(u8rchr)(const char8_t *s, size_t n, rune ch) +const char8_t * +u8rchr(struct u8view sv, rune ch) { char8_t buf[U8_LEN_MAX]; - int m = rtou8(buf, ch, sizeof(buf)); + int n = rtou8(buf, ch, sizeof(buf)); - if (n < (size_t)m) + if (sv.len < (size_t)n) return nullptr; - switch (m) { + switch (n) { case 1: - return (char8_t *)memrchr1(s, n, buf); + return memrchr1(sv.p, sv.len, buf); case 2: - return (char8_t *)memrchr2(s, n, buf); + return memrchr2(sv.p, sv.len, buf); case 3: - return (char8_t *)memrchr3(s, n, buf); + return memrchr3(sv.p, sv.len, buf); case 4: - return (char8_t *)memrchr4(s, n, buf); + return memrchr4(sv.p, sv.len, buf); } unreachable(); diff --git a/lib/mbstring/u8split.c b/lib/mbstring/u8split.c index 5ee3bc0..c26f48b 100644 --- a/lib/mbstring/u8split.c +++ b/lib/mbstring/u8split.c @@ -1,16 +1,16 @@ #include "mbstring.h" struct u8view -u8split(const char8_t **p, size_t *n, rune ch) +u8split(struct u8view *rhs, rune ch) { - struct u8view lhs = {.p = *p}; - if ((*p = u8chr(*p, *n, ch)) == nullptr) { - lhs.len = *n; - *n = 0; + struct u8view lhs = {.p = rhs->p}; + if ((rhs->p = u8chr(*rhs, ch)) == nullptr) { + lhs.len = rhs->len; + rhs->len = 0; } else { - lhs.len = *p - lhs.p; - *n -= lhs.len; - u8next(nullptr, p, n); + lhs.len = rhs->p - lhs.p; + rhs->len -= lhs.len; + u8next(nullptr, rhs); } return lhs; } diff --git a/lib/mbstring/u8spn.c b/lib/mbstring/u8spn.c index 1cf45f2..d41fcbc 100644 --- a/lib/mbstring/u8spn.c +++ b/lib/mbstring/u8spn.c @@ -1,15 +1,15 @@ #include "mbstring.h" size_t -u8spn(const char8_t *s, size_t n, const rune *p, size_t m) +u8spn(struct u8view sv, const rune *p, size_t n) { rune ch; - size_t k = 0, l; + size_t k = 0, w; - while ((l = u8next(&ch, &s, &n))) { - for (size_t i = 0; i < m; i++) { + while (w = u8next(&ch, &sv)) { + for (size_t i = 0; i < n; i++) { if (p[i] == ch) { - k += l; + k += w; goto found; } } diff --git a/lib/optparse/optparse.c b/lib/optparse/optparse.c index 407fa62..757dd47 100644 --- a/lib/optparse/optparse.c +++ b/lib/optparse/optparse.c @@ -44,11 +44,10 @@ optparse(struct optparse *st, const struct op_option *opts, size_t nopts) st->optind++; /* Skip ‘--’ */ - opt.p += 2; - opt.len -= 2; + VSHFT(&opt, 2); const struct op_option *o = nullptr; - const char8_t *eq_p = u8chr(opt.p, '=', opt.len); + const char8_t *eq_p = u8chr(opt, '='); struct u8view opt_no_eq = { .p = opt.p, .len = eq_p == nullptr ? opt.len : (size_t)(eq_p - opt.p), @@ -56,7 +55,7 @@ optparse(struct optparse *st, const struct op_option *opts, size_t nopts) for (size_t i = 0; i < nopts; i++) { struct u8view lo = opts[i].longopt; - if (lo.p == nullptr || !u8haspfx(U8_ARGS(lo), U8_ARGS(opt_no_eq))) + if (lo.p == nullptr || !u8haspfx(lo, opt_no_eq)) continue; if (o != nullptr) return error(st, OPT_MSG_INVALID, opt_no_eq); @@ -146,7 +145,7 @@ rune error_s(struct optparse *st, const char *msg, struct u8view s) { snprintf(st->errmsg, sizeof(st->errmsg), u8"%s — ‘%.*s’", msg, - U8_PRI_ARGS(s)); + SV_PRI_ARGS(s)); return -1; } diff --git a/lib/unicode/string/u8casefold.c b/lib/unicode/string/u8casefold.c index 6c0b61d..2ab7c7c 100644 --- a/lib/unicode/string/u8casefold.c +++ b/lib/unicode/string/u8casefold.c @@ -3,13 +3,13 @@ #include "unicode/string.h" size_t -u8casefold(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn, +u8casefold(char8_t *restrict dst, size_t dstn, struct u8view sv, enum caseflags flags) { rune ch; size_t n = 0; - while (u8next(&ch, &src, &srcn)) { + while (u8next(&ch, &sv)) { struct rview rv = uprop_get_cf(ch, flags & CF_LANG_AZ); for (size_t i = 0; i < rv.len; i++) { if (n >= dstn) { diff --git a/lib/unicode/string/u8gcnt.c b/lib/unicode/string/u8gcnt.c index 81a0f97..6dfc519 100644 --- a/lib/unicode/string/u8gcnt.c +++ b/lib/unicode/string/u8gcnt.c @@ -1,10 +1,10 @@ #include "unicode/string.h" size_t -u8gcnt(const char8_t *s, size_t n) +u8gcnt(struct u8view sv) { size_t m = 0; - while (u8gnext(nullptr, &s, &n)) + while (u8gnext(nullptr, &sv)) m++; return m; } diff --git a/lib/unicode/string/u8gnext.c b/lib/unicode/string/u8gnext.c index a050bd5..3b0b410 100644 --- a/lib/unicode/string/u8gnext.c +++ b/lib/unicode/string/u8gnext.c @@ -20,17 +20,17 @@ static bool u8isgbrk(rune, rune, struct gbrk_state *); _MLIB_DEFINE_BSEARCH(gbrk_prop, gbrk_prop_tbl, GBP_OTHER) size_t -u8gnext(struct u8view *g, const char8_t **s, size_t *n) +u8gnext(struct u8view *g, struct u8view *sv) { int m; rune ch1; const char8_t *p; struct gbrk_state gs = {0}; - if (*n == 0) + if (sv->len == 0) return 0; - p = *s; + p = sv->p; if (g) g->p = p; p += u8tor(&ch1, p); @@ -38,14 +38,13 @@ u8gnext(struct u8view *g, const char8_t **s, size_t *n) for (;;) { rune ch2; - if ((size_t)(p - *s) >= *n) + if ((size_t)(p - sv->p) >= sv->len) ch2 = 0; else m = u8tor(&ch2, p); if (u8isgbrk(ch1, ch2, &gs)) { - ptrdiff_t d = p - *s; - *n -= d; - *s = p; + ptrdiff_t d = p - sv->p; + VSHFT(sv, d); if (g) g->len = d; return d; diff --git a/lib/unicode/string/u8lower.c b/lib/unicode/string/u8lower.c index 63fdae4..907077b 100644 --- a/lib/unicode/string/u8lower.c +++ b/lib/unicode/string/u8lower.c @@ -13,7 +13,7 @@ uprop_ccc_0_or_230(rune ch) } size_t -u8lower(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn, +u8lower(char8_t *restrict dst, size_t dstn, struct u8view sv, enum caseflags flags) { struct lcctx ctx = { @@ -32,21 +32,21 @@ u8lower(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn, n = before_dot_cnt = more_above_cnt = 0; - while (u8next(&ch, &src, &srcn)) { + while (u8next(&ch, &sv)) { rune next = 0; - if (srcn > 0) - u8tor(&next, src); + if (sv.len > 0) + u8tor(&next, sv.p); if (ctx.az_or_tr || ctx.lt) { if (before_dot_cnt == 0 || more_above_cnt == 0) { rune ch = 0; before_dot_cnt = more_above_cnt = 0; - struct u8view cpy = {src, srcn}; + struct u8view cpy = sv; do { before_dot_cnt++; more_above_cnt++; - } while (u8next(&ch, U8_ARGSP(cpy)) && !uprop_ccc_0_or_230(ch)); + } while (u8next(&ch, &cpy) && !uprop_ccc_0_or_230(ch)); if (ch != COMB_DOT_ABOVE) before_dot_cnt = 0; @@ -60,11 +60,11 @@ u8lower(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn, if (final_sigma.after == 0) { rune ch = 0; - struct u8view cpy = {src, srcn}; + struct u8view cpy = sv; do final_sigma.after++; - while (u8next(&ch, U8_ARGSP(cpy)) && uprop_is_ci(ch)); + while (u8next(&ch, &cpy) && uprop_is_ci(ch)); if (!uprop_is_cased(ch)) final_sigma.after = 0; diff --git a/lib/unicode/string/u8title.c b/lib/unicode/string/u8title.c index 01e9d2e..f4d9b7e 100644 --- a/lib/unicode/string/u8title.c +++ b/lib/unicode/string/u8title.c @@ -14,7 +14,7 @@ uprop_ccc_0_or_230(rune ch) } size_t -u8title(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn, +u8title(char8_t *restrict dst, size_t dstn, struct u8view sv, enum caseflags flags) { struct tcctx ctx_t; @@ -26,7 +26,7 @@ u8title(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn, rune ch; bool nl_IJ = false; size_t n, before_dot_cnt, more_above_cnt; - struct u8view word = {}, wcpy = {src, srcn}; + struct u8view word = {}, wcpy = sv; struct { bool before; size_t after; @@ -39,9 +39,9 @@ u8title(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn, n = before_dot_cnt = more_above_cnt = 0; - while (u8next(&ch, &src, &srcn)) { - if (src > word.p + word.len) { - u8wnext(&word, U8_ARGSP(wcpy)); + while (u8next(&ch, &sv)) { + if (sv.p > word.p + word.len) { + u8wnext(&word, &wcpy); ctx_t.after_soft_dotted = false; state = TITLE; } @@ -50,12 +50,12 @@ u8title(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn, if (before_dot_cnt == 0 || more_above_cnt == 0) { rune ch = 0; before_dot_cnt = more_above_cnt = 0; - struct u8view cpy = {src, srcn}; + struct u8view cpy = sv; do { before_dot_cnt++; more_above_cnt++; - } while (u8next(&ch, U8_ARGSP(cpy)) && !uprop_ccc_0_or_230(ch)); + } while (u8next(&ch, &cpy) && !uprop_ccc_0_or_230(ch)); if (ch != COMB_DOT_ABOVE) before_dot_cnt = 0; @@ -69,11 +69,11 @@ u8title(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn, if (final_sigma.after == 0) { rune ch = 0; - struct u8view cpy = {src, srcn}; + struct u8view cpy = sv; do final_sigma.after++; - while (u8next(&ch, U8_ARGSP(cpy)) && uprop_is_ci(ch)); + while (u8next(&ch, &cpy) && uprop_is_ci(ch)); if (!uprop_is_cased(ch)) final_sigma.after = 0; @@ -95,8 +95,8 @@ u8title(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn, if (flags & CF_LANG_NL) { rune next = 0; - if (srcn > 0) - u8tor(&next, src); + if (sv.len > 0) + u8tor(&next, sv.p); nl_IJ = (ch == 'i' || ch == 'I') && (next == 'j' || next == 'J'); } diff --git a/lib/unicode/string/u8upper.c b/lib/unicode/string/u8upper.c index 086a160..6d4026d 100644 --- a/lib/unicode/string/u8upper.c +++ b/lib/unicode/string/u8upper.c @@ -3,7 +3,7 @@ #include "unicode/string.h" size_t -u8upper(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn, +u8upper(char8_t *restrict dst, size_t dstn, struct u8view sv, enum caseflags flags) { struct ucctx ctx = { @@ -15,7 +15,7 @@ u8upper(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn, rune ch; size_t n = 0; - while (u8next(&ch, &src, &srcn)) { + while (u8next(&ch, &sv)) { struct rview rv = uprop_get_uc(ch, ctx); for (size_t i = 0; i < rv.len; i++) { if (n >= dstn) { diff --git a/lib/unicode/string/u8wcnt.c b/lib/unicode/string/u8wcnt.c index f1b1742..f71faf5 100644 --- a/lib/unicode/string/u8wcnt.c +++ b/lib/unicode/string/u8wcnt.c @@ -1,10 +1,10 @@ #include "unicode/string.h" size_t -u8wcnt(const char8_t *s, size_t n) +u8wcnt(struct u8view sv) { size_t m = 0; - while (u8wnext(nullptr, &s, &n)) + while (u8wnext(nullptr, &sv)) m++; return m; } diff --git a/lib/unicode/string/u8wcnt_human.c b/lib/unicode/string/u8wcnt_human.c index 6e70398..60e7f95 100644 --- a/lib/unicode/string/u8wcnt_human.c +++ b/lib/unicode/string/u8wcnt_human.c @@ -1,10 +1,10 @@ #include "unicode/string.h" size_t -u8wcnt_human(const char8_t *s, size_t n) +u8wcnt_human(struct u8view sv) { size_t m = 0; - while (u8wnext_human(nullptr, &s, &n)) + while (u8wnext_human(nullptr, &sv)) m++; return m; } diff --git a/lib/unicode/string/u8wnext.c b/lib/unicode/string/u8wnext.c index 5e893c6..6655c5d 100644 --- a/lib/unicode/string/u8wnext.c +++ b/lib/unicode/string/u8wnext.c @@ -29,22 +29,20 @@ static size_t findwbrk(struct u8view); static struct wbrk_state mkwbrkstate(struct u8view); size_t -u8wnext(struct u8view *w, const char8_t **s, size_t *n) +u8wnext(struct u8view *w, struct u8view *sv) { - ASSUME(n != nullptr); - ASSUME(s != nullptr); - ASSUME(*s != nullptr); + ASSUME(sv != nullptr); + ASSUME(sv->p != nullptr); - if (*n == 0) + if (sv->len == 0) return 0; - size_t off = findwbrk((struct u8view){*s, *n}); + size_t off = findwbrk(*sv); if (w != nullptr) - *w = (struct u8view){*s, off}; + *w = (struct u8view){sv->p, off}; - ASSUME(*n >= off); - *s += off; - *n -= off; + ASSUME(sv->len >= off); + VSHFT(sv, off); return off; } @@ -196,13 +194,13 @@ mkwbrkstate(struct u8view sv) rune ch; for (size_t i = 0; - i < lengthof(ws.raw.next) && u8next(&ch, U8_ARGSP(ws.raw_v)) != 0; i++) + i < lengthof(ws.raw.next) && u8next(&ch, &ws.raw_v) != 0; i++) { ws.raw.next[i] = mlib_lookup(ch); } for (size_t i = 0; - i < lengthof(ws.raw.next) && u8next(&ch, U8_ARGSP(ws.skip_v)) != 0;) + i < lengthof(ws.raw.next) && u8next(&ch, &ws.skip_v) != 0;) { ws.skip.next[i] = mlib_lookup(ch); if (!IS_IGNORE(ws.skip.next[i])) @@ -224,10 +222,10 @@ advance(struct wbrk_state *ws) ws->raw.prev[0] = ws->raw.next[0]; ws->raw.next[0] = ws->raw.next[1]; ws->raw.next[1] = - u8next(&ch, U8_ARGSP(ws->raw_v)) != 0 ? mlib_lookup(ch) : WBRK_EOT; + u8next(&ch, &ws->raw_v) != 0 ? mlib_lookup(ch) : WBRK_EOT; /* Increment the midpoint */ - u8next(nullptr, U8_ARGSP(ws->mid_v)); + u8next(nullptr, &ws->mid_v); /* Ignore ignorable properties */ if (!IS_IGNORE(ws->raw.prev[0])) { @@ -237,7 +235,7 @@ advance(struct wbrk_state *ws) ws->ri_parity = ws->ri_parity == 0 && ws->skip.prev[0] == WBRK_RI; do { - if (u8next(&ch, U8_ARGSP(ws->skip_v)) == 0) { + if (u8next(&ch, &ws->skip_v) == 0) { ws->skip.next[1] = WBRK_EOT; break; } diff --git a/lib/unicode/string/u8wnext_human.c b/lib/unicode/string/u8wnext_human.c index d85abf1..953d942 100644 --- a/lib/unicode/string/u8wnext_human.c +++ b/lib/unicode/string/u8wnext_human.c @@ -4,17 +4,16 @@ #include "unicode/string.h" size_t -u8wnext_human(struct u8view *dst, const char8_t **s, size_t *n) +u8wnext_human(struct u8view *dst, struct u8view *sv) { - ASSUME(n != nullptr); - ASSUME(s != nullptr); - ASSUME(*s != nullptr); + ASSUME(sv != nullptr); + ASSUME(sv->p != nullptr); struct u8view w; - while (u8wnext(&w, s, n)) { + while (u8wnext(&w, sv)) { rune ch; struct u8view cpy = w; - while (u8next(&ch, U8_ARGSP(cpy))) { + while (u8next(&ch, &cpy)) { if (uprop_get_gc(ch) & (GC_L | GC_N)) { if (dst != nullptr) *dst = w; diff --git a/man/u8len.3 b/man/u8len.3 index f4d152f..5b51cd0 100644 --- a/man/u8len.3 +++ b/man/u8len.3 @@ -1,4 +1,4 @@ -.Dd 27 April 2024 +.Dd 4 May 2024 .Dt U8LEN 3 .Os .Sh NAME @@ -9,38 +9,38 @@ .Sh SYNOPSIS .In mbstring.h .Ft size_t -.Fn u8len "const char8_t *s" "size_t n" +.Fn u8len "struct u8view sv" .Sh DESCRIPTION The .Fn u8len function returns the number of UTF-8 encoded Unicode codepoints in the -buffer -.Fa s -of length -.Fa n -bytes. +string view +.Fa sv . .Pp Invalid bytes are interpreted as having a length of 1 byte. .Sh RETURN VALUES The .Fn u8len -function returns the number of codepoints in the buffer -.Fa s . +function returns the number of codepoints in the string view +.Fa sv . .Sh EXAMPLES The following call to .Fn u8len will return 17 while the call to .Fn strlen will return 22 as a result of use of multibyte-characters in -.Fa s . +.Fa sv . .Bd -literal -offset indent -struct u8view sv = U8(u8\(dq„Der Große Duden“\(dq); -size_t blen = strlen((char *)sv.p); -size_t cplen = u8len(U8_ARGS(sv)); +size_t n; +struct u8view sv = U8(\(dq„Der Große Duden“\(dq); + +n = u8len(sv); /* 17 */ +n = strlen((char *)sv.p); /* 22 */ .Ed .Sh SEE ALSO -.Xr u8gcnt 3 , .Xr U8 3 , +.Xr u8gcnt 3 , +.Xr u8view 3 , .Xr unicode 7 , .Xr utf\-8 7 .Sh STANDARDS @@ -56,10 +56,11 @@ size_t cplen = u8len(U8_ARGS(sv)); The return value of .Fn u8len does not necessarily represent the number of human-preceived characters -in the given buffer; -multiple codepoints may combine to form one human-preceived character -that spans a single column. -To count user-preceived codepoints +in the given string view; +multiple codepoints may combine to form one human-preceived character. +These human-preceived characters may even take up multiple columns in a +monospaced-environment such as in a terminal emulator. +To count user-preceived characters .Pq also known as graphemes , you may want to use the .Xr u8gcnt 3 diff --git a/man/u8next.3 b/man/u8next.3 index 1ba39f0..68079f1 100644 --- a/man/u8next.3 +++ b/man/u8next.3 @@ -1,4 +1,4 @@ -.Dd 20 February 2024 +.Dd 4 May 2024 .Dt U8NEXT 3 .Os .Sh NAME @@ -10,30 +10,25 @@ .Sh SYNOPSIS .In mbstring.h .Ft int -.Fn u8next "rune *ch" "const char8_t **s" "size_t *n" +.Fn u8next "rune *ch" "struct u8view sv" .Ft int .Fn u8prev "rune *ch" "const char8_t **s" "const char8_t *start" .Sh DESCRIPTION The .Fn u8next -function decodes the first rune in the UTF-8 encoded string pointed to by -.Fa s -of length -.Fa n +function decodes the first rune in the UTF-8 encoded string view +.Fa sv and stores the result in .Fa ch . -It then updates -.Fa s -to point to the next codepoint in the buffer and updates the length -.Fa n -accordingly. +It then shrinks +.Fa sv +so that the decoded rune is removed. .Pp The .Fn u8prev function takes a pointer .Fa start -which points to the start of the string instead of a length, -and updates +which points to the start of the string and updates .Fa s to point to the previous codepoint in the buffer. The rune @@ -59,19 +54,16 @@ or 0 at the end of iteration. The following calls to .Fn u8next iterate over and print all the codepoints in -.Va s . +.Va sv . .Bd -literal -offset indent #include <rune.h> /* For PRIXRUNE; see rune(3) */ -#define STRING u8"Ta’ Ħaġrat" - int w; rune ch; -const char8_t *s = STRING; -size_t n = sizeof(STRING) - 1; +struct u8view sv = U8("Ta’ Ħaġrat"); -while (w = u8next(&ch, &s, &n)) - printf("U+%04" PRIXRUNE ": ‘%.*s’\en", ch, w, s - w); +while (w = u8next(&ch, &sv)) + printf("U+%04" PRIXRUNE ": ‘%.*s’\en", ch, w, sv.p - w); .Ed .Pp The following example is the same as the previous, @@ -81,23 +73,20 @@ function to iterate backwards. .Bd -literal -offset indent #include <rune.h> /* For PRIXRUNE; see rune(3) */ -#define STRING u8"Ta’ Ħaġrat" - int w; rune ch; -const char8_t *s, *start; -size_t n = sizeof(STRING) - 1; - -start = STRING; -s = start + n; +struct u8view sv = U8("Ta’ Ħaġrat"); +const char8_t *s = sv.p + sv.len; -while (w = u8prev(&ch, &s, start)) +while (w = u8prev(&ch, &s, sv.p)) printf("U+%04" PRIXRUNE ": ‘%.*s’\en", ch, w, s); .Ed .Sh SEE ALSO .Xr rune 3 , +.Xr U8 3 , .Xr u8gnext 3 , .Xr u8tor 3 , +.Xr u8view 3type , .Xr RUNE_ERROR 3const , .Xr unicode 7 , .Xr utf\-8 7 diff --git a/man/u8tor.3 b/man/u8tor.3 index 6e3511e..8886193 100644 --- a/man/u8tor.3 +++ b/man/u8tor.3 @@ -1,4 +1,4 @@ -.Dd 10 March 2024 +.Dd 4 May 2024 .Dt U8TOR 3 .Os .Sh NAME @@ -37,18 +37,20 @@ The following call to attempts to decode the first UTF-8 codepoint in .Va buf . .Bd -literal -offset indent -/* Implementation of read_codepoint() omitted */ +#include <errors.h> /* For err(); see errors(3) */ +#include <rune.h> /* For PRIXRUNE; see rune(3) */ rune ch; -char8_t *buf = read_codepoint(stdin); +char8_t *buf = u8"Γειά σου Κόσμε"; int w = u8tor(&ch, buf); if (ch == RUNE_ERROR) - errx("Got invalid UTF-8 codepoint"); -printf("Got rune ‘%.*s’\en", w, buf); + err("Got invalid UTF-8 codepoint"); +printf("Got rune ‘%.*s’ (U+%04" PRIXRUNE ")\en", w, buf, ch); .Ed .Sh SEE ALSO -.Xr errx 3mlib , +.Xr errors 3 , .Xr rtou8 3 , +.Xr rune 3 , .Xr u8chk 3 , .Xr u8next 3 , .Xr RUNE_ERROR 3const , diff --git a/man/usage.3 b/man/usage.3 index 92b9b43..ead0f29 100644 --- a/man/usage.3 +++ b/man/usage.3 @@ -1,4 +1,4 @@ -.Dd 27 April 2024 +.Dd 4 May 2024 .Dt USAGE 3 .Os .Sh NAME @@ -34,9 +34,9 @@ be provided to the example executable. #include <optparse.h> static const struct op_option opts[] = { - {'a', U8(nullptr), OPT_NONE}, - {'b', U8(nullptr), OPT_NONE}, - {'h', U8(nullptr), OPT_NONE}, + {'a', U8(), OPT_NONE}, + {'b', U8(), OPT_NONE}, + {'h', U8("help"), OPT_NONE}, }; int |