From 1aeb7e2b426e7a94cdd4f83c4337f44c0f5a2ca8 Mon Sep 17 00:00:00 2001 From: Thomas Voss Date: Tue, 14 May 2024 23:59:05 +0200 Subject: Add encoding-generic macros --- include/_u8view.h | 13 ------------- include/_uNview.h | 23 +++++++++++++++++++++++ include/cli.h | 2 +- include/macros.h | 2 ++ include/mbstring.h | 32 +++++++++++++++++++++++++++++--- include/unicode/prop.h | 2 +- include/unicode/string.h | 31 +++++++++++++++++++++++++++---- lib/cli/optparse.c | 4 ++-- test/_brk-test.h | 10 +++++----- test/_case-test.h | 22 +++++++++++----------- test/norm-nfd-test.c | 10 +++++----- test/wbrk-human-test.c | 10 +++++----- 12 files changed, 111 insertions(+), 50 deletions(-) delete mode 100644 include/_u8view.h create mode 100644 include/_uNview.h diff --git a/include/_u8view.h b/include/_u8view.h deleted file mode 100644 index 5d6a9b8..0000000 --- a/include/_u8view.h +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef MLIB__U8VIEW_H -#define MLIB__U8VIEW_H - -#include - -#include "_charN_t.h" - -struct u8view { - const char8_t *p; - size_t len; -}; - -#endif /* !MLIB__U8VIEW_H */ diff --git a/include/_uNview.h b/include/_uNview.h new file mode 100644 index 0000000..9d0d5e2 --- /dev/null +++ b/include/_uNview.h @@ -0,0 +1,23 @@ +#ifndef MLIB__U8VIEW_H +#define MLIB__U8VIEW_H + +#include + +#include "_charN_t.h" + +struct u8view { + const char8_t *p; + size_t len; +}; + +struct u16view { + const char16_t *p; + size_t len; +}; + +struct u32view { + const char32_t *p; + size_t len; +}; + +#endif /* !MLIB__U8VIEW_H */ diff --git a/include/cli.h b/include/cli.h index 7b5b611..b2545ae 100644 --- a/include/cli.h +++ b/include/cli.h @@ -5,7 +5,7 @@ #include "_attrs.h" #include "_rune.h" -#include "_u8view.h" +#include "_uNview.h" struct optparser { bool _b; diff --git a/include/macros.h b/include/macros.h index 32e8b7c..5c38215 100644 --- a/include/macros.h +++ b/include/macros.h @@ -11,6 +11,8 @@ #define streq(x, y) (!strcmp((x), (y))) #define u8eq(x, y) (!u8cmp((x), (y))) +#define ucseq(lhs, rhs) (!_Generic((lhs), struct u8view: u8cmp)((lhs), (rhs))) + #define _MLIB_STR(s) #s #define _MLIB_CONCAT(x, y) x##y diff --git a/include/mbstring.h b/include/mbstring.h index d725e0d..947195f 100644 --- a/include/mbstring.h +++ b/include/mbstring.h @@ -5,10 +5,14 @@ #include "_charN_t.h" #include "_rune.h" -#include "_u8view.h" +#include "_uNview.h" -#define U8(...) \ +#define U8(...) \ ((struct u8view){__VA_OPT__(u8##__VA_ARGS__, sizeof(u8##__VA_ARGS__) - 1)}) +#define U16(...) \ + ((struct u16view){__VA_OPT__(u##__VA_ARGS__, sizeof(u##__VA_ARGS__) - 1)}) +#define U32(...) \ + ((struct u32view){__VA_OPT__(U##__VA_ARGS__, sizeof(U##__VA_ARGS__) - 1)}) #define VSHFT(sv, n) ((sv)->p += (n), (sv)->len -= (n)) @@ -28,7 +32,7 @@ constexpr rune U8_4B_MAX = 0x10FFFF; constexpr rune MBEND = 0x110000; -#define PRIsU8 ".*s" +#define PRIsSV ".*s" #define SV_PRI_ARGS(sv) ((int)(sv).len), ((sv).p) int rtou8(char8_t *, size_t, rune); @@ -47,4 +51,26 @@ int u8tor(rune *, const char8_t *); rune u8cut(struct u8view *restrict, struct u8view *restrict, const rune *, size_t); +/* Encoding-generic macros */ +#define rtoucs(buf, bufsz, ch) \ + _Generic((buf), char8_t *: rtou8)((buf), (bufsz), (ch)) +#define ucsnext(ch, sv) _Generic((sv), struct u8view: u8next)((ch), (sv)) +#define ucsprev(ch, sv, start) \ + _Generic((sv), const char8_t **: u8prev)((ch), (sv), (start)) +#define ucstor(ch, p) \ + _Generic((p), char8_t *: u8tor, const char8_t *: u8tor)((ch), (p)) +#define ucshaspfx(sv, pfx) _Generic((sv), struct u8view: u8haspfx)((sv), (pfx)) +#define ucshassfx(sv, sfx) _Generic((sv), struct u8view: u8hassfx)((sv), (sfx)) +#define ucschk(sv) _Generic((sv), struct u8view: u8chk)((sv)) +#define ucschr(sv, ch) _Generic((sv), struct u8view: u8chr)((sv), (ch)) +#define ucsrchr(sv, ch) _Generic((sv), struct u8view: u8rchr)((sv), (ch)) +#define ucscmp(lhs, rhs) _Generic((lhs), struct u8view: u8cmp)((lhs), (rhs)) +#define ucscspn(sv, delims, ndelims) \ + _Generic((sv), struct u8view: u8cspn)((sv), (delims), (ndelims)) +#define ucslen(sv) _Generic((sv), struct u8view: u8len)((sv)) +#define ucsspn(sv, delims, ndelims) \ + _Generic((sv), struct u8view: u8spn)((sv), (delims), (ndelims)) +#define ucscut(x, y, seps, nseps) \ + _Generic((y), struct u8view *: u8cut)(x, y, seps, nseps) + #endif /* !MLIB_MBSTRING_H */ diff --git a/include/unicode/prop.h b/include/unicode/prop.h index 0fbd479..422fa58 100644 --- a/include/unicode/prop.h +++ b/include/unicode/prop.h @@ -6,7 +6,7 @@ #include "_attrs.h" #include "_rune.h" -#include "_u8view.h" +#include "_uNview.h" struct rview { const rune *p; diff --git a/include/unicode/string.h b/include/unicode/string.h index 0c7ef79..a5b1cdb 100644 --- a/include/unicode/string.h +++ b/include/unicode/string.h @@ -5,7 +5,7 @@ #include "_alloc_fn.h" #include "_charN_t.h" -#include "_u8view.h" +#include "_uNview.h" /* clang-format off */ @@ -23,11 +23,9 @@ enum [[clang::flag_enum]] caseflags { [[nodiscard]] size_t u8gcnt(struct u8view); [[nodiscard]] size_t u8wcnt(struct u8view); [[nodiscard]] size_t u8wcnt_human(struct u8view); - size_t u8gnext(struct u8view *, struct u8view *); size_t u8wnext(struct u8view *, struct u8view *); size_t u8wnext_human(struct u8view *, struct u8view *); - [[nodiscard]] char8_t *u8casefold(size_t *, struct u8view, enum caseflags, alloc_fn, void *); [[nodiscard]] char8_t *u8lower(size_t *, struct u8view, enum caseflags, @@ -36,9 +34,34 @@ size_t u8wnext_human(struct u8view *, struct u8view *); alloc_fn, void *); [[nodiscard]] char8_t *u8upper(size_t *, struct u8view, enum caseflags, alloc_fn, void *); - +[[nodiscard]] char8_t *u8norm_nfc(size_t *, struct u8view, alloc_fn, void *); [[nodiscard]] char8_t *u8norm_nfd(size_t *, struct u8view, alloc_fn, void *); +/* Encoding-generic macros */ +#define ucsgcnt(sv) _Generic((sv), struct u8view: u8gcnt)((sv)) +#define ucswcnt(sv) _Generic((sv), struct u8view: u8wcnt)((sv)) +#define ucswcnt_human(sv) _Generic((sv), struct u8view: u8wcnt_human)((sv)) +#define ucsgnext(g, sv) _Generic((sv), struct u8view *: u8gnext)((g), (sv)) +#define ucswnext(g, sv) _Generic((sv), struct u8view *: u8wnext)((g), (sv)) +#define ucswnext_human(g, sv) \ + _Generic((sv), struct u8view *: u8wnext_human)((g), (sv)) +#define ucscasefold(dstn, sv, flags, alloc, ctx) \ + _Generic((sv), struct u8view: u8casefold)((dstn), (sv), (flags), (alloc), \ + (ctx)) +#define ucslower(dstn, sv, flags, alloc, ctx) \ + _Generic((sv), struct u8view: u8lower)((dstn), (sv), (flags), (alloc), \ + (ctx)) +#define ucstitle(dstn, sv, flags, alloc, ctx) \ + _Generic((sv), struct u8view: u8title)((dstn), (sv), (flags), (alloc), \ + (ctx)) +#define ucsupper(dstn, sv, flags, alloc, ctx) \ + _Generic((sv), struct u8view: u8upper)((dstn), (sv), (flags), (alloc), \ + (ctx)) +#define ucsnorm_nfc(dstn, sv, alloc, ctx) \ + _Generic((sv), struct u8view: u8norm_nfc)((dstn), (sv), (alloc), (ctx)) +#define ucsnorm_nfd(dstn, sv, alloc, ctx) \ + _Generic((sv), struct u8view: u8norm_nfd)((dstn), (sv), (alloc), (ctx)) + constexpr double U8CASEFOLD_SCALE = 3; constexpr double U8LOWER_SCALE = 1.5; constexpr double U8LOWER_SCALE_LT = 3; diff --git a/lib/cli/optparse.c b/lib/cli/optparse.c index ce688cf..7134b37 100644 --- a/lib/cli/optparse.c +++ b/lib/cli/optparse.c @@ -104,8 +104,8 @@ rune shortopt(struct optparser *st, const struct cli_option *opts, size_t nopts) { rune ch; - const char *opt = st->_argv[st->optind]; - st->_subopt += u8tor(&ch, opt + st->_subopt + 1); + const char8_t *opt = st->_argv[st->optind]; + st->_subopt += ucstor(&ch, opt + st->_subopt + 1); if (ch == '\0') { st->_subopt = 0; st->optind++; diff --git a/test/_brk-test.h b/test/_brk-test.h index 396138b..21a6a2b 100644 --- a/test/_brk-test.h +++ b/test/_brk-test.h @@ -15,8 +15,8 @@ #include #define TESTFILE STR(BRKTYPE) "brk.in" -#define ITERFUNC CONCAT(CONCAT(u8, BRKTYPE), next) -#define CNTFUNC CONCAT(CONCAT(u8, BRKTYPE), cnt) +#define ITERFUNC CONCAT(CONCAT(ucs, BRKTYPE), next) +#define CNTFUNC CONCAT(CONCAT(ucs, BRKTYPE), cnt) static bool test(struct u8view, int); @@ -68,12 +68,12 @@ test(struct u8view sv, int id) rune op; struct u8view sv_cpy = sv; - while ((op = u8cut(nullptr, &sv_cpy, U"×÷", 2)) != MBEND) { + while ((op = ucscut(nullptr, &sv_cpy, U"×÷", 2)) != MBEND) { rune ch; sscanf(sv_cpy.p, "%" SCNxRUNE, &ch); char8_t buf[U8_LEN_MAX]; - int w = rtou8(buf, sizeof(buf), ch); + int w = rtoucs(buf, sizeof(buf), ch); total += w; if (op == U'÷') @@ -103,7 +103,7 @@ test(struct u8view sv, int id) struct u8view it1, buf_cpy = buf; for (size_t i = 0; ITERFUNC(&it1, &buf_cpy); i++) { item it2 = items.buf[i]; - if (!u8eq(it1, ((struct u8view){it2.buf, it2.len}))) { + if (!ucseq(it1, ((struct u8view){it2.buf, it2.len}))) { warn("case %d: expected %s ‘%.*s’ but got ‘%.*s’", id, STR(BRKTYPE_LONG), (int)it2.len, it2.buf, SV_PRI_ARGS(it1)); rv = false; diff --git a/test/_case-test.h b/test/_case-test.h index 701d884..24a18f1 100644 --- a/test/_case-test.h +++ b/test/_case-test.h @@ -14,7 +14,7 @@ #include #define TESTFILE STR(CASETYPE) ".in" -#define FUNC CONCAT(u8, CASETYPE) +#define FUNC CONCAT(ucs, CASETYPE) static bool test(const char8_t *, int); @@ -54,22 +54,22 @@ test(const char8_t *line, int id) { struct u8view mapped, sv = {line, strlen(line)}; struct u8view before, after, flags; - u8cut(&before, &sv, U";", 1); - u8cut(&after, &sv, U";", 1); - u8cut(&flags, &sv, U";", 1); + ucscut(&before, &sv, U";", 1); + ucscut(&after, &sv, U";", 1); + ucscut(&flags, &sv, U";", 1); - enum caseflags cf = u8eq(flags, U8("ẞ")) ? CF_ẞ - : u8eq(flags, U8("AZ")) ? CF_LANG_AZ - : u8eq(flags, U8("LT")) ? CF_LANG_LT - : u8eq(flags, U8("NL")) ? CF_LANG_NL + enum caseflags cf = ucseq(flags, U8("ẞ")) ? CF_ẞ + : ucseq(flags, U8("AZ")) ? CF_LANG_AZ + : ucseq(flags, U8("LT")) ? CF_LANG_LT + : ucseq(flags, U8("NL")) ? CF_LANG_NL : 0; arena a = mkarena(0); - mapped.p = FUNC(&mapped.len, before, cf, alloc_arena, &(struct arena_ctx){ + mapped.p = FUNC(&mapped.len, before, cf, alloc_arena, &((struct arena_ctx){ .a = &a, - }); + })); - if (!u8eq(mapped, after)) { + if (!ucseq(mapped, after)) { warn("case %d: expected ‘%.*s’ but got ‘%.*s’", id, SV_PRI_ARGS(after), SV_PRI_ARGS(mapped)); arena_free(&a); diff --git a/test/norm-nfd-test.c b/test/norm-nfd-test.c index 02fde47..95bc8d5 100644 --- a/test/norm-nfd-test.c +++ b/test/norm-nfd-test.c @@ -60,7 +60,7 @@ test(struct u8view sv, int id) }; struct u8view column; - while (u8cut(&column, &sv, U";", 1) != MBEND) { + while (ucscut(&column, &sv, U";", 1) != MBEND) { dynarr(char8_t) s = { .alloc = alloc_arena, .ctx = &ctx, @@ -70,10 +70,10 @@ test(struct u8view sv, int id) struct u8view cp; do { rune ch; - _ = u8cut(&cp, &column, U" ", 1); + _ = ucscut(&cp, &column, U" ", 1); sscanf(cp.p, "%" SCNxRUNE, &ch); char8_t buf[U8_LEN_MAX]; - int w = rtou8(buf, sizeof(buf), ch); + int w = rtoucs(buf, sizeof(buf), ch); DAEXTEND(&s, buf, w); } while (_ != MBEND); @@ -83,8 +83,8 @@ test(struct u8view sv, int id) for (size_t i = 0; i < 5; i++) { size_t base = i < 3 ? 2 : 4; struct u8view normd = {}; - normd.p = u8norm_nfd(&normd.len, columns.buf[i], alloc_arena, &ctx); - if (!u8eq(columns.buf[base], normd)) { + normd.p = ucsnorm_nfd(&normd.len, columns.buf[i], alloc_arena, &ctx); + if (!ucseq(columns.buf[base], normd)) { warn("case %d: expected c%zu to be ‘%.*s’ but got ‘%.*s’", id, i + 1, SV_PRI_ARGS(columns.buf[base]), SV_PRI_ARGS(normd)); rv = false; diff --git a/test/wbrk-human-test.c b/test/wbrk-human-test.c index 6f8bf7a..24a3513 100644 --- a/test/wbrk-human-test.c +++ b/test/wbrk-human-test.c @@ -47,26 +47,26 @@ bool test(struct u8view sv, int id) { struct u8view src; - u8cut(&src, &sv, U";", 1); + ucscut(&src, &sv, U";", 1); struct u8view w; dynarr(struct u8view) ws = {.alloc = alloc_heap}; - while (u8cut(&w, &sv, U"|", 1) != MBEND) + while (ucscut(&w, &sv, U"|", 1) != MBEND) DAPUSH(&ws, w); if (w.len > 0) DAPUSH(&ws, w); /* Assert the word count is correct */ size_t n; - if ((n = u8wcnt_human(src)) != ws.len) { + if ((n = ucswcnt_human(src)) != ws.len) { warn("case %d: expected %zu words but got %zu", id, ws.len, n); return false; } /* Assert the individual words are correct */ - for (size_t i = 0; u8wnext_human(&w, &src) != 0; i++) { - if (!u8eq(w, ws.buf[i])) { + for (size_t i = 0; ucswnext_human(&w, &src) != 0; i++) { + if (!ucseq(w, ws.buf[i])) { warn("case %d: expected word %zu to be ‘%.*s’ but got ‘%.*s’", id, i, SV_PRI_ARGS(ws.buf[i]), SV_PRI_ARGS(w)); return false; -- cgit v1.2.3