From ac1b4bcbaeaee7d2ef9132dcdc254f2d08691650 Mon Sep 17 00:00:00 2001 From: Thomas Voss Date: Sat, 4 May 2024 04:01:45 +0200 Subject: Go all in on string views, and fix manuals --- man/u8len.3 | 37 +++++++++++++++++++------------------ man/u8next.3 | 45 +++++++++++++++++---------------------------- man/u8tor.3 | 14 ++++++++------ man/usage.3 | 8 ++++---- 4 files changed, 48 insertions(+), 56 deletions(-) (limited to 'man') diff --git a/man/u8len.3 b/man/u8len.3 index f4d152f..5b51cd0 100644 --- a/man/u8len.3 +++ b/man/u8len.3 @@ -1,4 +1,4 @@ -.Dd 27 April 2024 +.Dd 4 May 2024 .Dt U8LEN 3 .Os .Sh NAME @@ -9,38 +9,38 @@ .Sh SYNOPSIS .In mbstring.h .Ft size_t -.Fn u8len "const char8_t *s" "size_t n" +.Fn u8len "struct u8view sv" .Sh DESCRIPTION The .Fn u8len function returns the number of UTF-8 encoded Unicode codepoints in the -buffer -.Fa s -of length -.Fa n -bytes. +string view +.Fa sv . .Pp Invalid bytes are interpreted as having a length of 1 byte. .Sh RETURN VALUES The .Fn u8len -function returns the number of codepoints in the buffer -.Fa s . +function returns the number of codepoints in the string view +.Fa sv . .Sh EXAMPLES The following call to .Fn u8len will return 17 while the call to .Fn strlen will return 22 as a result of use of multibyte-characters in -.Fa s . +.Fa sv . .Bd -literal -offset indent -struct u8view sv = U8(u8\(dq„Der Große Duden“\(dq); -size_t blen = strlen((char *)sv.p); -size_t cplen = u8len(U8_ARGS(sv)); +size_t n; +struct u8view sv = U8(\(dq„Der Große Duden“\(dq); + +n = u8len(sv); /* 17 */ +n = strlen((char *)sv.p); /* 22 */ .Ed .Sh SEE ALSO -.Xr u8gcnt 3 , .Xr U8 3 , +.Xr u8gcnt 3 , +.Xr u8view 3 , .Xr unicode 7 , .Xr utf\-8 7 .Sh STANDARDS @@ -56,10 +56,11 @@ size_t cplen = u8len(U8_ARGS(sv)); The return value of .Fn u8len does not necessarily represent the number of human-preceived characters -in the given buffer; -multiple codepoints may combine to form one human-preceived character -that spans a single column. -To count user-preceived codepoints +in the given string view; +multiple codepoints may combine to form one human-preceived character. +These human-preceived characters may even take up multiple columns in a +monospaced-environment such as in a terminal emulator. +To count user-preceived characters .Pq also known as graphemes , you may want to use the .Xr u8gcnt 3 diff --git a/man/u8next.3 b/man/u8next.3 index 1ba39f0..68079f1 100644 --- a/man/u8next.3 +++ b/man/u8next.3 @@ -1,4 +1,4 @@ -.Dd 20 February 2024 +.Dd 4 May 2024 .Dt U8NEXT 3 .Os .Sh NAME @@ -10,30 +10,25 @@ .Sh SYNOPSIS .In mbstring.h .Ft int -.Fn u8next "rune *ch" "const char8_t **s" "size_t *n" +.Fn u8next "rune *ch" "struct u8view sv" .Ft int .Fn u8prev "rune *ch" "const char8_t **s" "const char8_t *start" .Sh DESCRIPTION The .Fn u8next -function decodes the first rune in the UTF-8 encoded string pointed to by -.Fa s -of length -.Fa n +function decodes the first rune in the UTF-8 encoded string view +.Fa sv and stores the result in .Fa ch . -It then updates -.Fa s -to point to the next codepoint in the buffer and updates the length -.Fa n -accordingly. +It then shrinks +.Fa sv +so that the decoded rune is removed. .Pp The .Fn u8prev function takes a pointer .Fa start -which points to the start of the string instead of a length, -and updates +which points to the start of the string and updates .Fa s to point to the previous codepoint in the buffer. The rune @@ -59,19 +54,16 @@ or 0 at the end of iteration. The following calls to .Fn u8next iterate over and print all the codepoints in -.Va s . +.Va sv . .Bd -literal -offset indent #include /* For PRIXRUNE; see rune(3) */ -#define STRING u8"Ta’ Ħaġrat" - int w; rune ch; -const char8_t *s = STRING; -size_t n = sizeof(STRING) - 1; +struct u8view sv = U8("Ta’ Ħaġrat"); -while (w = u8next(&ch, &s, &n)) - printf("U+%04" PRIXRUNE ": ‘%.*s’\en", ch, w, s - w); +while (w = u8next(&ch, &sv)) + printf("U+%04" PRIXRUNE ": ‘%.*s’\en", ch, w, sv.p - w); .Ed .Pp The following example is the same as the previous, @@ -81,23 +73,20 @@ function to iterate backwards. .Bd -literal -offset indent #include /* For PRIXRUNE; see rune(3) */ -#define STRING u8"Ta’ Ħaġrat" - int w; rune ch; -const char8_t *s, *start; -size_t n = sizeof(STRING) - 1; - -start = STRING; -s = start + n; +struct u8view sv = U8("Ta’ Ħaġrat"); +const char8_t *s = sv.p + sv.len; -while (w = u8prev(&ch, &s, start)) +while (w = u8prev(&ch, &s, sv.p)) printf("U+%04" PRIXRUNE ": ‘%.*s’\en", ch, w, s); .Ed .Sh SEE ALSO .Xr rune 3 , +.Xr U8 3 , .Xr u8gnext 3 , .Xr u8tor 3 , +.Xr u8view 3type , .Xr RUNE_ERROR 3const , .Xr unicode 7 , .Xr utf\-8 7 diff --git a/man/u8tor.3 b/man/u8tor.3 index 6e3511e..8886193 100644 --- a/man/u8tor.3 +++ b/man/u8tor.3 @@ -1,4 +1,4 @@ -.Dd 10 March 2024 +.Dd 4 May 2024 .Dt U8TOR 3 .Os .Sh NAME @@ -37,18 +37,20 @@ The following call to attempts to decode the first UTF-8 codepoint in .Va buf . .Bd -literal -offset indent -/* Implementation of read_codepoint() omitted */ +#include /* For err(); see errors(3) */ +#include /* For PRIXRUNE; see rune(3) */ rune ch; -char8_t *buf = read_codepoint(stdin); +char8_t *buf = u8"Γειά σου Κόσμε"; int w = u8tor(&ch, buf); if (ch == RUNE_ERROR) - errx("Got invalid UTF-8 codepoint"); -printf("Got rune ‘%.*s’\en", w, buf); + err("Got invalid UTF-8 codepoint"); +printf("Got rune ‘%.*s’ (U+%04" PRIXRUNE ")\en", w, buf, ch); .Ed .Sh SEE ALSO -.Xr errx 3mlib , +.Xr errors 3 , .Xr rtou8 3 , +.Xr rune 3 , .Xr u8chk 3 , .Xr u8next 3 , .Xr RUNE_ERROR 3const , diff --git a/man/usage.3 b/man/usage.3 index 92b9b43..ead0f29 100644 --- a/man/usage.3 +++ b/man/usage.3 @@ -1,4 +1,4 @@ -.Dd 27 April 2024 +.Dd 4 May 2024 .Dt USAGE 3 .Os .Sh NAME @@ -34,9 +34,9 @@ be provided to the example executable. #include static const struct op_option opts[] = { - {'a', U8(nullptr), OPT_NONE}, - {'b', U8(nullptr), OPT_NONE}, - {'h', U8(nullptr), OPT_NONE}, + {'a', U8(), OPT_NONE}, + {'b', U8(), OPT_NONE}, + {'h', U8("help"), OPT_NONE}, }; int -- cgit v1.2.3