diff options
-rw-r--r-- | man/Lb-desc.tmac | 1 | ||||
-rw-r--r-- | man/NOTE | 1 | ||||
-rw-r--r-- | man/rtou8.3 | 69 | ||||
-rw-r--r-- | man/u8len.3 | 66 | ||||
-rw-r--r-- | man/u8next.3 | 112 | ||||
-rw-r--r-- | man/u8prev.3 | 1 | ||||
-rw-r--r-- | man/u8tor.3 | 65 |
7 files changed, 315 insertions, 0 deletions
diff --git a/man/Lb-desc.tmac b/man/Lb-desc.tmac new file mode 100644 index 0000000..6da2a7f --- /dev/null +++ b/man/Lb-desc.tmac @@ -0,0 +1 @@ +.ds doc-str-Lb-mlib C Supplimentary Library (mlib, \-lmlib) diff --git a/man/NOTE b/man/NOTE new file mode 100644 index 0000000..dd57bad --- /dev/null +++ b/man/NOTE @@ -0,0 +1 @@ +sudo tee -a /usr/share/groff/site-tmac/mdoc.local <man/Lb-desc.tmac >/dev/null diff --git a/man/rtou8.3 b/man/rtou8.3 new file mode 100644 index 0000000..0803e84 --- /dev/null +++ b/man/rtou8.3 @@ -0,0 +1,69 @@ +.Dd March 10 2024 +.Dt RTOU8 3 +.Os +.Sh NAME +.Nm rtou8 +.Nd encode a rune to UTF-8 +.Sh LIBRARY +.Lb mlib +.Sh SYNOPSIS +.In mbstring.h +.Ft int +.Fn rtou8 "char8_t *s" "size_t n" "rune ch" +.Sh DESCRIPTION +The +.Fn rtou8 +function writes the rune +.Fa ch +to the UTF-8 encoded buffer +.Fa s +of length +.Fa n , +returning the number of bytes required to UTF-8 encode +.Fa ch . +If +.Fa s +is too small, +no data is written to it but the number of bytes required to UTF-8 encode +.Fa ch +is still returned. +.Pp +An invalid rune is treated as if it were +.Dv RUNE_ERROR . +.Sh RETURN VALUES +The +.Fn rtou8 +function returns the number of bytes required to write +.Fa ch +to the buffer +.Fa s . +.Sh EXAMPLES +The following call to +.Fn rtou8 +is used to print a rune to the standard output. +.Bd -literal -offset indent +#include <rune.h> /* For PRIXRUNE; see rune(3) */ + +rune ch = U\(aqĦ\(aq; + +char buf[U8_LEN_MAX]; +int w = rtou8(buf, sizeof(buf), ch); + +/* U+0126: ‘Ħ’ */ +printf(\(dqU+%04\(dq PRIXRUNE \(dq: ‘%.*s’\en\(dq, ch, w, buf); +.Ed +.Sh SEE ALSO +.Xr rune 3 , +.Xr u8tor 3 , +.Xr RUNE_ERROR 3const , +.Xr unicode 7 , +.Xr utf\-8 7 +.Sh STANDARDS +.Rs +.%A F. Yergeau +.%D November 2003 +.%R RFC 3629 +.%T UTF-8, a transformation format of ISO 10646 +.Re +.Sh AUTHORS +.An Thomas Voss Aq Mt mail@thomasvoss.com diff --git a/man/u8len.3 b/man/u8len.3 new file mode 100644 index 0000000..a2968e7 --- /dev/null +++ b/man/u8len.3 @@ -0,0 +1,66 @@ +.Dd March 10 2024 +.Dt U8LEN 3 +.Os +.Sh NAME +.Nm u8len +.Nd count Unicode codepoints +.Sh LIBRARY +.Lb mlib +.Sh SYNOPSIS +.In mbstring.h +.Ft size_t +.Fn u8len "const char8_t *s" "size_t n" +.Sh DESCRIPTION +The +.Fn u8len +function returns the number of UTF-8 encoded Unicode codepoints in the +buffer +.Fa s +of length +.Fa n +bytes. +.Pp +Invalid bytes are interpreted as having a length of 1 byte. +.Sh RETURN VALUES +The +.Fn u8len +function returns the number of codepoints in the buffer +.Fa s . +.Sh EXAMPLES +The following call to +.Fn u8len +will return 17 while the call to +.Fn strlen +will return 22 as a result of use of multibyte-characters in +.Fa s . +.Bd -literal -offset indent +struct u8view sv = U8V(u8\(dq„Der Große Duden“\(dq); +size_t blen = strlen((char *)sv.p); +size_t cplen = u8len(U8_ARGS(sv)); +.Ed +.Sh SEE ALSO +.Xr u8glen 3 , +.Xr U8V 3 , +.Xr unicode 7 , +.Xr utf\-8 7 +.Sh STANDARDS +.Rs +.%A F. Yergeau +.%D November 2003 +.%R RFC 3629 +.%T UTF-8, a transformation format of ISO 10646 +.Re +.Sh AUTHORS +.An Thomas Voss Aq Mt mail@thomasvoss.com +.Sh CAVEATS +The return value of +.Fn u8len +does not necessarily represent the number of human-preceived characters +in the given buffer; +multiple codepoints may combine to form one human-preceived character +that spans a single column. +To count user-preceived codepoints +.Pq also known as graphemes , +you may want to use the +.Xr u8glen 3 +function. diff --git a/man/u8next.3 b/man/u8next.3 new file mode 100644 index 0000000..388301f --- /dev/null +++ b/man/u8next.3 @@ -0,0 +1,112 @@ +.Dd February 20 2024 +.Dt U8NEXT 3 +.Os +.Sh NAME +.Nm u8next , +.Nm u8prev +.Nd iterate over Unicode codepoints +.Sh LIBRARY +.Lb mlib +.Sh SYNOPSIS +.In mbstring.h +.Ft int +.Fn u8next "rune *ch" "const char8_t **s" "size_t *n" +.Ft int +.Fn u8prev "rune *ch" "const char8_t **s" "const char8_t *start" +.Sh DESCRIPTION +The +.Fn u8next +function decodes the first rune in the UTF-8 encoded string pointed to by +.Fa s +of length +.Fa n +and stores the result in +.Fa ch . +It then updates +.Fa s +to point to the next codepoint in the buffer and updates the length +.Fa n +accordingly. +.Pp +The +.Fn u8prev +function takes a pointer +.Fa start +which points to the start of the string instead of a length, +and updates +.Fa s +to point to the previous codepoint in the buffer. +The rune +.Fa ch +is set to UTF-8 codepoint pointed to by +.Fa s +after iteration. +.Pp +Both of these functions set +.Va *ch +to +.Dv RUNE_ERROR +in the case of invalid UTF-8. +.Sh RETURN VALUES +The +.Fn u8next +and +.Fn u8prev +functions return the length of the UTF-8-encoded rune iterated over in +bytes, +or 0 at the end of iteration. +.Sh EXAMPLES +The following calls to +.Fn u8next +iterate over and print all the codepoints in +.Va s . +.Bd -literal -offset indent +#include <rune.h> /* For PRIXRUNE; see rune(3) */ + +#define STRING u8"Ta’ Ħaġrat" + +int w; +rune ch; +const char8_t *s = STRING; +size_t n = sizeof(STRING) - 1; + +while (w = u8next(&ch, &s, &n)) + printf("U+%04" PRIXRUNE ": ‘%.*s’\en", ch, w, s - w); +.Ed +.Pp +The following example is the same as the previous, +but it uses the +.Fn u8prev +function to iterate backwards. +.Bd -literal -offset indent +#include <rune.h> /* For PRIXRUNE; see rune(3) */ + +#define STRING u8"Ta’ Ħaġrat" + +int w; +rune ch; +const char8_t *s, *start; +size_t n = sizeof(STRING) - 1; + +start = STRING; +s = start + n; + +while (w = u8prev(&ch, &s, start)) + printf("U+%04" PRIXRUNE ": ‘%.*s’\en", ch, w, s); +.Ed +.Sh SEE ALSO +.Xr rune 3 , +.Xr u8gnext 3 , +.Xr u8tor 3 , +.Xr RUNE_ERROR 3const , +.Xr unicode 7 , +.Xr utf\-8 7 +.Sh STANDARDS +.Rs +.%A F. Yergeau +.%D November 2003 +.%R RFC 3629 +.%T UTF-8, a transformation format of ISO 10646 +.Re +.Sh AUTHORS +.An Thomas Voss Aq Mt mail@thomasvoss.com diff --git a/man/u8prev.3 b/man/u8prev.3 new file mode 100644 index 0000000..cf1364e --- /dev/null +++ b/man/u8prev.3 @@ -0,0 +1 @@ +.so u8next diff --git a/man/u8tor.3 b/man/u8tor.3 new file mode 100644 index 0000000..ba08110 --- /dev/null +++ b/man/u8tor.3 @@ -0,0 +1,65 @@ +.Dd March 10 2024 +.Dt U8TOR 3 +.Os +.Sh NAME +.Nm u8tor +.Nd decode UTF-8 into a rune +.Sh LIBRARY +.Lb mlib +.Sh SYNOPSIS +.In mbstring.h +.Ft int +.Fn u8tor "rune *ch" "const char8_t *s" +.Sh DESCRIPTION +The +.Fn u8tor +function decodes the first rune in the UTF-8 buffer +.Fa s , +storing the result in the rune pointed to by +.Fa ch +and returns the number of bytes which compose the decoded +UTF-8. +.Pp +If attempting to decode an invalid byte, +.Va *ch +will be set to +.Dv RUNE_ERROR. +.Sh RETURN VALUES +The +.Fn u8tor +function returns the number of bytes from +.Fa s +decoded into +.Fa ch . +.Sh EXAMPLES +The following call to +.Fn u8tor +attempts to decode the first UTF-8 codepoint in +.Va buf . +.Bd -literal -offset indent +/* Implementation of read_codepoint() omitted */ + +rune ch; +char8_t *buf = read_codepoint(stdin); +int w = u8tor(&ch, buf); +if (ch == RUNE_ERROR) + errx("Got invalid UTF-8 codepoint"); +printf("Got rune ‘%.*s’\en", w, buf); +.Ed +.Sh SEE ALSO +.Xr errx 3mlib , +.Xr rtou8 3 , +.Xr u8chk 3 , +.Xr u8next 3 , +.Xr RUNE_ERROR 3const , +.Xr unicode 7 , +.Xr utf\-8 7 +.Sh STANDARDS +.Rs +.%A F. Yergeau +.%D November 2003 +.%R RFC 3629 +.%T UTF-8, a transformation format of ISO 10646 +.Re +.Sh AUTHORS +.An Thomas Voss Aq Mt mail@thomasvoss.com |