7 files changed, 315 insertions, 0 deletions
diff --git a/man/Lb-desc.tmac b/man/Lb-desc.tmac
new file mode 100644
index 0000000..6da2a7f
--- /dev/null
+++ b/man/Lb-desc.tmac
@@ -0,0 +1 @@
+.ds doc-str-Lb-mlib C Supplimentary Library (mlib, \-lmlib)
diff --git a/man/NOTE b/man/NOTE
new file mode 100644
index 0000000..dd57bad
--- /dev/null
+++ b/man/NOTE
@@ -0,0 +1 @@
+sudo tee -a /usr/share/groff/site-tmac/mdoc.local <man/Lb-desc.tmac >/dev/null
diff --git a/man/rtou8.3 b/man/rtou8.3
new file mode 100644
index 0000000..0803e84
--- /dev/null
+++ b/man/rtou8.3
@@ -0,0 +1,69 @@
+.Dd March 10 2024
+.Dt RTOU8 3
+.Os
+.Sh NAME
+.Nm rtou8
+.Nd encode a rune to UTF-8
+.Sh LIBRARY
+.Lb mlib
+.Sh SYNOPSIS
+.In mbstring.h
+.Ft int
+.Fn rtou8 "char8_t *s" "size_t n" "rune ch"
+.Sh DESCRIPTION
+The
+.Fn rtou8
+function writes the rune
+.Fa ch
+to the UTF-8 encoded buffer
+.Fa s
+of length
+.Fa n ,
+returning the number of bytes required to UTF-8 encode
+.Fa ch .
+If
+.Fa s
+is too small,
+no data is written to it but the number of bytes required to UTF-8 encode
+.Fa ch
+is still returned.
+.Pp
+An invalid rune is treated as if it were
+.Dv RUNE_ERROR .
+.Sh RETURN VALUES
+The
+.Fn rtou8
+function returns the number of bytes required to write
+.Fa ch
+to the buffer
+.Fa s .
+.Sh EXAMPLES
+The following call to
+.Fn rtou8
+is used to print a rune to the standard output.
+.Bd -literal -offset indent
+#include <rune.h> /* For PRIXRUNE; see rune(3) */
+
+rune ch = U\(aqĦ\(aq;
+
+char buf[U8_LEN_MAX];
+int w = rtou8(buf, sizeof(buf), ch);
+
+/* U+0126: ‘Ħ’ */
+printf(\(dqU+%04\(dq PRIXRUNE \(dq: ‘%.*s’\en\(dq, ch, w, buf);
+.Ed
+.Sh SEE ALSO
+.Xr rune 3 ,
+.Xr u8tor 3 ,
+.Xr RUNE_ERROR 3const ,
+.Xr unicode 7 ,
+.Xr utf\-8 7
+.Sh STANDARDS
+.Rs
+.%A F. Yergeau
+.%D November 2003
+.%R RFC 3629
+.%T UTF-8, a transformation format of ISO 10646
+.Re
+.Sh AUTHORS
+.An Thomas Voss Aq Mt mail@thomasvoss.com
diff --git a/man/u8len.3 b/man/u8len.3
new file mode 100644
index 0000000..a2968e7
--- /dev/null
+++ b/man/u8len.3
@@ -0,0 +1,66 @@
+.Dd March 10 2024
+.Dt U8LEN 3
+.Os
+.Sh NAME
+.Nm u8len
+.Nd count Unicode codepoints
+.Sh LIBRARY
+.Lb mlib
+.Sh SYNOPSIS
+.In mbstring.h
+.Ft size_t
+.Fn u8len "const char8_t *s" "size_t n"
+.Sh DESCRIPTION
+The
+.Fn u8len
+function returns the number of UTF-8 encoded Unicode codepoints in the
+buffer
+.Fa s
+of length
+.Fa n
+bytes.
+.Pp
+Invalid bytes are interpreted as having a length of 1 byte.
+.Sh RETURN VALUES
+The
+.Fn u8len
+function returns the number of codepoints in the buffer
+.Fa s .
+.Sh EXAMPLES
+The following call to
+.Fn u8len
+will return 17 while the call to
+.Fn strlen
+will return 22 as a result of use of multibyte-characters in
+.Fa s .
+.Bd -literal -offset indent
+struct u8view sv = U8V(u8\(dq„Der Große Duden“\(dq);
+size_t blen = strlen((char *)sv.p);
+size_t cplen = u8len(U8_ARGS(sv));
+.Ed
+.Sh SEE ALSO
+.Xr u8glen 3 ,
+.Xr U8V 3 ,
+.Xr unicode 7 ,
+.Xr utf\-8 7
+.Sh STANDARDS
+.Rs
+.%A F. Yergeau
+.%D November 2003
+.%R RFC 3629
+.%T UTF-8, a transformation format of ISO 10646
+.Re
+.Sh AUTHORS
+.An Thomas Voss Aq Mt mail@thomasvoss.com
+.Sh CAVEATS
+The return value of
+.Fn u8len
+does not necessarily represent the number of human-preceived characters
+in the given buffer;
+multiple codepoints may combine to form one human-preceived character
+that spans a single column.
+To count user-preceived codepoints
+.Pq also known as graphemes ,
+you may want to use the
+.Xr u8glen 3
+function.
diff --git a/man/u8next.3 b/man/u8next.3
new file mode 100644
index 0000000..388301f
--- /dev/null
+++ b/man/u8next.3
@@ -0,0 +1,112 @@
+.Dd February 20 2024
+.Dt U8NEXT 3
+.Os
+.Sh NAME
+.Nm u8next ,
+.Nm u8prev
+.Nd iterate over Unicode codepoints
+.Sh LIBRARY
+.Lb mlib
+.Sh SYNOPSIS
+.In mbstring.h
+.Ft int
+.Fn u8next "rune *ch" "const char8_t **s" "size_t *n"
+.Ft int
+.Fn u8prev "rune *ch" "const char8_t **s" "const char8_t *start"
+.Sh DESCRIPTION
+The
+.Fn u8next
+function decodes the first rune in the UTF-8 encoded string pointed to by
+.Fa s
+of length
+.Fa n
+and stores the result in
+.Fa ch .
+It then updates
+.Fa s
+to point to the next codepoint in the buffer and updates the length
+.Fa n
+accordingly.
+.Pp
+The
+.Fn u8prev
+function takes a pointer
+.Fa start
+which points to the start of the string instead of a length,
+and updates
+.Fa s
+to point to the previous codepoint in the buffer.
+The rune
+.Fa ch
+is set to UTF-8 codepoint pointed to by
+.Fa s
+after iteration.
+.Pp
+Both of these functions set
+.Va *ch
+to
+.Dv RUNE_ERROR
+in the case of invalid UTF-8.
+.Sh RETURN VALUES
+The
+.Fn u8next
+and
+.Fn u8prev
+functions return the length of the UTF-8-encoded rune iterated over in
+bytes,
+or 0 at the end of iteration.
+.Sh EXAMPLES
+The following calls to
+.Fn u8next
+iterate over and print all the codepoints in
+.Va s .
+.Bd -literal -offset indent
+#include <rune.h> /* For PRIXRUNE; see rune(3) */
+
+#define STRING u8"Ta’ Ħaġrat"
+
+int w;
+rune ch;
+const char8_t *s = STRING;
+size_t n = sizeof(STRING) - 1;
+
+while (w = u8next(&ch, &s, &n))
+	printf("U+%04" PRIXRUNE ": ‘%.*s’\en", ch, w, s - w);
+.Ed
+.Pp
+The following example is the same as the previous,
+but it uses the
+.Fn u8prev
+function to iterate backwards.
+.Bd -literal -offset indent
+#include <rune.h> /* For PRIXRUNE; see rune(3) */
+
+#define STRING u8"Ta’ Ħaġrat"
+
+int w;
+rune ch;
+const char8_t *s, *start;
+size_t n = sizeof(STRING) - 1;
+
+start = STRING;
+s = start + n;
+
+while (w = u8prev(&ch, &s, start))
+	printf("U+%04" PRIXRUNE ": ‘%.*s’\en", ch, w, s);
+.Ed
+.Sh SEE ALSO
+.Xr rune 3 ,
+.Xr u8gnext 3 ,
+.Xr u8tor 3 ,
+.Xr RUNE_ERROR 3const ,
+.Xr unicode 7 ,
+.Xr utf\-8 7
+.Sh STANDARDS
+.Rs
+.%A F. Yergeau
+.%D November 2003
+.%R RFC 3629
+.%T UTF-8, a transformation format of ISO 10646
+.Re
+.Sh AUTHORS
+.An Thomas Voss Aq Mt mail@thomasvoss.com
diff --git a/man/u8prev.3 b/man/u8prev.3
new file mode 100644
index 0000000..cf1364e
--- /dev/null
+++ b/man/u8prev.3
@@ -0,0 +1 @@
+.so u8next
diff --git a/man/u8tor.3 b/man/u8tor.3
new file mode 100644
index 0000000..ba08110
--- /dev/null
+++ b/man/u8tor.3
@@ -0,0 +1,65 @@
+.Dd March 10 2024
+.Dt U8TOR 3
+.Os
+.Sh NAME
+.Nm u8tor
+.Nd decode UTF-8 into a rune
+.Sh LIBRARY
+.Lb mlib
+.Sh SYNOPSIS
+.In mbstring.h
+.Ft int
+.Fn u8tor "rune *ch" "const char8_t *s"
+.Sh DESCRIPTION
+The
+.Fn u8tor
+function decodes the first rune in the UTF-8 buffer
+.Fa s ,
+storing the result in the rune pointed to by
+.Fa ch
+and returns the number of bytes which compose the decoded
+UTF-8.
+.Pp
+If attempting to decode an invalid byte,
+.Va *ch
+will be set to
+.Dv RUNE_ERROR.
+.Sh RETURN VALUES
+The
+.Fn u8tor
+function returns the number of bytes from
+.Fa s
+decoded into
+.Fa ch .
+.Sh EXAMPLES
+The following call to
+.Fn u8tor
+attempts to decode the first UTF-8 codepoint in
+.Va buf .
+.Bd -literal -offset indent
+/* Implementation of read_codepoint() omitted */
+
+rune ch;
+char8_t *buf = read_codepoint(stdin);
+int w = u8tor(&ch, buf);
+if (ch == RUNE_ERROR)
+	errx("Got invalid UTF-8 codepoint");
+printf("Got rune ‘%.*s’\en", w, buf);
+.Ed
+.Sh SEE ALSO
+.Xr errx 3mlib ,
+.Xr rtou8 3 ,
+.Xr u8chk 3 ,
+.Xr u8next 3 ,
+.Xr RUNE_ERROR 3const ,
+.Xr unicode 7 ,
+.Xr utf\-8 7
+.Sh STANDARDS
+.Rs
+.%A F. Yergeau
+.%D November 2003
+.%R RFC 3629
+.%T UTF-8, a transformation format of ISO 10646
+.Re
+.Sh AUTHORS
+.An Thomas Voss Aq Mt mail@thomasvoss.com