Support line- & column-based match locations

author: Thomas Voss <mail@thomasvoss.com> 2024-01-21 03:03:58 +0100
committer: Thomas Voss <mail@thomasvoss.com> 2024-01-21 03:03:58 +0100
commit: 4f93f935dc7a981ca073a322425c3f5929ffb644 (patch)
tree: 4460586408ec7fdfcecf3ba4584f0435067125a6 /vendor/librune/man
parent: 72ea25a4d73e3e026366d4165f5bc4ec9e7418cb (diff)
11 files changed, 581 insertions, 0 deletions
diff --git a/vendor/librune/man/Lb-desc.tmac b/vendor/librune/man/Lb-desc.tmac
new file mode 100644
index 0000000..1ec8f8f
--- /dev/null
+++ b/vendor/librune/man/Lb-desc.tmac
@@ -0,0 +1 @@
+.ds doc-str-Lb-librune Unicode- and UTF-8 Library (librune, \-lrune)
diff --git a/vendor/librune/man/rtou8.3 b/vendor/librune/man/rtou8.3
new file mode 100644
index 0000000..a83d3e8
--- /dev/null
+++ b/vendor/librune/man/rtou8.3
@@ -0,0 +1 @@
+.so u8set.3
diff --git a/vendor/librune/man/u8glen.3 b/vendor/librune/man/u8glen.3
new file mode 100644
index 0000000..dfe0283
--- /dev/null
+++ b/vendor/librune/man/u8glen.3
@@ -0,0 +1,62 @@
+.Dd January 15 2024
+.Dt U8GLEN 3
+.Os
+.Sh NAME
+.Nm u8glen
+.Nd count Unicode graphemes
+.Sh LIBRARY
+.Lb librune
+.Sh SYNOPSIS
+.In gbrk.h
+.Ft size_t
+.Fn u8glen "const char8_t *s" "size_t n"
+.Sh DESCRIPTION
+The
+.Fn u8glen
+function returns the number of UTF-8 encoded Unicode graphemes in the
+buffer
+.Fa s
+of length
+.Fa n
+bytes.
+.Pp
+This function assumes that
+.Fa s
+contains only valid UTF-8.
+.Sh RETURN VALUES
+The
+.Fn u8glen
+function returns the number of graphemes in the buffer
+.Fa s .
+.Sh EXAMPLES
+The following call to
+.Fn u8glen
+will return 6 while the call to
+.Fn u8len
+will return 7 as a result of user-preceived characters such as
+.Sq е́
+taking up multiple codepoints.
+.Bd -literal -offset indent
+char8_t s[] = u8\(dqПриве́т\(dq;
+size_t cplen, glen;
+
+cplen = u8len(s, sizeof(s) - 1);
+glen = u8glen(s, sizeof(s) - 1);
+.Ed
+.Sh SEE ALSO
+.Xr u8len 3 ,
+.Xr u8wdth 3 ,
+.Xr unicode 7 ,
+.Xr utf\-8 7
+.Sh STANDARDS
+.Rs
+.%A F. Yergeau
+.%D November 2003
+.%R RFC 3629
+.%T UTF-8, a transformation format of ISO 10646
+.Re
+.Pp
+.Lk https://www.unicode.org/versions/Unicode15.1.0/ \
+"The Unicode\(rg Standard Version 15.1.0"
+.Sh AUTHORS
+.An Thomas Voss Aq Mt mail@thomasvoss.com
diff --git a/vendor/librune/man/u8gnext.3 b/vendor/librune/man/u8gnext.3
new file mode 100644
index 0000000..0053aa5
--- /dev/null
+++ b/vendor/librune/man/u8gnext.3
@@ -0,0 +1,72 @@
+.Dd January 18 2024
+.Dt U8GNEXT 3
+.Os
+.Sh NAME
+.Nm u8gnext ,
+.Nd iterate over Unicode codepoints
+.Sh LIBRARY
+.Lb librune
+.Sh SYNOPSIS
+.In gbrk.h
+.Ft "const char8_t *"
+.Fn u8gnext "struct u8view *v" "const char8_t **s" "size_t *n"
+.Sh DESCRIPTION
+The
+.Fn u8gnext
+function stores a view of the first grapheme in the buffer
+.Fa s
+of length
+.Fa n
+in the structure pointed to by
+.Fa v .
+It then updates
+.Fa s
+to point to the next grapheme in the buffer and also updates
+.Fa n
+accordingly.
+.Pp
+The
+.Vt "struct u8view"
+type is described in the
+.Xr u8view 3
+manual.
+.Sh RETURN VALUES
+The
+.Fn u8gnext
+function returns the updated value of
+.Fa s
+or
+.Dv NULL
+at the end of iteration.
+.Sh EXAMPLES
+The following calls to
+.Fn u8gnext
+iterate over and print all the graphemes in
+.Va s .
+.Bd -literal -offset indent
+#define STRING u8"नमस्कार विश्व"
+
+struct u8view v;
+const char8_t *s = STRING;
+size_t n = sizeof(STRING) - 1;
+
+while (u8gnext(&v, &s, &n))
+	printf("‘%.*s’\en", (int)g.len, g.p);
+.Ed
+.Sh SEE ALSO
+.Xr u8next 3 ,
+.Xr u8view 3 ,
+.Xr unicode 7 ,
+.Xr utf\-8 7
+.Sh STANDARDS
+.Rs
+.%A F. Yergeau
+.%D November 2003
+.%R RFC 3629
+.%T UTF-8, a transformation format of ISO 10646
+.Re
+.Pp
+.Lk https://www.unicode.org/versions/Unicode15.1.0/ \
+"The Unicode\(rg Standard Version 15.1.0"
+.Sh AUTHORS
+.An Thomas Voss Aq Mt mail@thomasvoss.com
diff --git a/vendor/librune/man/u8len.3 b/vendor/librune/man/u8len.3
new file mode 100644
index 0000000..4e58e14
--- /dev/null
+++ b/vendor/librune/man/u8len.3
@@ -0,0 +1,70 @@
+.Dd January 15 2024
+.Dt U8LEN 3
+.Os
+.Sh NAME
+.Nm u8len
+.Nd count Unicode codepoints
+.Sh LIBRARY
+.Lb librune
+.Sh SYNOPSIS
+.In utf8.h
+.Ft size_t
+.Fn u8len "const char8_t *s" "size_t n"
+.Sh DESCRIPTION
+The
+.Fn u8len
+function returns the number of UTF-8 encoded Unicode codepoints in the
+buffer
+.Fa s
+of length
+.Fa n
+bytes.
+.Pp
+This function assumes that
+.Fa s
+contains only valid UTF-8.
+.Sh RETURN VALUES
+The
+.Fn u8len
+function returns the number of codepoints in the buffer
+.Fa s .
+.Sh EXAMPLES
+The following call to
+.Fn u8len
+will return 17 while the call to
+.Fn strlen
+will return 22 as a result of use of multibyte-characters in
+.Fa s .
+.Bd -literal -offset indent
+char8_t s[] = u8\(dq„Der Große Duden“\(dq;
+size_t blen, cplen;
+
+blen = strlen((char *)s);
+cplen = u8len(s, sizeof(s) - 1);
+.Ed
+.Sh SEE ALSO
+.Xr u8glen 3 ,
+.Xr u8wdth 3 ,
+.Xr unicode 7 ,
+.Xr utf\-8 7
+.Sh STANDARDS
+.Rs
+.%A F. Yergeau
+.%D November 2003
+.%R RFC 3629
+.%T UTF-8, a transformation format of ISO 10646
+.Re
+.Sh AUTHORS
+.An Thomas Voss Aq Mt mail@thomasvoss.com
+.Sh CAVEATS
+The return value of
+.Fn u8len
+does not necessarily represent the number of human-preceived characters
+in the given buffer;
+multiple codepoints may combine to form one human-preceived character
+that spans a single column.
+To count user-preceived codepoints
+.Pq also known as graphemes ,
+you may want to use the
+.Xr u8glen 3
+function.
diff --git a/vendor/librune/man/u8next.3 b/vendor/librune/man/u8next.3
new file mode 100644
index 0000000..93a4f5d
--- /dev/null
+++ b/vendor/librune/man/u8next.3
@@ -0,0 +1,111 @@
+.Dd January 18 2024
+.Dt U8NEXT 3
+.Os
+.Sh NAME
+.Nm u8next ,
+.Nm u8prev
+.Nd iterate over Unicode codepoints
+.Sh LIBRARY
+.Lb librune
+.Sh SYNOPSIS
+.In utf8.h
+.Ft "const char8_t *"
+.Fn u8next "rune *ch" "const char8_t **s" "size_t *n"
+.Ft "const char8_t *"
+.Fn u8prev "rune *ch" "const char8_t **s" "const char8_t *start"
+.Sh DESCRIPTION
+The
+.Fn u8next
+function decodes the first rune in the UTF-8 encoded string pointed to by
+.Fa s
+of length
+.Fa n
+and stores the result in
+.Fa ch .
+It then updates
+.Fa s
+to point to the next codepoint in the buffer and updates the length
+.Fa n
+accordingly.
+.Pp
+The
+.Fn u8prev
+function takes a pointer
+.Fa start
+which points to the start of the string instead of a length,
+and updates
+.Fa s
+to point to the previous codepoint in the buffer.
+The rune
+.Fa ch
+is set to UTF-8 codepoint pointed to by
+.Fa s
+after iteration.
+.Pp
+Both of these functions assume the input is valid UTF-8.
+.Sh RETURN VALUES
+The
+.Fn u8next
+and
+.Fn u8prev
+functions return the updated value of
+.Fa s
+or
+.Dv NULL
+at the end of iteration.
+.Sh EXAMPLES
+The following calls to
+.Fn u8next
+iterate over and print all the codepoints in
+.Va s .
+.Bd -literal -offset indent
+#include <rune.h> /* For PRIXRUNE; see rune(3) */
+
+#define STRING u8"Ta’ Ħaġrat"
+
+rune ch;
+const char8_t *s = STRING;
+size_t n = sizeof(STRING) - 1;
+
+while (u8next(&ch, &s, &n)) {
+	int w = u8wdth(ch);
+	printf("U+%04" PRIXRUNE ": ‘%.*s’\en", ch, w, s - w);
+}
+.Ed
+.Pp
+The following example is the same as the previous,
+but it uses the
+.Fn u8prev
+function to iterate backwards.
+.Bd -literal -offset indent
+#include <rune.h> /* For PRIXRUNE; see rune(3) */
+
+#define STRING u8"Ta’ Ħaġrat"
+
+rune ch;
+const char8_t *s, *start;
+size_t n = sizeof(STRING) - 1;
+
+start = STRING;
+s = start + n;
+
+while (u8prev(&ch, &s, start)) {
+	int w = u8wdth(ch);
+	printf("U+%04" PRIXRUNE ": ‘%.*s’\en", ch, w, s);
+}
+.Ed
+.Sh SEE ALSO
+.Xr rune 3 ,
+.Xr u8gnext 3 ,
+.Xr u8tor 3 ,
+.Xr unicode 7 ,
+.Xr utf\-8 7
+.Sh STANDARDS
+.Rs
+.%A F. Yergeau
+.%D November 2003
+.%R RFC 3629
+.%T UTF-8, a transformation format of ISO 10646
+.Re
+.Sh AUTHORS
+.An Thomas Voss Aq Mt mail@thomasvoss.com
diff --git a/vendor/librune/man/u8prev.3 b/vendor/librune/man/u8prev.3
new file mode 100644
index 0000000..cf1364e
--- /dev/null
+++ b/vendor/librune/man/u8prev.3
@@ -0,0 +1 @@
+.so u8next
diff --git a/vendor/librune/man/u8set.3 b/vendor/librune/man/u8set.3
new file mode 100644
index 0000000..307f84e
--- /dev/null
+++ b/vendor/librune/man/u8set.3
@@ -0,0 +1,103 @@
+.Dd January 18 2024
+.Dt U8SET 3
+.Os
+.Sh NAME
+.Nm rtou8 ,
+.Nm u8set
+.Nd encode a rune to UTF-8
+.Sh LIBRARY
+.Lb librune
+.Sh SYNOPSIS
+.In utf8.h
+.Ft int
+.Fn rtou8 "const char8_t *s" "rune ch" "size_t n"
+.Ft size_t
+.Fn u8set "const char8_t *s" "rune ch" "size_t n"
+.Sh DESCRIPTION
+The
+.Fn rtou8
+function writes the rune
+.Fa ch
+to the UTF-8 encoded buffer
+.Fa s
+of length
+.Fa n ,
+returning the number of bytes required to UTF-8 encode
+.Fa ch .
+.Pp
+The
+.Fn u8set
+function fills the buffer
+.Fa s
+of length
+.Fa n
+with the constant rune
+.Fa ch .
+It is similar to the
+.Fn rtou8
+function,
+but writes more than 1 rune if the given buffer has the capacity.
+Unlike
+.Fn rtou8 ,
+this function returns the number of bytes that were successfully written
+to
+.Fa s .
+If
+.Fa n
+is a multiple of
+.Fn u8wdth ch
+the return value will be equal to
+.Fa n ,
+however in the case that
+.Fa n
+is not a multiple then
+.Fa s
+is filled as much as possible,
+and a count shorter than
+.Fa n
+is returned.
+.Pp
+Both of these functions assume the input is valid UTF-8.
+.Sh RETURN VALUES
+The
+.Fn rtou8
+function returns the number of bytes required to write
+.Fa ch
+to the buffer
+.Fa s .
+.Pp
+The
+.Fn u8set
+function returns the number of bytes written to the buffer
+.Fa s .
+.Sh EXAMPLES
+The following calls to
+.Fn rtou8
+and
+.Fn u8set
+fill a buffer with box-drawing characters to create a top-row of a box.
+.Bd -literal -offset indent
+#define SCREEN_WIDTH 80
+
+int bdr_wdth = u8wdth(U'─'); /* box-drawing rune width */
+size_t bufsiz = SCREEN_WIDTH * bdr_wdth;
+char8_t *s = malloc(bufsiz);
+
+rtou8(s, U'┌', bdr_wdth);
+u8set(s + bdr_wdth, U'─', bufsiz - bdr_wdth * 2);
+rtou8(s + bufsiz - bdr_wdth, U'┐', bdr_wdth);
+.Ed
+.Sh SEE ALSO
+.Xr u8tor 3 ,
+.Xr u8tor_uc 3 ,
+.Xr unicode 7 ,
+.Xr utf\-8 7
+.Sh STANDARDS
+.Rs
+.%A F. Yergeau
+.%D November 2003
+.%R RFC 3629
+.%T UTF-8, a transformation format of ISO 10646
+.Re
+.Sh AUTHORS
+.An Thomas Voss Aq Mt mail@thomasvoss.com
diff --git a/vendor/librune/man/u8tor.3 b/vendor/librune/man/u8tor.3
new file mode 100644
index 0000000..147f7c1
--- /dev/null
+++ b/vendor/librune/man/u8tor.3
@@ -0,0 +1,90 @@
+.Dd January 18 2024
+.Dt U8TOR 3
+.Os
+.Sh NAME
+.Nm u8tor ,
+.Nm u8tor_uc
+.Nd decode UTF-8 into a rune
+.Sh LIBRARY
+.Lb librune
+.Sh SYNOPSIS
+.In utf8.h
+.Ft int
+.Fn u8tor "rune *ch" "const char8_t *s"
+.Ft int
+.Fn u8tor_uc "rune *ch" "const char8_t *s"
+.Sh DESCRIPTION
+The
+.Fn u8tor
+and
+.Fn u8tor_uc
+functions decode the first rune in the UTF-8 buffer
+.Fa s ,
+storing the result in the rune pointed to by
+.Fa ch .
+Both functions return the number of bytes which compose the decoded
+UTF-8.
+.Pp
+The two functions are nearly identical,
+however
+.Fn u8tor_uc
+performs fewer range checks than
+.Fn u8tor
+allowing it to process data more efficiently.
+When provided with invalid UTF-8 however,
+.Fn u8tor_uc
+engages in undefined-behavior.
+The
+.Fn u8tor
+function on the other hand handles invalid UTF-8 by storing
+.Dv RUNE_ERROR
+in
+.Fa ch
+and returning 1.
+.Sh RETURN VALUES
+The
+.Fn u8tor
+and
+.Fn u8tor_uc
+functions return the number of bytes from
+.Fa s
+decoded into
+.Fa ch .
+.Pp
+The
+.Fn u8tor
+function returns 1 on invalid UTF-8.
+.Sh EXAMPLES
+The following call to
+.Fn u8tor
+attempts to decode the first UTF-8 codepoint in
+.Va buf .
+.Bd -literal -offset indent
+/* Implementation of read_codepoint() omitted */
+
+int w;
+rune ch;
+char8_t *buf = read_codepoint(stdin);
+
+w = u8tor(&ch, buf);
+if (ch == RUNE_ERROR) {
+	fputs("Got invalid UTF-8 codepoint", stderr);
+	exit(EXIT_FAILURE);
+}
+printf("Got rune ‘%.*s’\en", w, buf);
+.Ed
+.Sh SEE ALSO
+.Xr rtou8 3 ,
+.Xr u8chk 3 ,
+.Xr u8next 3 ,
+.Xr unicode 7 ,
+.Xr utf\-8 7
+.Sh STANDARDS
+.Rs
+.%A F. Yergeau
+.%D November 2003
+.%R RFC 3629
+.%T UTF-8, a transformation format of ISO 10646
+.Re
+.Sh AUTHORS
+.An Thomas Voss Aq Mt mail@thomasvoss.com
diff --git a/vendor/librune/man/u8tor_uc.3 b/vendor/librune/man/u8tor_uc.3
new file mode 100644
index 0000000..1527e52
--- /dev/null
+++ b/vendor/librune/man/u8tor_uc.3
@@ -0,0 +1 @@
+.so u8tor.3
diff --git a/vendor/librune/man/u8wdth.3 b/vendor/librune/man/u8wdth.3
new file mode 100644
index 0000000..60fcada
--- /dev/null
+++ b/vendor/librune/man/u8wdth.3
@@ -0,0 +1,69 @@
+.Dd January 16 2024
+.Dt U8WDTH 3
+.Os
+.Sh NAME
+.Nm u8wdth
+.Nd Unicode codepoint width
+.Sh LIBRARY
+.Lb librune
+.Sh SYNOPSIS
+.In utf8.h
+.Ft int
+.Fn u8wdth "rune ch"
+.Sh DESCRIPTION
+The
+.Fn u8wdth
+function returns the number of bytes that would be occupied by the
+Unicode-codepoint
+.Fa ch
+if it was encoded as UTF-8.
+If
+.Fa ch
+is greater than
+.Dv RUNE_MAX ,
+a width of 0 is returned.
+.Pp
+If the exact UTF-8 encoded size of a codepoint is not relevant and you
+simply wish to allocate a buffer capable of holding a given number of
+UTF-8 codepoints,
+the
+.Dv U8_LEN_MAX
+macro may be preferable.
+.Pp
+This function treats invalid codepoints smaller than
+.Dv RUNE_MAX
+such as UTF-16 surrogates as valid.
+.Sh RETURN VALUES
+The
+.Fn u8wdth
+function returns the number of bytes required to UTF-8 encode the
+codepoint
+.Fa ch .
+.Sh EXAMPLES
+The following example allocates a buffer which is exactly large enough to
+hold the given UTF-32 string once it is converted to UTF-8.
+.Bd -literal -offset indent
+#define lengthof(a) (sizeof(a) / sizeof(*(a)))
+
+size_t bufsiz = 0;
+char8_t *buf;
+char32_t s[] = U\(dqĲsselmeer\(dq; /* ‘Ĳ’ takes 2 bytes */
+
+for (size_t i = 0; i < lengthof(s) - 1; i++)
+	bufsiz += u8wdth(s[i]);
+buf = malloc(bufsiz);
+.Ed
+.Sh SEE ALSO
+.Xr u8glen 3 ,
+.Xr u8len 3 ,
+.Xr unicode 7 ,
+.Xr utf-8 7
+.Sh STANDARDS
+.Rs
+.%A F. Yergeau
+.%D November 2003
+.%R RFC 3629
+.%T UTF-8, a transformation format of ISO 10646
+.Re
+.Sh AUTHORS
+.An Thomas Voss Aq Mt mail@thomasvoss.com
author	Thomas Voss <mail@thomasvoss.com>	2024-01-21 03:03:58 +0100
committer	Thomas Voss <mail@thomasvoss.com>	2024-01-21 03:03:58 +0100
commit	4f93f935dc7a981ca073a322425c3f5929ffb644 (patch)
tree	4460586408ec7fdfcecf3ba4584f0435067125a6 /vendor/librune/man
parent	72ea25a4d73e3e026366d4165f5bc4ec9e7418cb (diff)