diff options
author | Thomas Voss <mail@thomasvoss.com> | 2024-01-21 03:03:58 +0100 |
---|---|---|
committer | Thomas Voss <mail@thomasvoss.com> | 2024-01-21 03:03:58 +0100 |
commit | 4f93f935dc7a981ca073a322425c3f5929ffb644 (patch) | |
tree | 4460586408ec7fdfcecf3ba4584f0435067125a6 /vendor/librune/man | |
parent | 72ea25a4d73e3e026366d4165f5bc4ec9e7418cb (diff) |
Support line- & column-based match locations
Diffstat (limited to 'vendor/librune/man')
-rw-r--r-- | vendor/librune/man/Lb-desc.tmac | 1 | ||||
-rw-r--r-- | vendor/librune/man/rtou8.3 | 1 | ||||
-rw-r--r-- | vendor/librune/man/u8glen.3 | 62 | ||||
-rw-r--r-- | vendor/librune/man/u8gnext.3 | 72 | ||||
-rw-r--r-- | vendor/librune/man/u8len.3 | 70 | ||||
-rw-r--r-- | vendor/librune/man/u8next.3 | 111 | ||||
-rw-r--r-- | vendor/librune/man/u8prev.3 | 1 | ||||
-rw-r--r-- | vendor/librune/man/u8set.3 | 103 | ||||
-rw-r--r-- | vendor/librune/man/u8tor.3 | 90 | ||||
-rw-r--r-- | vendor/librune/man/u8tor_uc.3 | 1 | ||||
-rw-r--r-- | vendor/librune/man/u8wdth.3 | 69 |
11 files changed, 581 insertions, 0 deletions
diff --git a/vendor/librune/man/Lb-desc.tmac b/vendor/librune/man/Lb-desc.tmac new file mode 100644 index 0000000..1ec8f8f --- /dev/null +++ b/vendor/librune/man/Lb-desc.tmac @@ -0,0 +1 @@ +.ds doc-str-Lb-librune Unicode- and UTF-8 Library (librune, \-lrune) diff --git a/vendor/librune/man/rtou8.3 b/vendor/librune/man/rtou8.3 new file mode 100644 index 0000000..a83d3e8 --- /dev/null +++ b/vendor/librune/man/rtou8.3 @@ -0,0 +1 @@ +.so u8set.3 diff --git a/vendor/librune/man/u8glen.3 b/vendor/librune/man/u8glen.3 new file mode 100644 index 0000000..dfe0283 --- /dev/null +++ b/vendor/librune/man/u8glen.3 @@ -0,0 +1,62 @@ +.Dd January 15 2024 +.Dt U8GLEN 3 +.Os +.Sh NAME +.Nm u8glen +.Nd count Unicode graphemes +.Sh LIBRARY +.Lb librune +.Sh SYNOPSIS +.In gbrk.h +.Ft size_t +.Fn u8glen "const char8_t *s" "size_t n" +.Sh DESCRIPTION +The +.Fn u8glen +function returns the number of UTF-8 encoded Unicode graphemes in the +buffer +.Fa s +of length +.Fa n +bytes. +.Pp +This function assumes that +.Fa s +contains only valid UTF-8. +.Sh RETURN VALUES +The +.Fn u8glen +function returns the number of graphemes in the buffer +.Fa s . +.Sh EXAMPLES +The following call to +.Fn u8glen +will return 6 while the call to +.Fn u8len +will return 7 as a result of user-preceived characters such as +.Sq е́ +taking up multiple codepoints. +.Bd -literal -offset indent +char8_t s[] = u8\(dqПриве́т\(dq; +size_t cplen, glen; + +cplen = u8len(s, sizeof(s) - 1); +glen = u8glen(s, sizeof(s) - 1); +.Ed +.Sh SEE ALSO +.Xr u8len 3 , +.Xr u8wdth 3 , +.Xr unicode 7 , +.Xr utf\-8 7 +.Sh STANDARDS +.Rs +.%A F. Yergeau +.%D November 2003 +.%R RFC 3629 +.%T UTF-8, a transformation format of ISO 10646 +.Re +.Pp +.Lk https://www.unicode.org/versions/Unicode15.1.0/ \ +"The Unicode\(rg Standard Version 15.1.0" +.Sh AUTHORS +.An Thomas Voss Aq Mt mail@thomasvoss.com diff --git a/vendor/librune/man/u8gnext.3 b/vendor/librune/man/u8gnext.3 new file mode 100644 index 0000000..0053aa5 --- /dev/null +++ b/vendor/librune/man/u8gnext.3 @@ -0,0 +1,72 @@ +.Dd January 18 2024 +.Dt U8GNEXT 3 +.Os +.Sh NAME +.Nm u8gnext , +.Nd iterate over Unicode codepoints +.Sh LIBRARY +.Lb librune +.Sh SYNOPSIS +.In gbrk.h +.Ft "const char8_t *" +.Fn u8gnext "struct u8view *v" "const char8_t **s" "size_t *n" +.Sh DESCRIPTION +The +.Fn u8gnext +function stores a view of the first grapheme in the buffer +.Fa s +of length +.Fa n +in the structure pointed to by +.Fa v . +It then updates +.Fa s +to point to the next grapheme in the buffer and also updates +.Fa n +accordingly. +.Pp +The +.Vt "struct u8view" +type is described in the +.Xr u8view 3 +manual. +.Sh RETURN VALUES +The +.Fn u8gnext +function returns the updated value of +.Fa s +or +.Dv NULL +at the end of iteration. +.Sh EXAMPLES +The following calls to +.Fn u8gnext +iterate over and print all the graphemes in +.Va s . +.Bd -literal -offset indent +#define STRING u8"नमस्कार विश्व" + +struct u8view v; +const char8_t *s = STRING; +size_t n = sizeof(STRING) - 1; + +while (u8gnext(&v, &s, &n)) + printf("‘%.*s’\en", (int)g.len, g.p); +.Ed +.Sh SEE ALSO +.Xr u8next 3 , +.Xr u8view 3 , +.Xr unicode 7 , +.Xr utf\-8 7 +.Sh STANDARDS +.Rs +.%A F. Yergeau +.%D November 2003 +.%R RFC 3629 +.%T UTF-8, a transformation format of ISO 10646 +.Re +.Pp +.Lk https://www.unicode.org/versions/Unicode15.1.0/ \ +"The Unicode\(rg Standard Version 15.1.0" +.Sh AUTHORS +.An Thomas Voss Aq Mt mail@thomasvoss.com diff --git a/vendor/librune/man/u8len.3 b/vendor/librune/man/u8len.3 new file mode 100644 index 0000000..4e58e14 --- /dev/null +++ b/vendor/librune/man/u8len.3 @@ -0,0 +1,70 @@ +.Dd January 15 2024 +.Dt U8LEN 3 +.Os +.Sh NAME +.Nm u8len +.Nd count Unicode codepoints +.Sh LIBRARY +.Lb librune +.Sh SYNOPSIS +.In utf8.h +.Ft size_t +.Fn u8len "const char8_t *s" "size_t n" +.Sh DESCRIPTION +The +.Fn u8len +function returns the number of UTF-8 encoded Unicode codepoints in the +buffer +.Fa s +of length +.Fa n +bytes. +.Pp +This function assumes that +.Fa s +contains only valid UTF-8. +.Sh RETURN VALUES +The +.Fn u8len +function returns the number of codepoints in the buffer +.Fa s . +.Sh EXAMPLES +The following call to +.Fn u8len +will return 17 while the call to +.Fn strlen +will return 22 as a result of use of multibyte-characters in +.Fa s . +.Bd -literal -offset indent +char8_t s[] = u8\(dq„Der Große Duden“\(dq; +size_t blen, cplen; + +blen = strlen((char *)s); +cplen = u8len(s, sizeof(s) - 1); +.Ed +.Sh SEE ALSO +.Xr u8glen 3 , +.Xr u8wdth 3 , +.Xr unicode 7 , +.Xr utf\-8 7 +.Sh STANDARDS +.Rs +.%A F. Yergeau +.%D November 2003 +.%R RFC 3629 +.%T UTF-8, a transformation format of ISO 10646 +.Re +.Sh AUTHORS +.An Thomas Voss Aq Mt mail@thomasvoss.com +.Sh CAVEATS +The return value of +.Fn u8len +does not necessarily represent the number of human-preceived characters +in the given buffer; +multiple codepoints may combine to form one human-preceived character +that spans a single column. +To count user-preceived codepoints +.Pq also known as graphemes , +you may want to use the +.Xr u8glen 3 +function. diff --git a/vendor/librune/man/u8next.3 b/vendor/librune/man/u8next.3 new file mode 100644 index 0000000..93a4f5d --- /dev/null +++ b/vendor/librune/man/u8next.3 @@ -0,0 +1,111 @@ +.Dd January 18 2024 +.Dt U8NEXT 3 +.Os +.Sh NAME +.Nm u8next , +.Nm u8prev +.Nd iterate over Unicode codepoints +.Sh LIBRARY +.Lb librune +.Sh SYNOPSIS +.In utf8.h +.Ft "const char8_t *" +.Fn u8next "rune *ch" "const char8_t **s" "size_t *n" +.Ft "const char8_t *" +.Fn u8prev "rune *ch" "const char8_t **s" "const char8_t *start" +.Sh DESCRIPTION +The +.Fn u8next +function decodes the first rune in the UTF-8 encoded string pointed to by +.Fa s +of length +.Fa n +and stores the result in +.Fa ch . +It then updates +.Fa s +to point to the next codepoint in the buffer and updates the length +.Fa n +accordingly. +.Pp +The +.Fn u8prev +function takes a pointer +.Fa start +which points to the start of the string instead of a length, +and updates +.Fa s +to point to the previous codepoint in the buffer. +The rune +.Fa ch +is set to UTF-8 codepoint pointed to by +.Fa s +after iteration. +.Pp +Both of these functions assume the input is valid UTF-8. +.Sh RETURN VALUES +The +.Fn u8next +and +.Fn u8prev +functions return the updated value of +.Fa s +or +.Dv NULL +at the end of iteration. +.Sh EXAMPLES +The following calls to +.Fn u8next +iterate over and print all the codepoints in +.Va s . +.Bd -literal -offset indent +#include <rune.h> /* For PRIXRUNE; see rune(3) */ + +#define STRING u8"Ta’ Ħaġrat" + +rune ch; +const char8_t *s = STRING; +size_t n = sizeof(STRING) - 1; + +while (u8next(&ch, &s, &n)) { + int w = u8wdth(ch); + printf("U+%04" PRIXRUNE ": ‘%.*s’\en", ch, w, s - w); +} +.Ed +.Pp +The following example is the same as the previous, +but it uses the +.Fn u8prev +function to iterate backwards. +.Bd -literal -offset indent +#include <rune.h> /* For PRIXRUNE; see rune(3) */ + +#define STRING u8"Ta’ Ħaġrat" + +rune ch; +const char8_t *s, *start; +size_t n = sizeof(STRING) - 1; + +start = STRING; +s = start + n; + +while (u8prev(&ch, &s, start)) { + int w = u8wdth(ch); + printf("U+%04" PRIXRUNE ": ‘%.*s’\en", ch, w, s); +} +.Ed +.Sh SEE ALSO +.Xr rune 3 , +.Xr u8gnext 3 , +.Xr u8tor 3 , +.Xr unicode 7 , +.Xr utf\-8 7 +.Sh STANDARDS +.Rs +.%A F. Yergeau +.%D November 2003 +.%R RFC 3629 +.%T UTF-8, a transformation format of ISO 10646 +.Re +.Sh AUTHORS +.An Thomas Voss Aq Mt mail@thomasvoss.com diff --git a/vendor/librune/man/u8prev.3 b/vendor/librune/man/u8prev.3 new file mode 100644 index 0000000..cf1364e --- /dev/null +++ b/vendor/librune/man/u8prev.3 @@ -0,0 +1 @@ +.so u8next diff --git a/vendor/librune/man/u8set.3 b/vendor/librune/man/u8set.3 new file mode 100644 index 0000000..307f84e --- /dev/null +++ b/vendor/librune/man/u8set.3 @@ -0,0 +1,103 @@ +.Dd January 18 2024 +.Dt U8SET 3 +.Os +.Sh NAME +.Nm rtou8 , +.Nm u8set +.Nd encode a rune to UTF-8 +.Sh LIBRARY +.Lb librune +.Sh SYNOPSIS +.In utf8.h +.Ft int +.Fn rtou8 "const char8_t *s" "rune ch" "size_t n" +.Ft size_t +.Fn u8set "const char8_t *s" "rune ch" "size_t n" +.Sh DESCRIPTION +The +.Fn rtou8 +function writes the rune +.Fa ch +to the UTF-8 encoded buffer +.Fa s +of length +.Fa n , +returning the number of bytes required to UTF-8 encode +.Fa ch . +.Pp +The +.Fn u8set +function fills the buffer +.Fa s +of length +.Fa n +with the constant rune +.Fa ch . +It is similar to the +.Fn rtou8 +function, +but writes more than 1 rune if the given buffer has the capacity. +Unlike +.Fn rtou8 , +this function returns the number of bytes that were successfully written +to +.Fa s . +If +.Fa n +is a multiple of +.Fn u8wdth ch +the return value will be equal to +.Fa n , +however in the case that +.Fa n +is not a multiple then +.Fa s +is filled as much as possible, +and a count shorter than +.Fa n +is returned. +.Pp +Both of these functions assume the input is valid UTF-8. +.Sh RETURN VALUES +The +.Fn rtou8 +function returns the number of bytes required to write +.Fa ch +to the buffer +.Fa s . +.Pp +The +.Fn u8set +function returns the number of bytes written to the buffer +.Fa s . +.Sh EXAMPLES +The following calls to +.Fn rtou8 +and +.Fn u8set +fill a buffer with box-drawing characters to create a top-row of a box. +.Bd -literal -offset indent +#define SCREEN_WIDTH 80 + +int bdr_wdth = u8wdth(U'─'); /* box-drawing rune width */ +size_t bufsiz = SCREEN_WIDTH * bdr_wdth; +char8_t *s = malloc(bufsiz); + +rtou8(s, U'┌', bdr_wdth); +u8set(s + bdr_wdth, U'─', bufsiz - bdr_wdth * 2); +rtou8(s + bufsiz - bdr_wdth, U'┐', bdr_wdth); +.Ed +.Sh SEE ALSO +.Xr u8tor 3 , +.Xr u8tor_uc 3 , +.Xr unicode 7 , +.Xr utf\-8 7 +.Sh STANDARDS +.Rs +.%A F. Yergeau +.%D November 2003 +.%R RFC 3629 +.%T UTF-8, a transformation format of ISO 10646 +.Re +.Sh AUTHORS +.An Thomas Voss Aq Mt mail@thomasvoss.com diff --git a/vendor/librune/man/u8tor.3 b/vendor/librune/man/u8tor.3 new file mode 100644 index 0000000..147f7c1 --- /dev/null +++ b/vendor/librune/man/u8tor.3 @@ -0,0 +1,90 @@ +.Dd January 18 2024 +.Dt U8TOR 3 +.Os +.Sh NAME +.Nm u8tor , +.Nm u8tor_uc +.Nd decode UTF-8 into a rune +.Sh LIBRARY +.Lb librune +.Sh SYNOPSIS +.In utf8.h +.Ft int +.Fn u8tor "rune *ch" "const char8_t *s" +.Ft int +.Fn u8tor_uc "rune *ch" "const char8_t *s" +.Sh DESCRIPTION +The +.Fn u8tor +and +.Fn u8tor_uc +functions decode the first rune in the UTF-8 buffer +.Fa s , +storing the result in the rune pointed to by +.Fa ch . +Both functions return the number of bytes which compose the decoded +UTF-8. +.Pp +The two functions are nearly identical, +however +.Fn u8tor_uc +performs fewer range checks than +.Fn u8tor +allowing it to process data more efficiently. +When provided with invalid UTF-8 however, +.Fn u8tor_uc +engages in undefined-behavior. +The +.Fn u8tor +function on the other hand handles invalid UTF-8 by storing +.Dv RUNE_ERROR +in +.Fa ch +and returning 1. +.Sh RETURN VALUES +The +.Fn u8tor +and +.Fn u8tor_uc +functions return the number of bytes from +.Fa s +decoded into +.Fa ch . +.Pp +The +.Fn u8tor +function returns 1 on invalid UTF-8. +.Sh EXAMPLES +The following call to +.Fn u8tor +attempts to decode the first UTF-8 codepoint in +.Va buf . +.Bd -literal -offset indent +/* Implementation of read_codepoint() omitted */ + +int w; +rune ch; +char8_t *buf = read_codepoint(stdin); + +w = u8tor(&ch, buf); +if (ch == RUNE_ERROR) { + fputs("Got invalid UTF-8 codepoint", stderr); + exit(EXIT_FAILURE); +} +printf("Got rune ‘%.*s’\en", w, buf); +.Ed +.Sh SEE ALSO +.Xr rtou8 3 , +.Xr u8chk 3 , +.Xr u8next 3 , +.Xr unicode 7 , +.Xr utf\-8 7 +.Sh STANDARDS +.Rs +.%A F. Yergeau +.%D November 2003 +.%R RFC 3629 +.%T UTF-8, a transformation format of ISO 10646 +.Re +.Sh AUTHORS +.An Thomas Voss Aq Mt mail@thomasvoss.com diff --git a/vendor/librune/man/u8tor_uc.3 b/vendor/librune/man/u8tor_uc.3 new file mode 100644 index 0000000..1527e52 --- /dev/null +++ b/vendor/librune/man/u8tor_uc.3 @@ -0,0 +1 @@ +.so u8tor.3 diff --git a/vendor/librune/man/u8wdth.3 b/vendor/librune/man/u8wdth.3 new file mode 100644 index 0000000..60fcada --- /dev/null +++ b/vendor/librune/man/u8wdth.3 @@ -0,0 +1,69 @@ +.Dd January 16 2024 +.Dt U8WDTH 3 +.Os +.Sh NAME +.Nm u8wdth +.Nd Unicode codepoint width +.Sh LIBRARY +.Lb librune +.Sh SYNOPSIS +.In utf8.h +.Ft int +.Fn u8wdth "rune ch" +.Sh DESCRIPTION +The +.Fn u8wdth +function returns the number of bytes that would be occupied by the +Unicode-codepoint +.Fa ch +if it was encoded as UTF-8. +If +.Fa ch +is greater than +.Dv RUNE_MAX , +a width of 0 is returned. +.Pp +If the exact UTF-8 encoded size of a codepoint is not relevant and you +simply wish to allocate a buffer capable of holding a given number of +UTF-8 codepoints, +the +.Dv U8_LEN_MAX +macro may be preferable. +.Pp +This function treats invalid codepoints smaller than +.Dv RUNE_MAX +such as UTF-16 surrogates as valid. +.Sh RETURN VALUES +The +.Fn u8wdth +function returns the number of bytes required to UTF-8 encode the +codepoint +.Fa ch . +.Sh EXAMPLES +The following example allocates a buffer which is exactly large enough to +hold the given UTF-32 string once it is converted to UTF-8. +.Bd -literal -offset indent +#define lengthof(a) (sizeof(a) / sizeof(*(a))) + +size_t bufsiz = 0; +char8_t *buf; +char32_t s[] = U\(dqIJsselmeer\(dq; /* ‘IJ’ takes 2 bytes */ + +for (size_t i = 0; i < lengthof(s) - 1; i++) + bufsiz += u8wdth(s[i]); +buf = malloc(bufsiz); +.Ed +.Sh SEE ALSO +.Xr u8glen 3 , +.Xr u8len 3 , +.Xr unicode 7 , +.Xr utf-8 7 +.Sh STANDARDS +.Rs +.%A F. Yergeau +.%D November 2003 +.%R RFC 3629 +.%T UTF-8, a transformation format of ISO 10646 +.Re +.Sh AUTHORS +.An Thomas Voss Aq Mt mail@thomasvoss.com |