From 042e43247f396a9000fead59d9bff87bf12806d6 Mon Sep 17 00:00:00 2001 From: Thomas Voss Date: Wed, 30 Oct 2024 01:51:14 +0100 Subject: Completely revamp the grab source code Some of the (many) few changes are: - Multithreading for significantly faster performance - The -p/--predicate flag - Byte offsets as the default - No customizable colors (maybe this will come back later) - Newer edition of mlib (formerly librune) --- vendor/librune/man/Lb-desc.tmac | 1 - vendor/librune/man/rtou8.3 | 1 - vendor/librune/man/u8glen.3 | 62 ----------------------- vendor/librune/man/u8gnext.3 | 69 -------------------------- vendor/librune/man/u8len.3 | 70 -------------------------- vendor/librune/man/u8next.3 | 107 ---------------------------------------- vendor/librune/man/u8prev.3 | 1 - vendor/librune/man/u8set.3 | 103 -------------------------------------- vendor/librune/man/u8tor.3 | 90 --------------------------------- vendor/librune/man/u8tor_uc.3 | 1 - vendor/librune/man/u8wdth.3 | 69 -------------------------- 11 files changed, 574 deletions(-) delete mode 100644 vendor/librune/man/Lb-desc.tmac delete mode 100644 vendor/librune/man/rtou8.3 delete mode 100644 vendor/librune/man/u8glen.3 delete mode 100644 vendor/librune/man/u8gnext.3 delete mode 100644 vendor/librune/man/u8len.3 delete mode 100644 vendor/librune/man/u8next.3 delete mode 100644 vendor/librune/man/u8prev.3 delete mode 100644 vendor/librune/man/u8set.3 delete mode 100644 vendor/librune/man/u8tor.3 delete mode 100644 vendor/librune/man/u8tor_uc.3 delete mode 100644 vendor/librune/man/u8wdth.3 (limited to 'vendor/librune/man') diff --git a/vendor/librune/man/Lb-desc.tmac b/vendor/librune/man/Lb-desc.tmac deleted file mode 100644 index 1ec8f8f..0000000 --- a/vendor/librune/man/Lb-desc.tmac +++ /dev/null @@ -1 +0,0 @@ -.ds doc-str-Lb-librune Unicode- and UTF-8 Library (librune, \-lrune) diff --git a/vendor/librune/man/rtou8.3 b/vendor/librune/man/rtou8.3 deleted file mode 100644 index a83d3e8..0000000 --- a/vendor/librune/man/rtou8.3 +++ /dev/null @@ -1 +0,0 @@ -.so u8set.3 diff --git a/vendor/librune/man/u8glen.3 b/vendor/librune/man/u8glen.3 deleted file mode 100644 index dfe0283..0000000 --- a/vendor/librune/man/u8glen.3 +++ /dev/null @@ -1,62 +0,0 @@ -.Dd January 15 2024 -.Dt U8GLEN 3 -.Os -.Sh NAME -.Nm u8glen -.Nd count Unicode graphemes -.Sh LIBRARY -.Lb librune -.Sh SYNOPSIS -.In gbrk.h -.Ft size_t -.Fn u8glen "const char8_t *s" "size_t n" -.Sh DESCRIPTION -The -.Fn u8glen -function returns the number of UTF-8 encoded Unicode graphemes in the -buffer -.Fa s -of length -.Fa n -bytes. -.Pp -This function assumes that -.Fa s -contains only valid UTF-8. -.Sh RETURN VALUES -The -.Fn u8glen -function returns the number of graphemes in the buffer -.Fa s . -.Sh EXAMPLES -The following call to -.Fn u8glen -will return 6 while the call to -.Fn u8len -will return 7 as a result of user-preceived characters such as -.Sq е́ -taking up multiple codepoints. -.Bd -literal -offset indent -char8_t s[] = u8\(dqПриве́т\(dq; -size_t cplen, glen; - -cplen = u8len(s, sizeof(s) - 1); -glen = u8glen(s, sizeof(s) - 1); -.Ed -.Sh SEE ALSO -.Xr u8len 3 , -.Xr u8wdth 3 , -.Xr unicode 7 , -.Xr utf\-8 7 -.Sh STANDARDS -.Rs -.%A F. Yergeau -.%D November 2003 -.%R RFC 3629 -.%T UTF-8, a transformation format of ISO 10646 -.Re -.Pp -.Lk https://www.unicode.org/versions/Unicode15.1.0/ \ -"The Unicode\(rg Standard Version 15.1.0" -.Sh AUTHORS -.An Thomas Voss Aq Mt mail@thomasvoss.com diff --git a/vendor/librune/man/u8gnext.3 b/vendor/librune/man/u8gnext.3 deleted file mode 100644 index e50c250..0000000 --- a/vendor/librune/man/u8gnext.3 +++ /dev/null @@ -1,69 +0,0 @@ -.Dd January 27 2024 -.Dt U8GNEXT 3 -.Os -.Sh NAME -.Nm u8gnext , -.Nd iterate over Unicode codepoints -.Sh LIBRARY -.Lb librune -.Sh SYNOPSIS -.In gbrk.h -.Ft size_t -.Fn u8gnext "struct u8view *v" "const char8_t **s" "size_t *n" -.Sh DESCRIPTION -The -.Fn u8gnext -function stores a view of the first grapheme in the buffer -.Fa s -of length -.Fa n -in the structure pointed to by -.Fa v . -It then updates -.Fa s -to point to the next grapheme in the buffer and also updates -.Fa n -accordingly. -.Pp -The -.Vt "struct u8view" -type is described in the -.Xr u8view 3 -manual. -.Sh RETURN VALUES -The -.Fn u8gnext -function returns the length of the grapheme iterated over in bytes, -or 0 at the end of iteration. -.Sh EXAMPLES -The following calls to -.Fn u8gnext -iterate over and print all the graphemes in -.Va s . -.Bd -literal -offset indent -#define STRING u8"नमस्कार विश्व" - -struct u8view v; -const char8_t *s = STRING; -size_t n = sizeof(STRING) - 1; - -while (u8gnext(&v, &s, &n)) - printf("‘%.*s’\en", (int)g.len, g.p); -.Ed -.Sh SEE ALSO -.Xr u8next 3 , -.Xr u8view 3 , -.Xr unicode 7 , -.Xr utf\-8 7 -.Sh STANDARDS -.Rs -.%A F. Yergeau -.%D November 2003 -.%R RFC 3629 -.%T UTF-8, a transformation format of ISO 10646 -.Re -.Pp -.Lk https://www.unicode.org/versions/Unicode15.1.0/ \ -"The Unicode\(rg Standard Version 15.1.0" -.Sh AUTHORS -.An Thomas Voss Aq Mt mail@thomasvoss.com diff --git a/vendor/librune/man/u8len.3 b/vendor/librune/man/u8len.3 deleted file mode 100644 index 4e58e14..0000000 --- a/vendor/librune/man/u8len.3 +++ /dev/null @@ -1,70 +0,0 @@ -.Dd January 15 2024 -.Dt U8LEN 3 -.Os -.Sh NAME -.Nm u8len -.Nd count Unicode codepoints -.Sh LIBRARY -.Lb librune -.Sh SYNOPSIS -.In utf8.h -.Ft size_t -.Fn u8len "const char8_t *s" "size_t n" -.Sh DESCRIPTION -The -.Fn u8len -function returns the number of UTF-8 encoded Unicode codepoints in the -buffer -.Fa s -of length -.Fa n -bytes. -.Pp -This function assumes that -.Fa s -contains only valid UTF-8. -.Sh RETURN VALUES -The -.Fn u8len -function returns the number of codepoints in the buffer -.Fa s . -.Sh EXAMPLES -The following call to -.Fn u8len -will return 17 while the call to -.Fn strlen -will return 22 as a result of use of multibyte-characters in -.Fa s . -.Bd -literal -offset indent -char8_t s[] = u8\(dq„Der Große Duden“\(dq; -size_t blen, cplen; - -blen = strlen((char *)s); -cplen = u8len(s, sizeof(s) - 1); -.Ed -.Sh SEE ALSO -.Xr u8glen 3 , -.Xr u8wdth 3 , -.Xr unicode 7 , -.Xr utf\-8 7 -.Sh STANDARDS -.Rs -.%A F. Yergeau -.%D November 2003 -.%R RFC 3629 -.%T UTF-8, a transformation format of ISO 10646 -.Re -.Sh AUTHORS -.An Thomas Voss Aq Mt mail@thomasvoss.com -.Sh CAVEATS -The return value of -.Fn u8len -does not necessarily represent the number of human-preceived characters -in the given buffer; -multiple codepoints may combine to form one human-preceived character -that spans a single column. -To count user-preceived codepoints -.Pq also known as graphemes , -you may want to use the -.Xr u8glen 3 -function. diff --git a/vendor/librune/man/u8next.3 b/vendor/librune/man/u8next.3 deleted file mode 100644 index eabf2a2..0000000 --- a/vendor/librune/man/u8next.3 +++ /dev/null @@ -1,107 +0,0 @@ -.Dd January 27 2024 -.Dt U8NEXT 3 -.Os -.Sh NAME -.Nm u8next , -.Nm u8prev -.Nd iterate over Unicode codepoints -.Sh LIBRARY -.Lb librune -.Sh SYNOPSIS -.In utf8.h -.Ft int -.Fn u8next "rune *ch" "const char8_t **s" "size_t *n" -.Ft int -.Fn u8prev "rune *ch" "const char8_t **s" "const char8_t *start" -.Sh DESCRIPTION -The -.Fn u8next -function decodes the first rune in the UTF-8 encoded string pointed to by -.Fa s -of length -.Fa n -and stores the result in -.Fa ch . -It then updates -.Fa s -to point to the next codepoint in the buffer and updates the length -.Fa n -accordingly. -.Pp -The -.Fn u8prev -function takes a pointer -.Fa start -which points to the start of the string instead of a length, -and updates -.Fa s -to point to the previous codepoint in the buffer. -The rune -.Fa ch -is set to UTF-8 codepoint pointed to by -.Fa s -after iteration. -.Pp -Both of these functions assume the input is valid UTF-8. -.Sh RETURN VALUES -The -.Fn u8next -and -.Fn u8prev -functions return the length of the UTF-8-encoded rune iterated over in -bytes, -or 0 at the end of iteration. -.Sh EXAMPLES -The following calls to -.Fn u8next -iterate over and print all the codepoints in -.Va s . -.Bd -literal -offset indent -#include /* For PRIXRUNE; see rune(3) */ - -#define STRING u8"Ta’ Ħaġrat" - -int w; -rune ch; -const char8_t *s = STRING; -size_t n = sizeof(STRING) - 1; - -while (w = u8next(&ch, &s, &n)) - printf("U+%04" PRIXRUNE ": ‘%.*s’\en", ch, w, s - w); -.Ed -.Pp -The following example is the same as the previous, -but it uses the -.Fn u8prev -function to iterate backwards. -.Bd -literal -offset indent -#include /* For PRIXRUNE; see rune(3) */ - -#define STRING u8"Ta’ Ħaġrat" - -int w; -rune ch; -const char8_t *s, *start; -size_t n = sizeof(STRING) - 1; - -start = STRING; -s = start + n; - -while (w = u8prev(&ch, &s, start)) - printf("U+%04" PRIXRUNE ": ‘%.*s’\en", ch, w, s); -.Ed -.Sh SEE ALSO -.Xr rune 3 , -.Xr u8gnext 3 , -.Xr u8tor 3 , -.Xr unicode 7 , -.Xr utf\-8 7 -.Sh STANDARDS -.Rs -.%A F. Yergeau -.%D November 2003 -.%R RFC 3629 -.%T UTF-8, a transformation format of ISO 10646 -.Re -.Sh AUTHORS -.An Thomas Voss Aq Mt mail@thomasvoss.com diff --git a/vendor/librune/man/u8prev.3 b/vendor/librune/man/u8prev.3 deleted file mode 100644 index cf1364e..0000000 --- a/vendor/librune/man/u8prev.3 +++ /dev/null @@ -1 +0,0 @@ -.so u8next diff --git a/vendor/librune/man/u8set.3 b/vendor/librune/man/u8set.3 deleted file mode 100644 index d579b0b..0000000 --- a/vendor/librune/man/u8set.3 +++ /dev/null @@ -1,103 +0,0 @@ -.Dd January 27 2024 -.Dt U8SET 3 -.Os -.Sh NAME -.Nm rtou8 , -.Nm u8set -.Nd encode a rune to UTF-8 -.Sh LIBRARY -.Lb librune -.Sh SYNOPSIS -.In utf8.h -.Ft int -.Fn rtou8 "char8_t *s" "rune ch" "size_t n" -.Ft size_t -.Fn u8set "char8_t *s" "rune ch" "size_t n" -.Sh DESCRIPTION -The -.Fn rtou8 -function writes the rune -.Fa ch -to the UTF-8 encoded buffer -.Fa s -of length -.Fa n , -returning the number of bytes required to UTF-8 encode -.Fa ch . -.Pp -The -.Fn u8set -function fills the buffer -.Fa s -of length -.Fa n -with the constant rune -.Fa ch . -It is similar to the -.Fn rtou8 -function, -but writes more than 1 rune if the given buffer has the capacity. -Unlike -.Fn rtou8 , -this function returns the number of bytes that were successfully written -to -.Fa s . -If -.Fa n -is a multiple of -.Fn u8wdth ch -the return value will be equal to -.Fa n , -however in the case that -.Fa n -is not a multiple then -.Fa s -is filled as much as possible, -and a count shorter than -.Fa n -is returned. -.Pp -Both of these functions assume the input is valid UTF-8. -.Sh RETURN VALUES -The -.Fn rtou8 -function returns the number of bytes required to write -.Fa ch -to the buffer -.Fa s . -.Pp -The -.Fn u8set -function returns the number of bytes written to the buffer -.Fa s . -.Sh EXAMPLES -The following calls to -.Fn rtou8 -and -.Fn u8set -fill a buffer with box-drawing characters to create a top-row of a box. -.Bd -literal -offset indent -#define SCREEN_WIDTH 80 - -int bdr_wdth = u8wdth(U'─'); /* box-drawing rune width */ -size_t bufsiz = SCREEN_WIDTH * bdr_wdth; -char8_t *s = malloc(bufsiz); - -rtou8(s, U'┌', bdr_wdth); -u8set(s + bdr_wdth, U'─', bufsiz - bdr_wdth * 2); -rtou8(s + bufsiz - bdr_wdth, U'┐', bdr_wdth); -.Ed -.Sh SEE ALSO -.Xr u8tor 3 , -.Xr u8tor_uc 3 , -.Xr unicode 7 , -.Xr utf\-8 7 -.Sh STANDARDS -.Rs -.%A F. Yergeau -.%D November 2003 -.%R RFC 3629 -.%T UTF-8, a transformation format of ISO 10646 -.Re -.Sh AUTHORS -.An Thomas Voss Aq Mt mail@thomasvoss.com diff --git a/vendor/librune/man/u8tor.3 b/vendor/librune/man/u8tor.3 deleted file mode 100644 index 147f7c1..0000000 --- a/vendor/librune/man/u8tor.3 +++ /dev/null @@ -1,90 +0,0 @@ -.Dd January 18 2024 -.Dt U8TOR 3 -.Os -.Sh NAME -.Nm u8tor , -.Nm u8tor_uc -.Nd decode UTF-8 into a rune -.Sh LIBRARY -.Lb librune -.Sh SYNOPSIS -.In utf8.h -.Ft int -.Fn u8tor "rune *ch" "const char8_t *s" -.Ft int -.Fn u8tor_uc "rune *ch" "const char8_t *s" -.Sh DESCRIPTION -The -.Fn u8tor -and -.Fn u8tor_uc -functions decode the first rune in the UTF-8 buffer -.Fa s , -storing the result in the rune pointed to by -.Fa ch . -Both functions return the number of bytes which compose the decoded -UTF-8. -.Pp -The two functions are nearly identical, -however -.Fn u8tor_uc -performs fewer range checks than -.Fn u8tor -allowing it to process data more efficiently. -When provided with invalid UTF-8 however, -.Fn u8tor_uc -engages in undefined-behavior. -The -.Fn u8tor -function on the other hand handles invalid UTF-8 by storing -.Dv RUNE_ERROR -in -.Fa ch -and returning 1. -.Sh RETURN VALUES -The -.Fn u8tor -and -.Fn u8tor_uc -functions return the number of bytes from -.Fa s -decoded into -.Fa ch . -.Pp -The -.Fn u8tor -function returns 1 on invalid UTF-8. -.Sh EXAMPLES -The following call to -.Fn u8tor -attempts to decode the first UTF-8 codepoint in -.Va buf . -.Bd -literal -offset indent -/* Implementation of read_codepoint() omitted */ - -int w; -rune ch; -char8_t *buf = read_codepoint(stdin); - -w = u8tor(&ch, buf); -if (ch == RUNE_ERROR) { - fputs("Got invalid UTF-8 codepoint", stderr); - exit(EXIT_FAILURE); -} -printf("Got rune ‘%.*s’\en", w, buf); -.Ed -.Sh SEE ALSO -.Xr rtou8 3 , -.Xr u8chk 3 , -.Xr u8next 3 , -.Xr unicode 7 , -.Xr utf\-8 7 -.Sh STANDARDS -.Rs -.%A F. Yergeau -.%D November 2003 -.%R RFC 3629 -.%T UTF-8, a transformation format of ISO 10646 -.Re -.Sh AUTHORS -.An Thomas Voss Aq Mt mail@thomasvoss.com diff --git a/vendor/librune/man/u8tor_uc.3 b/vendor/librune/man/u8tor_uc.3 deleted file mode 100644 index 1527e52..0000000 --- a/vendor/librune/man/u8tor_uc.3 +++ /dev/null @@ -1 +0,0 @@ -.so u8tor.3 diff --git a/vendor/librune/man/u8wdth.3 b/vendor/librune/man/u8wdth.3 deleted file mode 100644 index 60fcada..0000000 --- a/vendor/librune/man/u8wdth.3 +++ /dev/null @@ -1,69 +0,0 @@ -.Dd January 16 2024 -.Dt U8WDTH 3 -.Os -.Sh NAME -.Nm u8wdth -.Nd Unicode codepoint width -.Sh LIBRARY -.Lb librune -.Sh SYNOPSIS -.In utf8.h -.Ft int -.Fn u8wdth "rune ch" -.Sh DESCRIPTION -The -.Fn u8wdth -function returns the number of bytes that would be occupied by the -Unicode-codepoint -.Fa ch -if it was encoded as UTF-8. -If -.Fa ch -is greater than -.Dv RUNE_MAX , -a width of 0 is returned. -.Pp -If the exact UTF-8 encoded size of a codepoint is not relevant and you -simply wish to allocate a buffer capable of holding a given number of -UTF-8 codepoints, -the -.Dv U8_LEN_MAX -macro may be preferable. -.Pp -This function treats invalid codepoints smaller than -.Dv RUNE_MAX -such as UTF-16 surrogates as valid. -.Sh RETURN VALUES -The -.Fn u8wdth -function returns the number of bytes required to UTF-8 encode the -codepoint -.Fa ch . -.Sh EXAMPLES -The following example allocates a buffer which is exactly large enough to -hold the given UTF-32 string once it is converted to UTF-8. -.Bd -literal -offset indent -#define lengthof(a) (sizeof(a) / sizeof(*(a))) - -size_t bufsiz = 0; -char8_t *buf; -char32_t s[] = U\(dqIJsselmeer\(dq; /* ‘IJ’ takes 2 bytes */ - -for (size_t i = 0; i < lengthof(s) - 1; i++) - bufsiz += u8wdth(s[i]); -buf = malloc(bufsiz); -.Ed -.Sh SEE ALSO -.Xr u8glen 3 , -.Xr u8len 3 , -.Xr unicode 7 , -.Xr utf-8 7 -.Sh STANDARDS -.Rs -.%A F. Yergeau -.%D November 2003 -.%R RFC 3629 -.%T UTF-8, a transformation format of ISO 10646 -.Re -.Sh AUTHORS -.An Thomas Voss Aq Mt mail@thomasvoss.com -- cgit v1.2.3