aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorThomas Voss <mail@thomasvoss.com> 2024-10-30 01:51:14 +0100
committerThomas Voss <mail@thomasvoss.com> 2024-10-30 01:51:14 +0100
commit042e43247f396a9000fead59d9bff87bf12806d6 (patch)
treee902784464cbe9ce3c5114d513b016523e7e4b29 /src
parent170b8a92434233241c990c3e9432786de3262bcd (diff)
Completely revamp the grab source code
Some of the (many) few changes are: - Multithreading for significantly faster performance - The -p/--predicate flag - Byte offsets as the default - No customizable colors (maybe this will come back later) - Newer edition of mlib (formerly librune)
Diffstat (limited to 'src')
-rw-r--r--src/da.h121
-rw-r--r--src/exitcodes.h9
-rw-r--r--src/flags.h23
-rw-r--r--src/grab.c859
-rw-r--r--src/main.c372
-rw-r--r--src/tpool.c127
-rw-r--r--src/tpool.h19
-rw-r--r--src/work.c453
-rw-r--r--src/work.h16
9 files changed, 1019 insertions, 980 deletions
diff --git a/src/da.h b/src/da.h
deleted file mode 100644
index 8891971..0000000
--- a/src/da.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Simple & stupid dynamic array single-header implementation. You can use the
- * macros defined in this file with any structure that has the following fields:
- *
- * struct dyn_array {
- * T *buf // Array of items
- * N len // Length of array
- * N cap // Capacity of array
- * }
- *
- * The type ‘T’ is whatever type you want to store. The type ‘N’ is any numeric
- * type — most likely ‘size_t’ — but it could be sized as well.
- *
- * The daremove() macro also doesn’t bother with shrinking your array when the
- * length is far lower than the capacity. If you care about that, do it
- * yourself.
- *
- * Remember to call free() on your dynamic arrays ‘buf’ field after use.
- *
- *
- * Macro Overview
- * ――――――――――――――
- * The argument ‘a’ to all of the below macros is a pointer to the dynamic array
- * structure.
- *
- * dainit(a, n) Initialize the array with a capacity of ‘n’ items.
- * dapush(a, x) Append the item ‘x’ to the array
- * daremove(a, x) Remove the item at index ‘x’ from the array
- * da_remove_range(a, x, y) Remove the items between the range [x, y)
- * da_foreach(a, p) Iterate the pointer ‘p’ over each element of the
- * array. The type of ‘p’ is inferred.
- *
- * The ‘dapush()’ macro will double the arrays capacity when it gets full. If
- * you would like your arrays to grow with a different scale, edit this file.
- *
- *
- * Example
- * ―――――――
- *
- * struct {
- * int *buf;
- * size_t len, cap;
- * } nums;
- *
- * // Initialize nums with capacity == 4
- * dainit(&nums, 4);
- *
- * // Append 69, 1337, and 420 to nums
- * dapush(&nums, 69);
- * dapush(&nums, 1337);
- * dapush(&nums, 420);
- *
- * da_foreach (&nums, n) {
- * int x = *n << 1;
- * printf("n = %d; n² = %d\n", *n, x);
- * }
- *
- * // Remove 1337 and 420 from nums
- * da_remove_range(&nums, 1, 3);
- *
- * // Remove 69 from nums
- * daremove(&nums, 0);
- */
-
-#ifndef MANGO_DA_H
-#define MANGO_DA_H
-
-#include <err.h>
-#include <errno.h>
-#include <stddef.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-
-#if __STDC_VERSION__ >= 202311L
-# define DA_NULL nullptr
-#else
-# define DA_NULL NULL
-#endif
-
-#define DA_ALLOC(p, n) \
- do { \
- if ((n) && SIZE_MAX / (n) < sizeof(*(p))) { \
- errno = EOVERFLOW; \
- err(EXIT_FAILURE, "realloc"); \
- } \
- if (!((p) = realloc((p), (n) * sizeof(*(p))))) \
- err(EXIT_FAILURE, "realloc"); \
- } while (0)
-
-#define dainit(a, n) \
- do { \
- (a)->buf = DA_NULL; \
- (a)->cap = (n); \
- (a)->len = 0; \
- if (n) \
- DA_ALLOC((a)->buf, (a)->cap); \
- } while (0)
-
-#define dapush(a, x) \
- do { \
- if ((a)->len >= (a)->cap) { \
- (a)->cap = (a)->cap ? (a)->cap * 2 : 1; \
- DA_ALLOC((a)->buf, (a)->cap); \
- } \
- (a)->buf[(a)->len++] = (x); \
- } while (0)
-
-#define daremove(a, i) da_remove_range((a), (i), (i) + 1)
-
-#define da_remove_range(a, i, j) \
- do { \
- memmove((a)->buf + (i), (a)->buf + (j), \
- ((a)->len - (j)) * sizeof(*(a)->buf)); \
- (a)->len -= j - i; \
- } while (0)
-
-#define da_foreach(a, p) \
- for (typeof((a)->buf) p = (a)->buf; (size_t)(p - (a)->buf) < (a)->len; p++)
-
-#endif /* !MANGO_DA_H */
diff --git a/src/exitcodes.h b/src/exitcodes.h
new file mode 100644
index 0000000..00b455c
--- /dev/null
+++ b/src/exitcodes.h
@@ -0,0 +1,9 @@
+#ifndef GRAB_EXITCODES_H
+#define GRAB_EXITCODES_H
+
+/* These values should never be changed! Scripts may depend on them. */
+constexpr int EXIT_NOMATCH = 1;
+constexpr int EXIT_WARNING = 2;
+constexpr int EXIT_FATAL = 3;
+
+#endif /* !GRAB_EXITCODES_H */
diff --git a/src/flags.h b/src/flags.h
new file mode 100644
index 0000000..a4a4709
--- /dev/null
+++ b/src/flags.h
@@ -0,0 +1,23 @@
+#ifndef GRAB2_FLAGS_H
+#define GRAB2_FLAGS_H
+
+typedef struct {
+ bool c : 1;
+ bool i : 1;
+ bool l : 1;
+ bool p : 1;
+ bool s : 1;
+ bool U : 1;
+ bool z : 1;
+
+#if !GIT_GRAB
+ bool do_header : 1;
+#endif
+} flags_t;
+
+#if !MAIN_C
+extern
+#endif
+flags_t flags;
+
+#endif /* !GRAB2_FLAGS_H */
diff --git a/src/grab.c b/src/grab.c
deleted file mode 100644
index bf26a79..0000000
--- a/src/grab.c
+++ /dev/null
@@ -1,859 +0,0 @@
-#include <err.h>
-#include <getopt.h>
-#include <libgen.h>
-#include <limits.h>
-#include <locale.h>
-#include <stddef.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-
-#if GRAB_DO_PCRE
-# include <pcre2posix.h>
-#else
-# include <regex.h>
-# ifndef REG_DOTALL
-# define REG_DOTALL 0
-# endif
-# define REG_UCP 0
-# define REG_UTF 0
-# ifndef REG_STARTEND
-# error "REG_STARTEND not defined"
-# endif
-#endif
-
-#include <gbrk.h>
-#include <rune.h>
-#include <utf8.h>
-
-#include "da.h"
-
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
-
-#define FLAGMSK(f) ((uint64_t)1 << ((f) - ((f) < 'a' ? 'A' : 'G')))
-#define FLAGSET(f) (flags & FLAGMSK(f))
-
-#define lengthof(a) (sizeof(a) / sizeof(*(a)))
-
-#define die(...) err(3, __VA_ARGS__)
-#define diex(...) errx(3, __VA_ARGS__)
-#define warn(...) \
- do { \
- warn(__VA_ARGS__); \
- rv = 3; \
- } while (0)
-#define warnx(...) \
- do { \
- warnx(__VA_ARGS__); \
- rv = 3; \
- } while (0)
-
-#define streq(a, b) (!strcmp(a, b))
-#define memeq(a, b, n) (!memcmp(a, b, n))
-
-#define DEFCOL_FN "35"
-#define DEFCOL_HL "01;31"
-#define DEFCOL_LN "32"
-#define DEFCOL_SE "36"
-
-struct matches {
- struct sv *buf;
- size_t len, cap;
-};
-
-struct op {
- char c;
- regex_t pat;
-#ifdef GRAB_DEBUG
- bool alloced;
-#endif
-};
-
-struct ops {
- struct op *buf;
- size_t len, cap;
-};
-
-struct sv {
- char8_t *p;
- size_t len;
-};
-
-typedef unsigned char uchar;
-typedef void cmd_func(struct sv, struct matches *, struct ops, size_t,
- const char *);
-typedef void put_func(struct sv, struct matches *, const char *);
-
-static cmd_func cmdg, cmdh, cmdH, cmdx, cmdX;
-static put_func putm, putm_nc;
-
-#if GIT_GRAB
-static FILE *getfstream(int n, char *v[n]);
-#endif
-static void grab(struct ops, FILE *, const char *);
-static struct ops comppat(char8_t *);
-static regex_t mkregex(char8_t *, size_t);
-static bool islbrk(struct u8view);
-static bool sgrvalid(const char *);
-static bool xisspace(char);
-static int svposcmp(const void *, const void *);
-static char *env_or_default(const char *, const char *);
-
-static int filecnt, rv;
-static bool got_match;
-static uint64_t flags = FLAGMSK('f') * GIT_GRAB;
-static put_func *putf;
-
-static struct {
- const char8_t *p, *bp;
- size_t col, row;
-} pos;
-
-static cmd_func *op_table[UCHAR_MAX] = {
- ['g'] = cmdg, ['G'] = cmdg, ['h'] = cmdh,
- ['H'] = cmdH, ['x'] = cmdx, ['X'] = cmdX,
-};
-
-[[noreturn]] static void
-usage(const char *s)
-{
- fprintf(stderr,
-#if GIT_GRAB
- "Usage: %s [-s | -z] [-bcinU] pattern [glob ...]\n"
-#else
- "Usage: %s [-s | -z] [-bcfinU] pattern [file ...]\n"
-#endif
- " %s -h\n",
- s, s);
- exit(EXIT_FAILURE);
-}
-
-int
-main(int argc, char **argv)
-{
- int opt;
- struct ops ops;
- struct option longopts[] = {
- {"byte-offset", no_argument, nullptr, 'b'},
- {"color", no_argument, nullptr, 'c'},
-#if GIT_GRAB
- {"filenames", no_argument, nullptr, 'f'},
-#endif
- {"help", no_argument, nullptr, 'h'},
- {"ignore-case", no_argument, nullptr, 'i'},
- {"newline", no_argument, nullptr, 'n'},
- {"strip-newline", no_argument, nullptr, 's'},
- {"no-unicode", no_argument, nullptr, 'U'},
- {"zero", no_argument, nullptr, 'z'},
- {nullptr, 0, nullptr, 0 },
- };
-
-#if GIT_GRAB
- char *entry = nullptr;
- size_t len;
- ssize_t nr;
- FILE *flist;
- const char *opts = "bchinsUz";
-#else
- const char *opts = "bcfhinsUz";
-#endif
-
- argv[0] = basename(argv[0]);
- if (argc < 2)
- usage(argv[0]);
-
- setlocale(LC_ALL, "");
-
- while ((opt = getopt_long(argc, argv, opts, longopts, nullptr)) != -1) {
- switch (opt) {
- case '?':
- usage(argv[0]);
- case 'h':
- execlp("man", "man", "1", argv[0], nullptr);
- die("execlp: man 1 %s", argv[0]);
-#if !GRAB_DO_PCRE
- case 'U':
- errx(2, "program not built with PCRE support");
-#endif
- default:
- flags |= FLAGMSK(opt);
- }
- }
-
- if (FLAGSET('s') && FLAGSET('z'))
- usage(argv[0]);
-
- argc -= optind;
- argv += optind;
- filecnt = argc - 1;
-
- if (!FLAGSET('c') && isatty(STDOUT_FILENO) == 1
- && !env_or_default("NO_COLOR", nullptr))
- {
- flags |= FLAGMSK('c') * !streq(env_or_default("TERM", ""), "dumb");
- }
-
- putf = FLAGSET('c') ? putm : putm_nc;
- ops = comppat(argv[0]);
-
-#if GIT_GRAB
- if (!(flist = getfstream(argc - 1, argv + 1)))
- die("getfstream");
- while ((nr = getdelim(&entry, &len, '\0', flist)) > 0) {
- FILE *fp;
-
- if (!(fp = fopen(entry, "r")))
- warn("fopen: %s", entry);
- else {
- grab(ops, fp, entry);
- fclose(fp);
- }
- }
- if (ferror(flist))
- warn("getdelim");
- fclose(flist);
-#else
- if (argc == 1)
- grab(ops, stdin, "-");
- else {
- for (int i = 1; i < argc; i++) {
- FILE *fp;
-
- if (streq(argv[i], "-")) {
- grab(ops, stdin, "-");
- } else if (!(fp = fopen(argv[i], "r"))) {
- warn("fopen: %s", argv[i]);
- } else {
- grab(ops, fp, argv[i]);
- fclose(fp);
- }
- }
- }
-#endif
-
-#ifdef GRAB_DEBUG
-# if GIT_GRAB
- free(entry);
-# endif
- da_foreach (&ops, op) {
- if (op->alloced)
- regfree(&op->pat);
- }
- free(ops.buf);
-#endif
-
- return got_match ? rv : EXIT_FAILURE;
-}
-
-struct ops
-comppat(char8_t *s)
-{
- struct ops ops;
-
- dainit(&ops, 8);
- while (*s && xisspace(*s))
- s++;
- if (!*s)
- diex("input string terminated prematurely");
-
- do {
- int w;
- rune ch;
- size_t len;
- char8_t *p;
- struct op op;
-
- /* Grab the operator and delimiter. All operators are ASCII, but
- u8tor() is used to parse it so that we get properly formed error
- messages when someone uses a non-ASCII operator. */
- w = u8tor(&ch, s);
- if (ch == RUNE_ERROR)
- diex("invalid UTF-8 sequence near ‘%02hhX’", s[-1]);
- if (w > 1 || !op_table[ch])
- diex("invalid operator ‘%.*s’", w, s);
- op.c = *s++;
-
- s += u8tor(&ch, s);
- if (ch == RUNE_ERROR)
- diex("invalid UTF-8 sequence near ‘%02hhX’", s[-1]);
- if (ch == '\0')
- diex("input string terminated prematurely");
-
- /* Find the closing delimiter. The user is allowed to omit the closing
- delimiter if this is the last operation in the query pattern. */
- p = s;
- len = strlen(s);
- if (!(s = u8chr(s, ch, len)))
- s = p + len;
-
- if (s - p == 0) {
- if (op.c != 'h')
- diex("empty regex given to ‘%c’", op.c);
- if (ops.len == 0)
- diex("empty ‘h’ is not allowed as the first operator");
- op.pat = ops.buf[ops.len - 1].pat;
- } else
- op.pat = mkregex(p, s - p);
-
-#if GRAB_DEBUG
- op.alloced = s - p == 0;
-#endif
-
- dapush(&ops, op);
-
- if (*s) {
- s += u8tor(&ch, s);
- if (ch == RUNE_ERROR)
- diex("invalid UTF-8 sequence near ‘%02hhX’", s[-1]);
- }
- while (*s && xisspace(*s))
- s++;
- } while (*s);
-
- return ops;
-}
-
-void
-grab(struct ops ops, FILE *stream, const char *filename)
-{
- size_t n;
- struct {
- char *buf;
- size_t len, cap;
- } chars = {0};
-
- do {
- static_assert(sizeof(char) == 1, "sizeof(char) != 1; wtf?");
- chars.cap += BUFSIZ;
- if (!(chars.buf = realloc(chars.buf, chars.cap)))
- die("realloc");
- chars.len += n = fread(chars.buf + chars.len, 1, BUFSIZ, stream);
- } while (n == BUFSIZ);
-
- if (ferror(stream)) {
- warn("fread: %s", filename);
- goto out;
- }
-
- const char8_t *p;
- struct sv sv = {
- .p = chars.buf,
- .len = chars.len,
- };
- struct matches ms;
-
- if (p = u8chk(chars.buf, chars.len)) {
- warnx("%s: invalid UTF-8 near ‘%02X’", filename, *p);
- goto out;
- }
-
- dainit(&ms, 4);
- pos.col = pos.row = 1;
- pos.bp = pos.p = chars.buf;
- op_table[(uchar)ops.buf[0].c](sv, &ms, ops, 0, filename);
- free(ms.buf);
-
-out:
- free(chars.buf);
-}
-
-void
-cmdg(struct sv sv, struct matches *ms, struct ops ops, size_t i,
- const char *filename)
-{
- int r;
- regmatch_t rm = {
- .rm_so = 0,
- .rm_eo = sv.len,
- };
- struct op op = ops.buf[i];
-
- r = regexec(&op.pat, sv.p, 1, &rm, REG_STARTEND);
- if ((r == REG_NOMATCH && op.c == 'g') || (r != REG_NOMATCH && op.c == 'G'))
- return;
-
- if (i + 1 == ops.len)
- putf(sv, ms, filename);
- else
- op_table[(uchar)ops.buf[i + 1].c](sv, ms, ops, i + 1, filename);
-}
-
-void
-cmdh(struct sv sv, struct matches *ms, struct ops ops, size_t i,
- const char *filename)
-{
- regmatch_t rm = {
- .rm_so = 0,
- .rm_eo = sv.len,
- };
- struct op op = ops.buf[i];
-
- do {
- if (regexec(&op.pat, sv.p, 1, &rm, REG_STARTEND) == REG_NOMATCH)
- break;
-
- if (rm.rm_so < rm.rm_eo)
- dapush(ms, ((struct sv){sv.p + rm.rm_so, rm.rm_eo - rm.rm_so}));
- else {
- rune unused;
- rm.rm_eo += u8tor_uc(&unused, sv.p + rm.rm_eo);
- }
-
- rm = (regmatch_t){
- .rm_so = rm.rm_eo,
- .rm_eo = sv.len,
- };
- } while (rm.rm_so < rm.rm_eo);
-
- if (i + 1 == ops.len)
- putf(sv, ms, filename);
- else {
- size_t save = ms->len;
- op_table[(uchar)ops.buf[i + 1].c](sv, ms, ops, i + 1, filename);
- ms->len = save;
- }
-}
-
-void
-cmdH(struct sv sv, struct matches *ms, struct ops ops, size_t i,
- const char *filename)
-{
- regmatch_t rm = {
- .rm_so = 0,
- .rm_eo = sv.len,
- };
- regmatch_t prev = {
- .rm_so = 0,
- .rm_eo = 0,
- };
- struct op op = ops.buf[i];
-
- do {
- struct sv nsv;
-
- if (regexec(&op.pat, sv.p, 1, &rm, REG_STARTEND) == REG_NOMATCH)
- break;
-
- if (prev.rm_so || prev.rm_eo || rm.rm_so) {
- nsv = (struct sv){
- .p = sv.p + prev.rm_eo,
- .len = rm.rm_so - prev.rm_eo,
- };
- if (nsv.len)
- dapush(ms, nsv);
- }
-
- prev = rm;
- if (rm.rm_so == rm.rm_eo) {
- rune unused;
- rm.rm_eo += u8tor_uc(&unused, sv.p + rm.rm_eo);
- }
- rm = (regmatch_t){
- .rm_so = rm.rm_eo,
- .rm_eo = sv.len,
- };
- } while (rm.rm_so < rm.rm_eo);
-
- if (prev.rm_eo < rm.rm_eo)
- dapush(ms, ((struct sv){sv.p + rm.rm_so, rm.rm_eo - rm.rm_so}));
-
- if (i + 1 == ops.len)
- putf(sv, ms, filename);
- else
- op_table[(uchar)ops.buf[i + 1].c](sv, ms, ops, i + 1, filename);
-}
-
-void
-cmdx(struct sv sv, struct matches *ms, struct ops ops, size_t i,
- const char *filename)
-{
- regmatch_t rm = {
- .rm_so = 0,
- .rm_eo = sv.len,
- };
- struct op op = ops.buf[i];
-
- do {
- struct sv nsv;
-
- if (regexec(&op.pat, sv.p, 1, &rm, REG_STARTEND) == REG_NOMATCH)
- break;
- if (rm.rm_so < rm.rm_eo) {
- nsv = (struct sv){
- .p = sv.p + rm.rm_so,
- .len = rm.rm_eo - rm.rm_so,
- };
- if (i + 1 == ops.len)
- putf(nsv, ms, filename);
- else {
- size_t save = ms->len;
- op_table[(uchar)ops.buf[i + 1].c](nsv, ms, ops, i + 1,
- filename);
- ms->len = save;
- }
- } else {
- rune unused;
- rm.rm_eo += u8tor_uc(&unused, sv.p + rm.rm_eo);
- }
- rm = (regmatch_t){
- .rm_so = rm.rm_eo,
- .rm_eo = sv.len,
- };
- } while (rm.rm_so < rm.rm_eo);
-}
-
-void
-cmdX(struct sv sv, struct matches *ms, struct ops ops, size_t i,
- const char *filename)
-{
- regmatch_t rm = {
- .rm_so = 0,
- .rm_eo = sv.len,
- };
- regmatch_t prev = {
- .rm_so = 0,
- .rm_eo = 0,
- };
- struct op op = ops.buf[i];
-
- do {
- struct sv nsv;
-
- if (regexec(&op.pat, sv.p, 1, &rm, REG_STARTEND) == REG_NOMATCH)
- break;
-
- if (prev.rm_so || prev.rm_eo || rm.rm_so) {
- nsv = (struct sv){
- .p = sv.p + prev.rm_eo,
- .len = rm.rm_so - prev.rm_eo,
- };
- if (nsv.len) {
- if (i + 1 == ops.len)
- putf(nsv, ms, filename);
- else
- op_table[(uchar)ops.buf[i + 1].c](nsv, ms, ops, i + 1,
- filename);
- }
- }
-
- prev = rm;
- if (rm.rm_so == rm.rm_eo) {
- rune unused;
- rm.rm_eo += u8tor_uc(&unused, sv.p + rm.rm_eo);
- }
- rm = (regmatch_t){
- .rm_so = rm.rm_eo,
- .rm_eo = sv.len,
- };
- } while (rm.rm_so < rm.rm_eo);
-
- if (prev.rm_eo < rm.rm_eo) {
- struct sv nsv = {
- .p = sv.p + rm.rm_so,
- .len = rm.rm_eo - rm.rm_so,
- };
- if (i + 1 == ops.len)
- putf(nsv, ms, filename);
- else
- op_table[(uchar)ops.buf[i + 1].c](nsv, ms, ops, i + 1, filename);
- }
-}
-
-int
-svposcmp(const void *a, const void *b)
-{
- struct sv *A, *B;
- A = (struct sv *)a;
- B = (struct sv *)b;
- return A->p != B->p ? A->p - B->p : A->len < B->len ? -1 : A->len != B->len;
-}
-
-void
-putm(struct sv sv, struct matches *ms, const char *filename)
-{
- const char8_t *p;
- struct matches valid;
- static const char *fn, *hl, *ln, *se;
-
- got_match = true;
-
- if (FLAGSET('c') && !fn) {
- char *optstr;
- if ((optstr = env_or_default("GRAB_COLORS", nullptr))) {
- enum {
- OPT_FN,
- OPT_HL,
- OPT_LN,
- OPT_SE,
- };
- /* clang-format off */
- static char *const tokens[] = {
- [OPT_FN] = "fn",
- [OPT_HL] = "hl",
- [OPT_LN] = "ln",
- [OPT_SE] = "se",
- nullptr
- };
- /* clang-format on */
-
- while (*optstr) {
- char *val;
- switch (getsubopt(&optstr, tokens, &val)) {
- case OPT_FN:
- if (sgrvalid(val))
- fn = val;
- break;
- case OPT_HL:
- if (sgrvalid(val))
- hl = val;
- break;
- case OPT_LN:
- if (sgrvalid(val))
- fn = val;
- break;
- case OPT_SE:
- if (sgrvalid(val))
- se = val;
- break;
- default:
- warnx("invalid color value -- '%s'", val);
- }
- }
- }
-
- if (!fn)
- fn = DEFCOL_FN;
- if (!hl)
- hl = DEFCOL_HL;
- if (!ln)
- ln = DEFCOL_LN;
- if (!se)
- se = DEFCOL_SE;
- }
-
- if (FLAGSET('f') || filecnt > 1) {
- char sep = FLAGSET('z') ? '\0' : ':';
- printf("\33[%sm%s\33[0m" /* filename */
- "\33[%sm%c\33[0m", /* separator */
- fn, filename, se, sep);
-
- if (FLAGSET('b')) {
- printf("\33[%sm%td\33[0m" /* byte offset */
- "\33[%sm%c\33[0m", /* separator */
- ln, sv.p - pos.bp, se, sep);
- } else {
- struct u8view v;
- size_t len = sv.p - pos.p;
-
- while (u8gnext(&v, &pos.p, &len)) {
- if (islbrk(v)) {
- pos.col = 1;
- pos.row++;
- } else
- pos.col++;
- }
-
- printf("\33[%sm%zu\33[0m" /* row */
- "\33[%sm%c\33[0m" /* separator */
- "\33[%sm%zu\33[0m" /* column */
- "\33[%sm%c\33[0m", /* separator */
- ln, pos.row, se, sep, ln, pos.col, se, sep);
- }
- }
-
- /* Here we need to take all the views of regions to highlight, and try
- to merge them into a simpler form. This happens in two steps:
-
- 1. Sort the views by their starting position in the matched text.
- 2. Merge overlapping views.
-
- After this process we should have the most reduced possible set of
- views. The next part is to actually print the highlighted regions
- possible which requires bounds-checking as highlighted regions may
- begin before or end after the matched text when using patterns such
- as ‘h/.+/ x/.$/’. */
-
- dainit(&valid, ms->len);
- qsort(ms->buf, ms->len, sizeof(*ms->buf), svposcmp);
- memcpy(valid.buf, ms->buf, ms->len * sizeof(*ms->buf));
- valid.len = ms->len;
-
- for (size_t i = 0; i + 1 < valid.len;) {
- ptrdiff_t d;
- struct sv *a, *b;
-
- a = valid.buf + i;
- b = valid.buf + i + 1;
- d = a->p + a->len - b->p;
-
- if (d >= 0) {
- a->len += MAX(b->len - d, 0);
- daremove(&valid, i + 1);
- } else
- i++;
- }
-
- for (size_t i = 0; i < valid.len; i++) {
- struct sv *m = valid.buf + i;
- if (m->p + m->len < sv.p || m->p >= sv.p + sv.len) {
- daremove(&valid, i);
- i--;
- continue;
- }
-
- if (m->p < sv.p) {
- m->len -= sv.p - m->p;
- m->p = sv.p;
- }
- m->len = MIN(m->len, (size_t)(sv.p + sv.len - m->p));
- }
-
- p = sv.p;
- da_foreach (&valid, m) {
- printf("%.*s\33[%sm%.*s\33[0m", (int)(m->p - p), p, hl, (int)m->len,
- m->p);
- p = m->p + m->len;
- }
- fwrite(p, 1, sv.p + sv.len - p, stdout);
-
- if (!(FLAGSET('s') && sv.p[sv.len - 1] == '\n'))
- putchar(FLAGSET('z') ? '\0' : '\n');
- free(valid.buf);
-}
-
-void
-putm_nc(struct sv sv, struct matches *ms, const char *filename)
-{
- (void)ms;
-
- got_match = true;
-
- if (FLAGSET('f') || filecnt > 1) {
- char sep = FLAGSET('z') ? '\0' : ':';
- printf("%s%c", filename, sep);
-
- if (FLAGSET('b'))
- printf("%td%c", sv.p - pos.bp, sep);
- else {
- struct u8view v;
- size_t len = sv.p - pos.p;
-
- while (u8gnext(&v, &pos.p, &len)) {
- if (islbrk(v)) {
- pos.col = 1;
- pos.row++;
- } else
- pos.col++;
- }
-
- printf("%zu%c%zu%c", pos.row, sep, pos.col, sep);
- }
- }
- fwrite(sv.p, 1, sv.len, stdout);
- if (!(FLAGSET('s') && sv.p[sv.len - 1] == '\n'))
- putchar(FLAGSET('z') ? '\0' : '\n');
-}
-
-bool
-islbrk(struct u8view v)
-{
- return *v.p == '\n' || (v.len == 2 && memeq(v.p, "\r\n", 2));
-}
-
-bool
-sgrvalid(const char *s)
-{
- if (!s || !*s)
- return false;
- do {
- if ((*s < '0' || *s > '9') && *s != ';')
- return false;
- } while (*++s);
-
- return true;
-}
-
-regex_t
-mkregex(char8_t *s, size_t n)
-{
- int ret, cflags;
- regex_t r;
- char8_t c = s[n];
-
- s[n] = 0;
- cflags = REG_EXTENDED | REG_UTF | (FLAGSET('n') ? REG_NEWLINE : REG_DOTALL);
- if (FLAGSET('i'))
- cflags |= REG_ICASE;
- if (!FLAGSET('U'))
- cflags |= REG_UCP;
- if (ret = regcomp(&r, s, cflags)) {
- char emsg[256];
- regerror(ret, &r, emsg, sizeof(emsg));
- diex("failed to compile regex ‘%s’: %s", s, emsg);
- }
- s[n] = c;
-
- return r;
-}
-
-#if GIT_GRAB
-FILE *
-getfstream(int argc, char *argv[argc])
-{
- pid_t pid;
- int fds[2];
- enum {
- FD_R,
- FD_W,
- };
-
- if (pipe(fds) == -1)
- die("pipe");
-
- switch (pid = fork()) {
- case -1:
- die("fork");
- case 0:;
- size_t len;
- char **args;
- static const char *git_grep_args[] = {
- "git", "grep", "--cached", "-Ilz", "",
- };
-
- len = argc + lengthof(git_grep_args) + 1;
-
- close(fds[FD_R]);
- if (dup2(fds[FD_W], STDOUT_FILENO) == -1)
- die("dup2");
- close(fds[FD_W]);
-
- if (!(args = malloc(len * sizeof(char *))))
- die("malloc");
- memcpy(args, git_grep_args, sizeof(git_grep_args));
- memcpy(args + 5, argv, argc * sizeof(char *));
- args[len - 1] = nullptr;
-
- execvp("git", args);
- die("execvp: git grep --cached -Ilz ''");
- }
-
- close(fds[FD_W]);
- return fdopen(fds[FD_R], "r");
-}
-#endif
-
-char *
-env_or_default(const char *e, const char *d)
-{
- const char *s = getenv(e);
- return (char *)(s && *s ? s : d);
-}
-
-bool
-xisspace(char c)
-{
- return c == ' ' || c == '\t' || c == '\n';
-}
diff --git a/src/main.c b/src/main.c
new file mode 100644
index 0000000..5b2efff
--- /dev/null
+++ b/src/main.c
@@ -0,0 +1,372 @@
+#include <langinfo.h>
+#include <locale.h>
+#include <stdatomic.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <alloc.h>
+#include <array.h>
+#include <cli.h>
+#include <errors.h>
+#include <macros.h>
+#include <mbstring.h>
+#include <pcre2.h>
+#include <unicode/prop.h>
+#include <unicode/string.h>
+
+#include "exitcodes.h"
+#include "tpool.h"
+#include "work.h"
+
+#define MAIN_C 1
+#include "flags.h"
+
+static bool use_color_p(void);
+static op_t *pattern_comp(u8view_t pat);
+#if GIT_GRAB
+static FILE *getfstream(int globc, char **globv);
+#endif
+
+atomic_int rv = EXIT_NOMATCH;
+op_t *ops;
+/* For use in diagnostic messages */
+const char *lquot = "`", *rquot = "'";
+
+/* We need to use different matching functions depending on if we’re using JIT
+ matching or not */
+typeof(pcre2_match) *pcre2_match_fn;
+
+static char emsg[256]; /* Buffer for PCRE2 error messages */
+/* TODO: Use the LUT in work.c */
+static const bool opchars[] = {
+ ['g'] = true,
+ ['G'] = true,
+ ['h'] = true,
+ ['H'] = true,
+ ['x'] = true,
+ ['X'] = true,
+};
+
+int
+main(int argc, char **argv)
+{
+ mlib_setprogname(argv[0]);
+ setlocale(LC_ALL, "");
+
+ if (streq(nl_langinfo(CODESET), "UTF-8")) {
+ lquot = u8"‘";
+ rquot = u8"’";
+ }
+
+ optparser_t parser = mkoptparser(argv);
+ static const cli_opt_t opts[] = {
+ {'c', U8C("color"), CLI_NONE},
+ {'h', U8C("help"), CLI_NONE},
+ {'i', U8C("ignore-case"), CLI_NONE},
+ {'l', U8C("line"), CLI_OPT},
+ {'p', U8C("predicate"), CLI_NONE},
+ {'s', U8C("strip-newline"), CLI_NONE},
+ {'U', U8C("no-unicode"), CLI_NONE},
+ {'z', U8C("zero"), CLI_NONE},
+ };
+
+ for (;;) {
+ rune opt = optparse(&parser, opts, lengthof(opts));
+ if (opt == 0)
+ break;
+ switch (opt) {
+ case 'c':
+ flags.c = true;
+ break;
+ case 'h':
+ execlp("man", "man", "1", mlib_progname(), nullptr);
+ err("execlp: man 1 %s:", mlib_progname());
+ case 'i':
+ flags.i = true;
+ break;
+ case 'l':
+ flags.l = true;
+ break;
+ case 'p':
+ flags.p = true;
+ break;
+ case 's':
+ flags.s = true;
+ break;
+ case 'U':
+ flags.U = true;
+ break;
+ case 'z':
+ flags.z = true;
+ break;
+ case -1:
+ warn(parser.errmsg);
+ goto usage;
+ }
+ }
+
+ if (flags.p && flags.s) {
+ warn("-p and -s are mutually exclusive");
+ goto usage;
+ }
+ if (flags.p && flags.z) {
+ warn("-p and -z are mutually exclusive");
+ goto usage;
+ }
+ if (flags.s && flags.z) {
+ warn("-s and -z are mutually exclusive");
+ goto usage;
+ }
+
+ argc -= parser.optind;
+ argv += parser.optind;
+
+ if (argc == 0) {
+ usage:
+ usage("[-p | -s | -z] [-cilU] pattern [file ...]", "-h");
+ exit(EXIT_FATAL);
+ }
+
+ flags.c = flags.c || use_color_p();
+ ops = pattern_comp((u8view_t){*argv, strlen(*argv)});
+
+ allocator_t mem = init_heap_allocator(nullptr);
+
+#if GIT_GRAB
+ argc--;
+ argv++;
+
+ FILE *fstream = getfstream(argc, argv);
+ if (fstream == nullptr)
+ cerr(EXIT_FATAL, "getfstream:");
+
+ const char **filenames = array_new(mem, typeof(*filenames), 1024);
+
+ size_t len;
+ ssize_t nr;
+ char *file = nullptr;
+ while ((nr = getdelim(&file, &len, 0, fstream)) > 0) {
+ /* TODO: Would an arena improve performance? */
+ const char *s = strdup(file);
+ if (s == nullptr)
+ cerr(EXIT_FATAL, "strdup:");
+ array_push(&filenames, s);
+ }
+#else
+ if (argc == 1)
+ argv = (static char *[]){"-"};
+ else {
+ argc--;
+ argv++;
+ flags.do_header = true;
+ }
+#endif
+
+ tpool_t tp;
+ int thrds = tpinit(&tp,
+#if GIT_GRAB
+ filenames, array_len(filenames)
+#else
+ (const char **)argv, argc
+#endif
+ );
+
+ /* Failed to spawn threads */
+ if (thrds == 0) {
+ unsigned char *buf = array_new(mem, typeof(*buf), 4096);
+ for (int i = 0; i < argc; i++) {
+ process_file(argv[i], &buf);
+ fwrite(buf, 1, array_len(buf), stdout);
+ array_hdr(buf)->len = 0;
+ }
+#if DEBUG
+ array_free(buf);
+#endif
+ }
+
+ if (thrds != 0)
+ tpfree(&tp);
+#if DEBUG
+ pcre2_jit_free_unused_memory(nullptr);
+ array_foreach (ops, op) {
+ if (op->free_me)
+ pcre2_code_free(op->re);
+ }
+ array_free(ops);
+#if GIT_GRAB
+ array_foreach (filenames, f)
+ free(f);
+ array_free(filenames);
+#endif
+#endif
+ return rv;
+}
+
+op_t *
+pattern_comp(u8view_t pat)
+{
+ allocator_t mem = init_heap_allocator(nullptr);
+ op_t *ops = array_new(mem, op_t, 16);
+
+ for (;;) {
+ int w;
+ rune ch;
+
+ while ((w = ucsnext(&ch, &pat)) != 0) {
+ if (!uprop_is_pat_ws(ch)) {
+ VSHFT(&pat, -w);
+ break;
+ }
+ }
+ if (pat.len == 0)
+ break;
+
+ /* Grab the operator. We grab the entire next grapheme for
+ better error messages in the case that someone tries to use a
+ non-ASCII grapheme as an operator for whatever reason. */
+
+ op_t op;
+ u8view_t g;
+
+ (void)ucsgnext(&g, &pat);
+ if (g.len != 1 || *g.p >= lengthof(opchars) || !opchars[*g.p]) {
+ cerr(EXIT_FATAL, "Invalid operator %s%.*s%s",
+ lquot, SV_PRI_ARGS(g), rquot);
+ }
+ op.c = (char)*g.p;
+
+ /* Unlike with the operator, we parse the delimeter as a rune
+ instead of a grapheme. This makes it easier for users to
+ write patterns that match combining characters. This _may_ be
+ subject to change in the future but for now this is the
+ rationale. Alongside standard delimeters, if the opening
+ delimeter is a bracket or some other form of paired-bracket
+ (as determined by Unicode) then the closing delimeter is set
+ to the right-hand form of the bracket. This means that the
+ following are both valid delimeted patterns:
+
+ /regex/
+ 「regex」 */
+
+ rune ldelim, rdelim;
+ if ((w = ucsnext(&ldelim, &pat)) == 0)
+ cerr(EXIT_FATAL, "Premature end of pattern");
+ rdelim = uprop_get_bpb(ldelim);
+
+ /* Find the right delimeter, which is optional for the last
+ operator */
+ /* TODO: Change u8view_t.len to ptrdiff_t and use -1 here */
+ u8view_t re = {pat.p, (size_t)-1};
+ while ((w = ucsnext(&ch, &pat)) != 0) {
+ if (ch == rdelim) {
+ re.len = pat.p - re.p - w;
+ break;
+ }
+ }
+ if (re.len == (size_t)-1)
+ re.len = pat.p - re.p;
+ if (re.len == 0) {
+ if (op.c != 'h') {
+ cerr(EXIT_FATAL, "%s%c%s operator given empty regex",
+ lquot, op.c, rquot);
+ }
+ if (array_len(ops) == 0) {
+ cerr(EXIT_FATAL,
+ "%sh%s operator given empty regex as the first operator",
+ lquot, rquot);
+ }
+ op.re = ops[array_len(ops) - 1].re;
+#if DEBUG
+ op.free_me = false;
+#endif
+ } else {
+ int ec;
+ size_t eoff;
+ uint32_t reopts = PCRE2_DOTALL | PCRE2_MATCH_INVALID_UTF | PCRE2_UTF;
+ if (flags.i)
+ reopts |= PCRE2_CASELESS;
+ if (!flags.U)
+ reopts |= PCRE2_UCP;
+ op.re = pcre2_compile(re.p, re.len, reopts, &ec, &eoff, nullptr);
+ if (op.re == nullptr) {
+ /* TODO: Ensure the buffer is large enough for the error message */
+ (void)pcre2_get_error_message(ec, emsg, sizeof(emsg));
+ cerr(EXIT_FATAL, "Failed to compile regex: %s", emsg);
+ }
+ if ((ec = pcre2_jit_compile(op.re, PCRE2_JIT_COMPLETE)) != 0) {
+ /* TODO: Ensure the buffer is large enough for the error message */
+ (void)pcre2_get_error_message(ec, emsg, sizeof(emsg));
+ warn("Failed to JIT compile regex: %s", emsg);
+ rv = EXIT_WARNING;
+ pcre2_match_fn = pcre2_match;
+ } else
+ pcre2_match_fn = pcre2_jit_match;
+#if DEBUG
+ op.free_me = true;
+#endif
+ }
+ array_push(&ops, op);
+ }
+
+ if (array_len(ops) == 0)
+ err("Empty pattern");
+
+ return ops;
+}
+
+bool
+use_color_p(void)
+{
+ const char *ev = getenv("TERM");
+ if (ev != nullptr && streq(ev, "dumb"))
+ return false;
+ if ((ev = getenv("NO_COLOR")) != nullptr && *ev != 0)
+ return false;
+ if ((ev = getenv("CLICOLOR_FORCE")) != nullptr && *ev != 0)
+ return true;
+ return isatty(STDOUT_FILENO);
+}
+
+#if GIT_GRAB
+FILE *
+getfstream(int globc, char **globv)
+{
+ pid_t pid;
+ int fds[2];
+ enum { R, W };
+
+ if (pipe(fds) == 1)
+ cerr(EXIT_FATAL, "pipe:");
+
+ switch (pid = fork()) {
+ case -1:
+ cerr(EXIT_FATAL, "fork:");
+ case 0:
+ static const char *git_grep_argv[] = {
+ "git", "grep", "-Ilz", "",
+ };
+
+ close(fds[R]);
+ if (dup2(fds[W], STDOUT_FILENO) == -1)
+ cerr(EXIT_FATAL, "dup2:");
+ close(fds[W]);
+
+ size_t argc = globc + lengthof(git_grep_argv) + 1;
+ char **argv = malloc(argc * sizeof(char *));
+ if (argv == nullptr)
+ cerr(EXIT_FATAL, "malloc:");
+ memcpy(argv, git_grep_argv, sizeof(git_grep_argv));
+ memcpy(argv + lengthof(git_grep_argv), globv, globc * sizeof(char *));
+ argv[argc - 1] = nullptr;
+
+ execvp("git", argv);
+ cerr(EXIT_FATAL, "execvp: git grep -Ilz '':");
+ }
+
+ close(fds[W]);
+ return fdopen(fds[R], "r");
+}
+#endif
diff --git a/src/tpool.c b/src/tpool.c
new file mode 100644
index 0000000..f30bccc
--- /dev/null
+++ b/src/tpool.c
@@ -0,0 +1,127 @@
+/* Thread pool implementation mostly copied from cbs.h */
+
+#include <errno.h>
+#include <pthread.h>
+#include <stdatomic.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <alloc.h>
+#include <array.h>
+#include <errors.h>
+#include <macros.h>
+
+#include "tpool.h"
+#include "work.h"
+
+#include <stdio.h>
+
+static int nproc(void);
+static void *tpwork(void *);
+
+static pthread_t thread_buffer[32];
+
+extern const char *lquot, *rquot;
+
+int
+nproc(void)
+{
+ errno = 0;
+
+ /* Grab the number of processors available on the users system. If we can
+ we query sysconf() but fallback to 1 for systems that don’t support the
+ sysconf() method. The user can also override this via the GRAB_NPROCS
+ environment variable, and if that’s invalid then we just issue a
+ diagnostic and default to 1.
+
+ We don’t want to error on an invalid value for GRAB_NPROCS because we
+ might be running this tool as part of an editor plugin for example where
+ finding the root cause of your regexp-search failing may not be so
+ trivial. */
+
+ const char *ev = getenv("GRAB_NPROCS");
+ if (ev != nullptr && *ev != 0) {
+ const char *endptr;
+ long n = strtol(ev, (char **)&endptr, 10);
+ if (errno == 0 && *endptr == 0)
+ return (int)n;
+ if (errno != 0)
+ warn("strtol: %s:", ev);
+ if (*endptr != 0)
+ warn("Invalid value for %s%s%s for GRAB_NPROCS", lquot, ev, rquot);
+ return 1;
+ }
+
+#ifdef _SC_NPROCESSORS_ONLN
+ return (int)sysconf(_SC_NPROCESSORS_ONLN);
+#else
+ return 1;
+#endif
+}
+
+int
+tpinit(tpool_t *tp, const char **files, ptrdiff_t filecnt)
+{
+ tp->files = files;
+ tp->filecnt = filecnt;
+ tp->tcnt = nproc();
+ tp->tcnt = MIN(tp->tcnt, filecnt);
+ tp->wi = 0;
+
+ if (tp->tcnt <= 32)
+ tp->thrds = thread_buffer;
+ else if ((tp->thrds = malloc(sizeof(*tp->thrds) * tp->tcnt)) == nullptr)
+ err("malloc:");
+
+ /* If for whatever reason some threads fail to be created, we don’t
+ panic but instead just continue using the threads that were able
+ to spawn. If all threads fail to spawn we return 0 and the caller
+ will resort to single-threaded behaviour. */
+
+ int n = 0;
+ for (int i = 0; i < tp->tcnt; i++) {
+ if ((errno = pthread_create(tp->thrds + n, nullptr, tpwork, tp)) != 0)
+ warn("failed to create thread:");
+ else
+ n++;
+ }
+ return n;
+}
+
+void
+tpfree(tpool_t *tp)
+{
+ for (int i = 0; i < tp->tcnt; i++)
+ pthread_join(tp->thrds[i], nullptr);
+
+#if DEBUG
+ if (tp->thrds != thread_buffer)
+ free(tp->thrds);
+#endif
+}
+
+void *
+tpwork(void *arg)
+{
+ tpool_t *tp = arg;
+
+ allocator_t mem = init_heap_allocator(nullptr);
+ unsigned char *buf = array_new(mem, typeof(*buf), 4096);
+
+ for (;;) {
+ ptrdiff_t i = atomic_fetch_add(&tp->wi, 1);
+ if (i >= tp->filecnt)
+ break;
+ process_file(tp->files[i], &buf);
+ }
+
+ flockfile(stdout);
+ fwrite(buf, 1, array_len(buf), stdout);
+ funlockfile(stdout);
+
+#if DEBUG
+ array_free(buf);
+#endif
+ return nullptr;
+}
diff --git a/src/tpool.h b/src/tpool.h
new file mode 100644
index 0000000..04f0486
--- /dev/null
+++ b/src/tpool.h
@@ -0,0 +1,19 @@
+#ifndef GRAB_TPOOL_H
+#define GRAB_TPOOL_H
+
+#include <pthread.h>
+#include <stdatomic.h>
+#include <stddef.h>
+
+typedef struct {
+ int tcnt;
+ pthread_t *thrds;
+ atomic_ptrdiff_t wi;
+ const char **files;
+ ptrdiff_t filecnt;
+} tpool_t;
+
+int tpinit(tpool_t *tp, const char **files, ptrdiff_t filecnt);
+void tpfree(tpool_t *tp);
+
+#endif /* !GRAB_TPOOL_H */
diff --git a/src/work.c b/src/work.c
new file mode 100644
index 0000000..37fd8b8
--- /dev/null
+++ b/src/work.c
@@ -0,0 +1,453 @@
+#include <sys/mman.h>
+#include <sys/stat.h>
+
+#include <stdatomic.h>
+#include <stdckdint.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <alloc.h>
+#include <array.h>
+#include <errors.h>
+#include <macros.h>
+#include <mbstring.h>
+#include <pcre2.h>
+#include <unicode/string.h>
+
+#include "exitcodes.h"
+#include "flags.h"
+#include "work.h"
+
+#define DEFINE_OPERATOR(fn) \
+ void operator_##fn(ptrdiff_t opi, u8view_t sv, u8view_t **hl)
+#define array_extend_sv(xs, sv) \
+ array_extend((xs), (sv).p, (ptrdiff_t)(sv).len)
+
+typedef struct {
+ ptrdiff_t row, col;
+} pos_t;
+
+static pos_t compute_pos(const char8_t *p);
+static bool islbrk(u8view_t g);
+static int svposcmp(const void *a, const void *b);
+static void write_match_to_buffer(u8view_t sv, u8view_t *hl);
+
+static DEFINE_OPERATOR(dispatch);
+static DEFINE_OPERATOR(g);
+static DEFINE_OPERATOR(G);
+static DEFINE_OPERATOR(h);
+static DEFINE_OPERATOR(H);
+static DEFINE_OPERATOR(x);
+static DEFINE_OPERATOR(X);
+
+static thread_local const char *filename;
+static thread_local char8_t *baseptr;
+static thread_local const char8_t *last_match;
+static thread_local unsigned char **buf;
+
+static typeof(operator_dispatch) *operators[] = {
+ ['g'] = operator_g,
+ ['G'] = operator_G,
+ ['h'] = operator_h,
+ ['H'] = operator_H,
+ ['x'] = operator_x,
+ ['X'] = operator_X,
+};
+
+extern atomic_int rv;
+extern op_t *ops;
+extern bool cflag;
+extern typeof(pcre2_match) *pcre2_match_fn;
+
+
+
+void
+process_file(const char *locl_filename, unsigned char **locl_buf)
+{
+ filename = locl_filename;
+ buf = locl_buf;
+
+ FILE *fp = streq(filename, "-") ? stdin : fopen(filename, "r");
+ if (fp == nullptr) {
+ warn("fopen: %s:", filename);
+ atomic_store(&rv, EXIT_WARNING);
+ goto out;
+ }
+
+ allocator_t mem = init_heap_allocator(nullptr);
+ if (baseptr == nullptr)
+ baseptr = array_new(mem, char8_t, 0x1000);
+ size_t bufsz = array_cap(baseptr);
+ last_match = baseptr;
+
+ do {
+ static_assert(sizeof(char8_t) == 1, "sizeof(char8_t) != 1; wtf?");
+ baseptr = array_resz(baseptr, bufsz += BUFSIZ); /* TODO: Bounds checking */
+ size_t n = fread(baseptr + array_len(baseptr), 1, BUFSIZ, fp);
+ array_hdr(baseptr)->len += n;
+ } while (!feof(fp));
+
+ if (ferror(fp)) {
+ warn("fread: %s:", filename);
+ atomic_store(&rv, EXIT_WARNING);
+ goto out;
+ }
+
+ /* Shouldn’t need more than 32 ever… */
+ static thread_local u8view_t *hl = nullptr;
+ if (hl == nullptr)
+ hl = array_new(mem, typeof(*hl), 32);
+
+ operator_dispatch(0, (u8view_t){baseptr, array_len(baseptr)}, &hl);
+#if DEBUG
+ array_free(baseptr);
+ baseptr = nullptr;
+ array_free(hl);
+ hl = nullptr;
+#else
+ array_hdr(baseptr)->len = 0;
+ array_hdr(hl)->len = 0;
+#endif
+
+out:
+ if (fp != stdin)
+ (void)fclose(fp);
+}
+
+
+
+DEFINE_OPERATOR(dispatch)
+{
+ if (array_len(ops) == opi) {
+ if (flags.p)
+ exit(EXIT_SUCCESS);
+ atomic_compare_exchange_strong(&rv, &(int){EXIT_NOMATCH}, EXIT_SUCCESS);
+ write_match_to_buffer(sv, *hl);
+ } else /* Cast to silence GCC warning */
+ operators[(unsigned char)ops[opi].c](opi, sv, hl);
+}
+
+DEFINE_OPERATOR(g)
+{
+ pcre2_match_data *md =
+ pcre2_match_data_create_from_pattern(ops[opi].re, nullptr);
+ int n = pcre2_match_fn(ops[opi].re, sv.p, sv.len, 0, PCRE2_NOTEMPTY,
+ md, nullptr);
+ pcre2_match_data_free(md);
+
+ /* This should never happen */
+ if (n == 0)
+ cerr(EXIT_FATAL, "PCRE2 match data too small");
+ if (n == PCRE2_ERROR_NOMATCH)
+ return;
+ if (n < 0)
+ ; /* TODO: Handle error */
+
+ operator_dispatch(opi + 1, sv, hl);
+}
+
+DEFINE_OPERATOR(G)
+{
+ /* TODO: Can we reuse match data? */
+ pcre2_match_data *md =
+ pcre2_match_data_create_from_pattern(ops[opi].re, nullptr);
+ int n = pcre2_match_fn(ops[opi].re, sv.p, sv.len, 0, PCRE2_NOTEMPTY,
+ md, nullptr);
+ pcre2_match_data_free(md);
+
+ /* This should never happen */
+ if (n == 0)
+ cerr(EXIT_FATAL, "PCRE2 match data too small");
+ if (n == PCRE2_ERROR_NOMATCH)
+ operator_dispatch(opi + 1, sv, hl);
+ if (n < 0)
+ ; /* TODO: Handle error */
+}
+
+DEFINE_OPERATOR(h)
+{
+ if (flags.p) {
+ operator_dispatch(opi + 1, sv, hl);
+ return;
+ }
+
+ pcre2_match_data *md =
+ pcre2_match_data_create_from_pattern(ops[opi].re, nullptr);
+ u8view_t sv_save = sv;
+ ptrdiff_t origlen = array_len(*hl);
+ for (;;) {
+ int n = pcre2_match_fn(ops[opi].re, sv.p, sv.len, 0,
+ PCRE2_NOTEMPTY, md, nullptr);
+ /* This should never happen */
+ if (n == 0)
+ cerr(EXIT_FATAL, "PCRE2 match data too small");
+ if (n == PCRE2_ERROR_NOMATCH)
+ break;
+ if (n < 0)
+ ; /* TODO: Handle error */
+
+ size_t *ov = pcre2_get_ovector_pointer(md);
+ array_push(hl, ((u8view_t){sv.p + ov[0], ov[1] - ov[0]}));
+ VSHFT(&sv, ov[1]);
+ }
+ pcre2_match_data_free(md);
+ operator_dispatch(opi + 1, sv_save, hl);
+ array_hdr(*hl)->len = origlen;
+}
+
+DEFINE_OPERATOR(H)
+{
+ if (flags.p) {
+ operator_dispatch(opi + 1, sv, hl);
+ return;
+ }
+
+ pcre2_match_data *md =
+ pcre2_match_data_create_from_pattern(ops[opi].re, nullptr);
+ u8view_t sv_save = sv;
+ ptrdiff_t origlen = array_len(*hl);
+ for (;;) {
+ int n = pcre2_match_fn(ops[opi].re, sv.p, sv.len, 0, PCRE2_NOTEMPTY,
+ md, nullptr);
+ /* This should never happen */
+ if (n == 0)
+ cerr(EXIT_FATAL, "PCRE2 match data too small");
+ if (n == PCRE2_ERROR_NOMATCH)
+ break;
+ if (n < 0)
+ ; /* TODO: Handle error */
+
+ size_t *ov = pcre2_get_ovector_pointer(md);
+ array_push(hl, ((u8view_t){sv.p, ov[0]}));
+ VSHFT(&sv, ov[1]);
+ }
+ pcre2_match_data_free(md);
+ operator_dispatch(opi + 1, sv_save, hl);
+ array_hdr(*hl)->len = origlen;
+}
+
+DEFINE_OPERATOR(x)
+{
+ pcre2_match_data *md =
+ pcre2_match_data_create_from_pattern(ops[opi].re, nullptr);
+ for (;;) {
+ int n = pcre2_match_fn(ops[opi].re, sv.p, sv.len, 0, PCRE2_NOTEMPTY,
+ md, nullptr);
+ /* This should never happen */
+ if (n == 0)
+ cerr(EXIT_FATAL, "PCRE2 match data too small");
+ if (n == PCRE2_ERROR_NOMATCH)
+ break;
+ if (n < 0)
+ ; /* TODO: Handle error */
+
+ size_t *ov = pcre2_get_ovector_pointer(md);
+ operator_dispatch(opi + 1, (u8view_t){sv.p + ov[0], ov[1] - ov[0]}, hl);
+ VSHFT(&sv, ov[1]);
+ }
+ pcre2_match_data_free(md);
+}
+
+DEFINE_OPERATOR(X)
+{
+ pcre2_match_data *md =
+ pcre2_match_data_create_from_pattern(ops[opi].re, nullptr);
+ for (;;) {
+ int n = pcre2_match_fn(ops[opi].re, sv.p, sv.len, 0, PCRE2_NOTEMPTY,
+ md, nullptr);
+ /* This should never happen */
+ if (n == 0)
+ cerr(EXIT_FATAL, "PCRE2 match data too small");
+ if (n == PCRE2_ERROR_NOMATCH)
+ break;
+ if (n < 0)
+ ; /* TODO: Handle error */
+
+ size_t *ov = pcre2_get_ovector_pointer(md);
+ if (ov[0] != 0)
+ operator_dispatch(opi + 1, (u8view_t){sv.p, ov[0]}, hl);
+ VSHFT(&sv, ov[1]);
+ }
+ if (sv.len != 0)
+ operator_dispatch(opi + 1, sv, hl);
+ pcre2_match_data_free(md);
+}
+
+
+
+static inline bool
+views_overlap(const u8view_t a, const u8view_t b)
+{
+ const char8_t *p = a.p + a.len;
+ return p >= b.p && p <= b.p + b.len;
+}
+
+void
+write_match_to_buffer(u8view_t sv, u8view_t *hl)
+{
+ const u8view_t COL_FN = !flags.c ? U8("") : U8("\33[35m");
+ const u8view_t COL_HL = !flags.c ? U8("") : U8("\33[01;31m");
+ const u8view_t COL_LN = !flags.c ? U8("") : U8("\33[32m");
+ const u8view_t COL_SE = !flags.c ? U8("") : U8("\33[36m");
+ const u8view_t COL_RS = !flags.c ? U8("") : U8("\33[0m");
+
+ if (
+#if GIT_GRAB
+ true
+#else
+ flags.do_header
+#endif
+ ) {
+ char sep = flags.z ? 0 : ':';
+
+ size_t filenamesz = strlen(filename);
+
+ array_extend_sv(buf, COL_FN);
+ array_extend(buf, filename, (ptrdiff_t)filenamesz);
+ array_extend_sv(buf, COL_RS);
+
+ array_extend_sv(buf, COL_SE);
+ array_push(buf, sep);
+ array_extend_sv(buf, COL_RS);
+
+ /* GCC things ‘offset’ can overflow because our offsets have type
+ ptrdiff_t which if negative would have a ‘-’ in the front, but
+ we know that the match positions can’t be negative so it’s
+ safe to ignore. */
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wformat-overflow"
+
+ int offsetsz;
+ char offset[/* len(INT64_MAX - 1) */ 19];
+ if (flags.l) {
+ pos_t p = compute_pos(sv.p);
+
+ offsetsz = sprintf(offset, "%td", p.row + 1);
+ array_extend_sv(buf, COL_LN);
+ array_extend(buf, offset, offsetsz);
+ array_extend_sv(buf, COL_RS);
+
+ array_extend_sv(buf, COL_SE);
+ array_push(buf, sep);
+ array_extend_sv(buf, COL_RS);
+
+ offsetsz = sprintf(offset, "%td", p.col + 1);
+ array_extend_sv(buf, COL_LN);
+ array_extend(buf, offset, offsetsz);
+ array_extend_sv(buf, COL_RS);
+ } else {
+ offsetsz = sprintf(offset, "%td", sv.p - baseptr);
+ array_extend_sv(buf, COL_LN);
+ array_extend(buf, offset, offsetsz);
+ array_extend_sv(buf, COL_RS);
+ }
+
+ array_extend_sv(buf, COL_SE);
+ array_push(buf, sep);
+ array_extend_sv(buf, COL_RS);
+ }
+
+#pragma GCC diagnostic pop
+
+ /* Here we need to take all the views of regions to highlight, and try
+ to merge them into a simpler form. This happens in two steps:
+
+ 1. Sort the views by their starting position in the matched text.
+ 2. Merge overlapping views.
+
+ After this process we should have the most reduced possible set of
+ views. The next part is to actually print the highlighted regions
+ possible which requires bounds-checking as highlighted regions may
+ begin before or end after the matched text when using patterns such
+ as ‘h/.+/ x/.$/’. */
+
+ static thread_local u8view_t *sorted = nullptr;
+ if (sorted == nullptr) {
+ allocator_t mem = init_heap_allocator(nullptr);
+ ptrdiff_t buflen = array_len(hl);
+ buflen = MAX(buflen, 16);
+ sorted = array_new(mem, typeof(*sorted), buflen);
+ } else
+ array_hdr(sorted)->len = 0;
+
+ array_extend(&sorted, hl, array_len(hl));
+ qsort(sorted, array_len(sorted), sizeof(*sorted), svposcmp);
+
+ for (ptrdiff_t i = 0, len = array_len(sorted); i < len - 1;) {
+ if (views_overlap(sorted[i], sorted[i + 1])) {
+ sorted[i].len = sorted[i + 1].p + sorted[i + 1].len - sorted[i].p;
+ memmove(hl + i + 1, hl + i + 2, sizeof(*hl) * (len - i - 1));
+ array_hdr(sorted)->len = --len;
+ } else
+ i++;
+ }
+
+ for (ptrdiff_t i = 0, len = array_len(sorted); i < len; i++) {
+ if (i < len - 1 && sorted[i].p == sorted[i + 1].p)
+ continue;
+ array_extend(buf, sv.p, sorted[i].p - sv.p);
+ array_extend_sv(buf, COL_HL);
+ array_extend_sv(buf, sorted[i]);
+ array_extend_sv(buf, COL_RS);
+ ptrdiff_t Δ = sorted[i].p - sv.p + sorted[i].len;
+ VSHFT(&sv, Δ);
+ }
+ array_extend_sv(buf, sv);
+
+#if DEBUG
+ array_free(sorted);
+ sorted = nullptr;
+#endif
+
+ if (flags.z)
+ array_push(buf, 0);
+ else {
+ ptrdiff_t bufsz = array_len(*buf);
+ if (!flags.s || bufsz == 0 || (*buf)[bufsz - 1] != '\n')
+ array_push(buf, '\n');
+ }
+}
+
+pos_t
+compute_pos(const char8_t *ptr)
+{
+ static thread_local pos_t p;
+ if (last_match == baseptr)
+ p.row = p.col = 0;
+ u8view_t g, sv = {last_match, PTRDIFF_MAX};
+ while (sv.p < ptr) {
+ ucsgnext(&g, &sv);
+ if (islbrk(g)) {
+ p.row++;
+ p.col = 0;
+ } else
+ p.col = ucswdth(g, p.col, 8); /* TODO: Configurable tabsize? */
+ }
+ last_match = sv.p;
+ return p;
+}
+
+bool
+islbrk(u8view_t g)
+{
+ return ucseq(g, U8("\n"))
+ || ucseq(g, U8("\v"))
+ || ucseq(g, U8("\f"))
+ || ucseq(g, U8("\r\n"))
+ || ucseq(g, U8("\x85"))
+ || ucseq(g, U8("\u2028"))
+ || ucseq(g, U8("\u2029"));
+}
+
+int
+svposcmp(const void *a_, const void *b_)
+{
+ const u8view_t *a = a_,
+ *b = b_;
+ ptrdiff_t Δ = a->p - b->p;
+ return Δ == 0 ? (ptrdiff_t)a->len - (ptrdiff_t)b->len : Δ;
+}
diff --git a/src/work.h b/src/work.h
new file mode 100644
index 0000000..644a244
--- /dev/null
+++ b/src/work.h
@@ -0,0 +1,16 @@
+#ifndef GRAB_WORK_H
+#define GRAB_WORK_H
+
+#include <pcre2.h>
+
+typedef struct {
+ char c;
+ pcre2_code *re;
+#if DEBUG
+ bool free_me;
+#endif
+} op_t;
+
+void process_file(const char *filename, unsigned char **buf);
+
+#endif /* !GRAB_WORK_H */