diff options
author | Thomas Voss <mail@thomasvoss.com> | 2024-10-30 01:51:14 +0100 |
---|---|---|
committer | Thomas Voss <mail@thomasvoss.com> | 2024-10-30 01:51:14 +0100 |
commit | 042e43247f396a9000fead59d9bff87bf12806d6 (patch) | |
tree | e902784464cbe9ce3c5114d513b016523e7e4b29 /src | |
parent | 170b8a92434233241c990c3e9432786de3262bcd (diff) |
Completely revamp the grab source code
Some of the (many) few changes are:
- Multithreading for significantly faster performance
- The -p/--predicate flag
- Byte offsets as the default
- No customizable colors (maybe this will come back later)
- Newer edition of mlib (formerly librune)
Diffstat (limited to 'src')
-rw-r--r-- | src/da.h | 121 | ||||
-rw-r--r-- | src/exitcodes.h | 9 | ||||
-rw-r--r-- | src/flags.h | 23 | ||||
-rw-r--r-- | src/grab.c | 859 | ||||
-rw-r--r-- | src/main.c | 372 | ||||
-rw-r--r-- | src/tpool.c | 127 | ||||
-rw-r--r-- | src/tpool.h | 19 | ||||
-rw-r--r-- | src/work.c | 453 | ||||
-rw-r--r-- | src/work.h | 16 |
9 files changed, 1019 insertions, 980 deletions
diff --git a/src/da.h b/src/da.h deleted file mode 100644 index 8891971..0000000 --- a/src/da.h +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Simple & stupid dynamic array single-header implementation. You can use the - * macros defined in this file with any structure that has the following fields: - * - * struct dyn_array { - * T *buf // Array of items - * N len // Length of array - * N cap // Capacity of array - * } - * - * The type ‘T’ is whatever type you want to store. The type ‘N’ is any numeric - * type — most likely ‘size_t’ — but it could be sized as well. - * - * The daremove() macro also doesn’t bother with shrinking your array when the - * length is far lower than the capacity. If you care about that, do it - * yourself. - * - * Remember to call free() on your dynamic arrays ‘buf’ field after use. - * - * - * Macro Overview - * ―――――――――――――― - * The argument ‘a’ to all of the below macros is a pointer to the dynamic array - * structure. - * - * dainit(a, n) Initialize the array with a capacity of ‘n’ items. - * dapush(a, x) Append the item ‘x’ to the array - * daremove(a, x) Remove the item at index ‘x’ from the array - * da_remove_range(a, x, y) Remove the items between the range [x, y) - * da_foreach(a, p) Iterate the pointer ‘p’ over each element of the - * array. The type of ‘p’ is inferred. - * - * The ‘dapush()’ macro will double the arrays capacity when it gets full. If - * you would like your arrays to grow with a different scale, edit this file. - * - * - * Example - * ――――――― - * - * struct { - * int *buf; - * size_t len, cap; - * } nums; - * - * // Initialize nums with capacity == 4 - * dainit(&nums, 4); - * - * // Append 69, 1337, and 420 to nums - * dapush(&nums, 69); - * dapush(&nums, 1337); - * dapush(&nums, 420); - * - * da_foreach (&nums, n) { - * int x = *n << 1; - * printf("n = %d; n² = %d\n", *n, x); - * } - * - * // Remove 1337 and 420 from nums - * da_remove_range(&nums, 1, 3); - * - * // Remove 69 from nums - * daremove(&nums, 0); - */ - -#ifndef MANGO_DA_H -#define MANGO_DA_H - -#include <err.h> -#include <errno.h> -#include <stddef.h> -#include <stdint.h> -#include <stdlib.h> -#include <string.h> - -#if __STDC_VERSION__ >= 202311L -# define DA_NULL nullptr -#else -# define DA_NULL NULL -#endif - -#define DA_ALLOC(p, n) \ - do { \ - if ((n) && SIZE_MAX / (n) < sizeof(*(p))) { \ - errno = EOVERFLOW; \ - err(EXIT_FAILURE, "realloc"); \ - } \ - if (!((p) = realloc((p), (n) * sizeof(*(p))))) \ - err(EXIT_FAILURE, "realloc"); \ - } while (0) - -#define dainit(a, n) \ - do { \ - (a)->buf = DA_NULL; \ - (a)->cap = (n); \ - (a)->len = 0; \ - if (n) \ - DA_ALLOC((a)->buf, (a)->cap); \ - } while (0) - -#define dapush(a, x) \ - do { \ - if ((a)->len >= (a)->cap) { \ - (a)->cap = (a)->cap ? (a)->cap * 2 : 1; \ - DA_ALLOC((a)->buf, (a)->cap); \ - } \ - (a)->buf[(a)->len++] = (x); \ - } while (0) - -#define daremove(a, i) da_remove_range((a), (i), (i) + 1) - -#define da_remove_range(a, i, j) \ - do { \ - memmove((a)->buf + (i), (a)->buf + (j), \ - ((a)->len - (j)) * sizeof(*(a)->buf)); \ - (a)->len -= j - i; \ - } while (0) - -#define da_foreach(a, p) \ - for (typeof((a)->buf) p = (a)->buf; (size_t)(p - (a)->buf) < (a)->len; p++) - -#endif /* !MANGO_DA_H */ diff --git a/src/exitcodes.h b/src/exitcodes.h new file mode 100644 index 0000000..00b455c --- /dev/null +++ b/src/exitcodes.h @@ -0,0 +1,9 @@ +#ifndef GRAB_EXITCODES_H +#define GRAB_EXITCODES_H + +/* These values should never be changed! Scripts may depend on them. */ +constexpr int EXIT_NOMATCH = 1; +constexpr int EXIT_WARNING = 2; +constexpr int EXIT_FATAL = 3; + +#endif /* !GRAB_EXITCODES_H */ diff --git a/src/flags.h b/src/flags.h new file mode 100644 index 0000000..a4a4709 --- /dev/null +++ b/src/flags.h @@ -0,0 +1,23 @@ +#ifndef GRAB2_FLAGS_H +#define GRAB2_FLAGS_H + +typedef struct { + bool c : 1; + bool i : 1; + bool l : 1; + bool p : 1; + bool s : 1; + bool U : 1; + bool z : 1; + +#if !GIT_GRAB + bool do_header : 1; +#endif +} flags_t; + +#if !MAIN_C +extern +#endif +flags_t flags; + +#endif /* !GRAB2_FLAGS_H */ diff --git a/src/grab.c b/src/grab.c deleted file mode 100644 index bf26a79..0000000 --- a/src/grab.c +++ /dev/null @@ -1,859 +0,0 @@ -#include <err.h> -#include <getopt.h> -#include <libgen.h> -#include <limits.h> -#include <locale.h> -#include <stddef.h> -#include <stdint.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <unistd.h> - -#if GRAB_DO_PCRE -# include <pcre2posix.h> -#else -# include <regex.h> -# ifndef REG_DOTALL -# define REG_DOTALL 0 -# endif -# define REG_UCP 0 -# define REG_UTF 0 -# ifndef REG_STARTEND -# error "REG_STARTEND not defined" -# endif -#endif - -#include <gbrk.h> -#include <rune.h> -#include <utf8.h> - -#include "da.h" - -#define MIN(a, b) ((a) < (b) ? (a) : (b)) -#define MAX(a, b) ((a) > (b) ? (a) : (b)) - -#define FLAGMSK(f) ((uint64_t)1 << ((f) - ((f) < 'a' ? 'A' : 'G'))) -#define FLAGSET(f) (flags & FLAGMSK(f)) - -#define lengthof(a) (sizeof(a) / sizeof(*(a))) - -#define die(...) err(3, __VA_ARGS__) -#define diex(...) errx(3, __VA_ARGS__) -#define warn(...) \ - do { \ - warn(__VA_ARGS__); \ - rv = 3; \ - } while (0) -#define warnx(...) \ - do { \ - warnx(__VA_ARGS__); \ - rv = 3; \ - } while (0) - -#define streq(a, b) (!strcmp(a, b)) -#define memeq(a, b, n) (!memcmp(a, b, n)) - -#define DEFCOL_FN "35" -#define DEFCOL_HL "01;31" -#define DEFCOL_LN "32" -#define DEFCOL_SE "36" - -struct matches { - struct sv *buf; - size_t len, cap; -}; - -struct op { - char c; - regex_t pat; -#ifdef GRAB_DEBUG - bool alloced; -#endif -}; - -struct ops { - struct op *buf; - size_t len, cap; -}; - -struct sv { - char8_t *p; - size_t len; -}; - -typedef unsigned char uchar; -typedef void cmd_func(struct sv, struct matches *, struct ops, size_t, - const char *); -typedef void put_func(struct sv, struct matches *, const char *); - -static cmd_func cmdg, cmdh, cmdH, cmdx, cmdX; -static put_func putm, putm_nc; - -#if GIT_GRAB -static FILE *getfstream(int n, char *v[n]); -#endif -static void grab(struct ops, FILE *, const char *); -static struct ops comppat(char8_t *); -static regex_t mkregex(char8_t *, size_t); -static bool islbrk(struct u8view); -static bool sgrvalid(const char *); -static bool xisspace(char); -static int svposcmp(const void *, const void *); -static char *env_or_default(const char *, const char *); - -static int filecnt, rv; -static bool got_match; -static uint64_t flags = FLAGMSK('f') * GIT_GRAB; -static put_func *putf; - -static struct { - const char8_t *p, *bp; - size_t col, row; -} pos; - -static cmd_func *op_table[UCHAR_MAX] = { - ['g'] = cmdg, ['G'] = cmdg, ['h'] = cmdh, - ['H'] = cmdH, ['x'] = cmdx, ['X'] = cmdX, -}; - -[[noreturn]] static void -usage(const char *s) -{ - fprintf(stderr, -#if GIT_GRAB - "Usage: %s [-s | -z] [-bcinU] pattern [glob ...]\n" -#else - "Usage: %s [-s | -z] [-bcfinU] pattern [file ...]\n" -#endif - " %s -h\n", - s, s); - exit(EXIT_FAILURE); -} - -int -main(int argc, char **argv) -{ - int opt; - struct ops ops; - struct option longopts[] = { - {"byte-offset", no_argument, nullptr, 'b'}, - {"color", no_argument, nullptr, 'c'}, -#if GIT_GRAB - {"filenames", no_argument, nullptr, 'f'}, -#endif - {"help", no_argument, nullptr, 'h'}, - {"ignore-case", no_argument, nullptr, 'i'}, - {"newline", no_argument, nullptr, 'n'}, - {"strip-newline", no_argument, nullptr, 's'}, - {"no-unicode", no_argument, nullptr, 'U'}, - {"zero", no_argument, nullptr, 'z'}, - {nullptr, 0, nullptr, 0 }, - }; - -#if GIT_GRAB - char *entry = nullptr; - size_t len; - ssize_t nr; - FILE *flist; - const char *opts = "bchinsUz"; -#else - const char *opts = "bcfhinsUz"; -#endif - - argv[0] = basename(argv[0]); - if (argc < 2) - usage(argv[0]); - - setlocale(LC_ALL, ""); - - while ((opt = getopt_long(argc, argv, opts, longopts, nullptr)) != -1) { - switch (opt) { - case '?': - usage(argv[0]); - case 'h': - execlp("man", "man", "1", argv[0], nullptr); - die("execlp: man 1 %s", argv[0]); -#if !GRAB_DO_PCRE - case 'U': - errx(2, "program not built with PCRE support"); -#endif - default: - flags |= FLAGMSK(opt); - } - } - - if (FLAGSET('s') && FLAGSET('z')) - usage(argv[0]); - - argc -= optind; - argv += optind; - filecnt = argc - 1; - - if (!FLAGSET('c') && isatty(STDOUT_FILENO) == 1 - && !env_or_default("NO_COLOR", nullptr)) - { - flags |= FLAGMSK('c') * !streq(env_or_default("TERM", ""), "dumb"); - } - - putf = FLAGSET('c') ? putm : putm_nc; - ops = comppat(argv[0]); - -#if GIT_GRAB - if (!(flist = getfstream(argc - 1, argv + 1))) - die("getfstream"); - while ((nr = getdelim(&entry, &len, '\0', flist)) > 0) { - FILE *fp; - - if (!(fp = fopen(entry, "r"))) - warn("fopen: %s", entry); - else { - grab(ops, fp, entry); - fclose(fp); - } - } - if (ferror(flist)) - warn("getdelim"); - fclose(flist); -#else - if (argc == 1) - grab(ops, stdin, "-"); - else { - for (int i = 1; i < argc; i++) { - FILE *fp; - - if (streq(argv[i], "-")) { - grab(ops, stdin, "-"); - } else if (!(fp = fopen(argv[i], "r"))) { - warn("fopen: %s", argv[i]); - } else { - grab(ops, fp, argv[i]); - fclose(fp); - } - } - } -#endif - -#ifdef GRAB_DEBUG -# if GIT_GRAB - free(entry); -# endif - da_foreach (&ops, op) { - if (op->alloced) - regfree(&op->pat); - } - free(ops.buf); -#endif - - return got_match ? rv : EXIT_FAILURE; -} - -struct ops -comppat(char8_t *s) -{ - struct ops ops; - - dainit(&ops, 8); - while (*s && xisspace(*s)) - s++; - if (!*s) - diex("input string terminated prematurely"); - - do { - int w; - rune ch; - size_t len; - char8_t *p; - struct op op; - - /* Grab the operator and delimiter. All operators are ASCII, but - u8tor() is used to parse it so that we get properly formed error - messages when someone uses a non-ASCII operator. */ - w = u8tor(&ch, s); - if (ch == RUNE_ERROR) - diex("invalid UTF-8 sequence near ‘%02hhX’", s[-1]); - if (w > 1 || !op_table[ch]) - diex("invalid operator ‘%.*s’", w, s); - op.c = *s++; - - s += u8tor(&ch, s); - if (ch == RUNE_ERROR) - diex("invalid UTF-8 sequence near ‘%02hhX’", s[-1]); - if (ch == '\0') - diex("input string terminated prematurely"); - - /* Find the closing delimiter. The user is allowed to omit the closing - delimiter if this is the last operation in the query pattern. */ - p = s; - len = strlen(s); - if (!(s = u8chr(s, ch, len))) - s = p + len; - - if (s - p == 0) { - if (op.c != 'h') - diex("empty regex given to ‘%c’", op.c); - if (ops.len == 0) - diex("empty ‘h’ is not allowed as the first operator"); - op.pat = ops.buf[ops.len - 1].pat; - } else - op.pat = mkregex(p, s - p); - -#if GRAB_DEBUG - op.alloced = s - p == 0; -#endif - - dapush(&ops, op); - - if (*s) { - s += u8tor(&ch, s); - if (ch == RUNE_ERROR) - diex("invalid UTF-8 sequence near ‘%02hhX’", s[-1]); - } - while (*s && xisspace(*s)) - s++; - } while (*s); - - return ops; -} - -void -grab(struct ops ops, FILE *stream, const char *filename) -{ - size_t n; - struct { - char *buf; - size_t len, cap; - } chars = {0}; - - do { - static_assert(sizeof(char) == 1, "sizeof(char) != 1; wtf?"); - chars.cap += BUFSIZ; - if (!(chars.buf = realloc(chars.buf, chars.cap))) - die("realloc"); - chars.len += n = fread(chars.buf + chars.len, 1, BUFSIZ, stream); - } while (n == BUFSIZ); - - if (ferror(stream)) { - warn("fread: %s", filename); - goto out; - } - - const char8_t *p; - struct sv sv = { - .p = chars.buf, - .len = chars.len, - }; - struct matches ms; - - if (p = u8chk(chars.buf, chars.len)) { - warnx("%s: invalid UTF-8 near ‘%02X’", filename, *p); - goto out; - } - - dainit(&ms, 4); - pos.col = pos.row = 1; - pos.bp = pos.p = chars.buf; - op_table[(uchar)ops.buf[0].c](sv, &ms, ops, 0, filename); - free(ms.buf); - -out: - free(chars.buf); -} - -void -cmdg(struct sv sv, struct matches *ms, struct ops ops, size_t i, - const char *filename) -{ - int r; - regmatch_t rm = { - .rm_so = 0, - .rm_eo = sv.len, - }; - struct op op = ops.buf[i]; - - r = regexec(&op.pat, sv.p, 1, &rm, REG_STARTEND); - if ((r == REG_NOMATCH && op.c == 'g') || (r != REG_NOMATCH && op.c == 'G')) - return; - - if (i + 1 == ops.len) - putf(sv, ms, filename); - else - op_table[(uchar)ops.buf[i + 1].c](sv, ms, ops, i + 1, filename); -} - -void -cmdh(struct sv sv, struct matches *ms, struct ops ops, size_t i, - const char *filename) -{ - regmatch_t rm = { - .rm_so = 0, - .rm_eo = sv.len, - }; - struct op op = ops.buf[i]; - - do { - if (regexec(&op.pat, sv.p, 1, &rm, REG_STARTEND) == REG_NOMATCH) - break; - - if (rm.rm_so < rm.rm_eo) - dapush(ms, ((struct sv){sv.p + rm.rm_so, rm.rm_eo - rm.rm_so})); - else { - rune unused; - rm.rm_eo += u8tor_uc(&unused, sv.p + rm.rm_eo); - } - - rm = (regmatch_t){ - .rm_so = rm.rm_eo, - .rm_eo = sv.len, - }; - } while (rm.rm_so < rm.rm_eo); - - if (i + 1 == ops.len) - putf(sv, ms, filename); - else { - size_t save = ms->len; - op_table[(uchar)ops.buf[i + 1].c](sv, ms, ops, i + 1, filename); - ms->len = save; - } -} - -void -cmdH(struct sv sv, struct matches *ms, struct ops ops, size_t i, - const char *filename) -{ - regmatch_t rm = { - .rm_so = 0, - .rm_eo = sv.len, - }; - regmatch_t prev = { - .rm_so = 0, - .rm_eo = 0, - }; - struct op op = ops.buf[i]; - - do { - struct sv nsv; - - if (regexec(&op.pat, sv.p, 1, &rm, REG_STARTEND) == REG_NOMATCH) - break; - - if (prev.rm_so || prev.rm_eo || rm.rm_so) { - nsv = (struct sv){ - .p = sv.p + prev.rm_eo, - .len = rm.rm_so - prev.rm_eo, - }; - if (nsv.len) - dapush(ms, nsv); - } - - prev = rm; - if (rm.rm_so == rm.rm_eo) { - rune unused; - rm.rm_eo += u8tor_uc(&unused, sv.p + rm.rm_eo); - } - rm = (regmatch_t){ - .rm_so = rm.rm_eo, - .rm_eo = sv.len, - }; - } while (rm.rm_so < rm.rm_eo); - - if (prev.rm_eo < rm.rm_eo) - dapush(ms, ((struct sv){sv.p + rm.rm_so, rm.rm_eo - rm.rm_so})); - - if (i + 1 == ops.len) - putf(sv, ms, filename); - else - op_table[(uchar)ops.buf[i + 1].c](sv, ms, ops, i + 1, filename); -} - -void -cmdx(struct sv sv, struct matches *ms, struct ops ops, size_t i, - const char *filename) -{ - regmatch_t rm = { - .rm_so = 0, - .rm_eo = sv.len, - }; - struct op op = ops.buf[i]; - - do { - struct sv nsv; - - if (regexec(&op.pat, sv.p, 1, &rm, REG_STARTEND) == REG_NOMATCH) - break; - if (rm.rm_so < rm.rm_eo) { - nsv = (struct sv){ - .p = sv.p + rm.rm_so, - .len = rm.rm_eo - rm.rm_so, - }; - if (i + 1 == ops.len) - putf(nsv, ms, filename); - else { - size_t save = ms->len; - op_table[(uchar)ops.buf[i + 1].c](nsv, ms, ops, i + 1, - filename); - ms->len = save; - } - } else { - rune unused; - rm.rm_eo += u8tor_uc(&unused, sv.p + rm.rm_eo); - } - rm = (regmatch_t){ - .rm_so = rm.rm_eo, - .rm_eo = sv.len, - }; - } while (rm.rm_so < rm.rm_eo); -} - -void -cmdX(struct sv sv, struct matches *ms, struct ops ops, size_t i, - const char *filename) -{ - regmatch_t rm = { - .rm_so = 0, - .rm_eo = sv.len, - }; - regmatch_t prev = { - .rm_so = 0, - .rm_eo = 0, - }; - struct op op = ops.buf[i]; - - do { - struct sv nsv; - - if (regexec(&op.pat, sv.p, 1, &rm, REG_STARTEND) == REG_NOMATCH) - break; - - if (prev.rm_so || prev.rm_eo || rm.rm_so) { - nsv = (struct sv){ - .p = sv.p + prev.rm_eo, - .len = rm.rm_so - prev.rm_eo, - }; - if (nsv.len) { - if (i + 1 == ops.len) - putf(nsv, ms, filename); - else - op_table[(uchar)ops.buf[i + 1].c](nsv, ms, ops, i + 1, - filename); - } - } - - prev = rm; - if (rm.rm_so == rm.rm_eo) { - rune unused; - rm.rm_eo += u8tor_uc(&unused, sv.p + rm.rm_eo); - } - rm = (regmatch_t){ - .rm_so = rm.rm_eo, - .rm_eo = sv.len, - }; - } while (rm.rm_so < rm.rm_eo); - - if (prev.rm_eo < rm.rm_eo) { - struct sv nsv = { - .p = sv.p + rm.rm_so, - .len = rm.rm_eo - rm.rm_so, - }; - if (i + 1 == ops.len) - putf(nsv, ms, filename); - else - op_table[(uchar)ops.buf[i + 1].c](nsv, ms, ops, i + 1, filename); - } -} - -int -svposcmp(const void *a, const void *b) -{ - struct sv *A, *B; - A = (struct sv *)a; - B = (struct sv *)b; - return A->p != B->p ? A->p - B->p : A->len < B->len ? -1 : A->len != B->len; -} - -void -putm(struct sv sv, struct matches *ms, const char *filename) -{ - const char8_t *p; - struct matches valid; - static const char *fn, *hl, *ln, *se; - - got_match = true; - - if (FLAGSET('c') && !fn) { - char *optstr; - if ((optstr = env_or_default("GRAB_COLORS", nullptr))) { - enum { - OPT_FN, - OPT_HL, - OPT_LN, - OPT_SE, - }; - /* clang-format off */ - static char *const tokens[] = { - [OPT_FN] = "fn", - [OPT_HL] = "hl", - [OPT_LN] = "ln", - [OPT_SE] = "se", - nullptr - }; - /* clang-format on */ - - while (*optstr) { - char *val; - switch (getsubopt(&optstr, tokens, &val)) { - case OPT_FN: - if (sgrvalid(val)) - fn = val; - break; - case OPT_HL: - if (sgrvalid(val)) - hl = val; - break; - case OPT_LN: - if (sgrvalid(val)) - fn = val; - break; - case OPT_SE: - if (sgrvalid(val)) - se = val; - break; - default: - warnx("invalid color value -- '%s'", val); - } - } - } - - if (!fn) - fn = DEFCOL_FN; - if (!hl) - hl = DEFCOL_HL; - if (!ln) - ln = DEFCOL_LN; - if (!se) - se = DEFCOL_SE; - } - - if (FLAGSET('f') || filecnt > 1) { - char sep = FLAGSET('z') ? '\0' : ':'; - printf("\33[%sm%s\33[0m" /* filename */ - "\33[%sm%c\33[0m", /* separator */ - fn, filename, se, sep); - - if (FLAGSET('b')) { - printf("\33[%sm%td\33[0m" /* byte offset */ - "\33[%sm%c\33[0m", /* separator */ - ln, sv.p - pos.bp, se, sep); - } else { - struct u8view v; - size_t len = sv.p - pos.p; - - while (u8gnext(&v, &pos.p, &len)) { - if (islbrk(v)) { - pos.col = 1; - pos.row++; - } else - pos.col++; - } - - printf("\33[%sm%zu\33[0m" /* row */ - "\33[%sm%c\33[0m" /* separator */ - "\33[%sm%zu\33[0m" /* column */ - "\33[%sm%c\33[0m", /* separator */ - ln, pos.row, se, sep, ln, pos.col, se, sep); - } - } - - /* Here we need to take all the views of regions to highlight, and try - to merge them into a simpler form. This happens in two steps: - - 1. Sort the views by their starting position in the matched text. - 2. Merge overlapping views. - - After this process we should have the most reduced possible set of - views. The next part is to actually print the highlighted regions - possible which requires bounds-checking as highlighted regions may - begin before or end after the matched text when using patterns such - as ‘h/.+/ x/.$/’. */ - - dainit(&valid, ms->len); - qsort(ms->buf, ms->len, sizeof(*ms->buf), svposcmp); - memcpy(valid.buf, ms->buf, ms->len * sizeof(*ms->buf)); - valid.len = ms->len; - - for (size_t i = 0; i + 1 < valid.len;) { - ptrdiff_t d; - struct sv *a, *b; - - a = valid.buf + i; - b = valid.buf + i + 1; - d = a->p + a->len - b->p; - - if (d >= 0) { - a->len += MAX(b->len - d, 0); - daremove(&valid, i + 1); - } else - i++; - } - - for (size_t i = 0; i < valid.len; i++) { - struct sv *m = valid.buf + i; - if (m->p + m->len < sv.p || m->p >= sv.p + sv.len) { - daremove(&valid, i); - i--; - continue; - } - - if (m->p < sv.p) { - m->len -= sv.p - m->p; - m->p = sv.p; - } - m->len = MIN(m->len, (size_t)(sv.p + sv.len - m->p)); - } - - p = sv.p; - da_foreach (&valid, m) { - printf("%.*s\33[%sm%.*s\33[0m", (int)(m->p - p), p, hl, (int)m->len, - m->p); - p = m->p + m->len; - } - fwrite(p, 1, sv.p + sv.len - p, stdout); - - if (!(FLAGSET('s') && sv.p[sv.len - 1] == '\n')) - putchar(FLAGSET('z') ? '\0' : '\n'); - free(valid.buf); -} - -void -putm_nc(struct sv sv, struct matches *ms, const char *filename) -{ - (void)ms; - - got_match = true; - - if (FLAGSET('f') || filecnt > 1) { - char sep = FLAGSET('z') ? '\0' : ':'; - printf("%s%c", filename, sep); - - if (FLAGSET('b')) - printf("%td%c", sv.p - pos.bp, sep); - else { - struct u8view v; - size_t len = sv.p - pos.p; - - while (u8gnext(&v, &pos.p, &len)) { - if (islbrk(v)) { - pos.col = 1; - pos.row++; - } else - pos.col++; - } - - printf("%zu%c%zu%c", pos.row, sep, pos.col, sep); - } - } - fwrite(sv.p, 1, sv.len, stdout); - if (!(FLAGSET('s') && sv.p[sv.len - 1] == '\n')) - putchar(FLAGSET('z') ? '\0' : '\n'); -} - -bool -islbrk(struct u8view v) -{ - return *v.p == '\n' || (v.len == 2 && memeq(v.p, "\r\n", 2)); -} - -bool -sgrvalid(const char *s) -{ - if (!s || !*s) - return false; - do { - if ((*s < '0' || *s > '9') && *s != ';') - return false; - } while (*++s); - - return true; -} - -regex_t -mkregex(char8_t *s, size_t n) -{ - int ret, cflags; - regex_t r; - char8_t c = s[n]; - - s[n] = 0; - cflags = REG_EXTENDED | REG_UTF | (FLAGSET('n') ? REG_NEWLINE : REG_DOTALL); - if (FLAGSET('i')) - cflags |= REG_ICASE; - if (!FLAGSET('U')) - cflags |= REG_UCP; - if (ret = regcomp(&r, s, cflags)) { - char emsg[256]; - regerror(ret, &r, emsg, sizeof(emsg)); - diex("failed to compile regex ‘%s’: %s", s, emsg); - } - s[n] = c; - - return r; -} - -#if GIT_GRAB -FILE * -getfstream(int argc, char *argv[argc]) -{ - pid_t pid; - int fds[2]; - enum { - FD_R, - FD_W, - }; - - if (pipe(fds) == -1) - die("pipe"); - - switch (pid = fork()) { - case -1: - die("fork"); - case 0:; - size_t len; - char **args; - static const char *git_grep_args[] = { - "git", "grep", "--cached", "-Ilz", "", - }; - - len = argc + lengthof(git_grep_args) + 1; - - close(fds[FD_R]); - if (dup2(fds[FD_W], STDOUT_FILENO) == -1) - die("dup2"); - close(fds[FD_W]); - - if (!(args = malloc(len * sizeof(char *)))) - die("malloc"); - memcpy(args, git_grep_args, sizeof(git_grep_args)); - memcpy(args + 5, argv, argc * sizeof(char *)); - args[len - 1] = nullptr; - - execvp("git", args); - die("execvp: git grep --cached -Ilz ''"); - } - - close(fds[FD_W]); - return fdopen(fds[FD_R], "r"); -} -#endif - -char * -env_or_default(const char *e, const char *d) -{ - const char *s = getenv(e); - return (char *)(s && *s ? s : d); -} - -bool -xisspace(char c) -{ - return c == ' ' || c == '\t' || c == '\n'; -} diff --git a/src/main.c b/src/main.c new file mode 100644 index 0000000..5b2efff --- /dev/null +++ b/src/main.c @@ -0,0 +1,372 @@ +#include <langinfo.h> +#include <locale.h> +#include <stdatomic.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include <alloc.h> +#include <array.h> +#include <cli.h> +#include <errors.h> +#include <macros.h> +#include <mbstring.h> +#include <pcre2.h> +#include <unicode/prop.h> +#include <unicode/string.h> + +#include "exitcodes.h" +#include "tpool.h" +#include "work.h" + +#define MAIN_C 1 +#include "flags.h" + +static bool use_color_p(void); +static op_t *pattern_comp(u8view_t pat); +#if GIT_GRAB +static FILE *getfstream(int globc, char **globv); +#endif + +atomic_int rv = EXIT_NOMATCH; +op_t *ops; +/* For use in diagnostic messages */ +const char *lquot = "`", *rquot = "'"; + +/* We need to use different matching functions depending on if we’re using JIT + matching or not */ +typeof(pcre2_match) *pcre2_match_fn; + +static char emsg[256]; /* Buffer for PCRE2 error messages */ +/* TODO: Use the LUT in work.c */ +static const bool opchars[] = { + ['g'] = true, + ['G'] = true, + ['h'] = true, + ['H'] = true, + ['x'] = true, + ['X'] = true, +}; + +int +main(int argc, char **argv) +{ + mlib_setprogname(argv[0]); + setlocale(LC_ALL, ""); + + if (streq(nl_langinfo(CODESET), "UTF-8")) { + lquot = u8"‘"; + rquot = u8"’"; + } + + optparser_t parser = mkoptparser(argv); + static const cli_opt_t opts[] = { + {'c', U8C("color"), CLI_NONE}, + {'h', U8C("help"), CLI_NONE}, + {'i', U8C("ignore-case"), CLI_NONE}, + {'l', U8C("line"), CLI_OPT}, + {'p', U8C("predicate"), CLI_NONE}, + {'s', U8C("strip-newline"), CLI_NONE}, + {'U', U8C("no-unicode"), CLI_NONE}, + {'z', U8C("zero"), CLI_NONE}, + }; + + for (;;) { + rune opt = optparse(&parser, opts, lengthof(opts)); + if (opt == 0) + break; + switch (opt) { + case 'c': + flags.c = true; + break; + case 'h': + execlp("man", "man", "1", mlib_progname(), nullptr); + err("execlp: man 1 %s:", mlib_progname()); + case 'i': + flags.i = true; + break; + case 'l': + flags.l = true; + break; + case 'p': + flags.p = true; + break; + case 's': + flags.s = true; + break; + case 'U': + flags.U = true; + break; + case 'z': + flags.z = true; + break; + case -1: + warn(parser.errmsg); + goto usage; + } + } + + if (flags.p && flags.s) { + warn("-p and -s are mutually exclusive"); + goto usage; + } + if (flags.p && flags.z) { + warn("-p and -z are mutually exclusive"); + goto usage; + } + if (flags.s && flags.z) { + warn("-s and -z are mutually exclusive"); + goto usage; + } + + argc -= parser.optind; + argv += parser.optind; + + if (argc == 0) { + usage: + usage("[-p | -s | -z] [-cilU] pattern [file ...]", "-h"); + exit(EXIT_FATAL); + } + + flags.c = flags.c || use_color_p(); + ops = pattern_comp((u8view_t){*argv, strlen(*argv)}); + + allocator_t mem = init_heap_allocator(nullptr); + +#if GIT_GRAB + argc--; + argv++; + + FILE *fstream = getfstream(argc, argv); + if (fstream == nullptr) + cerr(EXIT_FATAL, "getfstream:"); + + const char **filenames = array_new(mem, typeof(*filenames), 1024); + + size_t len; + ssize_t nr; + char *file = nullptr; + while ((nr = getdelim(&file, &len, 0, fstream)) > 0) { + /* TODO: Would an arena improve performance? */ + const char *s = strdup(file); + if (s == nullptr) + cerr(EXIT_FATAL, "strdup:"); + array_push(&filenames, s); + } +#else + if (argc == 1) + argv = (static char *[]){"-"}; + else { + argc--; + argv++; + flags.do_header = true; + } +#endif + + tpool_t tp; + int thrds = tpinit(&tp, +#if GIT_GRAB + filenames, array_len(filenames) +#else + (const char **)argv, argc +#endif + ); + + /* Failed to spawn threads */ + if (thrds == 0) { + unsigned char *buf = array_new(mem, typeof(*buf), 4096); + for (int i = 0; i < argc; i++) { + process_file(argv[i], &buf); + fwrite(buf, 1, array_len(buf), stdout); + array_hdr(buf)->len = 0; + } +#if DEBUG + array_free(buf); +#endif + } + + if (thrds != 0) + tpfree(&tp); +#if DEBUG + pcre2_jit_free_unused_memory(nullptr); + array_foreach (ops, op) { + if (op->free_me) + pcre2_code_free(op->re); + } + array_free(ops); +#if GIT_GRAB + array_foreach (filenames, f) + free(f); + array_free(filenames); +#endif +#endif + return rv; +} + +op_t * +pattern_comp(u8view_t pat) +{ + allocator_t mem = init_heap_allocator(nullptr); + op_t *ops = array_new(mem, op_t, 16); + + for (;;) { + int w; + rune ch; + + while ((w = ucsnext(&ch, &pat)) != 0) { + if (!uprop_is_pat_ws(ch)) { + VSHFT(&pat, -w); + break; + } + } + if (pat.len == 0) + break; + + /* Grab the operator. We grab the entire next grapheme for + better error messages in the case that someone tries to use a + non-ASCII grapheme as an operator for whatever reason. */ + + op_t op; + u8view_t g; + + (void)ucsgnext(&g, &pat); + if (g.len != 1 || *g.p >= lengthof(opchars) || !opchars[*g.p]) { + cerr(EXIT_FATAL, "Invalid operator %s%.*s%s", + lquot, SV_PRI_ARGS(g), rquot); + } + op.c = (char)*g.p; + + /* Unlike with the operator, we parse the delimeter as a rune + instead of a grapheme. This makes it easier for users to + write patterns that match combining characters. This _may_ be + subject to change in the future but for now this is the + rationale. Alongside standard delimeters, if the opening + delimeter is a bracket or some other form of paired-bracket + (as determined by Unicode) then the closing delimeter is set + to the right-hand form of the bracket. This means that the + following are both valid delimeted patterns: + + /regex/ + 「regex」 */ + + rune ldelim, rdelim; + if ((w = ucsnext(&ldelim, &pat)) == 0) + cerr(EXIT_FATAL, "Premature end of pattern"); + rdelim = uprop_get_bpb(ldelim); + + /* Find the right delimeter, which is optional for the last + operator */ + /* TODO: Change u8view_t.len to ptrdiff_t and use -1 here */ + u8view_t re = {pat.p, (size_t)-1}; + while ((w = ucsnext(&ch, &pat)) != 0) { + if (ch == rdelim) { + re.len = pat.p - re.p - w; + break; + } + } + if (re.len == (size_t)-1) + re.len = pat.p - re.p; + if (re.len == 0) { + if (op.c != 'h') { + cerr(EXIT_FATAL, "%s%c%s operator given empty regex", + lquot, op.c, rquot); + } + if (array_len(ops) == 0) { + cerr(EXIT_FATAL, + "%sh%s operator given empty regex as the first operator", + lquot, rquot); + } + op.re = ops[array_len(ops) - 1].re; +#if DEBUG + op.free_me = false; +#endif + } else { + int ec; + size_t eoff; + uint32_t reopts = PCRE2_DOTALL | PCRE2_MATCH_INVALID_UTF | PCRE2_UTF; + if (flags.i) + reopts |= PCRE2_CASELESS; + if (!flags.U) + reopts |= PCRE2_UCP; + op.re = pcre2_compile(re.p, re.len, reopts, &ec, &eoff, nullptr); + if (op.re == nullptr) { + /* TODO: Ensure the buffer is large enough for the error message */ + (void)pcre2_get_error_message(ec, emsg, sizeof(emsg)); + cerr(EXIT_FATAL, "Failed to compile regex: %s", emsg); + } + if ((ec = pcre2_jit_compile(op.re, PCRE2_JIT_COMPLETE)) != 0) { + /* TODO: Ensure the buffer is large enough for the error message */ + (void)pcre2_get_error_message(ec, emsg, sizeof(emsg)); + warn("Failed to JIT compile regex: %s", emsg); + rv = EXIT_WARNING; + pcre2_match_fn = pcre2_match; + } else + pcre2_match_fn = pcre2_jit_match; +#if DEBUG + op.free_me = true; +#endif + } + array_push(&ops, op); + } + + if (array_len(ops) == 0) + err("Empty pattern"); + + return ops; +} + +bool +use_color_p(void) +{ + const char *ev = getenv("TERM"); + if (ev != nullptr && streq(ev, "dumb")) + return false; + if ((ev = getenv("NO_COLOR")) != nullptr && *ev != 0) + return false; + if ((ev = getenv("CLICOLOR_FORCE")) != nullptr && *ev != 0) + return true; + return isatty(STDOUT_FILENO); +} + +#if GIT_GRAB +FILE * +getfstream(int globc, char **globv) +{ + pid_t pid; + int fds[2]; + enum { R, W }; + + if (pipe(fds) == 1) + cerr(EXIT_FATAL, "pipe:"); + + switch (pid = fork()) { + case -1: + cerr(EXIT_FATAL, "fork:"); + case 0: + static const char *git_grep_argv[] = { + "git", "grep", "-Ilz", "", + }; + + close(fds[R]); + if (dup2(fds[W], STDOUT_FILENO) == -1) + cerr(EXIT_FATAL, "dup2:"); + close(fds[W]); + + size_t argc = globc + lengthof(git_grep_argv) + 1; + char **argv = malloc(argc * sizeof(char *)); + if (argv == nullptr) + cerr(EXIT_FATAL, "malloc:"); + memcpy(argv, git_grep_argv, sizeof(git_grep_argv)); + memcpy(argv + lengthof(git_grep_argv), globv, globc * sizeof(char *)); + argv[argc - 1] = nullptr; + + execvp("git", argv); + cerr(EXIT_FATAL, "execvp: git grep -Ilz '':"); + } + + close(fds[W]); + return fdopen(fds[R], "r"); +} +#endif diff --git a/src/tpool.c b/src/tpool.c new file mode 100644 index 0000000..f30bccc --- /dev/null +++ b/src/tpool.c @@ -0,0 +1,127 @@ +/* Thread pool implementation mostly copied from cbs.h */ + +#include <errno.h> +#include <pthread.h> +#include <stdatomic.h> +#include <stddef.h> +#include <stdlib.h> +#include <unistd.h> + +#include <alloc.h> +#include <array.h> +#include <errors.h> +#include <macros.h> + +#include "tpool.h" +#include "work.h" + +#include <stdio.h> + +static int nproc(void); +static void *tpwork(void *); + +static pthread_t thread_buffer[32]; + +extern const char *lquot, *rquot; + +int +nproc(void) +{ + errno = 0; + + /* Grab the number of processors available on the users system. If we can + we query sysconf() but fallback to 1 for systems that don’t support the + sysconf() method. The user can also override this via the GRAB_NPROCS + environment variable, and if that’s invalid then we just issue a + diagnostic and default to 1. + + We don’t want to error on an invalid value for GRAB_NPROCS because we + might be running this tool as part of an editor plugin for example where + finding the root cause of your regexp-search failing may not be so + trivial. */ + + const char *ev = getenv("GRAB_NPROCS"); + if (ev != nullptr && *ev != 0) { + const char *endptr; + long n = strtol(ev, (char **)&endptr, 10); + if (errno == 0 && *endptr == 0) + return (int)n; + if (errno != 0) + warn("strtol: %s:", ev); + if (*endptr != 0) + warn("Invalid value for %s%s%s for GRAB_NPROCS", lquot, ev, rquot); + return 1; + } + +#ifdef _SC_NPROCESSORS_ONLN + return (int)sysconf(_SC_NPROCESSORS_ONLN); +#else + return 1; +#endif +} + +int +tpinit(tpool_t *tp, const char **files, ptrdiff_t filecnt) +{ + tp->files = files; + tp->filecnt = filecnt; + tp->tcnt = nproc(); + tp->tcnt = MIN(tp->tcnt, filecnt); + tp->wi = 0; + + if (tp->tcnt <= 32) + tp->thrds = thread_buffer; + else if ((tp->thrds = malloc(sizeof(*tp->thrds) * tp->tcnt)) == nullptr) + err("malloc:"); + + /* If for whatever reason some threads fail to be created, we don’t + panic but instead just continue using the threads that were able + to spawn. If all threads fail to spawn we return 0 and the caller + will resort to single-threaded behaviour. */ + + int n = 0; + for (int i = 0; i < tp->tcnt; i++) { + if ((errno = pthread_create(tp->thrds + n, nullptr, tpwork, tp)) != 0) + warn("failed to create thread:"); + else + n++; + } + return n; +} + +void +tpfree(tpool_t *tp) +{ + for (int i = 0; i < tp->tcnt; i++) + pthread_join(tp->thrds[i], nullptr); + +#if DEBUG + if (tp->thrds != thread_buffer) + free(tp->thrds); +#endif +} + +void * +tpwork(void *arg) +{ + tpool_t *tp = arg; + + allocator_t mem = init_heap_allocator(nullptr); + unsigned char *buf = array_new(mem, typeof(*buf), 4096); + + for (;;) { + ptrdiff_t i = atomic_fetch_add(&tp->wi, 1); + if (i >= tp->filecnt) + break; + process_file(tp->files[i], &buf); + } + + flockfile(stdout); + fwrite(buf, 1, array_len(buf), stdout); + funlockfile(stdout); + +#if DEBUG + array_free(buf); +#endif + return nullptr; +} diff --git a/src/tpool.h b/src/tpool.h new file mode 100644 index 0000000..04f0486 --- /dev/null +++ b/src/tpool.h @@ -0,0 +1,19 @@ +#ifndef GRAB_TPOOL_H +#define GRAB_TPOOL_H + +#include <pthread.h> +#include <stdatomic.h> +#include <stddef.h> + +typedef struct { + int tcnt; + pthread_t *thrds; + atomic_ptrdiff_t wi; + const char **files; + ptrdiff_t filecnt; +} tpool_t; + +int tpinit(tpool_t *tp, const char **files, ptrdiff_t filecnt); +void tpfree(tpool_t *tp); + +#endif /* !GRAB_TPOOL_H */ diff --git a/src/work.c b/src/work.c new file mode 100644 index 0000000..37fd8b8 --- /dev/null +++ b/src/work.c @@ -0,0 +1,453 @@ +#include <sys/mman.h> +#include <sys/stat.h> + +#include <stdatomic.h> +#include <stdckdint.h> +#include <stddef.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include <alloc.h> +#include <array.h> +#include <errors.h> +#include <macros.h> +#include <mbstring.h> +#include <pcre2.h> +#include <unicode/string.h> + +#include "exitcodes.h" +#include "flags.h" +#include "work.h" + +#define DEFINE_OPERATOR(fn) \ + void operator_##fn(ptrdiff_t opi, u8view_t sv, u8view_t **hl) +#define array_extend_sv(xs, sv) \ + array_extend((xs), (sv).p, (ptrdiff_t)(sv).len) + +typedef struct { + ptrdiff_t row, col; +} pos_t; + +static pos_t compute_pos(const char8_t *p); +static bool islbrk(u8view_t g); +static int svposcmp(const void *a, const void *b); +static void write_match_to_buffer(u8view_t sv, u8view_t *hl); + +static DEFINE_OPERATOR(dispatch); +static DEFINE_OPERATOR(g); +static DEFINE_OPERATOR(G); +static DEFINE_OPERATOR(h); +static DEFINE_OPERATOR(H); +static DEFINE_OPERATOR(x); +static DEFINE_OPERATOR(X); + +static thread_local const char *filename; +static thread_local char8_t *baseptr; +static thread_local const char8_t *last_match; +static thread_local unsigned char **buf; + +static typeof(operator_dispatch) *operators[] = { + ['g'] = operator_g, + ['G'] = operator_G, + ['h'] = operator_h, + ['H'] = operator_H, + ['x'] = operator_x, + ['X'] = operator_X, +}; + +extern atomic_int rv; +extern op_t *ops; +extern bool cflag; +extern typeof(pcre2_match) *pcre2_match_fn; + + + +void +process_file(const char *locl_filename, unsigned char **locl_buf) +{ + filename = locl_filename; + buf = locl_buf; + + FILE *fp = streq(filename, "-") ? stdin : fopen(filename, "r"); + if (fp == nullptr) { + warn("fopen: %s:", filename); + atomic_store(&rv, EXIT_WARNING); + goto out; + } + + allocator_t mem = init_heap_allocator(nullptr); + if (baseptr == nullptr) + baseptr = array_new(mem, char8_t, 0x1000); + size_t bufsz = array_cap(baseptr); + last_match = baseptr; + + do { + static_assert(sizeof(char8_t) == 1, "sizeof(char8_t) != 1; wtf?"); + baseptr = array_resz(baseptr, bufsz += BUFSIZ); /* TODO: Bounds checking */ + size_t n = fread(baseptr + array_len(baseptr), 1, BUFSIZ, fp); + array_hdr(baseptr)->len += n; + } while (!feof(fp)); + + if (ferror(fp)) { + warn("fread: %s:", filename); + atomic_store(&rv, EXIT_WARNING); + goto out; + } + + /* Shouldn’t need more than 32 ever… */ + static thread_local u8view_t *hl = nullptr; + if (hl == nullptr) + hl = array_new(mem, typeof(*hl), 32); + + operator_dispatch(0, (u8view_t){baseptr, array_len(baseptr)}, &hl); +#if DEBUG + array_free(baseptr); + baseptr = nullptr; + array_free(hl); + hl = nullptr; +#else + array_hdr(baseptr)->len = 0; + array_hdr(hl)->len = 0; +#endif + +out: + if (fp != stdin) + (void)fclose(fp); +} + + + +DEFINE_OPERATOR(dispatch) +{ + if (array_len(ops) == opi) { + if (flags.p) + exit(EXIT_SUCCESS); + atomic_compare_exchange_strong(&rv, &(int){EXIT_NOMATCH}, EXIT_SUCCESS); + write_match_to_buffer(sv, *hl); + } else /* Cast to silence GCC warning */ + operators[(unsigned char)ops[opi].c](opi, sv, hl); +} + +DEFINE_OPERATOR(g) +{ + pcre2_match_data *md = + pcre2_match_data_create_from_pattern(ops[opi].re, nullptr); + int n = pcre2_match_fn(ops[opi].re, sv.p, sv.len, 0, PCRE2_NOTEMPTY, + md, nullptr); + pcre2_match_data_free(md); + + /* This should never happen */ + if (n == 0) + cerr(EXIT_FATAL, "PCRE2 match data too small"); + if (n == PCRE2_ERROR_NOMATCH) + return; + if (n < 0) + ; /* TODO: Handle error */ + + operator_dispatch(opi + 1, sv, hl); +} + +DEFINE_OPERATOR(G) +{ + /* TODO: Can we reuse match data? */ + pcre2_match_data *md = + pcre2_match_data_create_from_pattern(ops[opi].re, nullptr); + int n = pcre2_match_fn(ops[opi].re, sv.p, sv.len, 0, PCRE2_NOTEMPTY, + md, nullptr); + pcre2_match_data_free(md); + + /* This should never happen */ + if (n == 0) + cerr(EXIT_FATAL, "PCRE2 match data too small"); + if (n == PCRE2_ERROR_NOMATCH) + operator_dispatch(opi + 1, sv, hl); + if (n < 0) + ; /* TODO: Handle error */ +} + +DEFINE_OPERATOR(h) +{ + if (flags.p) { + operator_dispatch(opi + 1, sv, hl); + return; + } + + pcre2_match_data *md = + pcre2_match_data_create_from_pattern(ops[opi].re, nullptr); + u8view_t sv_save = sv; + ptrdiff_t origlen = array_len(*hl); + for (;;) { + int n = pcre2_match_fn(ops[opi].re, sv.p, sv.len, 0, + PCRE2_NOTEMPTY, md, nullptr); + /* This should never happen */ + if (n == 0) + cerr(EXIT_FATAL, "PCRE2 match data too small"); + if (n == PCRE2_ERROR_NOMATCH) + break; + if (n < 0) + ; /* TODO: Handle error */ + + size_t *ov = pcre2_get_ovector_pointer(md); + array_push(hl, ((u8view_t){sv.p + ov[0], ov[1] - ov[0]})); + VSHFT(&sv, ov[1]); + } + pcre2_match_data_free(md); + operator_dispatch(opi + 1, sv_save, hl); + array_hdr(*hl)->len = origlen; +} + +DEFINE_OPERATOR(H) +{ + if (flags.p) { + operator_dispatch(opi + 1, sv, hl); + return; + } + + pcre2_match_data *md = + pcre2_match_data_create_from_pattern(ops[opi].re, nullptr); + u8view_t sv_save = sv; + ptrdiff_t origlen = array_len(*hl); + for (;;) { + int n = pcre2_match_fn(ops[opi].re, sv.p, sv.len, 0, PCRE2_NOTEMPTY, + md, nullptr); + /* This should never happen */ + if (n == 0) + cerr(EXIT_FATAL, "PCRE2 match data too small"); + if (n == PCRE2_ERROR_NOMATCH) + break; + if (n < 0) + ; /* TODO: Handle error */ + + size_t *ov = pcre2_get_ovector_pointer(md); + array_push(hl, ((u8view_t){sv.p, ov[0]})); + VSHFT(&sv, ov[1]); + } + pcre2_match_data_free(md); + operator_dispatch(opi + 1, sv_save, hl); + array_hdr(*hl)->len = origlen; +} + +DEFINE_OPERATOR(x) +{ + pcre2_match_data *md = + pcre2_match_data_create_from_pattern(ops[opi].re, nullptr); + for (;;) { + int n = pcre2_match_fn(ops[opi].re, sv.p, sv.len, 0, PCRE2_NOTEMPTY, + md, nullptr); + /* This should never happen */ + if (n == 0) + cerr(EXIT_FATAL, "PCRE2 match data too small"); + if (n == PCRE2_ERROR_NOMATCH) + break; + if (n < 0) + ; /* TODO: Handle error */ + + size_t *ov = pcre2_get_ovector_pointer(md); + operator_dispatch(opi + 1, (u8view_t){sv.p + ov[0], ov[1] - ov[0]}, hl); + VSHFT(&sv, ov[1]); + } + pcre2_match_data_free(md); +} + +DEFINE_OPERATOR(X) +{ + pcre2_match_data *md = + pcre2_match_data_create_from_pattern(ops[opi].re, nullptr); + for (;;) { + int n = pcre2_match_fn(ops[opi].re, sv.p, sv.len, 0, PCRE2_NOTEMPTY, + md, nullptr); + /* This should never happen */ + if (n == 0) + cerr(EXIT_FATAL, "PCRE2 match data too small"); + if (n == PCRE2_ERROR_NOMATCH) + break; + if (n < 0) + ; /* TODO: Handle error */ + + size_t *ov = pcre2_get_ovector_pointer(md); + if (ov[0] != 0) + operator_dispatch(opi + 1, (u8view_t){sv.p, ov[0]}, hl); + VSHFT(&sv, ov[1]); + } + if (sv.len != 0) + operator_dispatch(opi + 1, sv, hl); + pcre2_match_data_free(md); +} + + + +static inline bool +views_overlap(const u8view_t a, const u8view_t b) +{ + const char8_t *p = a.p + a.len; + return p >= b.p && p <= b.p + b.len; +} + +void +write_match_to_buffer(u8view_t sv, u8view_t *hl) +{ + const u8view_t COL_FN = !flags.c ? U8("") : U8("\33[35m"); + const u8view_t COL_HL = !flags.c ? U8("") : U8("\33[01;31m"); + const u8view_t COL_LN = !flags.c ? U8("") : U8("\33[32m"); + const u8view_t COL_SE = !flags.c ? U8("") : U8("\33[36m"); + const u8view_t COL_RS = !flags.c ? U8("") : U8("\33[0m"); + + if ( +#if GIT_GRAB + true +#else + flags.do_header +#endif + ) { + char sep = flags.z ? 0 : ':'; + + size_t filenamesz = strlen(filename); + + array_extend_sv(buf, COL_FN); + array_extend(buf, filename, (ptrdiff_t)filenamesz); + array_extend_sv(buf, COL_RS); + + array_extend_sv(buf, COL_SE); + array_push(buf, sep); + array_extend_sv(buf, COL_RS); + + /* GCC things ‘offset’ can overflow because our offsets have type + ptrdiff_t which if negative would have a ‘-’ in the front, but + we know that the match positions can’t be negative so it’s + safe to ignore. */ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wformat-overflow" + + int offsetsz; + char offset[/* len(INT64_MAX - 1) */ 19]; + if (flags.l) { + pos_t p = compute_pos(sv.p); + + offsetsz = sprintf(offset, "%td", p.row + 1); + array_extend_sv(buf, COL_LN); + array_extend(buf, offset, offsetsz); + array_extend_sv(buf, COL_RS); + + array_extend_sv(buf, COL_SE); + array_push(buf, sep); + array_extend_sv(buf, COL_RS); + + offsetsz = sprintf(offset, "%td", p.col + 1); + array_extend_sv(buf, COL_LN); + array_extend(buf, offset, offsetsz); + array_extend_sv(buf, COL_RS); + } else { + offsetsz = sprintf(offset, "%td", sv.p - baseptr); + array_extend_sv(buf, COL_LN); + array_extend(buf, offset, offsetsz); + array_extend_sv(buf, COL_RS); + } + + array_extend_sv(buf, COL_SE); + array_push(buf, sep); + array_extend_sv(buf, COL_RS); + } + +#pragma GCC diagnostic pop + + /* Here we need to take all the views of regions to highlight, and try + to merge them into a simpler form. This happens in two steps: + + 1. Sort the views by their starting position in the matched text. + 2. Merge overlapping views. + + After this process we should have the most reduced possible set of + views. The next part is to actually print the highlighted regions + possible which requires bounds-checking as highlighted regions may + begin before or end after the matched text when using patterns such + as ‘h/.+/ x/.$/’. */ + + static thread_local u8view_t *sorted = nullptr; + if (sorted == nullptr) { + allocator_t mem = init_heap_allocator(nullptr); + ptrdiff_t buflen = array_len(hl); + buflen = MAX(buflen, 16); + sorted = array_new(mem, typeof(*sorted), buflen); + } else + array_hdr(sorted)->len = 0; + + array_extend(&sorted, hl, array_len(hl)); + qsort(sorted, array_len(sorted), sizeof(*sorted), svposcmp); + + for (ptrdiff_t i = 0, len = array_len(sorted); i < len - 1;) { + if (views_overlap(sorted[i], sorted[i + 1])) { + sorted[i].len = sorted[i + 1].p + sorted[i + 1].len - sorted[i].p; + memmove(hl + i + 1, hl + i + 2, sizeof(*hl) * (len - i - 1)); + array_hdr(sorted)->len = --len; + } else + i++; + } + + for (ptrdiff_t i = 0, len = array_len(sorted); i < len; i++) { + if (i < len - 1 && sorted[i].p == sorted[i + 1].p) + continue; + array_extend(buf, sv.p, sorted[i].p - sv.p); + array_extend_sv(buf, COL_HL); + array_extend_sv(buf, sorted[i]); + array_extend_sv(buf, COL_RS); + ptrdiff_t Δ = sorted[i].p - sv.p + sorted[i].len; + VSHFT(&sv, Δ); + } + array_extend_sv(buf, sv); + +#if DEBUG + array_free(sorted); + sorted = nullptr; +#endif + + if (flags.z) + array_push(buf, 0); + else { + ptrdiff_t bufsz = array_len(*buf); + if (!flags.s || bufsz == 0 || (*buf)[bufsz - 1] != '\n') + array_push(buf, '\n'); + } +} + +pos_t +compute_pos(const char8_t *ptr) +{ + static thread_local pos_t p; + if (last_match == baseptr) + p.row = p.col = 0; + u8view_t g, sv = {last_match, PTRDIFF_MAX}; + while (sv.p < ptr) { + ucsgnext(&g, &sv); + if (islbrk(g)) { + p.row++; + p.col = 0; + } else + p.col = ucswdth(g, p.col, 8); /* TODO: Configurable tabsize? */ + } + last_match = sv.p; + return p; +} + +bool +islbrk(u8view_t g) +{ + return ucseq(g, U8("\n")) + || ucseq(g, U8("\v")) + || ucseq(g, U8("\f")) + || ucseq(g, U8("\r\n")) + || ucseq(g, U8("\x85")) + || ucseq(g, U8("\u2028")) + || ucseq(g, U8("\u2029")); +} + +int +svposcmp(const void *a_, const void *b_) +{ + const u8view_t *a = a_, + *b = b_; + ptrdiff_t Δ = a->p - b->p; + return Δ == 0 ? (ptrdiff_t)a->len - (ptrdiff_t)b->len : Δ; +} diff --git a/src/work.h b/src/work.h new file mode 100644 index 0000000..644a244 --- /dev/null +++ b/src/work.h @@ -0,0 +1,16 @@ +#ifndef GRAB_WORK_H +#define GRAB_WORK_H + +#include <pcre2.h> + +typedef struct { + char c; + pcre2_code *re; +#if DEBUG + bool free_me; +#endif +} op_t; + +void process_file(const char *filename, unsigned char **buf); + +#endif /* !GRAB_WORK_H */ |