From 042e43247f396a9000fead59d9bff87bf12806d6 Mon Sep 17 00:00:00 2001 From: Thomas Voss Date: Wed, 30 Oct 2024 01:51:14 +0100 Subject: Completely revamp the grab source code Some of the (many) few changes are: - Multithreading for significantly faster performance - The -p/--predicate flag - Byte offsets as the default - No customizable colors (maybe this will come back later) - Newer edition of mlib (formerly librune) --- src/grab.c | 859 ------------------------------------------------------------- 1 file changed, 859 deletions(-) delete mode 100644 src/grab.c (limited to 'src/grab.c') diff --git a/src/grab.c b/src/grab.c deleted file mode 100644 index bf26a79..0000000 --- a/src/grab.c +++ /dev/null @@ -1,859 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if GRAB_DO_PCRE -# include -#else -# include -# ifndef REG_DOTALL -# define REG_DOTALL 0 -# endif -# define REG_UCP 0 -# define REG_UTF 0 -# ifndef REG_STARTEND -# error "REG_STARTEND not defined" -# endif -#endif - -#include -#include -#include - -#include "da.h" - -#define MIN(a, b) ((a) < (b) ? (a) : (b)) -#define MAX(a, b) ((a) > (b) ? (a) : (b)) - -#define FLAGMSK(f) ((uint64_t)1 << ((f) - ((f) < 'a' ? 'A' : 'G'))) -#define FLAGSET(f) (flags & FLAGMSK(f)) - -#define lengthof(a) (sizeof(a) / sizeof(*(a))) - -#define die(...) err(3, __VA_ARGS__) -#define diex(...) errx(3, __VA_ARGS__) -#define warn(...) \ - do { \ - warn(__VA_ARGS__); \ - rv = 3; \ - } while (0) -#define warnx(...) \ - do { \ - warnx(__VA_ARGS__); \ - rv = 3; \ - } while (0) - -#define streq(a, b) (!strcmp(a, b)) -#define memeq(a, b, n) (!memcmp(a, b, n)) - -#define DEFCOL_FN "35" -#define DEFCOL_HL "01;31" -#define DEFCOL_LN "32" -#define DEFCOL_SE "36" - -struct matches { - struct sv *buf; - size_t len, cap; -}; - -struct op { - char c; - regex_t pat; -#ifdef GRAB_DEBUG - bool alloced; -#endif -}; - -struct ops { - struct op *buf; - size_t len, cap; -}; - -struct sv { - char8_t *p; - size_t len; -}; - -typedef unsigned char uchar; -typedef void cmd_func(struct sv, struct matches *, struct ops, size_t, - const char *); -typedef void put_func(struct sv, struct matches *, const char *); - -static cmd_func cmdg, cmdh, cmdH, cmdx, cmdX; -static put_func putm, putm_nc; - -#if GIT_GRAB -static FILE *getfstream(int n, char *v[n]); -#endif -static void grab(struct ops, FILE *, const char *); -static struct ops comppat(char8_t *); -static regex_t mkregex(char8_t *, size_t); -static bool islbrk(struct u8view); -static bool sgrvalid(const char *); -static bool xisspace(char); -static int svposcmp(const void *, const void *); -static char *env_or_default(const char *, const char *); - -static int filecnt, rv; -static bool got_match; -static uint64_t flags = FLAGMSK('f') * GIT_GRAB; -static put_func *putf; - -static struct { - const char8_t *p, *bp; - size_t col, row; -} pos; - -static cmd_func *op_table[UCHAR_MAX] = { - ['g'] = cmdg, ['G'] = cmdg, ['h'] = cmdh, - ['H'] = cmdH, ['x'] = cmdx, ['X'] = cmdX, -}; - -[[noreturn]] static void -usage(const char *s) -{ - fprintf(stderr, -#if GIT_GRAB - "Usage: %s [-s | -z] [-bcinU] pattern [glob ...]\n" -#else - "Usage: %s [-s | -z] [-bcfinU] pattern [file ...]\n" -#endif - " %s -h\n", - s, s); - exit(EXIT_FAILURE); -} - -int -main(int argc, char **argv) -{ - int opt; - struct ops ops; - struct option longopts[] = { - {"byte-offset", no_argument, nullptr, 'b'}, - {"color", no_argument, nullptr, 'c'}, -#if GIT_GRAB - {"filenames", no_argument, nullptr, 'f'}, -#endif - {"help", no_argument, nullptr, 'h'}, - {"ignore-case", no_argument, nullptr, 'i'}, - {"newline", no_argument, nullptr, 'n'}, - {"strip-newline", no_argument, nullptr, 's'}, - {"no-unicode", no_argument, nullptr, 'U'}, - {"zero", no_argument, nullptr, 'z'}, - {nullptr, 0, nullptr, 0 }, - }; - -#if GIT_GRAB - char *entry = nullptr; - size_t len; - ssize_t nr; - FILE *flist; - const char *opts = "bchinsUz"; -#else - const char *opts = "bcfhinsUz"; -#endif - - argv[0] = basename(argv[0]); - if (argc < 2) - usage(argv[0]); - - setlocale(LC_ALL, ""); - - while ((opt = getopt_long(argc, argv, opts, longopts, nullptr)) != -1) { - switch (opt) { - case '?': - usage(argv[0]); - case 'h': - execlp("man", "man", "1", argv[0], nullptr); - die("execlp: man 1 %s", argv[0]); -#if !GRAB_DO_PCRE - case 'U': - errx(2, "program not built with PCRE support"); -#endif - default: - flags |= FLAGMSK(opt); - } - } - - if (FLAGSET('s') && FLAGSET('z')) - usage(argv[0]); - - argc -= optind; - argv += optind; - filecnt = argc - 1; - - if (!FLAGSET('c') && isatty(STDOUT_FILENO) == 1 - && !env_or_default("NO_COLOR", nullptr)) - { - flags |= FLAGMSK('c') * !streq(env_or_default("TERM", ""), "dumb"); - } - - putf = FLAGSET('c') ? putm : putm_nc; - ops = comppat(argv[0]); - -#if GIT_GRAB - if (!(flist = getfstream(argc - 1, argv + 1))) - die("getfstream"); - while ((nr = getdelim(&entry, &len, '\0', flist)) > 0) { - FILE *fp; - - if (!(fp = fopen(entry, "r"))) - warn("fopen: %s", entry); - else { - grab(ops, fp, entry); - fclose(fp); - } - } - if (ferror(flist)) - warn("getdelim"); - fclose(flist); -#else - if (argc == 1) - grab(ops, stdin, "-"); - else { - for (int i = 1; i < argc; i++) { - FILE *fp; - - if (streq(argv[i], "-")) { - grab(ops, stdin, "-"); - } else if (!(fp = fopen(argv[i], "r"))) { - warn("fopen: %s", argv[i]); - } else { - grab(ops, fp, argv[i]); - fclose(fp); - } - } - } -#endif - -#ifdef GRAB_DEBUG -# if GIT_GRAB - free(entry); -# endif - da_foreach (&ops, op) { - if (op->alloced) - regfree(&op->pat); - } - free(ops.buf); -#endif - - return got_match ? rv : EXIT_FAILURE; -} - -struct ops -comppat(char8_t *s) -{ - struct ops ops; - - dainit(&ops, 8); - while (*s && xisspace(*s)) - s++; - if (!*s) - diex("input string terminated prematurely"); - - do { - int w; - rune ch; - size_t len; - char8_t *p; - struct op op; - - /* Grab the operator and delimiter. All operators are ASCII, but - u8tor() is used to parse it so that we get properly formed error - messages when someone uses a non-ASCII operator. */ - w = u8tor(&ch, s); - if (ch == RUNE_ERROR) - diex("invalid UTF-8 sequence near ‘%02hhX’", s[-1]); - if (w > 1 || !op_table[ch]) - diex("invalid operator ‘%.*s’", w, s); - op.c = *s++; - - s += u8tor(&ch, s); - if (ch == RUNE_ERROR) - diex("invalid UTF-8 sequence near ‘%02hhX’", s[-1]); - if (ch == '\0') - diex("input string terminated prematurely"); - - /* Find the closing delimiter. The user is allowed to omit the closing - delimiter if this is the last operation in the query pattern. */ - p = s; - len = strlen(s); - if (!(s = u8chr(s, ch, len))) - s = p + len; - - if (s - p == 0) { - if (op.c != 'h') - diex("empty regex given to ‘%c’", op.c); - if (ops.len == 0) - diex("empty ‘h’ is not allowed as the first operator"); - op.pat = ops.buf[ops.len - 1].pat; - } else - op.pat = mkregex(p, s - p); - -#if GRAB_DEBUG - op.alloced = s - p == 0; -#endif - - dapush(&ops, op); - - if (*s) { - s += u8tor(&ch, s); - if (ch == RUNE_ERROR) - diex("invalid UTF-8 sequence near ‘%02hhX’", s[-1]); - } - while (*s && xisspace(*s)) - s++; - } while (*s); - - return ops; -} - -void -grab(struct ops ops, FILE *stream, const char *filename) -{ - size_t n; - struct { - char *buf; - size_t len, cap; - } chars = {0}; - - do { - static_assert(sizeof(char) == 1, "sizeof(char) != 1; wtf?"); - chars.cap += BUFSIZ; - if (!(chars.buf = realloc(chars.buf, chars.cap))) - die("realloc"); - chars.len += n = fread(chars.buf + chars.len, 1, BUFSIZ, stream); - } while (n == BUFSIZ); - - if (ferror(stream)) { - warn("fread: %s", filename); - goto out; - } - - const char8_t *p; - struct sv sv = { - .p = chars.buf, - .len = chars.len, - }; - struct matches ms; - - if (p = u8chk(chars.buf, chars.len)) { - warnx("%s: invalid UTF-8 near ‘%02X’", filename, *p); - goto out; - } - - dainit(&ms, 4); - pos.col = pos.row = 1; - pos.bp = pos.p = chars.buf; - op_table[(uchar)ops.buf[0].c](sv, &ms, ops, 0, filename); - free(ms.buf); - -out: - free(chars.buf); -} - -void -cmdg(struct sv sv, struct matches *ms, struct ops ops, size_t i, - const char *filename) -{ - int r; - regmatch_t rm = { - .rm_so = 0, - .rm_eo = sv.len, - }; - struct op op = ops.buf[i]; - - r = regexec(&op.pat, sv.p, 1, &rm, REG_STARTEND); - if ((r == REG_NOMATCH && op.c == 'g') || (r != REG_NOMATCH && op.c == 'G')) - return; - - if (i + 1 == ops.len) - putf(sv, ms, filename); - else - op_table[(uchar)ops.buf[i + 1].c](sv, ms, ops, i + 1, filename); -} - -void -cmdh(struct sv sv, struct matches *ms, struct ops ops, size_t i, - const char *filename) -{ - regmatch_t rm = { - .rm_so = 0, - .rm_eo = sv.len, - }; - struct op op = ops.buf[i]; - - do { - if (regexec(&op.pat, sv.p, 1, &rm, REG_STARTEND) == REG_NOMATCH) - break; - - if (rm.rm_so < rm.rm_eo) - dapush(ms, ((struct sv){sv.p + rm.rm_so, rm.rm_eo - rm.rm_so})); - else { - rune unused; - rm.rm_eo += u8tor_uc(&unused, sv.p + rm.rm_eo); - } - - rm = (regmatch_t){ - .rm_so = rm.rm_eo, - .rm_eo = sv.len, - }; - } while (rm.rm_so < rm.rm_eo); - - if (i + 1 == ops.len) - putf(sv, ms, filename); - else { - size_t save = ms->len; - op_table[(uchar)ops.buf[i + 1].c](sv, ms, ops, i + 1, filename); - ms->len = save; - } -} - -void -cmdH(struct sv sv, struct matches *ms, struct ops ops, size_t i, - const char *filename) -{ - regmatch_t rm = { - .rm_so = 0, - .rm_eo = sv.len, - }; - regmatch_t prev = { - .rm_so = 0, - .rm_eo = 0, - }; - struct op op = ops.buf[i]; - - do { - struct sv nsv; - - if (regexec(&op.pat, sv.p, 1, &rm, REG_STARTEND) == REG_NOMATCH) - break; - - if (prev.rm_so || prev.rm_eo || rm.rm_so) { - nsv = (struct sv){ - .p = sv.p + prev.rm_eo, - .len = rm.rm_so - prev.rm_eo, - }; - if (nsv.len) - dapush(ms, nsv); - } - - prev = rm; - if (rm.rm_so == rm.rm_eo) { - rune unused; - rm.rm_eo += u8tor_uc(&unused, sv.p + rm.rm_eo); - } - rm = (regmatch_t){ - .rm_so = rm.rm_eo, - .rm_eo = sv.len, - }; - } while (rm.rm_so < rm.rm_eo); - - if (prev.rm_eo < rm.rm_eo) - dapush(ms, ((struct sv){sv.p + rm.rm_so, rm.rm_eo - rm.rm_so})); - - if (i + 1 == ops.len) - putf(sv, ms, filename); - else - op_table[(uchar)ops.buf[i + 1].c](sv, ms, ops, i + 1, filename); -} - -void -cmdx(struct sv sv, struct matches *ms, struct ops ops, size_t i, - const char *filename) -{ - regmatch_t rm = { - .rm_so = 0, - .rm_eo = sv.len, - }; - struct op op = ops.buf[i]; - - do { - struct sv nsv; - - if (regexec(&op.pat, sv.p, 1, &rm, REG_STARTEND) == REG_NOMATCH) - break; - if (rm.rm_so < rm.rm_eo) { - nsv = (struct sv){ - .p = sv.p + rm.rm_so, - .len = rm.rm_eo - rm.rm_so, - }; - if (i + 1 == ops.len) - putf(nsv, ms, filename); - else { - size_t save = ms->len; - op_table[(uchar)ops.buf[i + 1].c](nsv, ms, ops, i + 1, - filename); - ms->len = save; - } - } else { - rune unused; - rm.rm_eo += u8tor_uc(&unused, sv.p + rm.rm_eo); - } - rm = (regmatch_t){ - .rm_so = rm.rm_eo, - .rm_eo = sv.len, - }; - } while (rm.rm_so < rm.rm_eo); -} - -void -cmdX(struct sv sv, struct matches *ms, struct ops ops, size_t i, - const char *filename) -{ - regmatch_t rm = { - .rm_so = 0, - .rm_eo = sv.len, - }; - regmatch_t prev = { - .rm_so = 0, - .rm_eo = 0, - }; - struct op op = ops.buf[i]; - - do { - struct sv nsv; - - if (regexec(&op.pat, sv.p, 1, &rm, REG_STARTEND) == REG_NOMATCH) - break; - - if (prev.rm_so || prev.rm_eo || rm.rm_so) { - nsv = (struct sv){ - .p = sv.p + prev.rm_eo, - .len = rm.rm_so - prev.rm_eo, - }; - if (nsv.len) { - if (i + 1 == ops.len) - putf(nsv, ms, filename); - else - op_table[(uchar)ops.buf[i + 1].c](nsv, ms, ops, i + 1, - filename); - } - } - - prev = rm; - if (rm.rm_so == rm.rm_eo) { - rune unused; - rm.rm_eo += u8tor_uc(&unused, sv.p + rm.rm_eo); - } - rm = (regmatch_t){ - .rm_so = rm.rm_eo, - .rm_eo = sv.len, - }; - } while (rm.rm_so < rm.rm_eo); - - if (prev.rm_eo < rm.rm_eo) { - struct sv nsv = { - .p = sv.p + rm.rm_so, - .len = rm.rm_eo - rm.rm_so, - }; - if (i + 1 == ops.len) - putf(nsv, ms, filename); - else - op_table[(uchar)ops.buf[i + 1].c](nsv, ms, ops, i + 1, filename); - } -} - -int -svposcmp(const void *a, const void *b) -{ - struct sv *A, *B; - A = (struct sv *)a; - B = (struct sv *)b; - return A->p != B->p ? A->p - B->p : A->len < B->len ? -1 : A->len != B->len; -} - -void -putm(struct sv sv, struct matches *ms, const char *filename) -{ - const char8_t *p; - struct matches valid; - static const char *fn, *hl, *ln, *se; - - got_match = true; - - if (FLAGSET('c') && !fn) { - char *optstr; - if ((optstr = env_or_default("GRAB_COLORS", nullptr))) { - enum { - OPT_FN, - OPT_HL, - OPT_LN, - OPT_SE, - }; - /* clang-format off */ - static char *const tokens[] = { - [OPT_FN] = "fn", - [OPT_HL] = "hl", - [OPT_LN] = "ln", - [OPT_SE] = "se", - nullptr - }; - /* clang-format on */ - - while (*optstr) { - char *val; - switch (getsubopt(&optstr, tokens, &val)) { - case OPT_FN: - if (sgrvalid(val)) - fn = val; - break; - case OPT_HL: - if (sgrvalid(val)) - hl = val; - break; - case OPT_LN: - if (sgrvalid(val)) - fn = val; - break; - case OPT_SE: - if (sgrvalid(val)) - se = val; - break; - default: - warnx("invalid color value -- '%s'", val); - } - } - } - - if (!fn) - fn = DEFCOL_FN; - if (!hl) - hl = DEFCOL_HL; - if (!ln) - ln = DEFCOL_LN; - if (!se) - se = DEFCOL_SE; - } - - if (FLAGSET('f') || filecnt > 1) { - char sep = FLAGSET('z') ? '\0' : ':'; - printf("\33[%sm%s\33[0m" /* filename */ - "\33[%sm%c\33[0m", /* separator */ - fn, filename, se, sep); - - if (FLAGSET('b')) { - printf("\33[%sm%td\33[0m" /* byte offset */ - "\33[%sm%c\33[0m", /* separator */ - ln, sv.p - pos.bp, se, sep); - } else { - struct u8view v; - size_t len = sv.p - pos.p; - - while (u8gnext(&v, &pos.p, &len)) { - if (islbrk(v)) { - pos.col = 1; - pos.row++; - } else - pos.col++; - } - - printf("\33[%sm%zu\33[0m" /* row */ - "\33[%sm%c\33[0m" /* separator */ - "\33[%sm%zu\33[0m" /* column */ - "\33[%sm%c\33[0m", /* separator */ - ln, pos.row, se, sep, ln, pos.col, se, sep); - } - } - - /* Here we need to take all the views of regions to highlight, and try - to merge them into a simpler form. This happens in two steps: - - 1. Sort the views by their starting position in the matched text. - 2. Merge overlapping views. - - After this process we should have the most reduced possible set of - views. The next part is to actually print the highlighted regions - possible which requires bounds-checking as highlighted regions may - begin before or end after the matched text when using patterns such - as ‘h/.+/ x/.$/’. */ - - dainit(&valid, ms->len); - qsort(ms->buf, ms->len, sizeof(*ms->buf), svposcmp); - memcpy(valid.buf, ms->buf, ms->len * sizeof(*ms->buf)); - valid.len = ms->len; - - for (size_t i = 0; i + 1 < valid.len;) { - ptrdiff_t d; - struct sv *a, *b; - - a = valid.buf + i; - b = valid.buf + i + 1; - d = a->p + a->len - b->p; - - if (d >= 0) { - a->len += MAX(b->len - d, 0); - daremove(&valid, i + 1); - } else - i++; - } - - for (size_t i = 0; i < valid.len; i++) { - struct sv *m = valid.buf + i; - if (m->p + m->len < sv.p || m->p >= sv.p + sv.len) { - daremove(&valid, i); - i--; - continue; - } - - if (m->p < sv.p) { - m->len -= sv.p - m->p; - m->p = sv.p; - } - m->len = MIN(m->len, (size_t)(sv.p + sv.len - m->p)); - } - - p = sv.p; - da_foreach (&valid, m) { - printf("%.*s\33[%sm%.*s\33[0m", (int)(m->p - p), p, hl, (int)m->len, - m->p); - p = m->p + m->len; - } - fwrite(p, 1, sv.p + sv.len - p, stdout); - - if (!(FLAGSET('s') && sv.p[sv.len - 1] == '\n')) - putchar(FLAGSET('z') ? '\0' : '\n'); - free(valid.buf); -} - -void -putm_nc(struct sv sv, struct matches *ms, const char *filename) -{ - (void)ms; - - got_match = true; - - if (FLAGSET('f') || filecnt > 1) { - char sep = FLAGSET('z') ? '\0' : ':'; - printf("%s%c", filename, sep); - - if (FLAGSET('b')) - printf("%td%c", sv.p - pos.bp, sep); - else { - struct u8view v; - size_t len = sv.p - pos.p; - - while (u8gnext(&v, &pos.p, &len)) { - if (islbrk(v)) { - pos.col = 1; - pos.row++; - } else - pos.col++; - } - - printf("%zu%c%zu%c", pos.row, sep, pos.col, sep); - } - } - fwrite(sv.p, 1, sv.len, stdout); - if (!(FLAGSET('s') && sv.p[sv.len - 1] == '\n')) - putchar(FLAGSET('z') ? '\0' : '\n'); -} - -bool -islbrk(struct u8view v) -{ - return *v.p == '\n' || (v.len == 2 && memeq(v.p, "\r\n", 2)); -} - -bool -sgrvalid(const char *s) -{ - if (!s || !*s) - return false; - do { - if ((*s < '0' || *s > '9') && *s != ';') - return false; - } while (*++s); - - return true; -} - -regex_t -mkregex(char8_t *s, size_t n) -{ - int ret, cflags; - regex_t r; - char8_t c = s[n]; - - s[n] = 0; - cflags = REG_EXTENDED | REG_UTF | (FLAGSET('n') ? REG_NEWLINE : REG_DOTALL); - if (FLAGSET('i')) - cflags |= REG_ICASE; - if (!FLAGSET('U')) - cflags |= REG_UCP; - if (ret = regcomp(&r, s, cflags)) { - char emsg[256]; - regerror(ret, &r, emsg, sizeof(emsg)); - diex("failed to compile regex ‘%s’: %s", s, emsg); - } - s[n] = c; - - return r; -} - -#if GIT_GRAB -FILE * -getfstream(int argc, char *argv[argc]) -{ - pid_t pid; - int fds[2]; - enum { - FD_R, - FD_W, - }; - - if (pipe(fds) == -1) - die("pipe"); - - switch (pid = fork()) { - case -1: - die("fork"); - case 0:; - size_t len; - char **args; - static const char *git_grep_args[] = { - "git", "grep", "--cached", "-Ilz", "", - }; - - len = argc + lengthof(git_grep_args) + 1; - - close(fds[FD_R]); - if (dup2(fds[FD_W], STDOUT_FILENO) == -1) - die("dup2"); - close(fds[FD_W]); - - if (!(args = malloc(len * sizeof(char *)))) - die("malloc"); - memcpy(args, git_grep_args, sizeof(git_grep_args)); - memcpy(args + 5, argv, argc * sizeof(char *)); - args[len - 1] = nullptr; - - execvp("git", args); - die("execvp: git grep --cached -Ilz ''"); - } - - close(fds[FD_W]); - return fdopen(fds[FD_R], "r"); -} -#endif - -char * -env_or_default(const char *e, const char *d) -{ - const char *s = getenv(e); - return (char *)(s && *s ? s : d); -} - -bool -xisspace(char c) -{ - return c == ' ' || c == '\t' || c == '\n'; -} -- cgit v1.2.3