From 042e43247f396a9000fead59d9bff87bf12806d6 Mon Sep 17 00:00:00 2001 From: Thomas Voss Date: Wed, 30 Oct 2024 01:51:14 +0100 Subject: Completely revamp the grab source code Some of the (many) few changes are: - Multithreading for significantly faster performance - The -p/--predicate flag - Byte offsets as the default - No customizable colors (maybe this will come back later) - Newer edition of mlib (formerly librune) --- src/work.c | 453 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 453 insertions(+) create mode 100644 src/work.c (limited to 'src/work.c') diff --git a/src/work.c b/src/work.c new file mode 100644 index 0000000..37fd8b8 --- /dev/null +++ b/src/work.c @@ -0,0 +1,453 @@ +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "exitcodes.h" +#include "flags.h" +#include "work.h" + +#define DEFINE_OPERATOR(fn) \ + void operator_##fn(ptrdiff_t opi, u8view_t sv, u8view_t **hl) +#define array_extend_sv(xs, sv) \ + array_extend((xs), (sv).p, (ptrdiff_t)(sv).len) + +typedef struct { + ptrdiff_t row, col; +} pos_t; + +static pos_t compute_pos(const char8_t *p); +static bool islbrk(u8view_t g); +static int svposcmp(const void *a, const void *b); +static void write_match_to_buffer(u8view_t sv, u8view_t *hl); + +static DEFINE_OPERATOR(dispatch); +static DEFINE_OPERATOR(g); +static DEFINE_OPERATOR(G); +static DEFINE_OPERATOR(h); +static DEFINE_OPERATOR(H); +static DEFINE_OPERATOR(x); +static DEFINE_OPERATOR(X); + +static thread_local const char *filename; +static thread_local char8_t *baseptr; +static thread_local const char8_t *last_match; +static thread_local unsigned char **buf; + +static typeof(operator_dispatch) *operators[] = { + ['g'] = operator_g, + ['G'] = operator_G, + ['h'] = operator_h, + ['H'] = operator_H, + ['x'] = operator_x, + ['X'] = operator_X, +}; + +extern atomic_int rv; +extern op_t *ops; +extern bool cflag; +extern typeof(pcre2_match) *pcre2_match_fn; + + + +void +process_file(const char *locl_filename, unsigned char **locl_buf) +{ + filename = locl_filename; + buf = locl_buf; + + FILE *fp = streq(filename, "-") ? stdin : fopen(filename, "r"); + if (fp == nullptr) { + warn("fopen: %s:", filename); + atomic_store(&rv, EXIT_WARNING); + goto out; + } + + allocator_t mem = init_heap_allocator(nullptr); + if (baseptr == nullptr) + baseptr = array_new(mem, char8_t, 0x1000); + size_t bufsz = array_cap(baseptr); + last_match = baseptr; + + do { + static_assert(sizeof(char8_t) == 1, "sizeof(char8_t) != 1; wtf?"); + baseptr = array_resz(baseptr, bufsz += BUFSIZ); /* TODO: Bounds checking */ + size_t n = fread(baseptr + array_len(baseptr), 1, BUFSIZ, fp); + array_hdr(baseptr)->len += n; + } while (!feof(fp)); + + if (ferror(fp)) { + warn("fread: %s:", filename); + atomic_store(&rv, EXIT_WARNING); + goto out; + } + + /* Shouldn’t need more than 32 ever… */ + static thread_local u8view_t *hl = nullptr; + if (hl == nullptr) + hl = array_new(mem, typeof(*hl), 32); + + operator_dispatch(0, (u8view_t){baseptr, array_len(baseptr)}, &hl); +#if DEBUG + array_free(baseptr); + baseptr = nullptr; + array_free(hl); + hl = nullptr; +#else + array_hdr(baseptr)->len = 0; + array_hdr(hl)->len = 0; +#endif + +out: + if (fp != stdin) + (void)fclose(fp); +} + + + +DEFINE_OPERATOR(dispatch) +{ + if (array_len(ops) == opi) { + if (flags.p) + exit(EXIT_SUCCESS); + atomic_compare_exchange_strong(&rv, &(int){EXIT_NOMATCH}, EXIT_SUCCESS); + write_match_to_buffer(sv, *hl); + } else /* Cast to silence GCC warning */ + operators[(unsigned char)ops[opi].c](opi, sv, hl); +} + +DEFINE_OPERATOR(g) +{ + pcre2_match_data *md = + pcre2_match_data_create_from_pattern(ops[opi].re, nullptr); + int n = pcre2_match_fn(ops[opi].re, sv.p, sv.len, 0, PCRE2_NOTEMPTY, + md, nullptr); + pcre2_match_data_free(md); + + /* This should never happen */ + if (n == 0) + cerr(EXIT_FATAL, "PCRE2 match data too small"); + if (n == PCRE2_ERROR_NOMATCH) + return; + if (n < 0) + ; /* TODO: Handle error */ + + operator_dispatch(opi + 1, sv, hl); +} + +DEFINE_OPERATOR(G) +{ + /* TODO: Can we reuse match data? */ + pcre2_match_data *md = + pcre2_match_data_create_from_pattern(ops[opi].re, nullptr); + int n = pcre2_match_fn(ops[opi].re, sv.p, sv.len, 0, PCRE2_NOTEMPTY, + md, nullptr); + pcre2_match_data_free(md); + + /* This should never happen */ + if (n == 0) + cerr(EXIT_FATAL, "PCRE2 match data too small"); + if (n == PCRE2_ERROR_NOMATCH) + operator_dispatch(opi + 1, sv, hl); + if (n < 0) + ; /* TODO: Handle error */ +} + +DEFINE_OPERATOR(h) +{ + if (flags.p) { + operator_dispatch(opi + 1, sv, hl); + return; + } + + pcre2_match_data *md = + pcre2_match_data_create_from_pattern(ops[opi].re, nullptr); + u8view_t sv_save = sv; + ptrdiff_t origlen = array_len(*hl); + for (;;) { + int n = pcre2_match_fn(ops[opi].re, sv.p, sv.len, 0, + PCRE2_NOTEMPTY, md, nullptr); + /* This should never happen */ + if (n == 0) + cerr(EXIT_FATAL, "PCRE2 match data too small"); + if (n == PCRE2_ERROR_NOMATCH) + break; + if (n < 0) + ; /* TODO: Handle error */ + + size_t *ov = pcre2_get_ovector_pointer(md); + array_push(hl, ((u8view_t){sv.p + ov[0], ov[1] - ov[0]})); + VSHFT(&sv, ov[1]); + } + pcre2_match_data_free(md); + operator_dispatch(opi + 1, sv_save, hl); + array_hdr(*hl)->len = origlen; +} + +DEFINE_OPERATOR(H) +{ + if (flags.p) { + operator_dispatch(opi + 1, sv, hl); + return; + } + + pcre2_match_data *md = + pcre2_match_data_create_from_pattern(ops[opi].re, nullptr); + u8view_t sv_save = sv; + ptrdiff_t origlen = array_len(*hl); + for (;;) { + int n = pcre2_match_fn(ops[opi].re, sv.p, sv.len, 0, PCRE2_NOTEMPTY, + md, nullptr); + /* This should never happen */ + if (n == 0) + cerr(EXIT_FATAL, "PCRE2 match data too small"); + if (n == PCRE2_ERROR_NOMATCH) + break; + if (n < 0) + ; /* TODO: Handle error */ + + size_t *ov = pcre2_get_ovector_pointer(md); + array_push(hl, ((u8view_t){sv.p, ov[0]})); + VSHFT(&sv, ov[1]); + } + pcre2_match_data_free(md); + operator_dispatch(opi + 1, sv_save, hl); + array_hdr(*hl)->len = origlen; +} + +DEFINE_OPERATOR(x) +{ + pcre2_match_data *md = + pcre2_match_data_create_from_pattern(ops[opi].re, nullptr); + for (;;) { + int n = pcre2_match_fn(ops[opi].re, sv.p, sv.len, 0, PCRE2_NOTEMPTY, + md, nullptr); + /* This should never happen */ + if (n == 0) + cerr(EXIT_FATAL, "PCRE2 match data too small"); + if (n == PCRE2_ERROR_NOMATCH) + break; + if (n < 0) + ; /* TODO: Handle error */ + + size_t *ov = pcre2_get_ovector_pointer(md); + operator_dispatch(opi + 1, (u8view_t){sv.p + ov[0], ov[1] - ov[0]}, hl); + VSHFT(&sv, ov[1]); + } + pcre2_match_data_free(md); +} + +DEFINE_OPERATOR(X) +{ + pcre2_match_data *md = + pcre2_match_data_create_from_pattern(ops[opi].re, nullptr); + for (;;) { + int n = pcre2_match_fn(ops[opi].re, sv.p, sv.len, 0, PCRE2_NOTEMPTY, + md, nullptr); + /* This should never happen */ + if (n == 0) + cerr(EXIT_FATAL, "PCRE2 match data too small"); + if (n == PCRE2_ERROR_NOMATCH) + break; + if (n < 0) + ; /* TODO: Handle error */ + + size_t *ov = pcre2_get_ovector_pointer(md); + if (ov[0] != 0) + operator_dispatch(opi + 1, (u8view_t){sv.p, ov[0]}, hl); + VSHFT(&sv, ov[1]); + } + if (sv.len != 0) + operator_dispatch(opi + 1, sv, hl); + pcre2_match_data_free(md); +} + + + +static inline bool +views_overlap(const u8view_t a, const u8view_t b) +{ + const char8_t *p = a.p + a.len; + return p >= b.p && p <= b.p + b.len; +} + +void +write_match_to_buffer(u8view_t sv, u8view_t *hl) +{ + const u8view_t COL_FN = !flags.c ? U8("") : U8("\33[35m"); + const u8view_t COL_HL = !flags.c ? U8("") : U8("\33[01;31m"); + const u8view_t COL_LN = !flags.c ? U8("") : U8("\33[32m"); + const u8view_t COL_SE = !flags.c ? U8("") : U8("\33[36m"); + const u8view_t COL_RS = !flags.c ? U8("") : U8("\33[0m"); + + if ( +#if GIT_GRAB + true +#else + flags.do_header +#endif + ) { + char sep = flags.z ? 0 : ':'; + + size_t filenamesz = strlen(filename); + + array_extend_sv(buf, COL_FN); + array_extend(buf, filename, (ptrdiff_t)filenamesz); + array_extend_sv(buf, COL_RS); + + array_extend_sv(buf, COL_SE); + array_push(buf, sep); + array_extend_sv(buf, COL_RS); + + /* GCC things ‘offset’ can overflow because our offsets have type + ptrdiff_t which if negative would have a ‘-’ in the front, but + we know that the match positions can’t be negative so it’s + safe to ignore. */ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wformat-overflow" + + int offsetsz; + char offset[/* len(INT64_MAX - 1) */ 19]; + if (flags.l) { + pos_t p = compute_pos(sv.p); + + offsetsz = sprintf(offset, "%td", p.row + 1); + array_extend_sv(buf, COL_LN); + array_extend(buf, offset, offsetsz); + array_extend_sv(buf, COL_RS); + + array_extend_sv(buf, COL_SE); + array_push(buf, sep); + array_extend_sv(buf, COL_RS); + + offsetsz = sprintf(offset, "%td", p.col + 1); + array_extend_sv(buf, COL_LN); + array_extend(buf, offset, offsetsz); + array_extend_sv(buf, COL_RS); + } else { + offsetsz = sprintf(offset, "%td", sv.p - baseptr); + array_extend_sv(buf, COL_LN); + array_extend(buf, offset, offsetsz); + array_extend_sv(buf, COL_RS); + } + + array_extend_sv(buf, COL_SE); + array_push(buf, sep); + array_extend_sv(buf, COL_RS); + } + +#pragma GCC diagnostic pop + + /* Here we need to take all the views of regions to highlight, and try + to merge them into a simpler form. This happens in two steps: + + 1. Sort the views by their starting position in the matched text. + 2. Merge overlapping views. + + After this process we should have the most reduced possible set of + views. The next part is to actually print the highlighted regions + possible which requires bounds-checking as highlighted regions may + begin before or end after the matched text when using patterns such + as ‘h/.+/ x/.$/’. */ + + static thread_local u8view_t *sorted = nullptr; + if (sorted == nullptr) { + allocator_t mem = init_heap_allocator(nullptr); + ptrdiff_t buflen = array_len(hl); + buflen = MAX(buflen, 16); + sorted = array_new(mem, typeof(*sorted), buflen); + } else + array_hdr(sorted)->len = 0; + + array_extend(&sorted, hl, array_len(hl)); + qsort(sorted, array_len(sorted), sizeof(*sorted), svposcmp); + + for (ptrdiff_t i = 0, len = array_len(sorted); i < len - 1;) { + if (views_overlap(sorted[i], sorted[i + 1])) { + sorted[i].len = sorted[i + 1].p + sorted[i + 1].len - sorted[i].p; + memmove(hl + i + 1, hl + i + 2, sizeof(*hl) * (len - i - 1)); + array_hdr(sorted)->len = --len; + } else + i++; + } + + for (ptrdiff_t i = 0, len = array_len(sorted); i < len; i++) { + if (i < len - 1 && sorted[i].p == sorted[i + 1].p) + continue; + array_extend(buf, sv.p, sorted[i].p - sv.p); + array_extend_sv(buf, COL_HL); + array_extend_sv(buf, sorted[i]); + array_extend_sv(buf, COL_RS); + ptrdiff_t Δ = sorted[i].p - sv.p + sorted[i].len; + VSHFT(&sv, Δ); + } + array_extend_sv(buf, sv); + +#if DEBUG + array_free(sorted); + sorted = nullptr; +#endif + + if (flags.z) + array_push(buf, 0); + else { + ptrdiff_t bufsz = array_len(*buf); + if (!flags.s || bufsz == 0 || (*buf)[bufsz - 1] != '\n') + array_push(buf, '\n'); + } +} + +pos_t +compute_pos(const char8_t *ptr) +{ + static thread_local pos_t p; + if (last_match == baseptr) + p.row = p.col = 0; + u8view_t g, sv = {last_match, PTRDIFF_MAX}; + while (sv.p < ptr) { + ucsgnext(&g, &sv); + if (islbrk(g)) { + p.row++; + p.col = 0; + } else + p.col = ucswdth(g, p.col, 8); /* TODO: Configurable tabsize? */ + } + last_match = sv.p; + return p; +} + +bool +islbrk(u8view_t g) +{ + return ucseq(g, U8("\n")) + || ucseq(g, U8("\v")) + || ucseq(g, U8("\f")) + || ucseq(g, U8("\r\n")) + || ucseq(g, U8("\x85")) + || ucseq(g, U8("\u2028")) + || ucseq(g, U8("\u2029")); +} + +int +svposcmp(const void *a_, const void *b_) +{ + const u8view_t *a = a_, + *b = b_; + ptrdiff_t Δ = a->p - b->p; + return Δ == 0 ? (ptrdiff_t)a->len - (ptrdiff_t)b->len : Δ; +} -- cgit v1.2.3