diff options
-rw-r--r-- | .gitignore | 2 | ||||
-rw-r--r-- | LICENSE | 14 | ||||
-rw-r--r-- | Makefile | 15 | ||||
-rw-r--r-- | README.md | 55 | ||||
-rw-r--r-- | da.h | 64 | ||||
-rw-r--r-- | grab.1 | 25 | ||||
-rw-r--r-- | grab.c | 274 |
7 files changed, 449 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..aa21f6a --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.clang-format +grab @@ -0,0 +1,14 @@ +BSD Zero Clause License + +Copyright © 2023 Thomas Voss + +Permission to use, copy, modify, and/or distribute this software for any +purpose with or without fee is hereby granted. + +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH +REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY +AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, +INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM +LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR +OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR +PERFORMANCE OF THIS SOFTWARE. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..344fb30 --- /dev/null +++ b/Makefile @@ -0,0 +1,15 @@ +.POSIX: + +WARNINGS = -Wall -Wextra -Werror -Wpedantic + +CC = cc +CFLAGS = $(WARNINGS) -pipe -O3 -march=native -mtune=native + +all: grab +grab: grab.c + +debug: + $(CC) $(WARNINGS) -DGRAB_DEBUG -g -ggdb3 -o grab grab.c + +clean: + rm -f grab diff --git a/README.md b/README.md new file mode 100644 index 0000000..14e5546 --- /dev/null +++ b/README.md @@ -0,0 +1,55 @@ +# Grab — A better grep + +Grab is a more powerful version of the well-known Grep utility, making +use of structural regular expressions as described by Rob Pike in [this +paper][1]. Grab allows you to be far more precise with your searching +than Grep, as it doesn’t constrain itself to working only on individual +lines. + +Grab invokations must include a pattern string which specifies which text +to match. A pattern string consists of one or more commands. A command +is an operator followed by a delimiter, a regular expression (regex), and +then terminated by the same delimiter. The last delimiter of the last +command is optional. + +For example, a pattern string may look like ‘`x/[a-z]+/ g.foo. v/bar/`’. + +The available operators are ‘g’, ‘v’, and ‘x’. The ‘x' operator iterates +over all matches of the corresponding regex. This means that to print +all numbers in a file, you can use the pattern string ‘`x/[0-9]+/`’. The +‘g’ and ‘v’ operators are filters. The ‘g’ operator discards all results +that don’t match the given regex, while the ‘v’ operator discards all +results that *do* match the given regex. This means that to select all +numbers in a file that contain a ‘3’ but are not ‘1337’, you can use the +pattern string ‘`x/[0-9]+/ g/3/ v/^1337$/`’. + + +## Examples + +Get a list of your CPU flags. + +```sh +# With Grep +grep '^flags' /proc/cpuinfo \ +| sed 's/flags:\t*: //; y/ /\n/' \ +| sort \ +| uniq + +# With Grab +grab 'x/^flags.*/ x/\w+/ v/flags/' /proc/cpuinfo \ +| sort \ +| uniq +``` + +1) Select lines that start with ‘flags’: `x/^flags.*/` +2) Select all the words: `x/\w+/` +3) Filter out the word ‘flags’: `v/flags/` + + +## Additional Options + +The Grab utility has a few options that may be helpful for your usecase. +For more detailed documentation, see the Grab manual with `man grab`. + + +[1]: https://doc.cat-v.org/bell_labs/structural_regexps/se.pdf @@ -0,0 +1,64 @@ +/* + * Simple & stupid dynamic array single-header implementation. You can use the + * macros defined in this file with any structure that has the following fields: + * + * struct dyn_array { + * T *buf // Array of items + * N len // Length of array + * N cap // Capacity of array + * } + * + * The type ‘T’ is whatever type you want to store. The type ‘N’ is any numeric + * type, most likely ‘size_t’. + * + * You should include ‘err.h’ and ‘stdlib.h’ along with this file. If you want + * to use da_remove(), include ‘string.h’. The da_remove() macro also doesn’t + * bother with shrinking your array when the length is far lower than the + * capacity. If you care about that, do it yourself. + * + * + * Macro Overview + * ―――――――――――――― + * The argument ‘a’ to all of the below macros is a pointer to the dynamic array + * structure. + * + * da_init(a, n) Initialize the array with a capacity of ‘n’ items. + * da_append(a, x) Append the item ‘x’ to the array + * da_remove(a, x) Remove the item ‘x’ from the array + * da_remove_range(a, x, y) Remove the items between the range [x, y) + */ + +#ifndef MANGO_DA_H +#define MANGO_DA_H + +#define __da_s(a) (sizeof(*(a)->buf)) + +#define da_init(a, n) \ + do { \ + (a)->cap = n; \ + (a)->len = 0; \ + (a)->buf = malloc((a)->cap * __da_s(a)); \ + if ((a)->buf == NULL) \ + err(EXIT_FAILURE, "malloc"); \ + } while (0) + +#define da_append(a, x) \ + do { \ + if ((a)->len >= (a)->cap) { \ + (a)->cap = (a)->cap * 2 + 1; \ + (a)->buf = realloc((a)->buf, (a)->cap * __da_s(a)); \ + if ((a)->buf == NULL) \ + err(EXIT_FAILURE, "realloc"); \ + } \ + (a)->buf[(a)->len++] = (x); \ + } while (0) + +#define da_remove(a, i) da_remove_range((a), (i), (i) + 1) + +#define da_remove_range(a, i, j) \ + do { \ + memmove((a)->buf + (i), (a)->buf + (j), ((a)->len - (j)) * __da_s(a)); \ + (a)->len -= j - i; \ + } while (0) + +#endif /* !MANGO_DA_H */ @@ -0,0 +1,25 @@ +.Dd December 9 2023 +.Dt GRAB 1 +.Os +.Sh NAME +.Nm grab +.Nd search for patterns in files +.Sh SYNOPSIS +.Nm +.Op Fl d Ar string +.Ar pattern +.Op Ar +.Sh DESCRIPTION +.Sh EXIT STATUS +.Ex -std +.Sh EXAMPLES +.Sh SEE ALSO +.Xr grep 1 +.Rs +.%A Rob Pike +.%D 1987 +.%T Structural Regular Expressions +.%U https://doc.cat-v.org/bell_labs/structural_regexps/se.pdf +.Re +.Sh AUTHORS +.An Thomas Voss Aq Mt mail@thomasvoss.com @@ -0,0 +1,274 @@ +#include <assert.h> +#include <err.h> +#include <limits.h> +#include <locale.h> +#include <regex.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include "da.h" + +#ifndef REG_STARTEND +# error "REG_STARTEND not defined" +#endif + +#define die(...) err(EXIT_FAILURE, __VA_ARGS__); +#define diex(...) errx(EXIT_FAILURE, __VA_ARGS__); + +#define EEARLY "Input string terminated prematurely" + +struct op { + char c; + regex_t pat; +}; + +struct ops { + struct op *buf; + size_t len, cap; +}; + +struct chars { + char *buf; + size_t len, cap; +}; + +struct sv { + char *p; + size_t len; +}; + +typedef unsigned char uchar; +typedef void (*cmd_func)(struct sv, struct ops, size_t); + +static void cmdg(struct sv, struct ops, size_t); +static void cmdx(struct sv, struct ops, size_t); + +static void grab(struct ops, FILE *, const char *); +static void putsv(struct sv); +static regex_t mkregex(char *, size_t); +static struct ops comppat(char *); + +static bool xisspace(char); +static char *xstrchrnul(const char *, char); + +static int rv = EXIT_SUCCESS; +static const char *delim = "\n"; +static const cmd_func op_table[UCHAR_MAX] = { + ['g'] = cmdg, + ['v'] = cmdg, + ['x'] = cmdx, + // ['y'] = cmdy, +}; + +static void +usage(const char *s) +{ + fprintf(stderr, "Usage: %s [-d string] pattern [file ...]\n", s); + exit(EXIT_FAILURE); +} + +int +main(int argc, char **argv) +{ + int rv, opt; + struct ops ops; + + if (argc < 2) + usage(argv[0]); + + setlocale(LC_ALL, ""); + + while ((opt = getopt(argc, argv, "d:")) != -1) { + switch (opt) { + case 'd': + delim = optarg; + break; + default: + usage(argv[0]); + } + } + + argc -= optind; + argv += optind; + + ops = comppat(argv[0]); + if (argc == 1) + grab(ops, stdin, "-"); + else { + for (int i = 1; i < argc; i++) { + FILE *fp; + + if (strcmp(argv[i], "-") == 0) { + grab(ops, stdin, "-"); + } else if ((fp = fopen(argv[i], "r")) == NULL) { + warn("fopen: %s", argv[i]); + rv = EXIT_FAILURE; + } else { + grab(ops, fp, argv[i]); + fclose(fp); + } + } + } + +#ifdef GRAB_DEBUG + for (size_t i = 0; i < ops.len; i++) + regfree(&ops.buf[i].pat); + free(ops.buf); +#endif + + return rv; +} + +struct ops +comppat(char *s) +{ +#define skip_ws(p) for (; *(p) && xisspace(*(p)); (p)++) + struct ops ops; + + da_init(&ops, 8); + skip_ws(s); + if (!*s) + diex(EEARLY); + + do { + char delim; + char *p; + struct op op; + + op.c = *s; + if (!op_table[(uchar)op.c]) + diex("Invalid operator ‘%c’", *s); + if (!(delim = *++s)) + diex(EEARLY); + + p = ++s; + s = xstrchrnul(s, delim); + op.pat = mkregex(p, s - p); + da_append(&ops, op); + + if (*s) + s++; + skip_ws(s); + } while (*s && *(s + 1)); + + return ops; +#undef skip_ws +} + +void +grab(struct ops ops, FILE *stream, const char *filename) +{ + size_t n; + struct chars chars = {0}; + + do { + static_assert(sizeof(char) == 1, "sizeof(char) != 1; wtf?"); + chars.cap += BUFSIZ; + if ((chars.buf = realloc(chars.buf, chars.cap)) == NULL) + die("realloc"); + chars.len += n = fread(chars.buf + chars.len, 1, BUFSIZ, stream); + } while (n == BUFSIZ); + + if (ferror(stream)) { + warn("fread: %s", filename); + rv = EXIT_FAILURE; + } else + cmdx((struct sv){.p = chars.buf, .len = chars.len}, ops, 0); + + free(chars.buf); +} + +void +cmdx(struct sv sv, struct ops ops, size_t i) +{ + regmatch_t pm = { + .rm_so = 0, + .rm_eo = sv.len, + }; + struct op op = ops.buf[i]; + + do { + struct sv nsv; + + if (regexec(&op.pat, sv.p, 1, &pm, REG_STARTEND) == REG_NOMATCH) + break; + nsv = (struct sv){.p = sv.p + pm.rm_so, .len = pm.rm_eo - pm.rm_so}; + if (i + 1 == ops.len) + putsv(nsv); + else + op_table[(uchar)ops.buf[i + 1].c](nsv, ops, i + 1); + + if (pm.rm_so == pm.rm_eo) + pm.rm_eo++; + pm = (regmatch_t){ + .rm_so = pm.rm_eo, + .rm_eo = sv.len, + }; + } while (pm.rm_so < pm.rm_eo); +} + +void +cmdg(struct sv sv, struct ops ops, size_t i) +{ + int r; + regmatch_t pm = { + .rm_so = 0, + .rm_eo = sv.len, + }; + struct op op = ops.buf[i]; + + r = regexec(&op.pat, sv.p, 1, &pm, REG_STARTEND); + if ((r == REG_NOMATCH && op.c == 'g') || (r != REG_NOMATCH && op.c == 'v')) + return; + + if (i + 1 == ops.len) + putsv(sv); + else + op_table[(uchar)ops.buf[i + 1].c](sv, ops, i + 1); +} + +void +putsv(struct sv sv) +{ + fwrite(sv.p, 1, sv.len, stdout); + fputs(delim, stdout); +} + +regex_t +mkregex(char *s, size_t n) +{ + char c = s[n]; + int ret; + regex_t r; + + s[n] = 0; + if ((ret = regcomp(&r, s, REG_EXTENDED | REG_NEWLINE)) != 0) { + char emsg[128]; + regerror(ret, &r, emsg, sizeof(emsg)); + diex("Failed to compile regex: %s", emsg); + } + s[n] = c; + + return r; +} + +bool +xisspace(char c) +{ + return c == ' ' || c == '\t' || c == '\n'; +} + +char * +xstrchrnul(const char *s, char c) +{ + for (; *s; s++) { + if (*s == '\\') + s++; + else if (*s == c) + break; + } + return (char *)s; +} |