aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore2
-rw-r--r--LICENSE14
-rw-r--r--Makefile15
-rw-r--r--README.md55
-rw-r--r--da.h64
-rw-r--r--grab.125
-rw-r--r--grab.c274
7 files changed, 449 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..aa21f6a
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+.clang-format
+grab
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..276994d
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,14 @@
+BSD Zero Clause License
+
+Copyright © 2023 Thomas Voss
+
+Permission to use, copy, modify, and/or distribute this software for any
+purpose with or without fee is hereby granted.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
+REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
+AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
+INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
+LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+PERFORMANCE OF THIS SOFTWARE.
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..344fb30
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,15 @@
+.POSIX:
+
+WARNINGS = -Wall -Wextra -Werror -Wpedantic
+
+CC = cc
+CFLAGS = $(WARNINGS) -pipe -O3 -march=native -mtune=native
+
+all: grab
+grab: grab.c
+
+debug:
+ $(CC) $(WARNINGS) -DGRAB_DEBUG -g -ggdb3 -o grab grab.c
+
+clean:
+ rm -f grab
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..14e5546
--- /dev/null
+++ b/README.md
@@ -0,0 +1,55 @@
+# Grab — A better grep
+
+Grab is a more powerful version of the well-known Grep utility, making
+use of structural regular expressions as described by Rob Pike in [this
+paper][1]. Grab allows you to be far more precise with your searching
+than Grep, as it doesn’t constrain itself to working only on individual
+lines.
+
+Grab invokations must include a pattern string which specifies which text
+to match. A pattern string consists of one or more commands. A command
+is an operator followed by a delimiter, a regular expression (regex), and
+then terminated by the same delimiter. The last delimiter of the last
+command is optional.
+
+For example, a pattern string may look like ‘`x/[a-z]+/ g.foo. v/bar/`’.
+
+The available operators are ‘g’, ‘v’, and ‘x’. The ‘x' operator iterates
+over all matches of the corresponding regex. This means that to print
+all numbers in a file, you can use the pattern string ‘`x/[0-9]+/`’. The
+‘g’ and ‘v’ operators are filters. The ‘g’ operator discards all results
+that don’t match the given regex, while the ‘v’ operator discards all
+results that *do* match the given regex. This means that to select all
+numbers in a file that contain a ‘3’ but are not ‘1337’, you can use the
+pattern string ‘`x/[0-9]+/ g/3/ v/^1337$/`’.
+
+
+## Examples
+
+Get a list of your CPU flags.
+
+```sh
+# With Grep
+grep '^flags' /proc/cpuinfo \
+| sed 's/flags:\t*: //; y/ /\n/' \
+| sort \
+| uniq
+
+# With Grab
+grab 'x/^flags.*/ x/\w+/ v/flags/' /proc/cpuinfo \
+| sort \
+| uniq
+```
+
+1) Select lines that start with ‘flags’: `x/^flags.*/`
+2) Select all the words: `x/\w+/`
+3) Filter out the word ‘flags’: `v/flags/`
+
+
+## Additional Options
+
+The Grab utility has a few options that may be helpful for your usecase.
+For more detailed documentation, see the Grab manual with `man grab`.
+
+
+[1]: https://doc.cat-v.org/bell_labs/structural_regexps/se.pdf
diff --git a/da.h b/da.h
new file mode 100644
index 0000000..1587ece
--- /dev/null
+++ b/da.h
@@ -0,0 +1,64 @@
+/*
+ * Simple & stupid dynamic array single-header implementation. You can use the
+ * macros defined in this file with any structure that has the following fields:
+ *
+ * struct dyn_array {
+ * T *buf // Array of items
+ * N len // Length of array
+ * N cap // Capacity of array
+ * }
+ *
+ * The type ‘T’ is whatever type you want to store. The type ‘N’ is any numeric
+ * type, most likely ‘size_t’.
+ *
+ * You should include ‘err.h’ and ‘stdlib.h’ along with this file. If you want
+ * to use da_remove(), include ‘string.h’. The da_remove() macro also doesn’t
+ * bother with shrinking your array when the length is far lower than the
+ * capacity. If you care about that, do it yourself.
+ *
+ *
+ * Macro Overview
+ * ――――――――――――――
+ * The argument ‘a’ to all of the below macros is a pointer to the dynamic array
+ * structure.
+ *
+ * da_init(a, n) Initialize the array with a capacity of ‘n’ items.
+ * da_append(a, x) Append the item ‘x’ to the array
+ * da_remove(a, x) Remove the item ‘x’ from the array
+ * da_remove_range(a, x, y) Remove the items between the range [x, y)
+ */
+
+#ifndef MANGO_DA_H
+#define MANGO_DA_H
+
+#define __da_s(a) (sizeof(*(a)->buf))
+
+#define da_init(a, n) \
+ do { \
+ (a)->cap = n; \
+ (a)->len = 0; \
+ (a)->buf = malloc((a)->cap * __da_s(a)); \
+ if ((a)->buf == NULL) \
+ err(EXIT_FAILURE, "malloc"); \
+ } while (0)
+
+#define da_append(a, x) \
+ do { \
+ if ((a)->len >= (a)->cap) { \
+ (a)->cap = (a)->cap * 2 + 1; \
+ (a)->buf = realloc((a)->buf, (a)->cap * __da_s(a)); \
+ if ((a)->buf == NULL) \
+ err(EXIT_FAILURE, "realloc"); \
+ } \
+ (a)->buf[(a)->len++] = (x); \
+ } while (0)
+
+#define da_remove(a, i) da_remove_range((a), (i), (i) + 1)
+
+#define da_remove_range(a, i, j) \
+ do { \
+ memmove((a)->buf + (i), (a)->buf + (j), ((a)->len - (j)) * __da_s(a)); \
+ (a)->len -= j - i; \
+ } while (0)
+
+#endif /* !MANGO_DA_H */
diff --git a/grab.1 b/grab.1
new file mode 100644
index 0000000..8349fc4
--- /dev/null
+++ b/grab.1
@@ -0,0 +1,25 @@
+.Dd December 9 2023
+.Dt GRAB 1
+.Os
+.Sh NAME
+.Nm grab
+.Nd search for patterns in files
+.Sh SYNOPSIS
+.Nm
+.Op Fl d Ar string
+.Ar pattern
+.Op Ar
+.Sh DESCRIPTION
+.Sh EXIT STATUS
+.Ex -std
+.Sh EXAMPLES
+.Sh SEE ALSO
+.Xr grep 1
+.Rs
+.%A Rob Pike
+.%D 1987
+.%T Structural Regular Expressions
+.%U https://doc.cat-v.org/bell_labs/structural_regexps/se.pdf
+.Re
+.Sh AUTHORS
+.An Thomas Voss Aq Mt mail@thomasvoss.com
diff --git a/grab.c b/grab.c
new file mode 100644
index 0000000..9fdfbd0
--- /dev/null
+++ b/grab.c
@@ -0,0 +1,274 @@
+#include <assert.h>
+#include <err.h>
+#include <limits.h>
+#include <locale.h>
+#include <regex.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "da.h"
+
+#ifndef REG_STARTEND
+# error "REG_STARTEND not defined"
+#endif
+
+#define die(...) err(EXIT_FAILURE, __VA_ARGS__);
+#define diex(...) errx(EXIT_FAILURE, __VA_ARGS__);
+
+#define EEARLY "Input string terminated prematurely"
+
+struct op {
+ char c;
+ regex_t pat;
+};
+
+struct ops {
+ struct op *buf;
+ size_t len, cap;
+};
+
+struct chars {
+ char *buf;
+ size_t len, cap;
+};
+
+struct sv {
+ char *p;
+ size_t len;
+};
+
+typedef unsigned char uchar;
+typedef void (*cmd_func)(struct sv, struct ops, size_t);
+
+static void cmdg(struct sv, struct ops, size_t);
+static void cmdx(struct sv, struct ops, size_t);
+
+static void grab(struct ops, FILE *, const char *);
+static void putsv(struct sv);
+static regex_t mkregex(char *, size_t);
+static struct ops comppat(char *);
+
+static bool xisspace(char);
+static char *xstrchrnul(const char *, char);
+
+static int rv = EXIT_SUCCESS;
+static const char *delim = "\n";
+static const cmd_func op_table[UCHAR_MAX] = {
+ ['g'] = cmdg,
+ ['v'] = cmdg,
+ ['x'] = cmdx,
+ // ['y'] = cmdy,
+};
+
+static void
+usage(const char *s)
+{
+ fprintf(stderr, "Usage: %s [-d string] pattern [file ...]\n", s);
+ exit(EXIT_FAILURE);
+}
+
+int
+main(int argc, char **argv)
+{
+ int rv, opt;
+ struct ops ops;
+
+ if (argc < 2)
+ usage(argv[0]);
+
+ setlocale(LC_ALL, "");
+
+ while ((opt = getopt(argc, argv, "d:")) != -1) {
+ switch (opt) {
+ case 'd':
+ delim = optarg;
+ break;
+ default:
+ usage(argv[0]);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ ops = comppat(argv[0]);
+ if (argc == 1)
+ grab(ops, stdin, "-");
+ else {
+ for (int i = 1; i < argc; i++) {
+ FILE *fp;
+
+ if (strcmp(argv[i], "-") == 0) {
+ grab(ops, stdin, "-");
+ } else if ((fp = fopen(argv[i], "r")) == NULL) {
+ warn("fopen: %s", argv[i]);
+ rv = EXIT_FAILURE;
+ } else {
+ grab(ops, fp, argv[i]);
+ fclose(fp);
+ }
+ }
+ }
+
+#ifdef GRAB_DEBUG
+ for (size_t i = 0; i < ops.len; i++)
+ regfree(&ops.buf[i].pat);
+ free(ops.buf);
+#endif
+
+ return rv;
+}
+
+struct ops
+comppat(char *s)
+{
+#define skip_ws(p) for (; *(p) && xisspace(*(p)); (p)++)
+ struct ops ops;
+
+ da_init(&ops, 8);
+ skip_ws(s);
+ if (!*s)
+ diex(EEARLY);
+
+ do {
+ char delim;
+ char *p;
+ struct op op;
+
+ op.c = *s;
+ if (!op_table[(uchar)op.c])
+ diex("Invalid operator ‘%c’", *s);
+ if (!(delim = *++s))
+ diex(EEARLY);
+
+ p = ++s;
+ s = xstrchrnul(s, delim);
+ op.pat = mkregex(p, s - p);
+ da_append(&ops, op);
+
+ if (*s)
+ s++;
+ skip_ws(s);
+ } while (*s && *(s + 1));
+
+ return ops;
+#undef skip_ws
+}
+
+void
+grab(struct ops ops, FILE *stream, const char *filename)
+{
+ size_t n;
+ struct chars chars = {0};
+
+ do {
+ static_assert(sizeof(char) == 1, "sizeof(char) != 1; wtf?");
+ chars.cap += BUFSIZ;
+ if ((chars.buf = realloc(chars.buf, chars.cap)) == NULL)
+ die("realloc");
+ chars.len += n = fread(chars.buf + chars.len, 1, BUFSIZ, stream);
+ } while (n == BUFSIZ);
+
+ if (ferror(stream)) {
+ warn("fread: %s", filename);
+ rv = EXIT_FAILURE;
+ } else
+ cmdx((struct sv){.p = chars.buf, .len = chars.len}, ops, 0);
+
+ free(chars.buf);
+}
+
+void
+cmdx(struct sv sv, struct ops ops, size_t i)
+{
+ regmatch_t pm = {
+ .rm_so = 0,
+ .rm_eo = sv.len,
+ };
+ struct op op = ops.buf[i];
+
+ do {
+ struct sv nsv;
+
+ if (regexec(&op.pat, sv.p, 1, &pm, REG_STARTEND) == REG_NOMATCH)
+ break;
+ nsv = (struct sv){.p = sv.p + pm.rm_so, .len = pm.rm_eo - pm.rm_so};
+ if (i + 1 == ops.len)
+ putsv(nsv);
+ else
+ op_table[(uchar)ops.buf[i + 1].c](nsv, ops, i + 1);
+
+ if (pm.rm_so == pm.rm_eo)
+ pm.rm_eo++;
+ pm = (regmatch_t){
+ .rm_so = pm.rm_eo,
+ .rm_eo = sv.len,
+ };
+ } while (pm.rm_so < pm.rm_eo);
+}
+
+void
+cmdg(struct sv sv, struct ops ops, size_t i)
+{
+ int r;
+ regmatch_t pm = {
+ .rm_so = 0,
+ .rm_eo = sv.len,
+ };
+ struct op op = ops.buf[i];
+
+ r = regexec(&op.pat, sv.p, 1, &pm, REG_STARTEND);
+ if ((r == REG_NOMATCH && op.c == 'g') || (r != REG_NOMATCH && op.c == 'v'))
+ return;
+
+ if (i + 1 == ops.len)
+ putsv(sv);
+ else
+ op_table[(uchar)ops.buf[i + 1].c](sv, ops, i + 1);
+}
+
+void
+putsv(struct sv sv)
+{
+ fwrite(sv.p, 1, sv.len, stdout);
+ fputs(delim, stdout);
+}
+
+regex_t
+mkregex(char *s, size_t n)
+{
+ char c = s[n];
+ int ret;
+ regex_t r;
+
+ s[n] = 0;
+ if ((ret = regcomp(&r, s, REG_EXTENDED | REG_NEWLINE)) != 0) {
+ char emsg[128];
+ regerror(ret, &r, emsg, sizeof(emsg));
+ diex("Failed to compile regex: %s", emsg);
+ }
+ s[n] = c;
+
+ return r;
+}
+
+bool
+xisspace(char c)
+{
+ return c == ' ' || c == '\t' || c == '\n';
+}
+
+char *
+xstrchrnul(const char *s, char c)
+{
+ for (; *s; s++) {
+ if (*s == '\\')
+ s++;
+ else if (*s == c)
+ break;
+ }
+ return (char *)s;
+}