From 1256660e1f0cea877b6d453704343f07d73d6224 Mon Sep 17 00:00:00 2001
From: Thomas Voss <mail@thomasvoss.com>
Date: Tue, 23 Jan 2024 01:57:39 +0100
Subject: Properly support UTF-8 in patterns

---
 make.c     |  5 +++--
 man/grab.1 | 17 ++++++++++++----
 src/grab.c | 66 +++++++++++++++++++++++++++++++++++---------------------------
 3 files changed, 53 insertions(+), 35 deletions(-)

diff --git a/make.c b/make.c
index 6a6dbb3..834b426 100644
--- a/make.c
+++ b/make.c
@@ -17,8 +17,9 @@
 #include "cbs.h"
 #include "src/compat.h"
 
-#define CC           "cc"
-#define CFLAGS       "-Wall", "-Wextra", "-Wpedantic", "-Werror", "-pipe"
+#define CC "cc"
+#define CFLAGS \
+	"-Wall", "-Wextra", "-Wpedantic", "-Werror", "-Wno-pointer-sign", "-pipe"
 #define CFLAGS_DEBUG "-DGRAB_DEBUG", "-g", "-ggdb3"
 #ifdef __APPLE__
 #	define CFLAGS_RELEASE "-O3"
diff --git a/man/grab.1 b/man/grab.1
index cdcacfa..5e36c53 100644
--- a/man/grab.1
+++ b/man/grab.1
@@ -1,4 +1,4 @@
-.Dd 22 January, 2024
+.Dd 23 January, 2024
 .Dt GRAB 1
 .Os Grab 2.0.1
 .Sh NAME
@@ -206,9 +206,17 @@ and
 .Sq G/^1337$/
 filters out the specific number 1337.
 .Pp
-As you may use whichever delimiter you like, the following is also valid:
+The delimiter used for each given operator can be any valid UTF-8
+codepoint.
+As a result,
+the following pattern using the delimiters
+.Sq | ,
+.Sq \&. ,
+and
+.Sq ä
+is well-formed:
 .Pp
-.Dl x|[0\-9]+| g.3. G#^1337#
+.Dl x|[0\-9]+| g.3. Gä^1337ä
 .Pp
 Operators are not allowed to take empty regular expression arguments with
 one exception:
@@ -337,6 +345,7 @@ the newline will
 be matched by
 .Ql [^a] .
 .Sh BUGS
-Input files must be encoded as UTF-8.
+The pattern string provided as a command-line argument as well as the
+provided input files must be encoded as UTF-8.
 No other encodings are supported unless they are UTF-8 compatible,
 such as ASCII.
diff --git a/src/grab.c b/src/grab.c
index 8117648..4031540 100644
--- a/src/grab.c
+++ b/src/grab.c
@@ -24,6 +24,8 @@
 #endif
 
 #include <gbrk.h>
+#include <rune.h>
+#include <utf8.h>
 
 #include "compat.h"
 #include "da.h"
@@ -84,12 +86,11 @@ static put_func putm, putm_nc;
 static FILE *getfstream(int n, char *v[n]);
 #endif
 static void grab(struct ops, FILE *, const char *);
-static struct ops comppat(char *);
-static regex_t mkregex(char *, size_t);
+static struct ops comppat(char8_t *);
+static regex_t mkregex(char8_t *, size_t);
 static bool islbrk(struct u8view);
 static bool sgrvalid(const char *);
 static bool xisspace(char);
-static char *xstrchrnul(const char *, char);
 static int svposcmp(const void *, const void *);
 static char *env_or_default(const char *, const char *);
 
@@ -260,7 +261,7 @@ main(int argc, char **argv)
 }
 
 struct ops
-comppat(char *s)
+comppat(char8_t *s)
 {
 	struct ops ops;
 
@@ -271,18 +272,35 @@ comppat(char *s)
 		diex(EEARLY);
 
 	do {
-		char delim;
-		char *p;
+		int w;
+		rune ch;
+		size_t len;
+		char8_t *p;
 		struct op op;
 
-		op.c = *s;
-		if (!op_table[(uchar)op.c])
-			diex("Invalid operator ‘%c’", *s);
-		if (!(delim = *++s))
+		/* Grab the operator and delimiter.  All operators are ASCII, but
+		   u8tor() is used to parse it so that we get properly formed error
+		   messages when someone uses a non-ASCII operator. */
+		w = u8tor(&ch, s);
+		if (ch == RUNE_ERROR)
+			diex("Invalid UTF-8 sequence near ‘%02hhX’", s[-1]);
+		if (w > 1 || !op_table[ch])
+			diex("Invalid operator ‘%.*s’", w, s);
+		op.c = *s++;
+
+		s += u8tor(&ch, s);
+		if (ch == RUNE_ERROR)
+			diex("Invalid UTF-8 sequence near ‘%02hhX’", s[-1]);
+		if (ch == '\0')
 			diex(EEARLY);
 
-		p = ++s;
-		s = xstrchrnul(s, delim);
+		/* Find the closing delimiter.  The user is allowed to omit the closing
+		   delimiter if this is the last operation in the query pattern. */
+		p = s;
+		len = strlen(s);
+		if (!(s = (char8_t *)u8chr(s, ch, len)))
+			s = p + len;
+
 		if (s - p == 0) {
 			if (op.c != 'h')
 				diex("Empty regex given to ‘%c’", op.c);
@@ -298,11 +316,13 @@ comppat(char *s)
 		}
 		dapush(&ops, op);
 
-		if (*s)
-			s++;
+		if (*s) {
+			if (s += u8tor(&ch, s), ch == RUNE_ERROR)
+				diex("Invalid UTF-8 sequence near ‘%02hhX’", s[-1]);
+		}
 		while (*s && xisspace(*s))
 			s++;
-	} while (*s && *(s + 1));
+	} while (*s);
 
 	return ops;
 }
@@ -747,11 +767,11 @@ sgrvalid(const char *s)
 }
 
 regex_t
-mkregex(char *s, size_t n)
+mkregex(char8_t *s, size_t n)
 {
-	char c = s[n];
 	int ret, cflags;
 	regex_t r;
+	char8_t c = s[n];
 
 	s[n] = 0;
 	cflags = REG_EXTENDED | REG_UTF | (nflag ? REG_NEWLINE : REG_DOTALL);
@@ -823,15 +843,3 @@ xisspace(char c)
 {
 	return c == ' ' || c == '\t' || c == '\n';
 }
-
-char *
-xstrchrnul(const char *s, char c)
-{
-	for (; *s; s++) {
-		if (*s == '\\')
-			s++;
-		else if (*s == c)
-			break;
-	}
-	return (char *)s;
-}
-- 
cgit v1.2.3