aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorThomas Voss <mail@thomasvoss.com> 2024-01-23 01:57:39 +0100
committerThomas Voss <mail@thomasvoss.com> 2024-01-23 01:57:39 +0100
commit1256660e1f0cea877b6d453704343f07d73d6224 (patch)
treefff760729e37626fc3749e587a29be5e94184db2
parent52cde82ece54db9796e43f9ee7497d50414a216f (diff)
Properly support UTF-8 in patterns
-rw-r--r--make.c5
-rw-r--r--man/grab.117
-rw-r--r--src/grab.c66
3 files changed, 53 insertions, 35 deletions
diff --git a/make.c b/make.c
index 6a6dbb3..834b426 100644
--- a/make.c
+++ b/make.c
@@ -17,8 +17,9 @@
#include "cbs.h"
#include "src/compat.h"
-#define CC "cc"
-#define CFLAGS "-Wall", "-Wextra", "-Wpedantic", "-Werror", "-pipe"
+#define CC "cc"
+#define CFLAGS \
+ "-Wall", "-Wextra", "-Wpedantic", "-Werror", "-Wno-pointer-sign", "-pipe"
#define CFLAGS_DEBUG "-DGRAB_DEBUG", "-g", "-ggdb3"
#ifdef __APPLE__
# define CFLAGS_RELEASE "-O3"
diff --git a/man/grab.1 b/man/grab.1
index cdcacfa..5e36c53 100644
--- a/man/grab.1
+++ b/man/grab.1
@@ -1,4 +1,4 @@
-.Dd 22 January, 2024
+.Dd 23 January, 2024
.Dt GRAB 1
.Os Grab 2.0.1
.Sh NAME
@@ -206,9 +206,17 @@ and
.Sq G/^1337$/
filters out the specific number 1337.
.Pp
-As you may use whichever delimiter you like, the following is also valid:
+The delimiter used for each given operator can be any valid UTF-8
+codepoint.
+As a result,
+the following pattern using the delimiters
+.Sq | ,
+.Sq \&. ,
+and
+.Sq ä
+is well-formed:
.Pp
-.Dl x|[0\-9]+| g.3. G#^1337#
+.Dl x|[0\-9]+| g.3. Gä^1337ä
.Pp
Operators are not allowed to take empty regular expression arguments with
one exception:
@@ -337,6 +345,7 @@ the newline will
be matched by
.Ql [^a] .
.Sh BUGS
-Input files must be encoded as UTF-8.
+The pattern string provided as a command-line argument as well as the
+provided input files must be encoded as UTF-8.
No other encodings are supported unless they are UTF-8 compatible,
such as ASCII.
diff --git a/src/grab.c b/src/grab.c
index 8117648..4031540 100644
--- a/src/grab.c
+++ b/src/grab.c
@@ -24,6 +24,8 @@
#endif
#include <gbrk.h>
+#include <rune.h>
+#include <utf8.h>
#include "compat.h"
#include "da.h"
@@ -84,12 +86,11 @@ static put_func putm, putm_nc;
static FILE *getfstream(int n, char *v[n]);
#endif
static void grab(struct ops, FILE *, const char *);
-static struct ops comppat(char *);
-static regex_t mkregex(char *, size_t);
+static struct ops comppat(char8_t *);
+static regex_t mkregex(char8_t *, size_t);
static bool islbrk(struct u8view);
static bool sgrvalid(const char *);
static bool xisspace(char);
-static char *xstrchrnul(const char *, char);
static int svposcmp(const void *, const void *);
static char *env_or_default(const char *, const char *);
@@ -260,7 +261,7 @@ main(int argc, char **argv)
}
struct ops
-comppat(char *s)
+comppat(char8_t *s)
{
struct ops ops;
@@ -271,18 +272,35 @@ comppat(char *s)
diex(EEARLY);
do {
- char delim;
- char *p;
+ int w;
+ rune ch;
+ size_t len;
+ char8_t *p;
struct op op;
- op.c = *s;
- if (!op_table[(uchar)op.c])
- diex("Invalid operator ‘%c’", *s);
- if (!(delim = *++s))
+ /* Grab the operator and delimiter. All operators are ASCII, but
+ u8tor() is used to parse it so that we get properly formed error
+ messages when someone uses a non-ASCII operator. */
+ w = u8tor(&ch, s);
+ if (ch == RUNE_ERROR)
+ diex("Invalid UTF-8 sequence near ‘%02hhX’", s[-1]);
+ if (w > 1 || !op_table[ch])
+ diex("Invalid operator ‘%.*s’", w, s);
+ op.c = *s++;
+
+ s += u8tor(&ch, s);
+ if (ch == RUNE_ERROR)
+ diex("Invalid UTF-8 sequence near ‘%02hhX’", s[-1]);
+ if (ch == '\0')
diex(EEARLY);
- p = ++s;
- s = xstrchrnul(s, delim);
+ /* Find the closing delimiter. The user is allowed to omit the closing
+ delimiter if this is the last operation in the query pattern. */
+ p = s;
+ len = strlen(s);
+ if (!(s = (char8_t *)u8chr(s, ch, len)))
+ s = p + len;
+
if (s - p == 0) {
if (op.c != 'h')
diex("Empty regex given to ‘%c’", op.c);
@@ -298,11 +316,13 @@ comppat(char *s)
}
dapush(&ops, op);
- if (*s)
- s++;
+ if (*s) {
+ if (s += u8tor(&ch, s), ch == RUNE_ERROR)
+ diex("Invalid UTF-8 sequence near ‘%02hhX’", s[-1]);
+ }
while (*s && xisspace(*s))
s++;
- } while (*s && *(s + 1));
+ } while (*s);
return ops;
}
@@ -747,11 +767,11 @@ sgrvalid(const char *s)
}
regex_t
-mkregex(char *s, size_t n)
+mkregex(char8_t *s, size_t n)
{
- char c = s[n];
int ret, cflags;
regex_t r;
+ char8_t c = s[n];
s[n] = 0;
cflags = REG_EXTENDED | REG_UTF | (nflag ? REG_NEWLINE : REG_DOTALL);
@@ -823,15 +843,3 @@ xisspace(char c)
{
return c == ' ' || c == '\t' || c == '\n';
}
-
-char *
-xstrchrnul(const char *s, char c)
-{
- for (; *s; s++) {
- if (*s == '\\')
- s++;
- else if (*s == c)
- break;
- }
- return (char *)s;
-}