From 1256660e1f0cea877b6d453704343f07d73d6224 Mon Sep 17 00:00:00 2001 From: Thomas Voss Date: Tue, 23 Jan 2024 01:57:39 +0100 Subject: Properly support UTF-8 in patterns --- make.c | 5 +++-- man/grab.1 | 17 ++++++++++++---- src/grab.c | 66 +++++++++++++++++++++++++++++++++++--------------------------- 3 files changed, 53 insertions(+), 35 deletions(-) diff --git a/make.c b/make.c index 6a6dbb3..834b426 100644 --- a/make.c +++ b/make.c @@ -17,8 +17,9 @@ #include "cbs.h" #include "src/compat.h" -#define CC "cc" -#define CFLAGS "-Wall", "-Wextra", "-Wpedantic", "-Werror", "-pipe" +#define CC "cc" +#define CFLAGS \ + "-Wall", "-Wextra", "-Wpedantic", "-Werror", "-Wno-pointer-sign", "-pipe" #define CFLAGS_DEBUG "-DGRAB_DEBUG", "-g", "-ggdb3" #ifdef __APPLE__ # define CFLAGS_RELEASE "-O3" diff --git a/man/grab.1 b/man/grab.1 index cdcacfa..5e36c53 100644 --- a/man/grab.1 +++ b/man/grab.1 @@ -1,4 +1,4 @@ -.Dd 22 January, 2024 +.Dd 23 January, 2024 .Dt GRAB 1 .Os Grab 2.0.1 .Sh NAME @@ -206,9 +206,17 @@ and .Sq G/^1337$/ filters out the specific number 1337. .Pp -As you may use whichever delimiter you like, the following is also valid: +The delimiter used for each given operator can be any valid UTF-8 +codepoint. +As a result, +the following pattern using the delimiters +.Sq | , +.Sq \&. , +and +.Sq ä +is well-formed: .Pp -.Dl x|[0\-9]+| g.3. G#^1337# +.Dl x|[0\-9]+| g.3. Gä^1337ä .Pp Operators are not allowed to take empty regular expression arguments with one exception: @@ -337,6 +345,7 @@ the newline will be matched by .Ql [^a] . .Sh BUGS -Input files must be encoded as UTF-8. +The pattern string provided as a command-line argument as well as the +provided input files must be encoded as UTF-8. No other encodings are supported unless they are UTF-8 compatible, such as ASCII. diff --git a/src/grab.c b/src/grab.c index 8117648..4031540 100644 --- a/src/grab.c +++ b/src/grab.c @@ -24,6 +24,8 @@ #endif #include +#include +#include #include "compat.h" #include "da.h" @@ -84,12 +86,11 @@ static put_func putm, putm_nc; static FILE *getfstream(int n, char *v[n]); #endif static void grab(struct ops, FILE *, const char *); -static struct ops comppat(char *); -static regex_t mkregex(char *, size_t); +static struct ops comppat(char8_t *); +static regex_t mkregex(char8_t *, size_t); static bool islbrk(struct u8view); static bool sgrvalid(const char *); static bool xisspace(char); -static char *xstrchrnul(const char *, char); static int svposcmp(const void *, const void *); static char *env_or_default(const char *, const char *); @@ -260,7 +261,7 @@ main(int argc, char **argv) } struct ops -comppat(char *s) +comppat(char8_t *s) { struct ops ops; @@ -271,18 +272,35 @@ comppat(char *s) diex(EEARLY); do { - char delim; - char *p; + int w; + rune ch; + size_t len; + char8_t *p; struct op op; - op.c = *s; - if (!op_table[(uchar)op.c]) - diex("Invalid operator ‘%c’", *s); - if (!(delim = *++s)) + /* Grab the operator and delimiter. All operators are ASCII, but + u8tor() is used to parse it so that we get properly formed error + messages when someone uses a non-ASCII operator. */ + w = u8tor(&ch, s); + if (ch == RUNE_ERROR) + diex("Invalid UTF-8 sequence near ‘%02hhX’", s[-1]); + if (w > 1 || !op_table[ch]) + diex("Invalid operator ‘%.*s’", w, s); + op.c = *s++; + + s += u8tor(&ch, s); + if (ch == RUNE_ERROR) + diex("Invalid UTF-8 sequence near ‘%02hhX’", s[-1]); + if (ch == '\0') diex(EEARLY); - p = ++s; - s = xstrchrnul(s, delim); + /* Find the closing delimiter. The user is allowed to omit the closing + delimiter if this is the last operation in the query pattern. */ + p = s; + len = strlen(s); + if (!(s = (char8_t *)u8chr(s, ch, len))) + s = p + len; + if (s - p == 0) { if (op.c != 'h') diex("Empty regex given to ‘%c’", op.c); @@ -298,11 +316,13 @@ comppat(char *s) } dapush(&ops, op); - if (*s) - s++; + if (*s) { + if (s += u8tor(&ch, s), ch == RUNE_ERROR) + diex("Invalid UTF-8 sequence near ‘%02hhX’", s[-1]); + } while (*s && xisspace(*s)) s++; - } while (*s && *(s + 1)); + } while (*s); return ops; } @@ -747,11 +767,11 @@ sgrvalid(const char *s) } regex_t -mkregex(char *s, size_t n) +mkregex(char8_t *s, size_t n) { - char c = s[n]; int ret, cflags; regex_t r; + char8_t c = s[n]; s[n] = 0; cflags = REG_EXTENDED | REG_UTF | (nflag ? REG_NEWLINE : REG_DOTALL); @@ -823,15 +843,3 @@ xisspace(char c) { return c == ' ' || c == '\t' || c == '\n'; } - -char * -xstrchrnul(const char *s, char c) -{ - for (; *s; s++) { - if (*s == '\\') - s++; - else if (*s == c) - break; - } - return (char *)s; -} -- cgit v1.2.3