From ac56f0167d0e26c35adc4639015e0cbbeecf6262 Mon Sep 17 00:00:00 2001 From: Thomas Voss Date: Thu, 19 Mar 2026 16:37:58 +0100 Subject: Fix pattern matching on anchors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous code would find all matches by searching for a match, chopping it off of the string view, and then matching again in a loop. This caused bugs with anchors such as ^, because x/^a/ would match *every* instance of ‘a’ instead of just ‘a’s at the start of the matching context. This PR switches the matching engine to use the offset parameter of pcre2_jit_match() to correctly handle anchors. --- src/work.c | 68 +++++++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 43 insertions(+), 25 deletions(-) diff --git a/src/work.c b/src/work.c index 7013726..3a138fb 100644 --- a/src/work.c +++ b/src/work.c @@ -228,10 +228,9 @@ DEFINE_OPERATOR(h) pcre2_match_data *md = pcre2_match_data_create_from_pattern(ops[opi].re, nullptr); - u8view_t sv_save = sv; ptrdiff_t origlen = array_len(*hl); - for (;;) { - int n = pcre2_jit_match(ops[opi].re, sv.p, sv.len, 0, + for (ptrdiff_t off = 0;;) { + int n = pcre2_jit_match(ops[opi].re, sv.p, sv.len, off, PCRE2_NOTEMPTY, md, nullptr); if (n == PCRE2_ERROR_NOMATCH) break; @@ -240,10 +239,10 @@ DEFINE_OPERATOR(h) size_t *ov = pcre2_get_ovector_pointer(md); array_push(hl, ((u8view_t){sv.p + ov[0], ov[1] - ov[0]})); - VSHFT(&sv, ov[1]); + off = ov[1]; } pcre2_match_data_free(md); - operator_dispatch(opi + 1, sv_save, hl); + operator_dispatch(opi + 1, sv, hl); array_hdr(*hl)->len = origlen; } @@ -256,22 +255,29 @@ DEFINE_OPERATOR(H) pcre2_match_data *md = pcre2_match_data_create_from_pattern(ops[opi].re, nullptr); - u8view_t sv_save = sv; - ptrdiff_t origlen = array_len(*hl); - for (;;) { - int n = pcre2_jit_match(ops[opi].re, sv.p, sv.len, 0, PCRE2_NOTEMPTY, - md, nullptr); + ptrdiff_t prvend = 0, origlen = array_len(*hl); + + for (ptrdiff_t off = 0;;) { + int n = pcre2_jit_match(ops[opi].re, sv.p, sv.len, off, 0, md, nullptr); if (n == PCRE2_ERROR_NOMATCH) break; if (n < 0) pcre2_bitch_and_die(n, "failed to match regex"); size_t *ov = pcre2_get_ovector_pointer(md); - array_push(hl, ((u8view_t){sv.p, ov[0]})); - VSHFT(&sv, ov[1]); + if (ov[0] - prvend != 0) + array_push(hl, ((u8view_t){sv.p + prvend, ov[0] - prvend})); + + prvend = off = ov[1]; + if (ov[0] == ov[1]) + off++; } + + if (prvend < sv.len) + array_push(hl, ((u8view_t){sv.p + prvend, sv.len - prvend})); + pcre2_match_data_free(md); - operator_dispatch(opi + 1, sv_save, hl); + operator_dispatch(opi + 1, sv, hl); array_hdr(*hl)->len = origlen; } @@ -279,9 +285,9 @@ DEFINE_OPERATOR(x) { pcre2_match_data *md = pcre2_match_data_create_from_pattern(ops[opi].re, nullptr); - for (;;) { - int n = pcre2_jit_match(ops[opi].re, sv.p, sv.len, 0, PCRE2_NOTEMPTY, - md, nullptr); + + for (ptrdiff_t off = 0;;) { + int n = pcre2_jit_match(ops[opi].re, sv.p, sv.len, off, 0, md, nullptr); if (n == PCRE2_ERROR_NOMATCH) break; if (n < 0) @@ -289,7 +295,9 @@ DEFINE_OPERATOR(x) size_t *ov = pcre2_get_ovector_pointer(md); operator_dispatch(opi + 1, (u8view_t){sv.p + ov[0], ov[1] - ov[0]}, hl); - VSHFT(&sv, ov[1]); + off = ov[1]; + if (ov[0] == ov[1]) + off++; } pcre2_match_data_free(md); } @@ -298,21 +306,31 @@ DEFINE_OPERATOR(X) { pcre2_match_data *md = pcre2_match_data_create_from_pattern(ops[opi].re, nullptr); - for (;;) { - int n = pcre2_jit_match(ops[opi].re, sv.p, sv.len, 0, PCRE2_NOTEMPTY, - md, nullptr); + + ptrdiff_t prvend = 0; + + for (ptrdiff_t off = 0;;) { + int n = pcre2_jit_match(ops[opi].re, sv.p, sv.len, off, 0, md, nullptr); if (n == PCRE2_ERROR_NOMATCH) break; if (n < 0) pcre2_bitch_and_die(n, "failed to match regex"); size_t *ov = pcre2_get_ovector_pointer(md); - if (ov[0] != 0) - operator_dispatch(opi + 1, (u8view_t){sv.p, ov[0]}, hl); - VSHFT(&sv, ov[1]); + if (ov[0] > (size_t)prvend) { + u8view_t sub = {sv.p + prvend, ov[0] - prvend}; + operator_dispatch(opi + 1, sub, hl); + } + + prvend = off = ov[1]; + if (ov[0] == ov[1]) + off++; + } + + if (prvend < sv.len) { + u8view_t sub = {sv.p + prvend, sv.len - prvend}; + operator_dispatch(opi + 1, sub, hl); } - if (sv.len != 0) - operator_dispatch(opi + 1, sv, hl); pcre2_match_data_free(md); } -- cgit v1.2.3