From c0a983a29af17415ef29058d72f1a9cd99ddd83f Mon Sep 17 00:00:00 2001 From: Thomas Voss Date: Mon, 22 Apr 2024 21:06:52 +0200 Subject: Fix various bugs in word segmentation --- lib/unicode/string/u8wnext.c | 340 +++++++++++++++++++++++++++---------------- 1 file changed, 214 insertions(+), 126 deletions(-) (limited to 'lib/unicode') diff --git a/lib/unicode/string/u8wnext.c b/lib/unicode/string/u8wnext.c index 4236cff..5e893c6 100644 --- a/lib/unicode/string/u8wnext.c +++ b/lib/unicode/string/u8wnext.c @@ -1,161 +1,249 @@ +/* The approach of this implementation is heavily inspired by libgrapheme + (written by Laslo Hunhold ), and my email-correspondance with + Laslo. */ + +#include "_bsearch.h" #include "macros.h" #include "mbstring.h" -#include "unicode/prop.h" +#include "unicode/_wbrk.h" #include "unicode/string.h" -#define IS_AHLETTER(cp) ((cp) == WB_LE || (cp) == WB_HL) -#define IS_MIDNUMLETQ(cp) ((cp) == WB_MB || (cp) == WB_SQ) +_MLIB_DEFINE_BSEARCH(enum wbrk_prop, wbrk_lookup, WBRK_XX) -#define RET(x) \ - do { \ - ws->prev_ap = ap; \ - return (x); \ - } while (false) +#define IS_MIDNUMLETQ(xp) ((xp) == WBRK_MB || (xp) == WBRK_SQ) +#define IS_AHLETTER(xp) \ + ((xp) == WBRK_LE || (xp) == WBRK_EXTPICT_LE || (xp) == WBRK_HL) +#define IS_IGNORE(xp) \ + ((xp) == WBRK_EXTEND || (xp) == WBRK_FO || (xp) == WBRK_ZWJ) struct wbrk_state { - int ri_parity; - enum uprop_wb prev_ap; + struct { + enum wbrk_prop prev[2], next[2]; + } raw, skip; + struct u8view raw_v, skip_v, mid_v; + int ri_parity : 1; }; -static bool u8iswbrk(const char8_t **, size_t *, struct wbrk_state *); +static bool advance(struct wbrk_state *); +static size_t findwbrk(struct u8view); +static struct wbrk_state mkwbrkstate(struct u8view); size_t u8wnext(struct u8view *w, const char8_t **s, size_t *n) { - ASSUME(s != nullptr); ASSUME(n != nullptr); + ASSUME(s != nullptr); + ASSUME(*s != nullptr); if (*n == 0) return 0; - const char8_t *p = *s; + size_t off = findwbrk((struct u8view){*s, *n}); if (w != nullptr) - w->p = p; - - size_t m = *n; - struct wbrk_state ws = {}; - while (!u8iswbrk(&p, &m, &ws)) - ; - - ptrdiff_t d = p - *s; - *n -= d; - *s = p; - if (w) - w->len = d; - return d; + *w = (struct u8view){*s, off}; + + ASSUME(*n >= off); + *s += off; + *n -= off; + return off; } -bool -u8iswbrk(const char8_t **s, size_t *n, struct wbrk_state *ws) +size_t +findwbrk(struct u8view sv) { - ASSUME(s != nullptr); - ASSUME(n != nullptr); - ASSUME(ws != nullptr); - - rune a, b, c; - enum uprop_wb ap, bp, cp; - a = b = c = ap = bp = cp = 0; - - u8next(&a, s, n); - - { - const char8_t *s_cpy = *s; - size_t n_cpy = *n; - u8next(&b, &s_cpy, &n_cpy); - u8next(&c, &s_cpy, &n_cpy); + ASSUME(sv.p != nullptr); + + struct wbrk_state ws = mkwbrkstate(sv); + + while (advance(&ws)) { +#define prev ws.raw.prev +#define next ws.raw.next + /* WB3 */ + if (prev[0] == WBRK_CR && next[0] == WBRK_LF) + continue; + + /* WB3a */ + if (prev[0] == WBRK_NL || prev[0] == WBRK_CR || prev[0] == WBRK_LF) + break; + + /* WB3b */ + if (next[0] == WBRK_NL || next[0] == WBRK_CR || next[0] == WBRK_LF) + break; + + /* WB3c */ + if (prev[0] == WBRK_ZWJ + && (next[0] == WBRK_EXTPICT || next[0] == WBRK_EXTPICT_LE)) + { + continue; + } + + /* WB3d */ + if (prev[0] == WBRK_WSEGSPACE && next[0] == WBRK_WSEGSPACE) + continue; + + /* WB4 */ + if (next[0] == WBRK_EXTEND || next[0] == WBRK_FO || next[0] == WBRK_ZWJ) + continue; + +#undef prev +#undef next +#define prev ws.skip.prev +#define next ws.skip.next + + /* WB5 */ + if (IS_AHLETTER(prev[0]) && IS_AHLETTER(next[0])) + continue; + + /* WB6 */ + if (IS_AHLETTER(prev[0]) + && (next[0] == WBRK_ML || IS_MIDNUMLETQ(next[0])) + && IS_AHLETTER(next[1])) + { + continue; + } + + /* WB7 */ + if (IS_AHLETTER(prev[1]) + && (prev[0] == WBRK_ML || IS_MIDNUMLETQ(prev[0])) + && IS_AHLETTER(next[0])) + { + continue; + } + + /* WB7a */ + if (prev[0] == WBRK_HL && next[0] == WBRK_SQ) + continue; + + /* WB7b */ + if (prev[0] == WBRK_HL && next[0] == WBRK_DQ && next[1] == WBRK_HL) + continue; + + /* WB7c */ + if (prev[1] == WBRK_HL && prev[0] == WBRK_DQ && next[0] == WBRK_HL) + continue; + + /* WB8 */ + if (prev[0] == WBRK_NU && next[0] == WBRK_NU) + continue; + + /* WB9 */ + if (IS_AHLETTER(prev[0]) && next[0] == WBRK_NU) + continue; + + /* WB10 */ + if (prev[0] == WBRK_NU && IS_AHLETTER(next[0])) + continue; + + /* WB11 */ + if (prev[1] == WBRK_NU && (prev[0] == WBRK_MN || IS_MIDNUMLETQ(prev[0])) + && next[0] == WBRK_NU) + { + continue; + } + + /* WB12 */ + if (prev[0] == WBRK_NU && (next[0] == WBRK_MN || IS_MIDNUMLETQ(next[0])) + && next[1] == WBRK_NU) + { + continue; + } + + /* WB13 */ + if (prev[0] == WBRK_KA && next[0] == WBRK_KA) + continue; + + /* WB13a */ + if ((IS_AHLETTER(prev[0]) || prev[0] == WBRK_NU || prev[0] == WBRK_KA + || prev[0] == WBRK_EX) + && next[0] == WBRK_EX) + { + continue; + } + + /* WB13b */ + if (prev[0] == WBRK_EX + && (IS_AHLETTER(next[0]) || next[0] == WBRK_NU + || next[0] == WBRK_KA)) + { + continue; + } + + /* WB15 & WB16 */ + if (next[0] == WBRK_RI && ws.ri_parity) + continue; + + /* WB999 */ + break; +#undef prev +#undef next } - ws->ri_parity = ws->ri_parity == 0 && uprop_is_ri(a); - - /* WB1 & WB2 */ - if (!a || !b) - RET(true); - - /* WB3 */ - if (a == '\r' && b == '\n') - RET(false); - - /* WB3a */ - if (a == '\r' || a == '\n' || (ap = uprop_get_wb(a)) == WB_NL) - RET(true); - - /* WB3b */ - if (b == '\r' || b == '\n' || (bp = uprop_get_wb(b)) == WB_NL) - RET(true); - - /* WB3c */ - if (ap == WB_ZWJ && uprop_is_extpict(b)) - RET(false); - - /* WB3d */ - if (ap == WB_WSEGSPACE && bp == WB_WSEGSPACE) - RET(false); - - /* WB4 */ - if (bp == WB_FO || bp == WB_EXTEND || bp == WB_ZWJ) - RET(false); - - /* WB5 */ - if (IS_AHLETTER(ap) && IS_AHLETTER(bp)) - RET(false); + return ws.mid_v.p - sv.p; +} - /* WB6 */ - cp = uprop_get_wb(c); - if (IS_AHLETTER(ap) && (bp == WB_ML || IS_MIDNUMLETQ(bp)) - && IS_AHLETTER(cp)) +struct wbrk_state +mkwbrkstate(struct u8view sv) +{ + struct wbrk_state ws = { + .raw = {{WBRK_EOT, WBRK_EOT}, {WBRK_EOT, WBRK_EOT}}, + .skip = {{WBRK_EOT, WBRK_EOT}, {WBRK_EOT, WBRK_EOT}}, + .mid_v = sv, + .raw_v = sv, + .skip_v = sv, + }; + + static_assert(sizeof(ws.skip.next) == sizeof(ws.raw.next)); + + rune ch; + for (size_t i = 0; + i < lengthof(ws.raw.next) && u8next(&ch, U8_ARGSP(ws.raw_v)) != 0; i++) { - RET(false); + ws.raw.next[i] = mlib_lookup(ch); } - /* WB7 */ - if (IS_AHLETTER(ws->prev_ap) && (ap == WB_ML || IS_MIDNUMLETQ(ap)) - && IS_AHLETTER(bp)) + for (size_t i = 0; + i < lengthof(ws.raw.next) && u8next(&ch, U8_ARGSP(ws.skip_v)) != 0;) { - RET(false); + ws.skip.next[i] = mlib_lookup(ch); + if (!IS_IGNORE(ws.skip.next[i])) + i++; } - /* WB7a & WB7b */ - if (ap == WB_HL && (bp == WB_SQ || (bp == WB_DQ && cp == WB_HL))) - RET(false); - - /* WB7c */ - if (ws->prev_ap == WB_HL && ap == WB_DQ && bp == WB_HL) - RET(false); - - /* WB8, WB9, & WB10 */ - if ((ap == WB_NU || IS_AHLETTER(ap)) && (bp == WB_NU || IS_AHLETTER(bp))) - RET(false); - - /* WB11 */ - if (ws->prev_ap == WB_NU && (ap == WB_MN || IS_MIDNUMLETQ(ap)) - && bp == WB_NU) - { - RET(false); - } - - /* WB12 */ - if (ap == WB_NU && (bp == WB_MN || IS_MIDNUMLETQ(bp)) && cp == WB_NU) - RET(false); - - /* WB13 */ - if (ap == WB_KA && bp == WB_KA) - RET(false); + return ws; +} - /* WB13a */ - if ((IS_AHLETTER(ap) || ap == WB_NU || ap == WB_KA || ap == WB_EX) - && bp == WB_EX) - { - RET(false); +bool +advance(struct wbrk_state *ws) +{ + if (ws->raw.next[0] == WBRK_EOT) + return false; + + /* Shift the prop window over by 1 */ + rune ch; + ws->raw.prev[1] = ws->raw.prev[0]; + ws->raw.prev[0] = ws->raw.next[0]; + ws->raw.next[0] = ws->raw.next[1]; + ws->raw.next[1] = + u8next(&ch, U8_ARGSP(ws->raw_v)) != 0 ? mlib_lookup(ch) : WBRK_EOT; + + /* Increment the midpoint */ + u8next(nullptr, U8_ARGSP(ws->mid_v)); + + /* Ignore ignorable properties */ + if (!IS_IGNORE(ws->raw.prev[0])) { + ws->skip.prev[1] = ws->skip.prev[0]; + ws->skip.prev[0] = ws->skip.next[0]; + ws->skip.next[0] = ws->skip.next[1]; + ws->ri_parity = ws->ri_parity == 0 && ws->skip.prev[0] == WBRK_RI; + + do { + if (u8next(&ch, U8_ARGSP(ws->skip_v)) == 0) { + ws->skip.next[1] = WBRK_EOT; + break; + } + ws->skip.next[1] = mlib_lookup(ch); + } while (IS_IGNORE(ws->skip.next[1])); } - /* WB13b */ - if (ap == WB_EX && (IS_AHLETTER(bp) || bp == WB_NU || bp == WB_KA)) - RET(false); - - /* WB15 & WB16 */ - if (ap == WB_RI && bp == WB_RI && ws->ri_parity == 1) - RET(false); - - /* WB999 */ - RET(true); + return true; } -- cgit v1.2.3