aboutsummaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorThomas Voss <mail@thomasvoss.com> 2024-04-22 21:06:52 +0200
committerThomas Voss <mail@thomasvoss.com> 2024-04-22 21:08:09 +0200
commitc0a983a29af17415ef29058d72f1a9cd99ddd83f (patch)
tree0c77ccd6905491ab7c39c6c386ba0721e6f0d4e6 /lib
parentff14e4801643f8c69b5d31e183bfb71943ee519f (diff)
Fix various bugs in word segmentation
Diffstat (limited to 'lib')
-rw-r--r--lib/unicode/string/u8wnext.c340
1 files changed, 214 insertions, 126 deletions
diff --git a/lib/unicode/string/u8wnext.c b/lib/unicode/string/u8wnext.c
index 4236cff..5e893c6 100644
--- a/lib/unicode/string/u8wnext.c
+++ b/lib/unicode/string/u8wnext.c
@@ -1,161 +1,249 @@
+/* The approach of this implementation is heavily inspired by libgrapheme
+ (written by Laslo Hunhold <dev@frign.de>), and my email-correspondance with
+ Laslo. */
+
+#include "_bsearch.h"
#include "macros.h"
#include "mbstring.h"
-#include "unicode/prop.h"
+#include "unicode/_wbrk.h"
#include "unicode/string.h"
-#define IS_AHLETTER(cp) ((cp) == WB_LE || (cp) == WB_HL)
-#define IS_MIDNUMLETQ(cp) ((cp) == WB_MB || (cp) == WB_SQ)
+_MLIB_DEFINE_BSEARCH(enum wbrk_prop, wbrk_lookup, WBRK_XX)
-#define RET(x) \
- do { \
- ws->prev_ap = ap; \
- return (x); \
- } while (false)
+#define IS_MIDNUMLETQ(xp) ((xp) == WBRK_MB || (xp) == WBRK_SQ)
+#define IS_AHLETTER(xp) \
+ ((xp) == WBRK_LE || (xp) == WBRK_EXTPICT_LE || (xp) == WBRK_HL)
+#define IS_IGNORE(xp) \
+ ((xp) == WBRK_EXTEND || (xp) == WBRK_FO || (xp) == WBRK_ZWJ)
struct wbrk_state {
- int ri_parity;
- enum uprop_wb prev_ap;
+ struct {
+ enum wbrk_prop prev[2], next[2];
+ } raw, skip;
+ struct u8view raw_v, skip_v, mid_v;
+ int ri_parity : 1;
};
-static bool u8iswbrk(const char8_t **, size_t *, struct wbrk_state *);
+static bool advance(struct wbrk_state *);
+static size_t findwbrk(struct u8view);
+static struct wbrk_state mkwbrkstate(struct u8view);
size_t
u8wnext(struct u8view *w, const char8_t **s, size_t *n)
{
- ASSUME(s != nullptr);
ASSUME(n != nullptr);
+ ASSUME(s != nullptr);
+ ASSUME(*s != nullptr);
if (*n == 0)
return 0;
- const char8_t *p = *s;
+ size_t off = findwbrk((struct u8view){*s, *n});
if (w != nullptr)
- w->p = p;
-
- size_t m = *n;
- struct wbrk_state ws = {};
- while (!u8iswbrk(&p, &m, &ws))
- ;
-
- ptrdiff_t d = p - *s;
- *n -= d;
- *s = p;
- if (w)
- w->len = d;
- return d;
+ *w = (struct u8view){*s, off};
+
+ ASSUME(*n >= off);
+ *s += off;
+ *n -= off;
+ return off;
}
-bool
-u8iswbrk(const char8_t **s, size_t *n, struct wbrk_state *ws)
+size_t
+findwbrk(struct u8view sv)
{
- ASSUME(s != nullptr);
- ASSUME(n != nullptr);
- ASSUME(ws != nullptr);
-
- rune a, b, c;
- enum uprop_wb ap, bp, cp;
- a = b = c = ap = bp = cp = 0;
-
- u8next(&a, s, n);
-
- {
- const char8_t *s_cpy = *s;
- size_t n_cpy = *n;
- u8next(&b, &s_cpy, &n_cpy);
- u8next(&c, &s_cpy, &n_cpy);
+ ASSUME(sv.p != nullptr);
+
+ struct wbrk_state ws = mkwbrkstate(sv);
+
+ while (advance(&ws)) {
+#define prev ws.raw.prev
+#define next ws.raw.next
+ /* WB3 */
+ if (prev[0] == WBRK_CR && next[0] == WBRK_LF)
+ continue;
+
+ /* WB3a */
+ if (prev[0] == WBRK_NL || prev[0] == WBRK_CR || prev[0] == WBRK_LF)
+ break;
+
+ /* WB3b */
+ if (next[0] == WBRK_NL || next[0] == WBRK_CR || next[0] == WBRK_LF)
+ break;
+
+ /* WB3c */
+ if (prev[0] == WBRK_ZWJ
+ && (next[0] == WBRK_EXTPICT || next[0] == WBRK_EXTPICT_LE))
+ {
+ continue;
+ }
+
+ /* WB3d */
+ if (prev[0] == WBRK_WSEGSPACE && next[0] == WBRK_WSEGSPACE)
+ continue;
+
+ /* WB4 */
+ if (next[0] == WBRK_EXTEND || next[0] == WBRK_FO || next[0] == WBRK_ZWJ)
+ continue;
+
+#undef prev
+#undef next
+#define prev ws.skip.prev
+#define next ws.skip.next
+
+ /* WB5 */
+ if (IS_AHLETTER(prev[0]) && IS_AHLETTER(next[0]))
+ continue;
+
+ /* WB6 */
+ if (IS_AHLETTER(prev[0])
+ && (next[0] == WBRK_ML || IS_MIDNUMLETQ(next[0]))
+ && IS_AHLETTER(next[1]))
+ {
+ continue;
+ }
+
+ /* WB7 */
+ if (IS_AHLETTER(prev[1])
+ && (prev[0] == WBRK_ML || IS_MIDNUMLETQ(prev[0]))
+ && IS_AHLETTER(next[0]))
+ {
+ continue;
+ }
+
+ /* WB7a */
+ if (prev[0] == WBRK_HL && next[0] == WBRK_SQ)
+ continue;
+
+ /* WB7b */
+ if (prev[0] == WBRK_HL && next[0] == WBRK_DQ && next[1] == WBRK_HL)
+ continue;
+
+ /* WB7c */
+ if (prev[1] == WBRK_HL && prev[0] == WBRK_DQ && next[0] == WBRK_HL)
+ continue;
+
+ /* WB8 */
+ if (prev[0] == WBRK_NU && next[0] == WBRK_NU)
+ continue;
+
+ /* WB9 */
+ if (IS_AHLETTER(prev[0]) && next[0] == WBRK_NU)
+ continue;
+
+ /* WB10 */
+ if (prev[0] == WBRK_NU && IS_AHLETTER(next[0]))
+ continue;
+
+ /* WB11 */
+ if (prev[1] == WBRK_NU && (prev[0] == WBRK_MN || IS_MIDNUMLETQ(prev[0]))
+ && next[0] == WBRK_NU)
+ {
+ continue;
+ }
+
+ /* WB12 */
+ if (prev[0] == WBRK_NU && (next[0] == WBRK_MN || IS_MIDNUMLETQ(next[0]))
+ && next[1] == WBRK_NU)
+ {
+ continue;
+ }
+
+ /* WB13 */
+ if (prev[0] == WBRK_KA && next[0] == WBRK_KA)
+ continue;
+
+ /* WB13a */
+ if ((IS_AHLETTER(prev[0]) || prev[0] == WBRK_NU || prev[0] == WBRK_KA
+ || prev[0] == WBRK_EX)
+ && next[0] == WBRK_EX)
+ {
+ continue;
+ }
+
+ /* WB13b */
+ if (prev[0] == WBRK_EX
+ && (IS_AHLETTER(next[0]) || next[0] == WBRK_NU
+ || next[0] == WBRK_KA))
+ {
+ continue;
+ }
+
+ /* WB15 & WB16 */
+ if (next[0] == WBRK_RI && ws.ri_parity)
+ continue;
+
+ /* WB999 */
+ break;
+#undef prev
+#undef next
}
- ws->ri_parity = ws->ri_parity == 0 && uprop_is_ri(a);
-
- /* WB1 & WB2 */
- if (!a || !b)
- RET(true);
-
- /* WB3 */
- if (a == '\r' && b == '\n')
- RET(false);
-
- /* WB3a */
- if (a == '\r' || a == '\n' || (ap = uprop_get_wb(a)) == WB_NL)
- RET(true);
-
- /* WB3b */
- if (b == '\r' || b == '\n' || (bp = uprop_get_wb(b)) == WB_NL)
- RET(true);
-
- /* WB3c */
- if (ap == WB_ZWJ && uprop_is_extpict(b))
- RET(false);
-
- /* WB3d */
- if (ap == WB_WSEGSPACE && bp == WB_WSEGSPACE)
- RET(false);
-
- /* WB4 */
- if (bp == WB_FO || bp == WB_EXTEND || bp == WB_ZWJ)
- RET(false);
-
- /* WB5 */
- if (IS_AHLETTER(ap) && IS_AHLETTER(bp))
- RET(false);
+ return ws.mid_v.p - sv.p;
+}
- /* WB6 */
- cp = uprop_get_wb(c);
- if (IS_AHLETTER(ap) && (bp == WB_ML || IS_MIDNUMLETQ(bp))
- && IS_AHLETTER(cp))
+struct wbrk_state
+mkwbrkstate(struct u8view sv)
+{
+ struct wbrk_state ws = {
+ .raw = {{WBRK_EOT, WBRK_EOT}, {WBRK_EOT, WBRK_EOT}},
+ .skip = {{WBRK_EOT, WBRK_EOT}, {WBRK_EOT, WBRK_EOT}},
+ .mid_v = sv,
+ .raw_v = sv,
+ .skip_v = sv,
+ };
+
+ static_assert(sizeof(ws.skip.next) == sizeof(ws.raw.next));
+
+ rune ch;
+ for (size_t i = 0;
+ i < lengthof(ws.raw.next) && u8next(&ch, U8_ARGSP(ws.raw_v)) != 0; i++)
{
- RET(false);
+ ws.raw.next[i] = mlib_lookup(ch);
}
- /* WB7 */
- if (IS_AHLETTER(ws->prev_ap) && (ap == WB_ML || IS_MIDNUMLETQ(ap))
- && IS_AHLETTER(bp))
+ for (size_t i = 0;
+ i < lengthof(ws.raw.next) && u8next(&ch, U8_ARGSP(ws.skip_v)) != 0;)
{
- RET(false);
+ ws.skip.next[i] = mlib_lookup(ch);
+ if (!IS_IGNORE(ws.skip.next[i]))
+ i++;
}
- /* WB7a & WB7b */
- if (ap == WB_HL && (bp == WB_SQ || (bp == WB_DQ && cp == WB_HL)))
- RET(false);
-
- /* WB7c */
- if (ws->prev_ap == WB_HL && ap == WB_DQ && bp == WB_HL)
- RET(false);
-
- /* WB8, WB9, & WB10 */
- if ((ap == WB_NU || IS_AHLETTER(ap)) && (bp == WB_NU || IS_AHLETTER(bp)))
- RET(false);
-
- /* WB11 */
- if (ws->prev_ap == WB_NU && (ap == WB_MN || IS_MIDNUMLETQ(ap))
- && bp == WB_NU)
- {
- RET(false);
- }
-
- /* WB12 */
- if (ap == WB_NU && (bp == WB_MN || IS_MIDNUMLETQ(bp)) && cp == WB_NU)
- RET(false);
-
- /* WB13 */
- if (ap == WB_KA && bp == WB_KA)
- RET(false);
+ return ws;
+}
- /* WB13a */
- if ((IS_AHLETTER(ap) || ap == WB_NU || ap == WB_KA || ap == WB_EX)
- && bp == WB_EX)
- {
- RET(false);
+bool
+advance(struct wbrk_state *ws)
+{
+ if (ws->raw.next[0] == WBRK_EOT)
+ return false;
+
+ /* Shift the prop window over by 1 */
+ rune ch;
+ ws->raw.prev[1] = ws->raw.prev[0];
+ ws->raw.prev[0] = ws->raw.next[0];
+ ws->raw.next[0] = ws->raw.next[1];
+ ws->raw.next[1] =
+ u8next(&ch, U8_ARGSP(ws->raw_v)) != 0 ? mlib_lookup(ch) : WBRK_EOT;
+
+ /* Increment the midpoint */
+ u8next(nullptr, U8_ARGSP(ws->mid_v));
+
+ /* Ignore ignorable properties */
+ if (!IS_IGNORE(ws->raw.prev[0])) {
+ ws->skip.prev[1] = ws->skip.prev[0];
+ ws->skip.prev[0] = ws->skip.next[0];
+ ws->skip.next[0] = ws->skip.next[1];
+ ws->ri_parity = ws->ri_parity == 0 && ws->skip.prev[0] == WBRK_RI;
+
+ do {
+ if (u8next(&ch, U8_ARGSP(ws->skip_v)) == 0) {
+ ws->skip.next[1] = WBRK_EOT;
+ break;
+ }
+ ws->skip.next[1] = mlib_lookup(ch);
+ } while (IS_IGNORE(ws->skip.next[1]));
}
- /* WB13b */
- if (ap == WB_EX && (IS_AHLETTER(bp) || bp == WB_NU || bp == WB_KA))
- RET(false);
-
- /* WB15 & WB16 */
- if (ap == WB_RI && bp == WB_RI && ws->ri_parity == 1)
- RET(false);
-
- /* WB999 */
- RET(true);
+ return true;
}