From fc0dae9c819a4ee64f4610db1f005dfb841030c5 Mon Sep 17 00:00:00 2001 From: Thomas Voss Date: Tue, 9 Apr 2024 18:49:27 +0200 Subject: Move unicode/gbrk.h to unicode/string.h --- README | 7 +-- include/unicode/gbrk.h | 12 ---- include/unicode/string.h | 12 ++++ lib/unicode/gbrk/u8glen.c | 10 --- lib/unicode/gbrk/u8gnext.c | 147 ------------------------------------------- lib/unicode/string/u8glen.c | 10 +++ lib/unicode/string/u8gnext.c | 147 +++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 170 insertions(+), 175 deletions(-) delete mode 100644 include/unicode/gbrk.h create mode 100644 include/unicode/string.h delete mode 100644 lib/unicode/gbrk/u8glen.c delete mode 100644 lib/unicode/gbrk/u8gnext.c create mode 100644 lib/unicode/string/u8glen.c create mode 100644 lib/unicode/string/u8gnext.c diff --git a/README b/README index f878f10..bb62e45 100644 --- a/README +++ b/README @@ -18,13 +18,8 @@ The headers as of now are: • mbstring.h — multibyte-strings • optparse.h — option parsing functions • rune.h — inttypes.h but for runes - • unicode/gbrk.h — grapheme breaking • unicode/prop.h — unicode character properties - -The headers planned for inclusion are: - • unicode/norm.h — text normalization - • unicode/sbrk.h — sentence breaking - • unicode/wbrk.h — word breaking + • unicode/string.h — unicode string functions DISCLAIMER: diff --git a/include/unicode/gbrk.h b/include/unicode/gbrk.h deleted file mode 100644 index 7f21371..0000000 --- a/include/unicode/gbrk.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef MLIB_UNICODE_GBRK_H -#define MLIB_UNICODE_GBRK_H - -#include - -#include "__charN_t.h" -#include "__u8view.h" - -size_t u8glen(const char8_t *, size_t); -size_t u8gnext(struct u8view *, const char8_t **, size_t *); - -#endif /* !MLIB_UNICODE_GBRK_H */ diff --git a/include/unicode/string.h b/include/unicode/string.h new file mode 100644 index 0000000..c2a99c1 --- /dev/null +++ b/include/unicode/string.h @@ -0,0 +1,12 @@ +#ifndef MLIB_UNICODE_STRING_H +#define MLIB_UNICODE_STRING_H + +#include + +#include "__charN_t.h" +#include "__u8view.h" + +size_t u8glen(const char8_t *, size_t); +size_t u8gnext(struct u8view *, const char8_t **, size_t *); + +#endif /* !MLIB_UNICODE_STRING_H */ diff --git a/lib/unicode/gbrk/u8glen.c b/lib/unicode/gbrk/u8glen.c deleted file mode 100644 index 763a834..0000000 --- a/lib/unicode/gbrk/u8glen.c +++ /dev/null @@ -1,10 +0,0 @@ -#include "unicode/gbrk.h" - -size_t -u8glen(const char8_t *s, size_t n) -{ - size_t m = 0; - while (u8gnext(nullptr, &s, &n)) - m++; - return m; -} diff --git a/lib/unicode/gbrk/u8gnext.c b/lib/unicode/gbrk/u8gnext.c deleted file mode 100644 index f3f7afc..0000000 --- a/lib/unicode/gbrk/u8gnext.c +++ /dev/null @@ -1,147 +0,0 @@ -#include - -#include "__bsearch.h" -#include "mbstring.h" -#include "unicode/__gbrk.h" -#include "unicode/gbrk.h" - -struct gbrk_state { - enum { - GB9C_NONE, - GB9C_CNSNT, - GB9C_LNK, - } gb9c; - bool gb11 : 1; - bool gb12 : 1; -}; - -static bool u8isgbrk(rune, rune, struct gbrk_state *); - -__MLIB_DEFINE_BSEARCH(gbrk_prop, gbrk_prop_tbl, GBP_OTHER) - -size_t -u8gnext(struct u8view *g, const char8_t **s, size_t *n) -{ - int m; - rune ch1; - const char8_t *p; - struct gbrk_state gs = {0}; - - if (*n == 0) - return 0; - - p = *s; - if (g) - g->p = p; - p += u8tor(&ch1, p); - - for (;;) { - rune ch2; - - if ((size_t)(p - *s) >= *n) - ch2 = 0; - else - m = u8tor(&ch2, p); - if (u8isgbrk(ch1, ch2, &gs)) { - ptrdiff_t d = p - *s; - *n -= d; - *s = p; - if (g) - g->len = d; - return d; - } - - ch1 = ch2; - p += m; - } -} - -bool -u8isgbrk(rune a, rune b, struct gbrk_state *gs) -{ - gbrk_prop ap, bp; - - /* GB1 & GB2 */ - if (!a || !b) - goto do_break; - - /* GB3 & ASCII fast-track */ - if ((a | b) < 0x300) { - if (a == '\r' && b == '\n') - return false; - goto do_break; - } - - /* GB4 */ - if (a == '\r' || a == '\n' || ((ap = mlib_lookup(a)) & GBP_CTRL)) - goto do_break; - - /* GB5 */ - if (b == '\r' || b == '\n' || ((bp = mlib_lookup(b)) & GBP_CTRL)) - goto do_break; - - /* Setting flags for GB9c */ - if (ap & GBP_INDC_CNSNT) - gs->gb9c = GB9C_CNSNT; - else if ((ap & GBP_INDC_LNK) && gs->gb9c == GB9C_CNSNT) - gs->gb9c = GB9C_LNK; - - /* GB6 */ - if ((ap & GBP_HNGL_L) - && (bp & (GBP_HNGL_L | GBP_HNGL_V | GBP_HNGL_LV | GBP_HNGL_LVT))) - { - return false; - } - - /* GB7 */ - if ((ap & (GBP_HNGL_LV | GBP_HNGL_V)) && (bp & (GBP_HNGL_V | GBP_HNGL_T))) - return false; - - /* GB8 */ - if ((ap & (GBP_HNGL_LVT | GBP_HNGL_T)) && (bp & GBP_HNGL_T)) - return false; - - /* GB9 */ - if (bp & (GBP_EXT | GBP_ZWJ)) { - if (ap & GBP_PIC) - gs->gb11 = true; - return false; - } - - /* GB9a */ - if (bp & GBP_SM) - return false; - - /* GB9b */ - if (ap & GBP_PREP) - return false; - - /* GB9c */ - if ((ap & (GBP_INDC_EXT | GBP_INDC_LNK)) && (bp & GBP_INDC_CNSNT) - && gs->gb9c == GB9C_LNK) - { - return false; - } - - /* GB11 */ - if (gs->gb11) { - if ((ap & GBP_EXT) && (bp & (GBP_EXT | GBP_ZWJ))) - return false; - if ((ap & GBP_ZWJ) && (bp & GBP_PIC)) - return false; - } - - /* GB12 & GB13 */ - if (ap & GBP_RI) { - if (gs->gb12 || !(bp & GBP_RI)) - goto do_break; - gs->gb12 = true; - return false; - } - - /* GB999 */ -do_break: - gs->gb9c = GB9C_NONE; - gs->gb11 = gs->gb12 = false; - return true; -} diff --git a/lib/unicode/string/u8glen.c b/lib/unicode/string/u8glen.c new file mode 100644 index 0000000..394a62d --- /dev/null +++ b/lib/unicode/string/u8glen.c @@ -0,0 +1,10 @@ +#include "unicode/string.h" + +size_t +u8glen(const char8_t *s, size_t n) +{ + size_t m = 0; + while (u8gnext(nullptr, &s, &n)) + m++; + return m; +} diff --git a/lib/unicode/string/u8gnext.c b/lib/unicode/string/u8gnext.c new file mode 100644 index 0000000..9824abb --- /dev/null +++ b/lib/unicode/string/u8gnext.c @@ -0,0 +1,147 @@ +#include + +#include "__bsearch.h" +#include "mbstring.h" +#include "unicode/__gbrk.h" +#include "unicode/string.h" + +struct gbrk_state { + enum { + GB9C_NONE, + GB9C_CNSNT, + GB9C_LNK, + } gb9c; + bool gb11 : 1; + bool gb12 : 1; +}; + +static bool u8isgbrk(rune, rune, struct gbrk_state *); + +__MLIB_DEFINE_BSEARCH(gbrk_prop, gbrk_prop_tbl, GBP_OTHER) + +size_t +u8gnext(struct u8view *g, const char8_t **s, size_t *n) +{ + int m; + rune ch1; + const char8_t *p; + struct gbrk_state gs = {0}; + + if (*n == 0) + return 0; + + p = *s; + if (g) + g->p = p; + p += u8tor(&ch1, p); + + for (;;) { + rune ch2; + + if ((size_t)(p - *s) >= *n) + ch2 = 0; + else + m = u8tor(&ch2, p); + if (u8isgbrk(ch1, ch2, &gs)) { + ptrdiff_t d = p - *s; + *n -= d; + *s = p; + if (g) + g->len = d; + return d; + } + + ch1 = ch2; + p += m; + } +} + +bool +u8isgbrk(rune a, rune b, struct gbrk_state *gs) +{ + gbrk_prop ap, bp; + + /* GB1 & GB2 */ + if (!a || !b) + goto do_break; + + /* GB3 & ASCII fast-track */ + if ((a | b) < 0x300) { + if (a == '\r' && b == '\n') + return false; + goto do_break; + } + + /* GB4 */ + if (a == '\r' || a == '\n' || ((ap = mlib_lookup(a)) & GBP_CTRL)) + goto do_break; + + /* GB5 */ + if (b == '\r' || b == '\n' || ((bp = mlib_lookup(b)) & GBP_CTRL)) + goto do_break; + + /* Setting flags for GB9c */ + if (ap & GBP_INDC_CNSNT) + gs->gb9c = GB9C_CNSNT; + else if ((ap & GBP_INDC_LNK) && gs->gb9c == GB9C_CNSNT) + gs->gb9c = GB9C_LNK; + + /* GB6 */ + if ((ap & GBP_HNGL_L) + && (bp & (GBP_HNGL_L | GBP_HNGL_V | GBP_HNGL_LV | GBP_HNGL_LVT))) + { + return false; + } + + /* GB7 */ + if ((ap & (GBP_HNGL_LV | GBP_HNGL_V)) && (bp & (GBP_HNGL_V | GBP_HNGL_T))) + return false; + + /* GB8 */ + if ((ap & (GBP_HNGL_LVT | GBP_HNGL_T)) && (bp & GBP_HNGL_T)) + return false; + + /* GB9 */ + if (bp & (GBP_EXT | GBP_ZWJ)) { + if (ap & GBP_PIC) + gs->gb11 = true; + return false; + } + + /* GB9a */ + if (bp & GBP_SM) + return false; + + /* GB9b */ + if (ap & GBP_PREP) + return false; + + /* GB9c */ + if ((ap & (GBP_INDC_EXT | GBP_INDC_LNK)) && (bp & GBP_INDC_CNSNT) + && gs->gb9c == GB9C_LNK) + { + return false; + } + + /* GB11 */ + if (gs->gb11) { + if ((ap & GBP_EXT) && (bp & (GBP_EXT | GBP_ZWJ))) + return false; + if ((ap & GBP_ZWJ) && (bp & GBP_PIC)) + return false; + } + + /* GB12 & GB13 */ + if (ap & GBP_RI) { + if (gs->gb12 || !(bp & GBP_RI)) + goto do_break; + gs->gb12 = true; + return false; + } + + /* GB999 */ +do_break: + gs->gb9c = GB9C_NONE; + gs->gb11 = gs->gb12 = false; + return true; +} -- cgit v1.2.3