From eda8550f79f7f836a78f5909f1dccc008511d4f8 Mon Sep 17 00:00:00 2001 From: Thomas Voss Date: Mon, 22 Apr 2024 21:27:07 +0200 Subject: Add a test for word breaking --- test/gen-test-data | 11 +++++ test/run-tests | 33 +++++++++++++++ test/wnext_test.c | 122 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 166 insertions(+) create mode 100755 test/gen-test-data create mode 100755 test/run-tests create mode 100644 test/wnext_test.c diff --git a/test/gen-test-data b/test/gen-test-data new file mode 100755 index 0000000..12ee11e --- /dev/null +++ b/test/gen-test-data @@ -0,0 +1,11 @@ +#!/bin/sh + +set -e + +download() +{ + curl -sS "https://www.unicode.org/Public/15.1.0/ucd/$1" +} + +download 'auxiliary/WordBreakTest.txt' \ +| sed -En 's/\s+//g; s/÷?#.*//g; /./p' >wnext.in diff --git a/test/run-tests b/test/run-tests new file mode 100755 index 0000000..e19b611 --- /dev/null +++ b/test/run-tests @@ -0,0 +1,33 @@ +#!/bin/sh + +set -e +cd "${0%/*}" + +readonly FLAGS=' + -std=c23 -I../include + -Og -ggdb3 + -Wall -Wextra -Wpedantic + -Wno-pointer-sign + -Wno-attributes +' + +(cd ..; ./make) + +./gen-test-data + +for src in *.c +do + dst="${src%.*}" + gcc $FLAGS -o "$dst" "$src" ../libmlib.a +done + +s="$(find . -type f -executable \ + -not -name gen-test-data \ + -not -name run-tests \ + -exec echo rm "*.in" {} +)" +trap "$s" EXIT + +find . -type f -executable \ + -not -name gen-test-data \ + -not -name run-tests \ + -exec {} \; diff --git a/test/wnext_test.c b/test/wnext_test.c new file mode 100644 index 0000000..a137f7d --- /dev/null +++ b/test/wnext_test.c @@ -0,0 +1,122 @@ +#define _GNU_SOURCE +#include +#include + +#include "alloc.h" +#include "dynarr.h" +#include "errors.h" +#include "macros.h" +#include "mbstring.h" +#include "rune.h" +#include "unicode/string.h" + +#define TESTFILE "wnext.in" + +static bool test(const char8_t *, size_t, int); +static int hexdigits(rune); + +int +main(int, char **argv) +{ + int rv; + size_t n; + ssize_t nr; + char *line; + FILE *fp; + + rv = EXIT_SUCCESS; + line = nullptr; + mlib_setprogname(argv[0]); + + if ((fp = fopen(TESTFILE, "r")) == nullptr) + err("fopen: %s:", TESTFILE); + + for (int id = 1; (nr = getline(&line, &n, fp)) > 0; id++) { + if (line[nr - 1] == '\n') + line[--nr] = '\0'; + + if (!test(line, (size_t)nr, id)) { + rv = EXIT_FAILURE; + break; + } + } + if (ferror(fp)) + err("getline: %s:", TESTFILE); + + free(line); + fclose(fp); + return rv; +} + +bool +test(const char8_t *line, size_t n, int id) +{ + size_t total = 0; + const char8_t *line2 = line; + + typedef dynarr(char8_t) word; + dynarr(word) words = {}; + + do { + rune op, ch; + + u8next(&op, &line2, &n); + sscanf(line2, "%" SCNxRUNE, &ch); + int off = hexdigits(ch); + off = MAX(4, off); + line2 += off, n -= off; + + char8_t buf[U8_LEN_MAX] = {}; + int w = rtou8(buf, sizeof(buf), ch); + total += w; + + if (op == U'÷') + DAPUSH(&words, (word){}); + DAEXTEND(&words.buf[words.len - 1], buf, w); + } while (n > 0); + + size_t off = 0; + char8_t *buf = bufalloc(nullptr, 1, total); + da_foreach (&words, wd) { + memcpy(buf + off, wd->buf, wd->len); + off += wd->len; + } + + /* Assert the word count is correct */ + size_t words_got = u8wcnt(buf, total); + if (words_got != words.len) { + warn("case %d: expected %zu word(s) but got %zu: ‘%s’", id, words.len, + words_got, line); + return false; + } + + /* Assert the individual words are correct */ + struct u8view wd; + const char8_t *buf_cpy = buf; + for (size_t i = 0; u8wnext(&wd, &buf_cpy, &total); i++) { + word wd2 = words.buf[i]; + if (!u8eq(wd.p, wd.len, wd2.buf, wd2.len)) { + warn("case %d: expected word ‘%.*s’ but got ‘%.*s’", id, + (int)wd2.len, wd2.buf, (int)wd.len, wd.p); + return false; + } + } + + da_foreach (&words, wd) + free(wd->buf); + free(words.buf); + free(buf); + + return true; +} + +int +hexdigits(rune ch) +{ + int n = 0; + do { + ch /= 16; + n++; + } while (ch != 0); + return n; +} -- cgit v1.2.3