From eda8550f79f7f836a78f5909f1dccc008511d4f8 Mon Sep 17 00:00:00 2001
From: Thomas Voss <mail@thomasvoss.com>
Date: Mon, 22 Apr 2024 21:27:07 +0200
Subject: Add a test for word breaking

---
 test/gen-test-data |  11 +++++
 test/run-tests     |  33 +++++++++++++++
 test/wnext_test.c  | 122 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 166 insertions(+)
 create mode 100755 test/gen-test-data
 create mode 100755 test/run-tests
 create mode 100644 test/wnext_test.c

diff --git a/test/gen-test-data b/test/gen-test-data
new file mode 100755
index 0000000..12ee11e
--- /dev/null
+++ b/test/gen-test-data
@@ -0,0 +1,11 @@
+#!/bin/sh
+
+set -e
+
+download()
+{
+	curl -sS "https://www.unicode.org/Public/15.1.0/ucd/$1"
+}
+
+download 'auxiliary/WordBreakTest.txt' \
+| sed -En 's/\s+//g; s/÷?#.*//g; /./p' >wnext.in
diff --git a/test/run-tests b/test/run-tests
new file mode 100755
index 0000000..e19b611
--- /dev/null
+++ b/test/run-tests
@@ -0,0 +1,33 @@
+#!/bin/sh
+
+set -e
+cd "${0%/*}"
+
+readonly FLAGS='
+	-std=c23 -I../include
+	-Og -ggdb3
+	-Wall -Wextra -Wpedantic
+	-Wno-pointer-sign
+	-Wno-attributes
+'
+
+(cd ..; ./make)
+
+./gen-test-data
+
+for src in *.c
+do
+	dst="${src%.*}"
+	gcc $FLAGS -o "$dst" "$src" ../libmlib.a 
+done
+
+s="$(find . -type f -executable \
+	-not -name gen-test-data \
+	-not -name run-tests \
+	-exec echo rm "*.in" {} +)"
+trap "$s" EXIT
+
+find . -type f -executable \
+	-not -name gen-test-data \
+	-not -name run-tests \
+	-exec {} \;
diff --git a/test/wnext_test.c b/test/wnext_test.c
new file mode 100644
index 0000000..a137f7d
--- /dev/null
+++ b/test/wnext_test.c
@@ -0,0 +1,122 @@
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "alloc.h"
+#include "dynarr.h"
+#include "errors.h"
+#include "macros.h"
+#include "mbstring.h"
+#include "rune.h"
+#include "unicode/string.h"
+
+#define TESTFILE "wnext.in"
+
+static bool test(const char8_t *, size_t, int);
+static int hexdigits(rune);
+
+int
+main(int, char **argv)
+{
+	int rv;
+	size_t n;
+	ssize_t nr;
+	char *line;
+	FILE *fp;
+
+	rv = EXIT_SUCCESS;
+	line = nullptr;
+	mlib_setprogname(argv[0]);
+
+	if ((fp = fopen(TESTFILE, "r")) == nullptr)
+		err("fopen: %s:", TESTFILE);
+
+	for (int id = 1; (nr = getline(&line, &n, fp)) > 0; id++) {
+		if (line[nr - 1] == '\n')
+			line[--nr] = '\0';
+
+		if (!test(line, (size_t)nr, id)) {
+			rv = EXIT_FAILURE;
+			break;
+		}
+	}
+	if (ferror(fp))
+		err("getline: %s:", TESTFILE);
+
+	free(line);
+	fclose(fp);
+	return rv;
+}
+
+bool
+test(const char8_t *line, size_t n, int id)
+{
+	size_t total = 0;
+	const char8_t *line2 = line;
+
+	typedef dynarr(char8_t) word;
+	dynarr(word) words = {};
+
+	do {
+		rune op, ch;
+
+		u8next(&op, &line2, &n);
+		sscanf(line2, "%" SCNxRUNE, &ch);
+		int off = hexdigits(ch);
+		off = MAX(4, off);
+		line2 += off, n -= off;
+
+		char8_t buf[U8_LEN_MAX] = {};
+		int w = rtou8(buf, sizeof(buf), ch);
+		total += w;
+
+		if (op == U'÷')
+			DAPUSH(&words, (word){});
+		DAEXTEND(&words.buf[words.len - 1], buf, w);
+	} while (n > 0);
+
+	size_t off = 0;
+	char8_t *buf = bufalloc(nullptr, 1, total);
+	da_foreach (&words, wd) {
+		memcpy(buf + off, wd->buf, wd->len);
+		off += wd->len;
+	}
+
+	/* Assert the word count is correct */
+	size_t words_got = u8wcnt(buf, total);
+	if (words_got != words.len) {
+		warn("case %d: expected %zu word(s) but got %zu: ‘%s’", id, words.len,
+		     words_got, line);
+		return false;
+	}
+
+	/* Assert the individual words are correct */
+	struct u8view wd;
+	const char8_t *buf_cpy = buf;
+	for (size_t i = 0; u8wnext(&wd, &buf_cpy, &total); i++) {
+		word wd2 = words.buf[i];
+		if (!u8eq(wd.p, wd.len, wd2.buf, wd2.len)) {
+			warn("case %d: expected word ‘%.*s’ but got ‘%.*s’", id,
+			     (int)wd2.len, wd2.buf, (int)wd.len, wd.p);
+			return false;
+		}
+	}
+
+	da_foreach (&words, wd)
+		free(wd->buf);
+	free(words.buf);
+	free(buf);
+
+	return true;
+}
+
+int
+hexdigits(rune ch)
+{
+	int n = 0;
+	do {
+		ch /= 16;
+		n++;
+	} while (ch != 0);
+	return n;
+}
-- 
cgit v1.2.3