diff options
| author | Thomas Voss <mail@thomasvoss.com> | 2024-05-04 01:50:09 +0200 | 
|---|---|---|
| committer | Thomas Voss <mail@thomasvoss.com> | 2024-05-04 01:50:09 +0200 | 
| commit | ba56b5fa8344847077b49268d2ab215f3e73d10e (patch) | |
| tree | ffcb52ba22c5f1d154d76184c542b3e13971153f | |
| parent | cc49c8c0f7791c38f578a1698f3e7a6cc4ed630e (diff) | |
Add tests for u8title() and fix ‘ς’ bug
| -rw-r--r-- | lib/unicode/string/u8title.c | 2 | ||||
| -rw-r--r-- | test/data/TitlecaseTest | 48 | ||||
| -rwxr-xr-x | test/run-tests | 1 | ||||
| -rw-r--r-- | test/title-test.c | 93 | 
4 files changed, 143 insertions, 1 deletions
| diff --git a/lib/unicode/string/u8title.c b/lib/unicode/string/u8title.c index 4545fda..01e9d2e 100644 --- a/lib/unicode/string/u8title.c +++ b/lib/unicode/string/u8title.c @@ -68,7 +68,7 @@ u8title(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn,  		}  		if (final_sigma.after == 0) { -			rune ch; +			rune ch = 0;  			struct u8view cpy = {src, srcn};  			do diff --git a/test/data/TitlecaseTest b/test/data/TitlecaseTest new file mode 100644 index 0000000..24256a5 --- /dev/null +++ b/test/data/TitlecaseTest @@ -0,0 +1,48 @@ +# Empty input +;; + +# Short input +a;A; + +# If CF_ẞ gets passed for whatever reason… don’t turn into ẞ +ß;Ss;ẞ + +# Latin alphabet +LOREM IPSUM DOLOR SIT AMET, CONSECTETUR ADIPISCING ELIT.;Lorem Ipsum Dolor Sit Amet, Consectetur Adipiscing Elit.; +lorem ipsum dolor sit amet, consectetur adipiscing elit.;Lorem Ipsum Dolor Sit Amet, Consectetur Adipiscing Elit.; + +# Random punctuation and numbers +COMPLEX-LANGUAGE AND -SCRIPT;Complex-Language And -Script; +complex-language and -script;Complex-Language And -Script; + +# Greek alphabet; handle sigma properly +ΤΟ ΓΡΆΜΜΑ ΣΊΓΜΑ ΈΧΕΙ ΔΎΟ ΠΕΖΟΎΣ ΤΎΠΟΥΣ;Το Γράμμα Σίγμα Έχει Δύο Πεζούς Τύπους; +το γράμμα σίγμα έχει δύο πεζούς τύπους;Το Γράμμα Σίγμα Έχει Δύο Πεζούς Τύπους; + +# Cyrillic alphabet +СЛАВА УКРАЇНІ ПРОТИ РОСІЙСЬКОЇ АГРЕСІЇ!;Слава Україні Проти Російської Агресії!; +слава україні проти російської агресії!;Слава Україні Проти Російської Агресії!; + +# In lithuanian we need to retain the dot above ‘i’ and ‘j’ when there’s an +# accent above the uppercased variant.  Also test with both single-codepoint +# variants (i.e. U+00CC LATIN CAPITAL I WITH GRAVE) and variants that use +# combining-characters. +i̇̀;İ̀; +i̇̀;Ì;LT +RÀSTI, MÈSTI, KÌLO;Ràsti, Mèsti, Kìlo; +RÀSTI, MÈSTI, KÌLO;Ràsti, Mèsti, Ki̇̀lo;LT + +# Croatian has 3 cases +LJUDEVIT GAJ;Ljudevit Gaj; +ljudevit gaj;Ljudevit Gaj; + +# Dutch IJ needs special handling +ijsberg en onderzeeër in de ijssel;Ijsberg En Onderzeeër In De Ijssel; +ijsberg en onderzeeër in de ijssel;IJsberg En Onderzeeër In De IJssel;NL + +# Uppercase ‘i’ to ‘İ’ in Azeri/Turkish +istanbul’luyum;Istanbul’luyum; +istanbul’luyum;İstanbul’luyum;AZ + +# Uncased language +안녕하세요, 월드!;안녕하세요, 월드!; diff --git a/test/run-tests b/test/run-tests index 1d6cf56..f191c33 100755 --- a/test/run-tests +++ b/test/run-tests @@ -24,6 +24,7 @@ download 'auxiliary/GraphemeBreakTest.txt'  download 'auxiliary/WordBreakTest.txt'  grep '^[^#]'                         data/LowercaseTest     >lower.in +grep '^[^#]'                         data/TitlecaseTest     >title.in  grep '^[^#]'                         data/UppercaseTest     >upper.in  sed -En 's/\s+//g; s/÷?#.*//g; /./p' data/GraphemeBreakTest >gnext.in  sed -En 's/\s+//g; s/÷?#.*//g; /./p' data/WordBreakTest     >wnext.in diff --git a/test/title-test.c b/test/title-test.c new file mode 100644 index 0000000..bea1ce7 --- /dev/null +++ b/test/title-test.c @@ -0,0 +1,93 @@ +#define _GNU_SOURCE +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include <alloc.h> +#include <errors.h> +#include <macros.h> +#include <mbstring.h> +#include <unicode/string.h> + +#define TESTFILE "title.in" + +static bool test(const char8_t *, int); + +int +main(int, char **argv) +{ +	int rv; +	size_t n; +	ssize_t nr; +	char *line; +	FILE *fp; + +	rv = EXIT_SUCCESS; +	line = nullptr; +	mlib_setprogname(argv[0]); + +	if ((fp = fopen(TESTFILE, "r")) == nullptr) +		err("fopen: %s:", TESTFILE); + +	for (int id = 1; (nr = getline(&line, &n, fp)) > 0; id++) { +		if (line[nr - 1] == '\n') +			line[--nr] = '\0'; + +		if (!test(line, id)) +			rv = EXIT_FAILURE; +	} +	if (ferror(fp)) +		err("getline: %s:", TESTFILE); + +	free(line); +	fclose(fp); +	return rv; +} + +bool +test(const char8_t *line, int id) +{ +	struct u8view before, after, flags; +	before.p = line; +	after.p = strchr(line, ';') + 1; +	before.len = after.p - before.p - 1; +	flags.p = strchr(after.p, ';') + 1; +	after.len = flags.p - after.p - 1; +	flags.len = strlen(flags.p); + +	enum caseflags cf = 0; +	if (u8eq(U8_ARGS(flags), U8_ARGS(U8("ẞ")))) +		cf |= CF_ẞ; +	else if (u8eq(U8_ARGS(flags), U8_ARGS(U8("AZ")))) +		cf |= CF_LANG_AZ; +	else if (u8eq(U8_ARGS(flags), U8_ARGS(U8("LT")))) +		cf |= CF_LANG_LT; +	else if (u8eq(U8_ARGS(flags), U8_ARGS(U8("NL")))) +		cf |= CF_LANG_NL; + +	char8_t *buf = bufalloc(nullptr, 1, after.len); +	size_t bufsz = u8title(nullptr, 0, U8_ARGS(before), cf); +	if (bufsz != after.len) { +		warn("case %d: expected titlecased buffer size of %zu but got %zu " +		     "(flags=‘%.*s’)", +		     id, after.len, bufsz, U8_PRI_ARGS(flags)); +		return false; +	} + +	bufsz = u8title(buf, bufsz, U8_ARGS(before), cf); +	if (bufsz != after.len) { +		warn("case %d: expected titlecased length of %zu but got %zu " +		     "(flags=‘%.*s’)", +		     id, after.len, bufsz, U8_PRI_ARGS(flags)); +		return false; +	} + +	if (!memeq(buf, after.p, bufsz)) { +		warn("case %d: expected ‘%.*s’ but got ‘%.*s’ (flags=‘%.*s’)", id, +		     U8_PRI_ARGS(after), (int)bufsz, buf, U8_PRI_ARGS(flags)); +		return false; +	} + +	free(buf); +	return true; +} |