diff options
-rw-r--r-- | lib/unicode/string/u8lower.c | 6 | ||||
-rw-r--r-- | test/data/LowercaseTest | 35 | ||||
-rwxr-xr-x | test/gen-test-data | 1 | ||||
-rw-r--r-- | test/lower-test.c | 91 |
4 files changed, 131 insertions, 2 deletions
diff --git a/lib/unicode/string/u8lower.c b/lib/unicode/string/u8lower.c index f59cf39..63fdae4 100644 --- a/lib/unicode/string/u8lower.c +++ b/lib/unicode/string/u8lower.c @@ -26,7 +26,9 @@ u8lower(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn, struct { bool before; size_t after; - } final_sigma = {}; + } final_sigma = { + .before = true, + }; n = before_dot_cnt = more_above_cnt = 0; @@ -57,7 +59,7 @@ u8lower(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn, } if (final_sigma.after == 0) { - rune ch; + rune ch = 0; struct u8view cpy = {src, srcn}; do diff --git a/test/data/LowercaseTest b/test/data/LowercaseTest new file mode 100644 index 0000000..0deb9c4 --- /dev/null +++ b/test/data/LowercaseTest @@ -0,0 +1,35 @@ +# Empty input +;; + +# Latin alphabet +LOREM IPSUM DOLOR SIT AMET, CONSECTETUR ADIPISCING ELIT.;lorem ipsum dolor sit amet, consectetur adipiscing elit.; + +# Greek alphabet; handle sigma properly +Σ;ς; +ΤΟ ΓΡΆΜΜΑ ΣΊΓΜΑ ΈΧΕΙ ΔΎΟ ΠΕΖΟΎΣ ΤΎΠΟΥΣ;το γράμμα σίγμα έχει δύο πεζούς τύπους; + +# Cyrillic alphabet +СЛАВА УКРАЇНІ ПРОТИ РОСІЙСЬКОЇ АГРЕСІЇ!;слава україні проти російської агресії!; + +# In lithuanian we need to retain the dot above ‘i’ and ‘j’ when there’s an +# accent above the uppercased variant. Also test with both single-codepoint +# variants (i.e. U+00CC LATIN CAPITAL I WITH GRAVE) and variants that use +# combining-characters. +Į̃;į̃; +Į̃;į̇̃;LT +J́;j́; +J́;j̇́;LT +Į̃J́;į̃j́; +Į̃J́;į̇̃j̇́;LT +RÀSTI, MÈSTI, KÌLO;ràsti, mèsti, kìlo; +RÀSTI, MÈSTI, KÌLO;ràsti, mèsti, ki̇̀lo;LT +RÀSTI, MÈSTI, KÌLO;ràsti, mèsti, kìlo; +RÀSTI, MÈSTI, KÌLO;ràsti, mèsti, ki̇̀lo;LT + +# Add U+0307 COMBINING DOT ABOVE after ‘i’ when lowercasing ‘İ’ in non-Azeri and +# -Turkish locales +İSTANBUL’LUYUM;i̇stanbul’luyum; +İSTANBUL’LUYUM;istanbul’luyum;AZ + +# Uncased language +안녕하세요, 월드!;안녕하세요, 월드!; diff --git a/test/gen-test-data b/test/gen-test-data index 2f9e5a2..50028b8 100755 --- a/test/gen-test-data +++ b/test/gen-test-data @@ -12,6 +12,7 @@ download() download 'auxiliary/GraphemeBreakTest.txt' download 'auxiliary/WordBreakTest.txt' +grep '^[^#]' data/LowercaseTest >lower.in grep '^[^#]' data/UppercaseTest >upper.in sed -En 's/\s+//g; s/÷?#.*//g; /./p' data/GraphemeBreakTest >gnext.in sed -En 's/\s+//g; s/÷?#.*//g; /./p' data/WordBreakTest >wnext.in diff --git a/test/lower-test.c b/test/lower-test.c new file mode 100644 index 0000000..f6069ba --- /dev/null +++ b/test/lower-test.c @@ -0,0 +1,91 @@ +#define _GNU_SOURCE +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include <alloc.h> +#include <errors.h> +#include <macros.h> +#include <mbstring.h> +#include <unicode/string.h> + +#define TESTFILE "lower.in" + +static bool test(const char8_t *, int); + +int +main(int, char **argv) +{ + int rv; + size_t n; + ssize_t nr; + char *line; + FILE *fp; + + rv = EXIT_SUCCESS; + line = nullptr; + mlib_setprogname(argv[0]); + + if ((fp = fopen(TESTFILE, "r")) == nullptr) + err("fopen: %s:", TESTFILE); + + for (int id = 1; (nr = getline(&line, &n, fp)) > 0; id++) { + if (line[nr - 1] == '\n') + line[--nr] = '\0'; + + if (!test(line, id)) + rv = EXIT_FAILURE; + } + if (ferror(fp)) + err("getline: %s:", TESTFILE); + + free(line); + fclose(fp); + return rv; +} + +bool +test(const char8_t *line, int id) +{ + struct u8view before, after, flags; + before.p = line; + after.p = strchr(line, ';') + 1; + before.len = after.p - before.p - 1; + flags.p = strchr(after.p, ';') + 1; + after.len = flags.p - after.p - 1; + flags.len = strlen(flags.p); + + enum caseflags cf = 0; + if (u8eq(U8_ARGS(flags), U8_ARGS(U8("ẞ")))) + cf |= CF_ẞ; + else if (u8eq(U8_ARGS(flags), U8_ARGS(U8("AZ")))) + cf |= CF_LANG_AZ; + else if (u8eq(U8_ARGS(flags), U8_ARGS(U8("LT")))) + cf |= CF_LANG_LT; + + char8_t *buf = bufalloc(nullptr, 1, after.len); + size_t bufsz = u8lower(nullptr, 0, U8_ARGS(before), cf); + if (bufsz != after.len) { + warn("case %d: expected lowercased buffer size of %zu but got %zu " + "(flags=‘%.*s’)", + id, after.len, bufsz, U8_PRI_ARGS(flags)); + return false; + } + + bufsz = u8lower(buf, bufsz, U8_ARGS(before), cf); + if (bufsz != after.len) { + warn("case %d: expected lowercased length of %zu but got %zu " + "(flags=‘%.*s’)", + id, after.len, bufsz, U8_PRI_ARGS(flags)); + return false; + } + + if (!memeq(buf, after.p, bufsz)) { + warn("case %d: expected ‘%.*s’ but got ‘%.*s’ (flags=‘%.*s’)", id, + U8_PRI_ARGS(after), (int)bufsz, buf, U8_PRI_ARGS(flags)); + return false; + } + + free(buf); + return true; +} |