diff options
author | Thomas Voss <mail@thomasvoss.com> | 2024-05-04 12:31:27 +0200 |
---|---|---|
committer | Thomas Voss <mail@thomasvoss.com> | 2024-05-04 12:31:27 +0200 |
commit | 2d5d218072575ed19ce7429a0b1a2e601f0c1346 (patch) | |
tree | ed5a78cdc552d944ad43fea2e99306e66f4483ac | |
parent | de5f416d60f7331b6c86b97ffe5c18176791780f (diff) |
Add tests for u8casefold()
-rw-r--r-- | test/casefold-test.c | 3 | ||||
-rw-r--r-- | test/data/CasefoldTest | 55 | ||||
-rwxr-xr-x | test/run-tests | 1 |
3 files changed, 59 insertions, 0 deletions
diff --git a/test/casefold-test.c b/test/casefold-test.c new file mode 100644 index 0000000..bc3789b --- /dev/null +++ b/test/casefold-test.c @@ -0,0 +1,3 @@ +#define CASETYPE casefold +#define CASETYPE_VERB casefolded +#include "_case-test.h" diff --git a/test/data/CasefoldTest b/test/data/CasefoldTest new file mode 100644 index 0000000..92c9b44 --- /dev/null +++ b/test/data/CasefoldTest @@ -0,0 +1,55 @@ +# Empty input +;; + +# Latin alphabet +LOREM IPSUM DOLOR SIT AMET, CONSECTETUR ADIPISCING ELIT.;lorem ipsum dolor sit amet, consectetur adipiscing elit.; + +# Greek alphabet; when casefolding we don’t use ‘ς’ +Σ;σ; +ς;σ; +ΤΟ ΓΡΆΜΜΑ ΣΊΓΜΑ ΈΧΕΙ ΔΎΟ ΠΕΖΟΎΣ ΤΎΠΟΥΣ;το γράμμα σίγμα έχει δύο πεζούσ τύπουσ; + +# Cyrillic alphabet +СЛАВА УКРАЇНІ ПРОТИ РОСІЙСЬКОЇ АГРЕСІЇ!;слава україні проти російської агресії!; + +# Croatian has 3 cases +LJUDEVIT GAJ;ljudevit gaj; +Ljudevit Gaj;ljudevit gaj; + +# Ignore the Lithuanian case completely +Į̃;į̃; +Į̃;į̃;LT +J́;j́; +J́;j́;LT +Į̃J́;į̃j́; +Į̃J́;į̃j́;LT +RÀSTI, MÈSTI, KÌLO;ràsti, mèsti, kìlo; +RÀSTI, MÈSTI, KÌLO;ràsti, mèsti, kìlo;LT +RÀSTI, MÈSTI, KÌLO;ràsti, mèsti, kìlo; +RÀSTI, MÈSTI, KÌLO;ràsti, mèsti, kìlo;LT + +# Azeri/Turkish ‘ı’ and ‘i’ are different letters +I;i; +I;ı;AZ + +# Add U+0307 COMBINING DOT ABOVE after ‘i’ when lowercasing ‘İ’ in +# non-Azeri and -Turkish locales +İSTANBUL’LUYUM;i̇stanbul’luyum; +İSTANBUL’LUYUM;istanbul’luyum;AZ + +# Composite characters should be expanded, including +# U+00DF LATIN SMALL LETTER SHARP S for some reason… +FLUẞ;fluss; +fluß;fluss; +Waffle;waffle; +stab;stab; + +# …but not U+0132 LATIN SMALL LIGATURE IJ or the capital variant? +ijssel;ijssel; +IJSSEL;ijssel; + +# In Cherokee we want to uppercase our strings +ꭳꮝꮣ ꮢꭿᏸᏹꮧꮲ;ᎣᏍᏓ ᏒᎯᏰᏱᏗᏢ; + +# Uncased language +안녕하세요, 월드!;안녕하세요, 월드!; diff --git a/test/run-tests b/test/run-tests index 82d3e00..f4c53c4 100755 --- a/test/run-tests +++ b/test/run-tests @@ -23,6 +23,7 @@ readonly FLAGS=' download 'auxiliary/GraphemeBreakTest.txt' download 'auxiliary/WordBreakTest.txt' +grep '^[^#]' data/CasefoldTest >casefold.in grep '^[^#]' data/LowercaseTest >lower.in grep '^[^#]' data/TitlecaseTest >title.in grep '^[^#]' data/UppercaseTest >upper.in |