diff options
author | Thomas Voss <mail@thomasvoss.com> | 2024-04-05 14:26:33 +0200 |
---|---|---|
committer | Thomas Voss <mail@thomasvoss.com> | 2024-04-05 14:26:33 +0200 |
commit | 450fd004b8f8358dc46e1bcc1bceae821f7ae158 (patch) | |
tree | c3e59514c6f6a9d5d94419f025748ce7c51a9e29 /gen | |
parent | 0abf844cd9c22623e22f462c91f380f16524c7e8 (diff) |
Add boolean properties to unicode/prop.h
Diffstat (limited to 'gen')
-rwxr-xr-x | gen/prop/bool-props | 114 | ||||
-rw-r--r-- | gen/prop/bool-props.awk | 79 |
2 files changed, 193 insertions, 0 deletions
diff --git a/gen/prop/bool-props b/gen/prop/bool-props new file mode 100755 index 0000000..261caa6 --- /dev/null +++ b/gen/prop/bool-props @@ -0,0 +1,114 @@ +#!/bin/sh + +set -e +cd "${0%/*}/../.." + +props1=' +ahex=ASCII_Hex_Digit +bidi_c=Bidi_Control +dash=Dash +dep=Deprecated +dia=Diacritic +ext=Extender +hex=Hex_Digit +idbo=IDS_Binary_Operator +id_compat_math_continue=ID_Compat_Math_Continue +id_compat_math_start=ID_Compat_Math_Start +ideo=Ideographic +loe=Logical_Order_Exception +pat_syn=Pattern_Syntax +pcm=Prepended_Concatenation_Mark +qmark=Quotation_Mark +radical=Radical +sd=Soft_Dotted +sterm=Sentence_Terminal +term=Terminal_Punctuation +uideo=Unified_Ideograph +vs=Variation_Selector +wspace=White_Space +' + +props2=' +alpha=Alphabetic +cased=Cased +ci=Case_Ignorable +cwcf=Changes_When_Casefolded +cwcm=Changes_When_Casemapped +cwl=Changes_When_Lowercased +cwt=Changes_When_Titlecased +cwu=Changes_When_Uppercased +di=Default_Ignorable_Code_Point +gr_base=Grapheme_Base +gr_ext=Grapheme_Extend +idc=ID_Continue +ids=ID_Start +incb=Indic_Conjunct_Break +lower=Lowercase +math=Math +upper=Uppercase +xidc=XID_Continue +xids=XID_Start +' + +props3=' +ebase=Emoji_Modifier_Base +ecomp=Emoji_Component +emod=Emoji_Modifier +emoji=Emoji +epres=Emoji_Presentation +extpic=Extended_Pictographic +' + +props4=' +cwkcf=Changes_When_NFKC_Casefolded +' + +props5=' +bidi_m=Bidi_Mirrored +' + +manual=' +idst=IDS_Trinary_Operator +idsu=IDS_Unary_Operator +join_c=Join_Control +nchar=Noncharacter_Code_Point +pat_ws=Pattern_White_Space +ri=Regional_Indicator +' + +gen() +{ + local p=${1%%=*} + gawk -M -v prop=${1#*=} -v short=$p \ + -f gen/prop/bool-props.awk data/$2 \ + >lib/unicode/prop/uprop_is_${p}.c + printf 'DONE uprop_is_%s()\n' $p >&2 +} + +for prop in $props1; do gen $prop PropList.txt & done +for prop in $props2; do gen $prop DerivedCoreProperties.txt & done +for prop in $props3; do gen $prop emoji-data.txt & done +for prop in $props4; do gen $prop DerivedNormalizationProps.txt & done +for prop in $props5; do gen $prop DerivedBinaryProperties.txt & done + +printf '[[__nodiscard__, __unsequenced__]] bool uprop_is_%s(rune);\n' \ + $(printf '%s\n' $props1 $props2 $props3 $props4 $props5 | cut -d= -f1) \ +| gawk ' + /PROP PREDICATES END/ { no = 0 } + FILENAME != "-" && !no { print } + FILENAME == "-" { funcs[++i] = $0 } + + /PROP PREDICATES START/ { + no = 1 + asort(funcs) + for (i = 1; i <= length(funcs); i++) + print funcs[i] + } +' - include/unicode/prop.h | sponge include/unicode/prop.h + +wait +for prop in $manual +do + shrt=${prop%%=*} + printf 'Function uprop_is_%s() implemented manually\n' $shrt >&2 +done diff --git a/gen/prop/bool-props.awk b/gen/prop/bool-props.awk new file mode 100644 index 0000000..d9c6299 --- /dev/null +++ b/gen/prop/bool-props.awk @@ -0,0 +1,79 @@ +BEGIN { + FS = "( *#.*| +; +)" + + print "/* This file is autogenerated by gen/prop/bool-props; DO NOT EDIT. */" + print "" + print "#include \"__bsearch.h\"" + print "#include \"bitset.h\"" + print "#include \"rune.h\"" + print "#include \"unicode/prop.h\"" + print "" + print "/* clang-format off */" + print "" +} + +$2 == prop || (prop == "Indic_Conjunct_Break" && $2 ~ /InCB;/) { + n = split($1, a, /\.\./) + lo = strtonum("0x" a[1]) + hi = strtonum("0x" a[n]) + + for (i = lo; i <= hi; i++) + xs[i] = 1 +} + +END { + for (i = 0; i <= 0xFF; i++) { + if (xs[i]) + mask = or(mask, lshift(1, i)) + } + + if (mask > 0) { + print "static constexpr bitset(bs, LATIN1_MAX) = {" + for (i = 0; i < 32; i++) { + if (i % 8 == 0) + printf "\t" + printf "0x%02X,", and(rshift(mask, 8 * i), 0xFF) + printf((i % 8 == 7) ? "\n" : " ") + } + print "};" + print "" + } + + for (i = 0x100; i <= 0x10FFFF; i++) { + if (xs[i]) { + need_big_lookup = 1 + break + } + } + + if (need_big_lookup) { + print "static const struct {" + print "\trune lo, hi;" + print "} lookup[] = {" + + for (i = 0x100; i <= 0x10FFFF; i++) { + if (!xs[i]) + continue + lo = i + while (xs[i + 1]) + i++ + printf "\t{RUNE_C(0x%06X), RUNE_C(0x%06X)},\n", lo, i + } + + print "};" + print "" + print "__MLIB_DEFINE_BSEARCH_CONTAINS(lookup)" + print "" + } + + print "bool" + printf "uprop_is_%s(rune ch)\n", short + print "{" + if (mask > 0 && need_big_lookup) + print "\treturn ch <= LATIN1_MAX ? TESTBIT(bs, ch) : mlib_lookup_contains(ch);" + else if (need_big_lookup) + print "\treturn mlib_lookup_contains(ch);" + else + print "\treturn ch <= LATIN1_MAX && TESTBIT(bs, ch);" + print "}" +} |