aboutsummaryrefslogtreecommitdiff
path: root/gen
diff options
context:
space:
mode:
authorThomas Voss <mail@thomasvoss.com> 2024-04-05 14:26:33 +0200
committerThomas Voss <mail@thomasvoss.com> 2024-04-05 14:26:33 +0200
commit450fd004b8f8358dc46e1bcc1bceae821f7ae158 (patch)
treec3e59514c6f6a9d5d94419f025748ce7c51a9e29 /gen
parent0abf844cd9c22623e22f462c91f380f16524c7e8 (diff)
Add boolean properties to unicode/prop.h
Diffstat (limited to 'gen')
-rwxr-xr-xgen/prop/bool-props114
-rw-r--r--gen/prop/bool-props.awk79
2 files changed, 193 insertions, 0 deletions
diff --git a/gen/prop/bool-props b/gen/prop/bool-props
new file mode 100755
index 0000000..261caa6
--- /dev/null
+++ b/gen/prop/bool-props
@@ -0,0 +1,114 @@
+#!/bin/sh
+
+set -e
+cd "${0%/*}/../.."
+
+props1='
+ahex=ASCII_Hex_Digit
+bidi_c=Bidi_Control
+dash=Dash
+dep=Deprecated
+dia=Diacritic
+ext=Extender
+hex=Hex_Digit
+idbo=IDS_Binary_Operator
+id_compat_math_continue=ID_Compat_Math_Continue
+id_compat_math_start=ID_Compat_Math_Start
+ideo=Ideographic
+loe=Logical_Order_Exception
+pat_syn=Pattern_Syntax
+pcm=Prepended_Concatenation_Mark
+qmark=Quotation_Mark
+radical=Radical
+sd=Soft_Dotted
+sterm=Sentence_Terminal
+term=Terminal_Punctuation
+uideo=Unified_Ideograph
+vs=Variation_Selector
+wspace=White_Space
+'
+
+props2='
+alpha=Alphabetic
+cased=Cased
+ci=Case_Ignorable
+cwcf=Changes_When_Casefolded
+cwcm=Changes_When_Casemapped
+cwl=Changes_When_Lowercased
+cwt=Changes_When_Titlecased
+cwu=Changes_When_Uppercased
+di=Default_Ignorable_Code_Point
+gr_base=Grapheme_Base
+gr_ext=Grapheme_Extend
+idc=ID_Continue
+ids=ID_Start
+incb=Indic_Conjunct_Break
+lower=Lowercase
+math=Math
+upper=Uppercase
+xidc=XID_Continue
+xids=XID_Start
+'
+
+props3='
+ebase=Emoji_Modifier_Base
+ecomp=Emoji_Component
+emod=Emoji_Modifier
+emoji=Emoji
+epres=Emoji_Presentation
+extpic=Extended_Pictographic
+'
+
+props4='
+cwkcf=Changes_When_NFKC_Casefolded
+'
+
+props5='
+bidi_m=Bidi_Mirrored
+'
+
+manual='
+idst=IDS_Trinary_Operator
+idsu=IDS_Unary_Operator
+join_c=Join_Control
+nchar=Noncharacter_Code_Point
+pat_ws=Pattern_White_Space
+ri=Regional_Indicator
+'
+
+gen()
+{
+ local p=${1%%=*}
+ gawk -M -v prop=${1#*=} -v short=$p \
+ -f gen/prop/bool-props.awk data/$2 \
+ >lib/unicode/prop/uprop_is_${p}.c
+ printf 'DONE uprop_is_%s()\n' $p >&2
+}
+
+for prop in $props1; do gen $prop PropList.txt & done
+for prop in $props2; do gen $prop DerivedCoreProperties.txt & done
+for prop in $props3; do gen $prop emoji-data.txt & done
+for prop in $props4; do gen $prop DerivedNormalizationProps.txt & done
+for prop in $props5; do gen $prop DerivedBinaryProperties.txt & done
+
+printf '[[__nodiscard__, __unsequenced__]] bool uprop_is_%s(rune);\n' \
+ $(printf '%s\n' $props1 $props2 $props3 $props4 $props5 | cut -d= -f1) \
+| gawk '
+ /PROP PREDICATES END/ { no = 0 }
+ FILENAME != "-" && !no { print }
+ FILENAME == "-" { funcs[++i] = $0 }
+
+ /PROP PREDICATES START/ {
+ no = 1
+ asort(funcs)
+ for (i = 1; i <= length(funcs); i++)
+ print funcs[i]
+ }
+' - include/unicode/prop.h | sponge include/unicode/prop.h
+
+wait
+for prop in $manual
+do
+ shrt=${prop%%=*}
+ printf 'Function uprop_is_%s() implemented manually\n' $shrt >&2
+done
diff --git a/gen/prop/bool-props.awk b/gen/prop/bool-props.awk
new file mode 100644
index 0000000..d9c6299
--- /dev/null
+++ b/gen/prop/bool-props.awk
@@ -0,0 +1,79 @@
+BEGIN {
+ FS = "( *#.*| +; +)"
+
+ print "/* This file is autogenerated by gen/prop/bool-props; DO NOT EDIT. */"
+ print ""
+ print "#include \"__bsearch.h\""
+ print "#include \"bitset.h\""
+ print "#include \"rune.h\""
+ print "#include \"unicode/prop.h\""
+ print ""
+ print "/* clang-format off */"
+ print ""
+}
+
+$2 == prop || (prop == "Indic_Conjunct_Break" && $2 ~ /InCB;/) {
+ n = split($1, a, /\.\./)
+ lo = strtonum("0x" a[1])
+ hi = strtonum("0x" a[n])
+
+ for (i = lo; i <= hi; i++)
+ xs[i] = 1
+}
+
+END {
+ for (i = 0; i <= 0xFF; i++) {
+ if (xs[i])
+ mask = or(mask, lshift(1, i))
+ }
+
+ if (mask > 0) {
+ print "static constexpr bitset(bs, LATIN1_MAX) = {"
+ for (i = 0; i < 32; i++) {
+ if (i % 8 == 0)
+ printf "\t"
+ printf "0x%02X,", and(rshift(mask, 8 * i), 0xFF)
+ printf((i % 8 == 7) ? "\n" : " ")
+ }
+ print "};"
+ print ""
+ }
+
+ for (i = 0x100; i <= 0x10FFFF; i++) {
+ if (xs[i]) {
+ need_big_lookup = 1
+ break
+ }
+ }
+
+ if (need_big_lookup) {
+ print "static const struct {"
+ print "\trune lo, hi;"
+ print "} lookup[] = {"
+
+ for (i = 0x100; i <= 0x10FFFF; i++) {
+ if (!xs[i])
+ continue
+ lo = i
+ while (xs[i + 1])
+ i++
+ printf "\t{RUNE_C(0x%06X), RUNE_C(0x%06X)},\n", lo, i
+ }
+
+ print "};"
+ print ""
+ print "__MLIB_DEFINE_BSEARCH_CONTAINS(lookup)"
+ print ""
+ }
+
+ print "bool"
+ printf "uprop_is_%s(rune ch)\n", short
+ print "{"
+ if (mask > 0 && need_big_lookup)
+ print "\treturn ch <= LATIN1_MAX ? TESTBIT(bs, ch) : mlib_lookup_contains(ch);"
+ else if (need_big_lookup)
+ print "\treturn mlib_lookup_contains(ch);"
+ else
+ print "\treturn ch <= LATIN1_MAX && TESTBIT(bs, ch);"
+ print "}"
+}