aboutsummaryrefslogtreecommitdiff
path: root/gen
diff options
context:
space:
mode:
authorThomas Voss <mail@thomasvoss.com> 2024-04-21 19:46:29 +0200
committerThomas Voss <mail@thomasvoss.com> 2024-04-21 19:46:29 +0200
commita04d1334a968649b1da36eb640d5d9d35eb3f29d (patch)
tree46f2d89c3d40953942eaf70c75a44fe6a69c9e71 /gen
parent5b14562e05457d96a6524b5aa2e533e69cf30fb2 (diff)
Add uprop_get_wb()
Diffstat (limited to 'gen')
-rwxr-xr-xgen/data-files1
-rwxr-xr-xgen/prop/wb86
2 files changed, 87 insertions, 0 deletions
diff --git a/gen/data-files b/gen/data-files
index 00b5f1c..ba795cd 100755
--- a/gen/data-files
+++ b/gen/data-files
@@ -8,6 +8,7 @@ readonly BASE='https://www.unicode.org/Public/15.1.0/ucd'
readonly PATHS='
auxiliary/GraphemeBreakProperty
+ auxiliary/WordBreakProperty
BidiBrackets
BidiMirroring
Blocks
diff --git a/gen/prop/wb b/gen/prop/wb
new file mode 100755
index 0000000..a6b47f2
--- /dev/null
+++ b/gen/prop/wb
@@ -0,0 +1,86 @@
+#!/bin/sh
+
+set -e
+cd "${0%/*}/../.."
+exec >lib/unicode/prop/uprop_get_wb.c
+
+gawk '
+BEGIN {
+ FS = " *(; *|#.*)"
+
+ map["ALetter"] = "LE"
+ map["CR"] = "CR"
+ map["Double_Quote"] = "DQ"
+ map["E_Base"] = "EB"
+ map["E_Base_GAZ"] = "EBG"
+ map["E_Modifier"] = "EM"
+ map["Extend"] = "EXTEND"
+ map["ExtendNumLet"] = "EX"
+ map["Format"] = "FO"
+ map["Glue_After_Zwj"] = "GAZ"
+ map["Hebrew_Letter"] = "HL"
+ map["Katakana"] = "KA"
+ map["LF"] = "LF"
+ map["MidLetter"] = "ML"
+ map["MidNumLet"] = "MB"
+ map["MidNum"] = "MN"
+ map["Newline"] = "NL"
+ map["Numeric"] = "NU"
+ map["Other"] = "XX"
+ map["Regional_Indicator"] = "RI"
+ map["Single_Quote"] = "SQ"
+ map["WSegSpace"] = "WSEGSPACE"
+ map["ZWJ"] = "ZWJ"
+
+ print "/* This file is autogenerated by gen/prop/wb; DO NOT EDIT. */"
+ print ""
+ print "#include \"_bsearch.h\""
+ print "#include \"macros.h\""
+ print "#include \"rune.h\""
+ print "#include \"unicode/prop.h\""
+ print ""
+}
+
+/^[A-F0-9]/ {
+ n = split($1, a, /\.\./)
+ lo = strtonum("0X" a[1])
+ hi = strtonum("0X" a[n])
+
+ for (i = lo; i <= hi; i++)
+ props[i] = "WB_" map[$2]
+}
+
+END {
+ print "static constexpr enum uprop_wb lookup_lat1[] = {"
+ for (i = 0; i < 0x100; i++) {
+ if (i % 4 == 0)
+ printf "\t"
+ printf "%-13s%s", (props[i] ? props[i] : "WB_XX") ",", \
+ i % 4 == 3 ? "\n" : " "
+ }
+ print "};"
+ print ""
+ print "static const struct {"
+ print "\trune lo, hi;"
+ print "\tenum uprop_wb val;"
+ print "} lookup[] = {"
+
+ for (i = 0x100; i <= 0x10FFFF; i++) {
+ if (!props[i])
+ continue
+ for (lo = i; props[lo] == props[i + 1]; i++)
+ ;
+ printf "\t{RUNE_C(0x%06X), RUNE_C(0x%06X), %s},\n", lo, i, props[i]
+ }
+
+ print "};"
+ print ""
+ print "_MLIB_DEFINE_BSEARCH(enum uprop_wb, lookup, WB_XX)"
+ print ""
+ print "enum uprop_wb"
+ print "uprop_get_wb(rune ch)"
+ print "{"
+ print "\treturn ch < lengthof(lookup_lat1) ? lookup_lat1[ch] : mlib_lookup(ch);"
+ print "}"
+}
+' data/WordBreakProperty | sed 's/\s*$//'