aboutsummaryrefslogtreecommitdiff
path: root/gen/string
diff options
context:
space:
mode:
authorThomas Voss <mail@thomasvoss.com> 2024-04-22 21:06:52 +0200
committerThomas Voss <mail@thomasvoss.com> 2024-04-22 21:08:09 +0200
commitc0a983a29af17415ef29058d72f1a9cd99ddd83f (patch)
tree0c77ccd6905491ab7c39c6c386ba0721e6f0d4e6 /gen/string
parentff14e4801643f8c69b5d31e183bfb71943ee519f (diff)
Fix various bugs in word segmentation
Diffstat (limited to 'gen/string')
-rwxr-xr-xgen/string/wbrk109
1 files changed, 109 insertions, 0 deletions
diff --git a/gen/string/wbrk b/gen/string/wbrk
new file mode 100755
index 0000000..1966356
--- /dev/null
+++ b/gen/string/wbrk
@@ -0,0 +1,109 @@
+#!/bin/sh
+
+set -e
+cd "${0%/*}/../.."
+exec >include/unicode/_wbrk.h
+
+gawk '
+BEGIN {
+ FS = " *(; *|#.*)"
+
+ map["ALetter"] = "LE"
+ map["CR"] = "CR"
+ map["Double_Quote"] = "DQ"
+ map["E_Base"] = "EB"
+ map["E_Base_GAZ"] = "EBG"
+ map["E_Modifier"] = "EM"
+ map["Extended_Pictographic"] = "EXTPICT"
+ map["Extend"] = "EXTEND"
+ map["ExtendNumLet"] = "EX"
+ map["Format"] = "FO"
+ map["Glue_After_Zwj"] = "GAZ"
+ map["Hebrew_Letter"] = "HL"
+ map["Katakana"] = "KA"
+ map["LF"] = "LF"
+ map["MidLetter"] = "ML"
+ map["MidNumLet"] = "MB"
+ map["MidNum"] = "MN"
+ map["Newline"] = "NL"
+ map["Numeric"] = "NU"
+ map["Other"] = "XX"
+ map["Regional_Indicator"] = "RI"
+ map["Single_Quote"] = "SQ"
+ map["WSegSpace"] = "WSEGSPACE"
+ map["ZWJ"] = "ZWJ"
+
+ print "/* This file is autogenerated by gen/string/wbrk; DO NOT EDIT. */"
+ print ""
+ print "#ifndef MLIB_UNICODE__WBRK_H"
+ print "#define MLIB_UNICODE__WBRK_H"
+ print ""
+ print "#include <inttypes.h>"
+ print ""
+ print "#include \"rune.h\""
+ print ""
+ print "enum wbrk_prop : uint_least8_t {"
+ print "\tWBRK_XX = 0, /* Other */"
+ print "\tWBRK_CR, /* CR */"
+ print "\tWBRK_DQ, /* Double Quote */"
+ print "\tWBRK_EB, /* E Base */"
+ print "\tWBRK_EBG, /* E Base GAZ */"
+ print "\tWBRK_EM, /* E Modifier */"
+ print "\tWBRK_EOT, /* End of Text */"
+ print "\tWBRK_EX, /* ExtendNumLet */"
+ print "\tWBRK_EXTEND, /* Extend */"
+ print "\tWBRK_EXTPICT, /* Extended Pictographic */"
+ print "\tWBRK_EXTPICT_LE, /* Extended Pictographic and ALetter */"
+ print "\tWBRK_FO, /* Format */"
+ print "\tWBRK_GAZ, /* Glue After Zwj */"
+ print "\tWBRK_HL, /* Hebrew Letter */"
+ print "\tWBRK_KA, /* Katakana */"
+ print "\tWBRK_LE, /* ALetter */"
+ print "\tWBRK_LF, /* LF */"
+ print "\tWBRK_MB, /* MidNumLet */"
+ print "\tWBRK_ML, /* MidLetter */"
+ print "\tWBRK_MN, /* MidNum */"
+ print "\tWBRK_NL, /* Newline */"
+ print "\tWBRK_NU, /* Numeric */"
+ print "\tWBRK_RI, /* Regional Indicator */"
+ print "\tWBRK_SQ, /* Single Quote */"
+ print "\tWBRK_WSEGSPACE, /* WSegSpace */"
+ print "\tWBRK_ZWJ, /* ZWJ */"
+ print "};"
+ print ""
+ print "const struct {"
+ print "\trune lo, hi;"
+ print "\tenum wbrk_prop val;"
+ print "} wbrk_lookup[] = {"
+}
+
+/^[A-F0-9]/ {
+ if (map[$2] == "")
+ next
+
+ n = split($1, a, /\.\./)
+ lo = strtonum("0X" a[1])
+ hi = strtonum("0X" a[n])
+
+ for (i = lo; i <= hi; i++) {
+ s = "WBRK_" map[$2]
+ if (props[i] == "WBRK_LE" && s == "WBRK_EXTPICT")
+ s = "WBRK_EXTPICT_LE"
+ props[i] = s
+ }
+}
+
+END {
+ for (i = 0; i <= 0x10FFFF; i++) {
+ if (!props[i])
+ continue
+ for (lo = i; props[i] == props[i + 1]; i++)
+ ;
+ printf "\t{RUNE_C(0x%06X), RUNE_C(0x%06X), %s},\n", lo, i, props[i]
+ }
+
+ print "};"
+ print ""
+ print "#endif /* !MLIB_UNICODE__WBRK_H */"
+}
+' data/WordBreakProperty data/emoji-data | sed 's/\s*$//'