From c0a983a29af17415ef29058d72f1a9cd99ddd83f Mon Sep 17 00:00:00 2001 From: Thomas Voss Date: Mon, 22 Apr 2024 21:06:52 +0200 Subject: Fix various bugs in word segmentation --- gen/string/wbrk | 109 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100755 gen/string/wbrk (limited to 'gen/string/wbrk') diff --git a/gen/string/wbrk b/gen/string/wbrk new file mode 100755 index 0000000..1966356 --- /dev/null +++ b/gen/string/wbrk @@ -0,0 +1,109 @@ +#!/bin/sh + +set -e +cd "${0%/*}/../.." +exec >include/unicode/_wbrk.h + +gawk ' +BEGIN { + FS = " *(; *|#.*)" + + map["ALetter"] = "LE" + map["CR"] = "CR" + map["Double_Quote"] = "DQ" + map["E_Base"] = "EB" + map["E_Base_GAZ"] = "EBG" + map["E_Modifier"] = "EM" + map["Extended_Pictographic"] = "EXTPICT" + map["Extend"] = "EXTEND" + map["ExtendNumLet"] = "EX" + map["Format"] = "FO" + map["Glue_After_Zwj"] = "GAZ" + map["Hebrew_Letter"] = "HL" + map["Katakana"] = "KA" + map["LF"] = "LF" + map["MidLetter"] = "ML" + map["MidNumLet"] = "MB" + map["MidNum"] = "MN" + map["Newline"] = "NL" + map["Numeric"] = "NU" + map["Other"] = "XX" + map["Regional_Indicator"] = "RI" + map["Single_Quote"] = "SQ" + map["WSegSpace"] = "WSEGSPACE" + map["ZWJ"] = "ZWJ" + + print "/* This file is autogenerated by gen/string/wbrk; DO NOT EDIT. */" + print "" + print "#ifndef MLIB_UNICODE__WBRK_H" + print "#define MLIB_UNICODE__WBRK_H" + print "" + print "#include " + print "" + print "#include \"rune.h\"" + print "" + print "enum wbrk_prop : uint_least8_t {" + print "\tWBRK_XX = 0, /* Other */" + print "\tWBRK_CR, /* CR */" + print "\tWBRK_DQ, /* Double Quote */" + print "\tWBRK_EB, /* E Base */" + print "\tWBRK_EBG, /* E Base GAZ */" + print "\tWBRK_EM, /* E Modifier */" + print "\tWBRK_EOT, /* End of Text */" + print "\tWBRK_EX, /* ExtendNumLet */" + print "\tWBRK_EXTEND, /* Extend */" + print "\tWBRK_EXTPICT, /* Extended Pictographic */" + print "\tWBRK_EXTPICT_LE, /* Extended Pictographic and ALetter */" + print "\tWBRK_FO, /* Format */" + print "\tWBRK_GAZ, /* Glue After Zwj */" + print "\tWBRK_HL, /* Hebrew Letter */" + print "\tWBRK_KA, /* Katakana */" + print "\tWBRK_LE, /* ALetter */" + print "\tWBRK_LF, /* LF */" + print "\tWBRK_MB, /* MidNumLet */" + print "\tWBRK_ML, /* MidLetter */" + print "\tWBRK_MN, /* MidNum */" + print "\tWBRK_NL, /* Newline */" + print "\tWBRK_NU, /* Numeric */" + print "\tWBRK_RI, /* Regional Indicator */" + print "\tWBRK_SQ, /* Single Quote */" + print "\tWBRK_WSEGSPACE, /* WSegSpace */" + print "\tWBRK_ZWJ, /* ZWJ */" + print "};" + print "" + print "const struct {" + print "\trune lo, hi;" + print "\tenum wbrk_prop val;" + print "} wbrk_lookup[] = {" +} + +/^[A-F0-9]/ { + if (map[$2] == "") + next + + n = split($1, a, /\.\./) + lo = strtonum("0X" a[1]) + hi = strtonum("0X" a[n]) + + for (i = lo; i <= hi; i++) { + s = "WBRK_" map[$2] + if (props[i] == "WBRK_LE" && s == "WBRK_EXTPICT") + s = "WBRK_EXTPICT_LE" + props[i] = s + } +} + +END { + for (i = 0; i <= 0x10FFFF; i++) { + if (!props[i]) + continue + for (lo = i; props[i] == props[i + 1]; i++) + ; + printf "\t{RUNE_C(0x%06X), RUNE_C(0x%06X), %s},\n", lo, i, props[i] + } + + print "};" + print "" + print "#endif /* !MLIB_UNICODE__WBRK_H */" +} +' data/WordBreakProperty data/emoji-data | sed 's/\s*$//' -- cgit v1.2.3