#!/bin/sh set -e cd "${0%/*}/../.." exec >include/unicode/_wbrk.h gawk ' BEGIN { FS = " *(; *|#.*)" map["ALetter"] = "LE" map["CR"] = "CR" map["Double_Quote"] = "DQ" map["E_Base"] = "EB" map["E_Base_GAZ"] = "EBG" map["E_Modifier"] = "EM" map["Extended_Pictographic"] = "EXTPICT" map["Extend"] = "EXTEND" map["ExtendNumLet"] = "EX" map["Format"] = "FO" map["Glue_After_Zwj"] = "GAZ" map["Hebrew_Letter"] = "HL" map["Katakana"] = "KA" map["LF"] = "LF" map["MidLetter"] = "ML" map["MidNumLet"] = "MB" map["MidNum"] = "MN" map["Newline"] = "NL" map["Numeric"] = "NU" map["Other"] = "XX" map["Regional_Indicator"] = "RI" map["Single_Quote"] = "SQ" map["WSegSpace"] = "WSEGSPACE" map["ZWJ"] = "ZWJ" print "/* This file is autogenerated by gen/string/wbrk; DO NOT EDIT. */" print "" print "#ifndef MLIB_UNICODE__WBRK_H" print "#define MLIB_UNICODE__WBRK_H" print "" print "#include " print "" print "#include \"rune.h\"" print "" print "enum wbrk_prop : uint_least8_t {" print "\tWBRK_XX = 0, /* Other */" print "\tWBRK_CR, /* CR */" print "\tWBRK_DQ, /* Double Quote */" print "\tWBRK_EB, /* E Base */" print "\tWBRK_EBG, /* E Base GAZ */" print "\tWBRK_EM, /* E Modifier */" print "\tWBRK_EOT, /* End of Text */" print "\tWBRK_EX, /* ExtendNumLet */" print "\tWBRK_EXTEND, /* Extend */" print "\tWBRK_EXTPICT, /* Extended Pictographic */" print "\tWBRK_EXTPICT_LE, /* Extended Pictographic and ALetter */" print "\tWBRK_FO, /* Format */" print "\tWBRK_GAZ, /* Glue After Zwj */" print "\tWBRK_HL, /* Hebrew Letter */" print "\tWBRK_KA, /* Katakana */" print "\tWBRK_LE, /* ALetter */" print "\tWBRK_LF, /* LF */" print "\tWBRK_MB, /* MidNumLet */" print "\tWBRK_ML, /* MidLetter */" print "\tWBRK_MN, /* MidNum */" print "\tWBRK_NL, /* Newline */" print "\tWBRK_NU, /* Numeric */" print "\tWBRK_RI, /* Regional Indicator */" print "\tWBRK_SQ, /* Single Quote */" print "\tWBRK_WSEGSPACE, /* WSegSpace */" print "\tWBRK_ZWJ, /* ZWJ */" print "};" print "" print "const struct {" print "\trune lo, hi;" print "\tenum wbrk_prop val;" print "} wbrk_lookup[] = {" } /^[A-F0-9]/ { if (map[$2] == "") next n = split($1, a, /\.\./) lo = strtonum("0X" a[1]) hi = strtonum("0X" a[n]) for (i = lo; i <= hi; i++) { s = "WBRK_" map[$2] if (props[i] == "WBRK_LE" && s == "WBRK_EXTPICT") s = "WBRK_EXTPICT_LE" props[i] = s } } END { for (i = 0; i <= 0x10FFFF; i++) { if (!props[i]) continue for (lo = i; props[i] == props[i + 1]; i++) ; printf "\t{RUNE_C(0x%06X), RUNE_C(0x%06X), %s},\n", lo, i, props[i] } print "};" print "" print "#endif /* !MLIB_UNICODE__WBRK_H */" } ' data/WordBreakProperty data/emoji-data | sed 's/\s*$//'