1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
|
#!/bin/sh
set -e
cd "${0%/*}/../.."
exec >include/unicode/_wbrk.h
gawk '
BEGIN {
FS = " *(; *|#.*)"
map["ALetter"] = "LE"
map["CR"] = "CR"
map["Double_Quote"] = "DQ"
map["E_Base"] = "EB"
map["E_Base_GAZ"] = "EBG"
map["E_Modifier"] = "EM"
map["Extended_Pictographic"] = "EXTPICT"
map["Extend"] = "EXTEND"
map["ExtendNumLet"] = "EX"
map["Format"] = "FO"
map["Glue_After_Zwj"] = "GAZ"
map["Hebrew_Letter"] = "HL"
map["Katakana"] = "KA"
map["LF"] = "LF"
map["MidLetter"] = "ML"
map["MidNumLet"] = "MB"
map["MidNum"] = "MN"
map["Newline"] = "NL"
map["Numeric"] = "NU"
map["Other"] = "XX"
map["Regional_Indicator"] = "RI"
map["Single_Quote"] = "SQ"
map["WSegSpace"] = "WSEGSPACE"
map["ZWJ"] = "ZWJ"
print "/* This file is autogenerated by gen/string/wbrk; DO NOT EDIT. */"
print ""
print "#ifndef MLIB_UNICODE__WBRK_H"
print "#define MLIB_UNICODE__WBRK_H"
print ""
print "#include <inttypes.h>"
print ""
print "#include \"rune.h\""
print ""
print "enum wbrk_prop : uint_least8_t {"
print "\tWBRK_XX = 0, /* Other */"
print "\tWBRK_CR, /* CR */"
print "\tWBRK_DQ, /* Double Quote */"
print "\tWBRK_EB, /* E Base */"
print "\tWBRK_EBG, /* E Base GAZ */"
print "\tWBRK_EM, /* E Modifier */"
print "\tWBRK_EOT, /* End of Text */"
print "\tWBRK_EX, /* ExtendNumLet */"
print "\tWBRK_EXTEND, /* Extend */"
print "\tWBRK_EXTPICT, /* Extended Pictographic */"
print "\tWBRK_EXTPICT_LE, /* Extended Pictographic and ALetter */"
print "\tWBRK_FO, /* Format */"
print "\tWBRK_GAZ, /* Glue After Zwj */"
print "\tWBRK_HL, /* Hebrew Letter */"
print "\tWBRK_KA, /* Katakana */"
print "\tWBRK_LE, /* ALetter */"
print "\tWBRK_LF, /* LF */"
print "\tWBRK_MB, /* MidNumLet */"
print "\tWBRK_ML, /* MidLetter */"
print "\tWBRK_MN, /* MidNum */"
print "\tWBRK_NL, /* Newline */"
print "\tWBRK_NU, /* Numeric */"
print "\tWBRK_RI, /* Regional Indicator */"
print "\tWBRK_SQ, /* Single Quote */"
print "\tWBRK_WSEGSPACE, /* WSegSpace */"
print "\tWBRK_ZWJ, /* ZWJ */"
print "};"
print ""
print "const struct {"
print "\trune lo, hi;"
print "\tenum wbrk_prop val;"
print "} wbrk_lookup[] = {"
}
/^[A-F0-9]/ {
if (map[$2] == "")
next
n = split($1, a, /\.\./)
lo = strtonum("0X" a[1])
hi = strtonum("0X" a[n])
for (i = lo; i <= hi; i++) {
s = "WBRK_" map[$2]
if (props[i] == "WBRK_LE" && s == "WBRK_EXTPICT")
s = "WBRK_EXTPICT_LE"
props[i] = s
}
}
END {
for (i = 0; i <= 0x10FFFF; i++) {
if (!props[i])
continue
for (lo = i; props[i] == props[i + 1]; i++)
;
printf "\t{RUNE_C(0x%06X), RUNE_C(0x%06X), %s},\n", lo, i, props[i]
}
print "};"
print ""
print "#endif /* !MLIB_UNICODE__WBRK_H */"
}
' data/WordBreakProperty data/emoji-data | sed 's/\s*$//'
|