blob: df8e53a6f4c4de439ca98d6b0f0fde1429cebf50 (
plain) (
blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
|
#!/bin/sh
# Usage: scale -v utf=X -v mapping=X [-v az=X] [-v lt=X]
# Example: scale -v utf=8 -v mapping=title -v lt=1
# NOTE: Titlecasing is a combination of titlecasing the first rune in a word,
# and lowercasing all the other runes in the word. This means that for
# the titlecase scale factor, you actually need MAX(TS, LS) where TS is
# the titlecase scale factor and LS is the lowercase scale factor.
set -e
cd "${0%/*}/../.."
gawk "$@" '
function bcnt(x)
{
x = strtonum("0X" x)
if (utf == 32)
return 4
if (utf == 16)
return x < 0x10000 ? 2 : 4
return x < 0x00080 ? 1 \
: x < 0x00800 ? 2 \
: x < 0x10000 ? 3 \
: /* ... */ 4
}
function max(x, y)
{
return x > y ? x : y
}
BEGIN {
FS = " *; *"
if (mapping == "lower")
field = 2
else if (mapping == "title")
field = 3
else if (mapping == "upper")
field = 4
}
$5 ~ /^(az|tr)/ && !az { next }
$5 ~ /^lt/ && !lt { next }
/^[A-F0-9]/ {
to = 0
from = bcnt($1)
split($field, xs, / /)
for (i in xs)
to += bcnt(xs[i])
results[g_i++] = to / from
}
END {
for (i = 1; i <= g_i; i++)
n = max(n, results[i])
print n
}
' data/SpecialCasing
|