aboutsummaryrefslogtreecommitdiff
path: root/gen/string/scale
blob: df8e53a6f4c4de439ca98d6b0f0fde1429cebf50 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#!/bin/sh

# Usage: scale -v utf=X -v mapping=X [-v az=X] [-v lt=X]
# Example: scale -v utf=8 -v mapping=title -v lt=1

# NOTE: Titlecasing is a combination of titlecasing the first rune in a word,
#       and lowercasing all the other runes in the word.  This means that for
#       the titlecase scale factor, you actually need MAX(TS, LS) where TS is
#       the titlecase scale factor and LS is the lowercase scale factor.

set -e
cd "${0%/*}/../.."

gawk "$@" '
function bcnt(x)
{
	x = strtonum("0X" x)
	if (utf == 32)
		return 4
	if (utf == 16)
		return x < 0x10000 ? 2 : 4
	return x < 0x00080 ? 1 \
	     : x < 0x00800 ? 2 \
	     : x < 0x10000 ? 3 \
	     : /* ... */     4
}

function max(x, y)
{
	return x > y ? x : y
}

BEGIN {
	FS = " *; *"
	if (mapping == "lower")
		field = 2
	else if (mapping == "title")
		field = 3
	else if (mapping == "upper")
		field = 4
}

$5 ~ /^(az|tr)/ && !az { next }
$5 ~ /^lt/ && !lt      { next }

/^[A-F0-9]/ {
	to = 0
	from = bcnt($1)
	split($field, xs, / /)
	for (i in xs)
		to += bcnt(xs[i])
	results[g_i++] = to / from
}

END {
	for (i = 1; i <= g_i; i++)
		n = max(n, results[i])
	print n
}
' data/SpecialCasing