diff options
author | Thomas Voss <mail@thomasvoss.com> | 2024-05-05 12:59:50 +0200 |
---|---|---|
committer | Thomas Voss <mail@thomasvoss.com> | 2024-05-05 12:59:50 +0200 |
commit | d34cd53842098bd84d7d6aa775225c66854306da (patch) | |
tree | bdd9fdb12ed378b51d52d47817a4b909da2fba65 /.local/bin | |
parent | 55b4077aff0cc27cca00aa9b3309abf67185296c (diff) |
Make the Unicode data more useful (include U+XXXX)
Diffstat (limited to '.local/bin')
-rwxr-xr-x | .local/bin/uni | 19 |
1 files changed, 11 insertions, 8 deletions
diff --git a/.local/bin/uni b/.local/bin/uni index 66b9950..72f9b70 100755 --- a/.local/bin/uni +++ b/.local/bin/uni @@ -5,14 +5,17 @@ set -e NOTIFY_SHORT `basename $args[0] func setup { curl 'https://www.unicode.org/Public/UNIDATA/UnicodeData.txt' - | sed -E ' - s/;[^;]*//2g - s/\<(.)([A-Z]*)/\1\L\2/2g - /^[^;]*;</d - /Compatibility/d - /Variation Selector/d - s/[^;]*/\\u&/ - s/^\\u([^;]{5})/\\U000\1/ + | awk ' + BEGIN { FS = ";" } + $2 == "<control>" { $2 = $11 } + $2 !~ /(First|Last)>$/ { + while (match($2, /[A-Z]{2,}([^)]|$)/)) { + car = substr($2, RSTART, 1) + cdr = substr($2, RSTART + 1, RLENGTH - 1) + sub(/[A-Z]{2,}([^)]|$)/, car tolower(cdr), $2) + } + printf "\\u%s;U+%s %s\n", $1, $1, $2 + } ' >$DATA } |