1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
|
#include "unicode.h"
/* Branchless UTF-8 decoding and validation by Christopher Wellons.
You can find the original source with comments at
https://github.com/skeeto/branchless-utf8. */
static const char lengths[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0};
static const rune mins[] = {RUNE_C(4194304), 0, 128, 2048, RUNE_C(65536)};
static const int masks[] = {0x00, 0x7f, 0x1f, 0x0f, 0x07};
static const int shiftc[] = {0, 18, 12, 6, 0};
static const int shifte[] = {0, 6, 4, 2, 0};
rune
utf8_decode(const char **buf)
{
const unsigned char *s = *buf;
int len = lengths[s[0] >> 3];
*buf = s + len + !len;
rune c = (rune)(s[0] & masks[len]) << 18;
c |= (rune)(s[1] & 0x3f) << 12;
c |= (rune)(s[2] & 0x3f) << 6;
c |= (rune)(s[3] & 0x3f) << 0;
return c >> shiftc[len];
}
size_t
utf8_validate_off(const char *buf, size_t len)
{
const char *start = buf, *end = start + len;
while (buf < end) {
const unsigned char *s = buf;
int len = lengths[s[0] >> 3];
const unsigned char *next = s + len + !len;
rune c = (rune)(s[0] & masks[len]) << 18;
c |= (rune)(s[1] & 0x3f) << 12;
c |= (rune)(s[2] & 0x3f) << 6;
c |= (rune)(s[3] & 0x3f) << 0;
c >>= shiftc[len];
int e = (c < mins[len]) << 6;
e |= ((c >> 11) == 0x1B) << 7;
e |= (c > 0x10FFFF) << 8;
e |= (s[1] & 0xC0) >> 2;
e |= (s[2] & 0xC0) >> 4;
e |= (s[3]) >> 6;
e ^= 0x2A;
e >>= shifte[len];
if (e != 0)
return buf - start + 1;
buf = next;
}
return 0;
}
|