1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
|
#include "common.h"
#include "types.h"
#include "unicode-data.h"
#include "unicode.h"
#define RUNE_IS_GEN(fn, stg1, stg2, blksz) \
bool fn(rune ch) \
{ \
unsigned x = ch % blksz; \
return stg2[stg1[ch / blksz]][x / 8] & (1 << (x % 8)); \
}
RUNE_IS_GEN(rune_is_xids, xids_stage1, xids_stage2, 128)
RUNE_IS_GEN(rune_is_xidc, xidc_stage1, xidc_stage2, 128)
/*
* This is free and unencumbered software released into the public domain.
*
* Anyone is free to copy, modify, publish, use, compile, sell, or
* distribute this software, either in source code form or as a compiled
* binary, for any purpose, commercial or non-commercial, and by any
* means.
*
* In jurisdictions that recognize copyright laws, the author or authors
* of this software dedicate any and all copyright interest in the
* software to the public domain. We make this dedication for the benefit
* of the public at large and to the detriment of our heirs and
* successors. We intend this dedication to be an overt act of
* relinquishment in perpetuity of all present and future rights to this
* software under copyright law.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
* For more information, please refer to <http://unlicense.org/>
*
* Source: https://github.com/skeeto/branchless-utf8
*/
static const char lengths[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0};
static const rune mins[] = {RUNE_C(4194304), 0, 128, 2048, RUNE_C(65536)};
static const int masks[] = {0x00, 0x7f, 0x1f, 0x0f, 0x07};
static const int shiftc[] = {0, 18, 12, 6, 0};
static const int shifte[] = {0, 6, 4, 2, 0};
rune
utf8_decode(const uchar **buf)
{
const uchar *s = *buf;
int len = lengths[s[0] >> 3];
*buf = s + len + !len;
rune c = (rune)(s[0] & masks[len]) << 18;
c |= (rune)(s[1] & 0x3f) << 12;
c |= (rune)(s[2] & 0x3f) << 6;
c |= (rune)(s[3] & 0x3f) << 0;
return c >> shiftc[len];
}
size_t
utf8_validate_off(const uchar *s, size_t len)
{
const uchar *start = s, *end = start + len;
while (likely(s < end)) {
int len = lengths[s[0] >> 3];
const uchar *next = s + len + !len;
rune c = (rune)(s[0] & masks[len]) << 18;
c |= (rune)(s[1] & 0x3f) << 12;
c |= (rune)(s[2] & 0x3f) << 6;
c |= (rune)(s[3] & 0x3f) << 0;
c >>= shiftc[len];
int e = (c < mins[len]) << 6;
e |= ((c >> 11) == 0x1B) << 7;
e |= (c > 0x10FFFF) << 8;
e |= (s[1] & 0xC0) >> 2;
e |= (s[2] & 0xC0) >> 4;
e |= (s[3]) >> 6;
e ^= 0x2A;
e >>= shifte[len];
if (unlikely(e != 0))
return s - start + 1;
s = next;
}
return 0;
}
|