blob: 8b18d4598135158b11dacd2f44afd08f8b91d2de (
plain) (
blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
|
#ifndef ORYX_UNICODE_H
#define ORYX_UNICODE_H
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#define RUNE_C(x) UINT32_C(x)
typedef uint32_t rune;
/* Assert that CH has the Unicode Pattern_White_Space property */
bool rune_is_pat_ws(rune ch);
/* Assert that CH has the Unicode XID_Start or XID_Continue property. The
rune_is_xids() function extends XID_Start to include U+005F LOW LINE, and the
rune_is_xidc() function extends XID_Continue to include U+2032 PRIME,
U+2033 DOUBLE PRIME, U+2034 TRIPLE PRIME, and U+2057 QUADRUPLE PRIME. */
bool rune_is_xids(rune ch);
bool rune_is_xidc(rune ch);
/* Decode the first UTF-8 rune in S, and point S to the next rune in the stream.
This function assumes that S points to a buffer that’s padded to a length of
4 bytes, and doesn’t perform any form of input validation. */
rune utf8_decode(const unsigned char **s);
/* Return the offset of the first invalid byte in the UTF-8 string S of length
LEN. This function assumes that S points to a buffer that’s padded to a
length of 4 bytes (although LEN should not reflect any added padding). */
size_t utf8_validate_off(const unsigned char *s, size_t len);
#if ORYX_SIMD
/* Assert whether the UTF-8 string S of length LEN is valid, using SIMD
intristics to speed-up computation. */
bool utf8_validate_simd(const unsigned char *s, size_t len);
#endif
#endif /* !ORYX_UNICODE_H */
|