1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
|
#include <inttypes.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdlib.h>
#include "errors.h"
#include "lexer.h"
#include "unicode.h"
static bool skip_comment(const unsigned char **, const char *);
struct lexeme *
lexstring(const unsigned char *code, size_t codesz, size_t *lcnt)
{
struct {
struct lexeme *buf;
size_t len, cap;
} data = {.cap = 1024};
if ((data.buf = malloc(data.cap)) == NULL)
err("malloc:");
#if ORYX_SIMD
if (!utf8_validate_simd(code, codesz)) {
#endif
size_t loc = utf8_validate_off(code, codesz);
if (loc != 0) {
err("Invalid byte ‘0x%02" PRIx8 "’ in UTF-8 input at byte %zu",
code[loc - 1], loc);
}
#if ORYX_SIMD
}
#endif
const unsigned char *start = code, *end = start + codesz;
while (code < end) {
struct lexeme l;
const unsigned char *spnbeg = code, *spnend;
rune ch = utf8_decode(&code);
switch (ch) {
/* Single-byte literals */
case '&': case '(': case ')': case '*':
case '+': case '-': case ':': case '=':
case ';': case '{': case '|': case '}':
case '~':
l.kind = ch;
break;
/* Single- or double-byte literals */
case '/':
if (code < end && code[0] == '*') {
if (!skip_comment(&code, end))
err("Unterminated comment at byte %td", code - start);
continue;
}
l.kind = ch;
break;
case '<':
case '>':
l.kind = ch;
/* See the comment in lexer.h for where 193 comes from */
if (code < end && code[0] == ch) {
code++;
l.kind += 193;
}
break;
default:
if (!rune_is_xids(ch))
continue;
l.kind = LEXIDENT;
l.p = spnbeg;
spnend = code;
while (code < end && rune_is_xidc(ch)) {
spnend = code;
ch = utf8_decode(&code);
}
if (code < end)
code = spnend;
l.len = spnend - spnbeg;
}
if (data.len == data.cap) {
data.cap *= 2;
if ((data.buf = realloc(data.buf, data.cap)) == NULL)
err("realloc:");
}
data.buf[data.len++] = l;
}
*lcnt = data.len;
return data.buf;
}
bool
skip_comment(const unsigned char **ptr, const char *end)
{
int nst = 1;
const char *p = *ptr;
for (p++; p < end; p++) {
if (p + 1 < end) {
if (p[0] == '*' && p[1] == '/') {
p++;
if (--nst == 0)
goto out;
} else if (p[0] == '/' && p[1] == '*') {
p++;
nst++;
}
}
}
return false;
out:
*ptr = ++p;
return true;
}
|