aboutsummaryrefslogtreecommitdiff
path: root/src/lexer.c
blob: 663c8dd608253974f29d0c9b5d3bf9479b142163 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#include <inttypes.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdlib.h>

#include "errors.h"
#include "lexer.h"
#include "unicode.h"

static bool skip_comment(const unsigned char **, const char *);

struct lexeme *
lexstring(const unsigned char *code, size_t codesz, size_t *lcnt)
{
	struct {
		struct lexeme *buf;
		size_t len, cap;
	} data = {.cap = 1024};
	if ((data.buf = malloc(data.cap)) == NULL)
		err("malloc:");

#if ORYX_SIMD
	if (!utf8_validate_simd(code, codesz)) {
#endif
		size_t loc = utf8_validate_off(code, codesz);
		if (loc != 0) {
			err("Invalid byte ‘0x%02" PRIx8 "’ in UTF-8 input at byte %zu",
			    code[loc - 1], loc);
		}
#if ORYX_SIMD
	}
#endif

	const unsigned char *start = code, *end = start + codesz;
	while (code < end) {
		struct lexeme l;
		const unsigned char *spnbeg = code, *spnend;
		rune ch = utf8_decode(&code);

		switch (ch) {
		/* Single-byte literals */
		case '&': case '(': case ')': case '*':
		case '+': case '-': case ':': case '=':
		case ';': case '{': case '|': case '}':
		case '~':
			l.kind = ch;
			break;

		/* Single- or double-byte literals */
		case '/':
			if (code < end && code[0] == '*') {
				if (!skip_comment(&code, end))
					err("Unterminated comment at byte %td", code - start);
				continue;
			}

			l.kind = ch;
			break;

		case '<':
		case '>':
			l.kind = ch;

			/* See the comment in lexer.h for where 193 comes from */
			if (code < end && code[0] == ch) {
				code++;
				l.kind += 193;
			}
			break;

		default:
			if (!rune_is_xids(ch))
				continue;

			l.kind = LEXIDENT;
			l.p = spnbeg;

			spnend = code;
			while (code < end && rune_is_xidc(ch)) {
				spnend = code;
				ch = utf8_decode(&code);
			}
			if (code < end)
				code = spnend;

			l.len = spnend - spnbeg;
		}

		if (data.len == data.cap) {
			data.cap *= 2;
			if ((data.buf = realloc(data.buf, data.cap)) == NULL)
				err("realloc:");
		}

		data.buf[data.len++] = l;
	}

	*lcnt = data.len;
	return data.buf;
}

bool
skip_comment(const unsigned char **ptr, const char *end)
{
	int nst = 1;
	const char *p = *ptr;

	for (p++; p < end; p++) {
		if (p + 1 < end) {
			if (p[0] == '*' && p[1] == '/') {
				p++;
				if (--nst == 0)
					goto out;
			} else if (p[0] == '/' && p[1] == '*') {
				p++;
				nst++;
			}
		}
	}

	return false;

out:
	*ptr = ++p;
	return true;
}