aboutsummaryrefslogtreecommitdiff
path: root/src/lexer.c
diff options
context:
space:
mode:
authorThomas Voss <mail@thomasvoss.com> 2024-06-08 12:58:04 +0200
committerThomas Voss <mail@thomasvoss.com> 2024-06-08 12:58:04 +0200
commitbaea074f524676a889043779c80fb17cdd38f30d (patch)
tree4609684954145d092ed684ac38e94b8b04ecd79a /src/lexer.c
parent4f698ec642547534bac2b37f96de045dd828fd58 (diff)
Do some lexer work
Diffstat (limited to 'src/lexer.c')
-rw-r--r--src/lexer.c112
1 files changed, 102 insertions, 10 deletions
diff --git a/src/lexer.c b/src/lexer.c
index 970202a..663c8dd 100644
--- a/src/lexer.c
+++ b/src/lexer.c
@@ -1,34 +1,126 @@
#include <inttypes.h>
+#include <stdbool.h>
#include <stddef.h>
-#include <stdio.h>
+#include <stdlib.h>
#include "errors.h"
#include "lexer.h"
#include "unicode.h"
+static bool skip_comment(const unsigned char **, const char *);
+
struct lexeme *
-lexstring(const char *code, size_t codesz, size_t *lcnt)
+lexstring(const unsigned char *code, size_t codesz, size_t *lcnt)
{
struct {
- struct lexeme *p;
- size_t len, buf;
- } data = {0};
+ struct lexeme *buf;
+ size_t len, cap;
+ } data = {.cap = 1024};
+ if ((data.buf = malloc(data.cap)) == NULL)
+ err("malloc:");
#if ORYX_SIMD
if (!utf8_validate_simd(code, codesz)) {
#endif
- size_t off = utf8_validate_off(code, codesz);
- if (off != 0)
- err("Invalid UTF-8 at byte-offset %zu", off - 1);
+ size_t loc = utf8_validate_off(code, codesz);
+ if (loc != 0) {
+ err("Invalid byte ‘0x%02" PRIx8 "’ in UTF-8 input at byte %zu",
+ code[loc - 1], loc);
+ }
#if ORYX_SIMD
}
#endif
- const char *end = code + codesz;
+ const unsigned char *start = code, *end = start + codesz;
while (code < end) {
+ struct lexeme l;
+ const unsigned char *spnbeg = code, *spnend;
rune ch = utf8_decode(&code);
+
+ switch (ch) {
+ /* Single-byte literals */
+ case '&': case '(': case ')': case '*':
+ case '+': case '-': case ':': case '=':
+ case ';': case '{': case '|': case '}':
+ case '~':
+ l.kind = ch;
+ break;
+
+ /* Single- or double-byte literals */
+ case '/':
+ if (code < end && code[0] == '*') {
+ if (!skip_comment(&code, end))
+ err("Unterminated comment at byte %td", code - start);
+ continue;
+ }
+
+ l.kind = ch;
+ break;
+
+ case '<':
+ case '>':
+ l.kind = ch;
+
+ /* See the comment in lexer.h for where 193 comes from */
+ if (code < end && code[0] == ch) {
+ code++;
+ l.kind += 193;
+ }
+ break;
+
+ default:
+ if (!rune_is_xids(ch))
+ continue;
+
+ l.kind = LEXIDENT;
+ l.p = spnbeg;
+
+ spnend = code;
+ while (code < end && rune_is_xidc(ch)) {
+ spnend = code;
+ ch = utf8_decode(&code);
+ }
+ if (code < end)
+ code = spnend;
+
+ l.len = spnend - spnbeg;
+ }
+
+ if (data.len == data.cap) {
+ data.cap *= 2;
+ if ((data.buf = realloc(data.buf, data.cap)) == NULL)
+ err("realloc:");
+ }
+
+ data.buf[data.len++] = l;
}
*lcnt = data.len;
- return data.p;
+ return data.buf;
+}
+
+bool
+skip_comment(const unsigned char **ptr, const char *end)
+{
+ int nst = 1;
+ const char *p = *ptr;
+
+ for (p++; p < end; p++) {
+ if (p + 1 < end) {
+ if (p[0] == '*' && p[1] == '/') {
+ p++;
+ if (--nst == 0)
+ goto out;
+ } else if (p[0] == '/' && p[1] == '*') {
+ p++;
+ nst++;
+ }
+ }
+ }
+
+ return false;
+
+out:
+ *ptr = ++p;
+ return true;
}