Do some lexer work

author: Thomas Voss <mail@thomasvoss.com> 2024-06-08 12:58:04 +0200
committer: Thomas Voss <mail@thomasvoss.com> 2024-06-08 12:58:04 +0200
commit: baea074f524676a889043779c80fb17cdd38f30d (patch)
tree: 4609684954145d092ed684ac38e94b8b04ecd79a /src/lexer.c
parent: 4f698ec642547534bac2b37f96de045dd828fd58 (diff)
1 files changed, 102 insertions, 10 deletions
diff --git a/src/lexer.c b/src/lexer.c
index 970202a..663c8dd 100644
--- a/src/lexer.c
+++ b/src/lexer.c
@@ -1,34 +1,126 @@
 #include <inttypes.h>
+#include <stdbool.h>
 #include <stddef.h>
-#include <stdio.h>
+#include <stdlib.h>
 
 #include "errors.h"
 #include "lexer.h"
 #include "unicode.h"
 
+static bool skip_comment(const unsigned char **, const char *);
+
 struct lexeme *
-lexstring(const char *code, size_t codesz, size_t *lcnt)
+lexstring(const unsigned char *code, size_t codesz, size_t *lcnt)
 {
 	struct {
-		struct lexeme *p;
-		size_t len, buf;
-	} data = {0};
+		struct lexeme *buf;
+		size_t len, cap;
+	} data = {.cap = 1024};
+	if ((data.buf = malloc(data.cap)) == NULL)
+		err("malloc:");
 
 #if ORYX_SIMD
 	if (!utf8_validate_simd(code, codesz)) {
 #endif
-		size_t off = utf8_validate_off(code, codesz);
-		if (off != 0)
-			err("Invalid UTF-8 at byte-offset %zu", off - 1);
+		size_t loc = utf8_validate_off(code, codesz);
+		if (loc != 0) {
+			err("Invalid byte ‘0x%02" PRIx8 "’ in UTF-8 input at byte %zu",
+			    code[loc - 1], loc);
+		}
 #if ORYX_SIMD
 	}
 #endif
 
-	const char *end = code + codesz;
+	const unsigned char *start = code, *end = start + codesz;
 	while (code < end) {
+		struct lexeme l;
+		const unsigned char *spnbeg = code, *spnend;
 		rune ch = utf8_decode(&code);
+
+		switch (ch) {
+		/* Single-byte literals */
+		case '&': case '(': case ')': case '*':
+		case '+': case '-': case ':': case '=':
+		case ';': case '{': case '|': case '}':
+		case '~':
+			l.kind = ch;
+			break;
+
+		/* Single- or double-byte literals */
+		case '/':
+			if (code < end && code[0] == '*') {
+				if (!skip_comment(&code, end))
+					err("Unterminated comment at byte %td", code - start);
+				continue;
+			}
+
+			l.kind = ch;
+			break;
+
+		case '<':
+		case '>':
+			l.kind = ch;
+
+			/* See the comment in lexer.h for where 193 comes from */
+			if (code < end && code[0] == ch) {
+				code++;
+				l.kind += 193;
+			}
+			break;
+
+		default:
+			if (!rune_is_xids(ch))
+				continue;
+
+			l.kind = LEXIDENT;
+			l.p = spnbeg;
+
+			spnend = code;
+			while (code < end && rune_is_xidc(ch)) {
+				spnend = code;
+				ch = utf8_decode(&code);
+			}
+			if (code < end)
+				code = spnend;
+
+			l.len = spnend - spnbeg;
+		}
+
+		if (data.len == data.cap) {
+			data.cap *= 2;
+			if ((data.buf = realloc(data.buf, data.cap)) == NULL)
+				err("realloc:");
+		}
+
+		data.buf[data.len++] = l;
 	}
 
 	*lcnt = data.len;
-	return data.p;
+	return data.buf;
+}
+
+bool
+skip_comment(const unsigned char **ptr, const char *end)
+{
+	int nst = 1;
+	const char *p = *ptr;
+
+	for (p++; p < end; p++) {
+		if (p + 1 < end) {
+			if (p[0] == '*' && p[1] == '/') {
+				p++;
+				if (--nst == 0)
+					goto out;
+			} else if (p[0] == '/' && p[1] == '*') {
+				p++;
+				nst++;
+			}
+		}
+	}
+
+	return false;
+
+out:
+	*ptr = ++p;
+	return true;
 }
author	Thomas Voss <mail@thomasvoss.com>	2024-06-08 12:58:04 +0200
committer	Thomas Voss <mail@thomasvoss.com>	2024-06-08 12:58:04 +0200
commit	baea074f524676a889043779c80fb17cdd38f30d (patch)
tree	4609684954145d092ed684ac38e94b8b04ecd79a /src/lexer.c
parent	4f698ec642547534bac2b37f96de045dd828fd58 (diff)