Switch to an SOA for lexemes

author: Thomas Voss <mail@thomasvoss.com> 2024-06-08 15:17:13 +0200
committer: Thomas Voss <mail@thomasvoss.com> 2024-06-08 15:17:13 +0200
commit: d21121e59b21c73afa3622abe256771f1e73ff69 (patch)
tree: d91acbc4fac4062cd5acda237d759e2a51047648 /src
parent: 46e676254e5121f38c5d3668855c34a4496f88ec (diff)
5 files changed, 190 insertions, 67 deletions
diff --git a/src/errors.h b/src/errors.h
index 69c8ea0..c3ccaf2 100644
--- a/src/errors.h
+++ b/src/errors.h
@@ -1,6 +1,8 @@
 #ifndef ORYX_ERRORS_H
 #define ORYX_ERRORS_H
 
-void err(const char *, ...);
+#include <stdnoreturn.h>
+
+noreturn void err(const char *, ...);
 
 #endif /* !ORYX_ERRORS_H */
diff --git a/src/lexer.c b/src/lexer.c
index 663c8dd..77e60f2 100644
--- a/src/lexer.c
+++ b/src/lexer.c
@@ -1,24 +1,26 @@
+#include <assert.h>
+#include <errno.h>
 #include <inttypes.h>
+#include <stdalign.h>
 #include <stdbool.h>
 #include <stddef.h>
 #include <stdlib.h>
+#include <string.h>
 
 #include "errors.h"
 #include "lexer.h"
 #include "unicode.h"
 
-static bool skip_comment(const unsigned char **, const char *);
+#define SIZE_WDTH (sizeof(size_t) * 8)
 
-struct lexeme *
-lexstring(const unsigned char *code, size_t codesz, size_t *lcnt)
-{
-	struct {
-		struct lexeme *buf;
-		size_t len, cap;
-	} data = {.cap = 1024};
-	if ((data.buf = malloc(data.cap)) == NULL)
-		err("malloc:");
+static bool skip_comment(const uchar **, const uchar *);
+
+static struct lexemes_soa mk_lexemes_soa(void);
+static void lexemes_soa_resz(struct lexemes_soa *);
 
+struct lexemes_soa
+lexstring(const uchar *code, size_t codesz)
+{
 #if ORYX_SIMD
 	if (!utf8_validate_simd(code, codesz)) {
 #endif
@@ -31,19 +33,31 @@ lexstring(const unsigned char *code, size_t codesz, size_t *lcnt)
 	}
 #endif
 
-	const unsigned char *start = code, *end = start + codesz;
+	struct lexemes_soa data = mk_lexemes_soa();
+
+	const uchar *start = code, *end = start + codesz;
 	while (code < end) {
-		struct lexeme l;
-		const unsigned char *spnbeg = code, *spnend;
+		const uchar *spnbeg = code, *spnend;
 		rune ch = utf8_decode(&code);
 
 		switch (ch) {
 		/* Single-byte literals */
-		case '&': case '(': case ')': case '*':
-		case '+': case '-': case ':': case '=':
-		case ';': case '{': case '|': case '}':
+		case '&':
+		case '(':
+		case ')':
+		case '*':
+		case '+':
+		case '-':
+		case ':':
+		case ';':
+		case '=':
+		case '[':
+		case ']':
+		case '{':
+		case '|':
+		case '}':
 		case '~':
-			l.kind = ch;
+			data.kinds[data.len++] = ch;
 			break;
 
 		/* Single- or double-byte literals */
@@ -54,17 +68,17 @@ lexstring(const unsigned char *code, size_t codesz, size_t *lcnt)
 				continue;
 			}
 
-			l.kind = ch;
+			data.kinds[data.len++] = ch;
 			break;
 
 		case '<':
 		case '>':
-			l.kind = ch;
+			data.kinds[data.len++] = ch;
 
 			/* See the comment in lexer.h for where 193 comes from */
 			if (code < end && code[0] == ch) {
 				code++;
-				l.kind += 193;
+				data.kinds[data.len - 1] += 193;
 			}
 			break;
 
@@ -72,8 +86,8 @@ lexstring(const unsigned char *code, size_t codesz, size_t *lcnt)
 			if (!rune_is_xids(ch))
 				continue;
 
-			l.kind = LEXIDENT;
-			l.p = spnbeg;
+			data.kinds[data.len] = LEXIDENT;
+			data.strs[data.len].p = spnbeg;
 
 			spnend = code;
 			while (code < end && rune_is_xidc(ch)) {
@@ -83,27 +97,21 @@ lexstring(const unsigned char *code, size_t codesz, size_t *lcnt)
 			if (code < end)
 				code = spnend;
 
-			l.len = spnend - spnbeg;
+			data.strs[data.len++].len = spnend - spnbeg;
 		}
 
-		if (data.len == data.cap) {
-			data.cap *= 2;
-			if ((data.buf = realloc(data.buf, data.cap)) == NULL)
-				err("realloc:");
-		}
-
-		data.buf[data.len++] = l;
+		if (data.len == data.cap)
+			lexemes_soa_resz(&data);
 	}
 
-	*lcnt = data.len;
-	return data.buf;
+	return data;
 }
 
 bool
-skip_comment(const unsigned char **ptr, const char *end)
+skip_comment(const uchar **ptr, const uchar *end)
 {
 	int nst = 1;
-	const char *p = *ptr;
+	const uchar *p = *ptr;
 
 	for (p++; p < end; p++) {
 		if (p + 1 < end) {
@@ -124,3 +132,60 @@ out:
 	*ptr = ++p;
 	return true;
 }
+
+struct lexemes_soa
+mk_lexemes_soa(void)
+{
+	static_assert(offsetof(struct lexemes_soa, kinds)
+	                  < offsetof(struct lexemes_soa, strs),
+	              "KINDS is not the first field before STRS");
+
+	struct lexemes_soa soa;
+	soa.len = 0;
+	soa.cap = 2048;
+
+	/* Ensure that soa.strs is properly aligned */
+	size_t pad = alignof(*soa.strs)
+	           - soa.cap * sizeof(*soa.kinds) % alignof(*soa.strs);
+	if (pad == 8)
+		pad = 0;
+
+	if ((soa.kinds = malloc(soa.cap * LEXEMES_SOA_BLKSZ + pad)) == NULL)
+		err("malloc:");
+	soa.strs = (void *)((char *)soa.kinds + soa.cap * sizeof(*soa.kinds) + pad);
+
+	return soa;
+}
+
+void
+lexemes_soa_resz(struct lexemes_soa *soa)
+{
+	static_assert(offsetof(struct lexemes_soa, kinds)
+	                  < offsetof(struct lexemes_soa, strs),
+	              "KINDS is not the first field before STRS");
+
+	size_t ncap, pad, newsz;
+	ptrdiff_t off = (char *)soa->strs - (char *)soa->kinds;
+
+	/* The capacity is always going to be a power of 2, so checking for overflow
+	   becomes pretty trivial */
+	if ((soa->cap >> (SIZE_WDTH - 1)) != 0) {
+		errno = EOVERFLOW;
+		err("lexemes_soa_resz:");
+	}
+	ncap = soa->cap << 1;
+
+	/* Ensure that soa->strs is properly aligned */
+	pad = alignof(*soa->strs)
+	    - ncap * sizeof(*soa->kinds) % alignof(*soa->strs);
+	if (pad == 8)
+		pad = 0;
+
+	newsz = ncap * LEXEMES_SOA_BLKSZ + pad;
+
+	if ((soa->kinds = realloc(soa->kinds, newsz)) == NULL)
+		err("realloc:");
+	soa->strs = (void *)((char *)soa->kinds + ncap * sizeof(*soa->kinds) + pad);
+	memmove(soa->strs, (char *)soa->kinds + off, soa->len * sizeof(*soa->strs));
+	soa->cap = ncap;
+}
diff --git a/src/lexer.h b/src/lexer.h
index efd03df..e2862bb 100644
--- a/src/lexer.h
+++ b/src/lexer.h
@@ -4,6 +4,8 @@
 #include <stddef.h>
 #include <stdint.h>
 
+#include "types.h"
+
 enum {
 	LEXIDENT,
 
@@ -12,12 +14,14 @@ enum {
 	LEXEQ = '=',
 	LEXLANGL = '<',
 	LEXLBRACE = '{',
+	LEXLBRKT = '[',
 	LEXLPAR = '(',
 	LEXMINUS = '-',
 	LEXPIPE = '|',
 	LEXPLUS = '+',
 	LEXRANGL = '>',
 	LEXRBRACE = '}',
+	LEXRBRKT = ']',
 	LEXRPAR = ')',
 	LEXSEMI = ';',
 	LEXSLASH = '/',
@@ -33,12 +37,16 @@ enum {
 
 typedef uint8_t lexeme_kind;
 
-struct lexeme {
-	lexeme_kind kind;
-	const char *p;
-	size_t len;
+#define LEXEMES_SOA_BLKSZ (sizeof(lexeme_kind) + sizeof(struct strview))
+
+struct lexemes_soa {
+	lexeme_kind *kinds;
+	struct strview *strs;
+	size_t len, cap;
 };
 
-struct lexeme *lexstring(const unsigned char *, size_t, size_t *);
+#define lexemes_free(x) free(x.kinds)
+
+struct lexemes_soa lexstring(const uchar *, size_t);
 
 #endif /* !ORYX_LEXER_H */
diff --git a/src/main.c b/src/main.c
index 9969401..bef6b0d 100644
--- a/src/main.c
+++ b/src/main.c
@@ -24,40 +24,75 @@ main(int argc, char **argv)
 	} file;
 	file.p = readfile(argv[1], &file.len);
 
-	struct {
-		struct lexeme *p;
-		size_t len;
-	} toks;
-	toks.p = lexstring(file.p, file.len, &toks.len);
+	struct lexemes_soa toks = lexstring(file.p, file.len);
 
 	for (size_t i = 0; i < toks.len; i++) {
-		struct lexeme l = toks.p[i];
-		switch (l.kind) {
-		case LEXIDENT:
-			printf("Identifier: ‘%.*s’\n", (int)l.len, l.p);
-			break;
-		case LEXCOLON: puts("Colon"); break;
-		case LEXLBRACE: puts("Left brace"); break;
-		case LEXLPAR: puts("Left parenthesis"); break;
-		case LEXRBRACE: puts("Right brace"); break;
-		case LEXRPAR: puts("Right parenthesis"); break;
-		case LEXSEMI: puts("Semicolon"); break;
-		case LEXAMP: puts("Ampersand"); break;
-		case LEXEQ: puts("Equals"); break;
-		case LEXLANGL: puts("Left angle bracket"); break;
-		case LEXMINUS: puts("Minus"); break;
-		case LEXPIPE: puts("Pipe"); break;
-		case LEXPLUS: puts("Plus"); break;
-		case LEXRANGL: puts("Right angle bracket"); break;
-		case LEXSLASH: puts("Slash"); break;
-		case LEXSTAR: puts("Asterisk"); break;
-		case LEXTILDE: puts("Tilde"); break;
+		switch (toks.kinds[i]) {
+		case LEXIDENT: {
+			struct strview sv = toks.strs[i];
+			printf("Identifier: ‘%.*s’\n", (int)sv.len, sv.p);
+			break;
+		}
+		case LEXAMP:
+			puts("Ampersand");
+			break;
+		case LEXCOLON:
+			puts("Colon");
+			break;
+		case LEXEQ:
+			puts("Equals");
+			break;
+		case LEXLANGL:
+			puts("Left angle bracket");
+			break;
+		case LEXLBRACE:
+			puts("Left brace");
+			break;
+		case LEXLBRKT:
+			puts("Right bracket");
+			break;
+		case LEXLPAR:
+			puts("Left parenthesis");
+			break;
+		case LEXMINUS:
+			puts("Minus");
+			break;
+		case LEXPIPE:
+			puts("Pipe");
+			break;
+		case LEXPLUS:
+			puts("Plus");
+			break;
+		case LEXRANGL:
+			puts("Right angle bracket");
+			break;
+		case LEXRBRACE:
+			puts("Right brace");
+			break;
+		case LEXRBRKT:
+			puts("Right bracket");
+			break;
+		case LEXRPAR:
+			puts("Right parenthesis");
+			break;
+		case LEXSEMI:
+			puts("Semicolon");
+			break;
+		case LEXSLASH:
+			puts("Slash");
+			break;
+		case LEXSTAR:
+			puts("Asterisk");
+			break;
+		case LEXTILDE:
+			puts("Tilde");
+			break;
 		}
 	}
 
 #if DEBUG
 	free(file.p);
-	free(toks.p);
+	lexemes_free(toks);
 #endif
 	return EXIT_SUCCESS;
 }
diff --git a/src/types.h b/src/types.h
new file mode 100644
index 0000000..c784b26
--- /dev/null
+++ b/src/types.h
@@ -0,0 +1,13 @@
+#ifndef ORYX_TYPES_H
+#define ORYX_TYPES_H
+
+#include <stddef.h>
+
+typedef unsigned char uchar;
+
+struct strview {
+	const uchar *p;
+	size_t len;
+};
+
+#endif /* !ORYX_TYPES_H */
author	Thomas Voss <mail@thomasvoss.com>	2024-06-08 15:17:13 +0200
committer	Thomas Voss <mail@thomasvoss.com>	2024-06-08 15:17:13 +0200
commit	d21121e59b21c73afa3622abe256771f1e73ff69 (patch)
tree	d91acbc4fac4062cd5acda237d759e2a51047648 /src
parent	46e676254e5121f38c5d3668855c34a4496f88ec (diff)