aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorThomas Voss <mail@thomasvoss.com> 2024-06-08 15:17:13 +0200
committerThomas Voss <mail@thomasvoss.com> 2024-06-08 15:17:13 +0200
commitd21121e59b21c73afa3622abe256771f1e73ff69 (patch)
treed91acbc4fac4062cd5acda237d759e2a51047648 /src
parent46e676254e5121f38c5d3668855c34a4496f88ec (diff)
Switch to an SOA for lexemes
Diffstat (limited to 'src')
-rw-r--r--src/errors.h4
-rw-r--r--src/lexer.c133
-rw-r--r--src/lexer.h18
-rw-r--r--src/main.c89
-rw-r--r--src/types.h13
5 files changed, 190 insertions, 67 deletions
diff --git a/src/errors.h b/src/errors.h
index 69c8ea0..c3ccaf2 100644
--- a/src/errors.h
+++ b/src/errors.h
@@ -1,6 +1,8 @@
#ifndef ORYX_ERRORS_H
#define ORYX_ERRORS_H
-void err(const char *, ...);
+#include <stdnoreturn.h>
+
+noreturn void err(const char *, ...);
#endif /* !ORYX_ERRORS_H */
diff --git a/src/lexer.c b/src/lexer.c
index 663c8dd..77e60f2 100644
--- a/src/lexer.c
+++ b/src/lexer.c
@@ -1,24 +1,26 @@
+#include <assert.h>
+#include <errno.h>
#include <inttypes.h>
+#include <stdalign.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdlib.h>
+#include <string.h>
#include "errors.h"
#include "lexer.h"
#include "unicode.h"
-static bool skip_comment(const unsigned char **, const char *);
+#define SIZE_WDTH (sizeof(size_t) * 8)
-struct lexeme *
-lexstring(const unsigned char *code, size_t codesz, size_t *lcnt)
-{
- struct {
- struct lexeme *buf;
- size_t len, cap;
- } data = {.cap = 1024};
- if ((data.buf = malloc(data.cap)) == NULL)
- err("malloc:");
+static bool skip_comment(const uchar **, const uchar *);
+
+static struct lexemes_soa mk_lexemes_soa(void);
+static void lexemes_soa_resz(struct lexemes_soa *);
+struct lexemes_soa
+lexstring(const uchar *code, size_t codesz)
+{
#if ORYX_SIMD
if (!utf8_validate_simd(code, codesz)) {
#endif
@@ -31,19 +33,31 @@ lexstring(const unsigned char *code, size_t codesz, size_t *lcnt)
}
#endif
- const unsigned char *start = code, *end = start + codesz;
+ struct lexemes_soa data = mk_lexemes_soa();
+
+ const uchar *start = code, *end = start + codesz;
while (code < end) {
- struct lexeme l;
- const unsigned char *spnbeg = code, *spnend;
+ const uchar *spnbeg = code, *spnend;
rune ch = utf8_decode(&code);
switch (ch) {
/* Single-byte literals */
- case '&': case '(': case ')': case '*':
- case '+': case '-': case ':': case '=':
- case ';': case '{': case '|': case '}':
+ case '&':
+ case '(':
+ case ')':
+ case '*':
+ case '+':
+ case '-':
+ case ':':
+ case ';':
+ case '=':
+ case '[':
+ case ']':
+ case '{':
+ case '|':
+ case '}':
case '~':
- l.kind = ch;
+ data.kinds[data.len++] = ch;
break;
/* Single- or double-byte literals */
@@ -54,17 +68,17 @@ lexstring(const unsigned char *code, size_t codesz, size_t *lcnt)
continue;
}
- l.kind = ch;
+ data.kinds[data.len++] = ch;
break;
case '<':
case '>':
- l.kind = ch;
+ data.kinds[data.len++] = ch;
/* See the comment in lexer.h for where 193 comes from */
if (code < end && code[0] == ch) {
code++;
- l.kind += 193;
+ data.kinds[data.len - 1] += 193;
}
break;
@@ -72,8 +86,8 @@ lexstring(const unsigned char *code, size_t codesz, size_t *lcnt)
if (!rune_is_xids(ch))
continue;
- l.kind = LEXIDENT;
- l.p = spnbeg;
+ data.kinds[data.len] = LEXIDENT;
+ data.strs[data.len].p = spnbeg;
spnend = code;
while (code < end && rune_is_xidc(ch)) {
@@ -83,27 +97,21 @@ lexstring(const unsigned char *code, size_t codesz, size_t *lcnt)
if (code < end)
code = spnend;
- l.len = spnend - spnbeg;
+ data.strs[data.len++].len = spnend - spnbeg;
}
- if (data.len == data.cap) {
- data.cap *= 2;
- if ((data.buf = realloc(data.buf, data.cap)) == NULL)
- err("realloc:");
- }
-
- data.buf[data.len++] = l;
+ if (data.len == data.cap)
+ lexemes_soa_resz(&data);
}
- *lcnt = data.len;
- return data.buf;
+ return data;
}
bool
-skip_comment(const unsigned char **ptr, const char *end)
+skip_comment(const uchar **ptr, const uchar *end)
{
int nst = 1;
- const char *p = *ptr;
+ const uchar *p = *ptr;
for (p++; p < end; p++) {
if (p + 1 < end) {
@@ -124,3 +132,60 @@ out:
*ptr = ++p;
return true;
}
+
+struct lexemes_soa
+mk_lexemes_soa(void)
+{
+ static_assert(offsetof(struct lexemes_soa, kinds)
+ < offsetof(struct lexemes_soa, strs),
+ "KINDS is not the first field before STRS");
+
+ struct lexemes_soa soa;
+ soa.len = 0;
+ soa.cap = 2048;
+
+ /* Ensure that soa.strs is properly aligned */
+ size_t pad = alignof(*soa.strs)
+ - soa.cap * sizeof(*soa.kinds) % alignof(*soa.strs);
+ if (pad == 8)
+ pad = 0;
+
+ if ((soa.kinds = malloc(soa.cap * LEXEMES_SOA_BLKSZ + pad)) == NULL)
+ err("malloc:");
+ soa.strs = (void *)((char *)soa.kinds + soa.cap * sizeof(*soa.kinds) + pad);
+
+ return soa;
+}
+
+void
+lexemes_soa_resz(struct lexemes_soa *soa)
+{
+ static_assert(offsetof(struct lexemes_soa, kinds)
+ < offsetof(struct lexemes_soa, strs),
+ "KINDS is not the first field before STRS");
+
+ size_t ncap, pad, newsz;
+ ptrdiff_t off = (char *)soa->strs - (char *)soa->kinds;
+
+ /* The capacity is always going to be a power of 2, so checking for overflow
+ becomes pretty trivial */
+ if ((soa->cap >> (SIZE_WDTH - 1)) != 0) {
+ errno = EOVERFLOW;
+ err("lexemes_soa_resz:");
+ }
+ ncap = soa->cap << 1;
+
+ /* Ensure that soa->strs is properly aligned */
+ pad = alignof(*soa->strs)
+ - ncap * sizeof(*soa->kinds) % alignof(*soa->strs);
+ if (pad == 8)
+ pad = 0;
+
+ newsz = ncap * LEXEMES_SOA_BLKSZ + pad;
+
+ if ((soa->kinds = realloc(soa->kinds, newsz)) == NULL)
+ err("realloc:");
+ soa->strs = (void *)((char *)soa->kinds + ncap * sizeof(*soa->kinds) + pad);
+ memmove(soa->strs, (char *)soa->kinds + off, soa->len * sizeof(*soa->strs));
+ soa->cap = ncap;
+}
diff --git a/src/lexer.h b/src/lexer.h
index efd03df..e2862bb 100644
--- a/src/lexer.h
+++ b/src/lexer.h
@@ -4,6 +4,8 @@
#include <stddef.h>
#include <stdint.h>
+#include "types.h"
+
enum {
LEXIDENT,
@@ -12,12 +14,14 @@ enum {
LEXEQ = '=',
LEXLANGL = '<',
LEXLBRACE = '{',
+ LEXLBRKT = '[',
LEXLPAR = '(',
LEXMINUS = '-',
LEXPIPE = '|',
LEXPLUS = '+',
LEXRANGL = '>',
LEXRBRACE = '}',
+ LEXRBRKT = ']',
LEXRPAR = ')',
LEXSEMI = ';',
LEXSLASH = '/',
@@ -33,12 +37,16 @@ enum {
typedef uint8_t lexeme_kind;
-struct lexeme {
- lexeme_kind kind;
- const char *p;
- size_t len;
+#define LEXEMES_SOA_BLKSZ (sizeof(lexeme_kind) + sizeof(struct strview))
+
+struct lexemes_soa {
+ lexeme_kind *kinds;
+ struct strview *strs;
+ size_t len, cap;
};
-struct lexeme *lexstring(const unsigned char *, size_t, size_t *);
+#define lexemes_free(x) free(x.kinds)
+
+struct lexemes_soa lexstring(const uchar *, size_t);
#endif /* !ORYX_LEXER_H */
diff --git a/src/main.c b/src/main.c
index 9969401..bef6b0d 100644
--- a/src/main.c
+++ b/src/main.c
@@ -24,40 +24,75 @@ main(int argc, char **argv)
} file;
file.p = readfile(argv[1], &file.len);
- struct {
- struct lexeme *p;
- size_t len;
- } toks;
- toks.p = lexstring(file.p, file.len, &toks.len);
+ struct lexemes_soa toks = lexstring(file.p, file.len);
for (size_t i = 0; i < toks.len; i++) {
- struct lexeme l = toks.p[i];
- switch (l.kind) {
- case LEXIDENT:
- printf("Identifier: ā€˜%.*sā€™\n", (int)l.len, l.p);
- break;
- case LEXCOLON: puts("Colon"); break;
- case LEXLBRACE: puts("Left brace"); break;
- case LEXLPAR: puts("Left parenthesis"); break;
- case LEXRBRACE: puts("Right brace"); break;
- case LEXRPAR: puts("Right parenthesis"); break;
- case LEXSEMI: puts("Semicolon"); break;
- case LEXAMP: puts("Ampersand"); break;
- case LEXEQ: puts("Equals"); break;
- case LEXLANGL: puts("Left angle bracket"); break;
- case LEXMINUS: puts("Minus"); break;
- case LEXPIPE: puts("Pipe"); break;
- case LEXPLUS: puts("Plus"); break;
- case LEXRANGL: puts("Right angle bracket"); break;
- case LEXSLASH: puts("Slash"); break;
- case LEXSTAR: puts("Asterisk"); break;
- case LEXTILDE: puts("Tilde"); break;
+ switch (toks.kinds[i]) {
+ case LEXIDENT: {
+ struct strview sv = toks.strs[i];
+ printf("Identifier: ā€˜%.*sā€™\n", (int)sv.len, sv.p);
+ break;
+ }
+ case LEXAMP:
+ puts("Ampersand");
+ break;
+ case LEXCOLON:
+ puts("Colon");
+ break;
+ case LEXEQ:
+ puts("Equals");
+ break;
+ case LEXLANGL:
+ puts("Left angle bracket");
+ break;
+ case LEXLBRACE:
+ puts("Left brace");
+ break;
+ case LEXLBRKT:
+ puts("Right bracket");
+ break;
+ case LEXLPAR:
+ puts("Left parenthesis");
+ break;
+ case LEXMINUS:
+ puts("Minus");
+ break;
+ case LEXPIPE:
+ puts("Pipe");
+ break;
+ case LEXPLUS:
+ puts("Plus");
+ break;
+ case LEXRANGL:
+ puts("Right angle bracket");
+ break;
+ case LEXRBRACE:
+ puts("Right brace");
+ break;
+ case LEXRBRKT:
+ puts("Right bracket");
+ break;
+ case LEXRPAR:
+ puts("Right parenthesis");
+ break;
+ case LEXSEMI:
+ puts("Semicolon");
+ break;
+ case LEXSLASH:
+ puts("Slash");
+ break;
+ case LEXSTAR:
+ puts("Asterisk");
+ break;
+ case LEXTILDE:
+ puts("Tilde");
+ break;
}
}
#if DEBUG
free(file.p);
- free(toks.p);
+ lexemes_free(toks);
#endif
return EXIT_SUCCESS;
}
diff --git a/src/types.h b/src/types.h
new file mode 100644
index 0000000..c784b26
--- /dev/null
+++ b/src/types.h
@@ -0,0 +1,13 @@
+#ifndef ORYX_TYPES_H
+#define ORYX_TYPES_H
+
+#include <stddef.h>
+
+typedef unsigned char uchar;
+
+struct strview {
+ const uchar *p;
+ size_t len;
+};
+
+#endif /* !ORYX_TYPES_H */