diff options
author | Thomas Voss <mail@thomasvoss.com> | 2024-06-08 15:17:13 +0200 |
---|---|---|
committer | Thomas Voss <mail@thomasvoss.com> | 2024-06-08 15:17:13 +0200 |
commit | d21121e59b21c73afa3622abe256771f1e73ff69 (patch) | |
tree | d91acbc4fac4062cd5acda237d759e2a51047648 /src | |
parent | 46e676254e5121f38c5d3668855c34a4496f88ec (diff) |
Switch to an SOA for lexemes
Diffstat (limited to 'src')
-rw-r--r-- | src/errors.h | 4 | ||||
-rw-r--r-- | src/lexer.c | 133 | ||||
-rw-r--r-- | src/lexer.h | 18 | ||||
-rw-r--r-- | src/main.c | 89 | ||||
-rw-r--r-- | src/types.h | 13 |
5 files changed, 190 insertions, 67 deletions
diff --git a/src/errors.h b/src/errors.h index 69c8ea0..c3ccaf2 100644 --- a/src/errors.h +++ b/src/errors.h @@ -1,6 +1,8 @@ #ifndef ORYX_ERRORS_H #define ORYX_ERRORS_H -void err(const char *, ...); +#include <stdnoreturn.h> + +noreturn void err(const char *, ...); #endif /* !ORYX_ERRORS_H */ diff --git a/src/lexer.c b/src/lexer.c index 663c8dd..77e60f2 100644 --- a/src/lexer.c +++ b/src/lexer.c @@ -1,24 +1,26 @@ +#include <assert.h> +#include <errno.h> #include <inttypes.h> +#include <stdalign.h> #include <stdbool.h> #include <stddef.h> #include <stdlib.h> +#include <string.h> #include "errors.h" #include "lexer.h" #include "unicode.h" -static bool skip_comment(const unsigned char **, const char *); +#define SIZE_WDTH (sizeof(size_t) * 8) -struct lexeme * -lexstring(const unsigned char *code, size_t codesz, size_t *lcnt) -{ - struct { - struct lexeme *buf; - size_t len, cap; - } data = {.cap = 1024}; - if ((data.buf = malloc(data.cap)) == NULL) - err("malloc:"); +static bool skip_comment(const uchar **, const uchar *); + +static struct lexemes_soa mk_lexemes_soa(void); +static void lexemes_soa_resz(struct lexemes_soa *); +struct lexemes_soa +lexstring(const uchar *code, size_t codesz) +{ #if ORYX_SIMD if (!utf8_validate_simd(code, codesz)) { #endif @@ -31,19 +33,31 @@ lexstring(const unsigned char *code, size_t codesz, size_t *lcnt) } #endif - const unsigned char *start = code, *end = start + codesz; + struct lexemes_soa data = mk_lexemes_soa(); + + const uchar *start = code, *end = start + codesz; while (code < end) { - struct lexeme l; - const unsigned char *spnbeg = code, *spnend; + const uchar *spnbeg = code, *spnend; rune ch = utf8_decode(&code); switch (ch) { /* Single-byte literals */ - case '&': case '(': case ')': case '*': - case '+': case '-': case ':': case '=': - case ';': case '{': case '|': case '}': + case '&': + case '(': + case ')': + case '*': + case '+': + case '-': + case ':': + case ';': + case '=': + case '[': + case ']': + case '{': + case '|': + case '}': case '~': - l.kind = ch; + data.kinds[data.len++] = ch; break; /* Single- or double-byte literals */ @@ -54,17 +68,17 @@ lexstring(const unsigned char *code, size_t codesz, size_t *lcnt) continue; } - l.kind = ch; + data.kinds[data.len++] = ch; break; case '<': case '>': - l.kind = ch; + data.kinds[data.len++] = ch; /* See the comment in lexer.h for where 193 comes from */ if (code < end && code[0] == ch) { code++; - l.kind += 193; + data.kinds[data.len - 1] += 193; } break; @@ -72,8 +86,8 @@ lexstring(const unsigned char *code, size_t codesz, size_t *lcnt) if (!rune_is_xids(ch)) continue; - l.kind = LEXIDENT; - l.p = spnbeg; + data.kinds[data.len] = LEXIDENT; + data.strs[data.len].p = spnbeg; spnend = code; while (code < end && rune_is_xidc(ch)) { @@ -83,27 +97,21 @@ lexstring(const unsigned char *code, size_t codesz, size_t *lcnt) if (code < end) code = spnend; - l.len = spnend - spnbeg; + data.strs[data.len++].len = spnend - spnbeg; } - if (data.len == data.cap) { - data.cap *= 2; - if ((data.buf = realloc(data.buf, data.cap)) == NULL) - err("realloc:"); - } - - data.buf[data.len++] = l; + if (data.len == data.cap) + lexemes_soa_resz(&data); } - *lcnt = data.len; - return data.buf; + return data; } bool -skip_comment(const unsigned char **ptr, const char *end) +skip_comment(const uchar **ptr, const uchar *end) { int nst = 1; - const char *p = *ptr; + const uchar *p = *ptr; for (p++; p < end; p++) { if (p + 1 < end) { @@ -124,3 +132,60 @@ out: *ptr = ++p; return true; } + +struct lexemes_soa +mk_lexemes_soa(void) +{ + static_assert(offsetof(struct lexemes_soa, kinds) + < offsetof(struct lexemes_soa, strs), + "KINDS is not the first field before STRS"); + + struct lexemes_soa soa; + soa.len = 0; + soa.cap = 2048; + + /* Ensure that soa.strs is properly aligned */ + size_t pad = alignof(*soa.strs) + - soa.cap * sizeof(*soa.kinds) % alignof(*soa.strs); + if (pad == 8) + pad = 0; + + if ((soa.kinds = malloc(soa.cap * LEXEMES_SOA_BLKSZ + pad)) == NULL) + err("malloc:"); + soa.strs = (void *)((char *)soa.kinds + soa.cap * sizeof(*soa.kinds) + pad); + + return soa; +} + +void +lexemes_soa_resz(struct lexemes_soa *soa) +{ + static_assert(offsetof(struct lexemes_soa, kinds) + < offsetof(struct lexemes_soa, strs), + "KINDS is not the first field before STRS"); + + size_t ncap, pad, newsz; + ptrdiff_t off = (char *)soa->strs - (char *)soa->kinds; + + /* The capacity is always going to be a power of 2, so checking for overflow + becomes pretty trivial */ + if ((soa->cap >> (SIZE_WDTH - 1)) != 0) { + errno = EOVERFLOW; + err("lexemes_soa_resz:"); + } + ncap = soa->cap << 1; + + /* Ensure that soa->strs is properly aligned */ + pad = alignof(*soa->strs) + - ncap * sizeof(*soa->kinds) % alignof(*soa->strs); + if (pad == 8) + pad = 0; + + newsz = ncap * LEXEMES_SOA_BLKSZ + pad; + + if ((soa->kinds = realloc(soa->kinds, newsz)) == NULL) + err("realloc:"); + soa->strs = (void *)((char *)soa->kinds + ncap * sizeof(*soa->kinds) + pad); + memmove(soa->strs, (char *)soa->kinds + off, soa->len * sizeof(*soa->strs)); + soa->cap = ncap; +} diff --git a/src/lexer.h b/src/lexer.h index efd03df..e2862bb 100644 --- a/src/lexer.h +++ b/src/lexer.h @@ -4,6 +4,8 @@ #include <stddef.h> #include <stdint.h> +#include "types.h" + enum { LEXIDENT, @@ -12,12 +14,14 @@ enum { LEXEQ = '=', LEXLANGL = '<', LEXLBRACE = '{', + LEXLBRKT = '[', LEXLPAR = '(', LEXMINUS = '-', LEXPIPE = '|', LEXPLUS = '+', LEXRANGL = '>', LEXRBRACE = '}', + LEXRBRKT = ']', LEXRPAR = ')', LEXSEMI = ';', LEXSLASH = '/', @@ -33,12 +37,16 @@ enum { typedef uint8_t lexeme_kind; -struct lexeme { - lexeme_kind kind; - const char *p; - size_t len; +#define LEXEMES_SOA_BLKSZ (sizeof(lexeme_kind) + sizeof(struct strview)) + +struct lexemes_soa { + lexeme_kind *kinds; + struct strview *strs; + size_t len, cap; }; -struct lexeme *lexstring(const unsigned char *, size_t, size_t *); +#define lexemes_free(x) free(x.kinds) + +struct lexemes_soa lexstring(const uchar *, size_t); #endif /* !ORYX_LEXER_H */ @@ -24,40 +24,75 @@ main(int argc, char **argv) } file; file.p = readfile(argv[1], &file.len); - struct { - struct lexeme *p; - size_t len; - } toks; - toks.p = lexstring(file.p, file.len, &toks.len); + struct lexemes_soa toks = lexstring(file.p, file.len); for (size_t i = 0; i < toks.len; i++) { - struct lexeme l = toks.p[i]; - switch (l.kind) { - case LEXIDENT: - printf("Identifier: ā%.*sā\n", (int)l.len, l.p); - break; - case LEXCOLON: puts("Colon"); break; - case LEXLBRACE: puts("Left brace"); break; - case LEXLPAR: puts("Left parenthesis"); break; - case LEXRBRACE: puts("Right brace"); break; - case LEXRPAR: puts("Right parenthesis"); break; - case LEXSEMI: puts("Semicolon"); break; - case LEXAMP: puts("Ampersand"); break; - case LEXEQ: puts("Equals"); break; - case LEXLANGL: puts("Left angle bracket"); break; - case LEXMINUS: puts("Minus"); break; - case LEXPIPE: puts("Pipe"); break; - case LEXPLUS: puts("Plus"); break; - case LEXRANGL: puts("Right angle bracket"); break; - case LEXSLASH: puts("Slash"); break; - case LEXSTAR: puts("Asterisk"); break; - case LEXTILDE: puts("Tilde"); break; + switch (toks.kinds[i]) { + case LEXIDENT: { + struct strview sv = toks.strs[i]; + printf("Identifier: ā%.*sā\n", (int)sv.len, sv.p); + break; + } + case LEXAMP: + puts("Ampersand"); + break; + case LEXCOLON: + puts("Colon"); + break; + case LEXEQ: + puts("Equals"); + break; + case LEXLANGL: + puts("Left angle bracket"); + break; + case LEXLBRACE: + puts("Left brace"); + break; + case LEXLBRKT: + puts("Right bracket"); + break; + case LEXLPAR: + puts("Left parenthesis"); + break; + case LEXMINUS: + puts("Minus"); + break; + case LEXPIPE: + puts("Pipe"); + break; + case LEXPLUS: + puts("Plus"); + break; + case LEXRANGL: + puts("Right angle bracket"); + break; + case LEXRBRACE: + puts("Right brace"); + break; + case LEXRBRKT: + puts("Right bracket"); + break; + case LEXRPAR: + puts("Right parenthesis"); + break; + case LEXSEMI: + puts("Semicolon"); + break; + case LEXSLASH: + puts("Slash"); + break; + case LEXSTAR: + puts("Asterisk"); + break; + case LEXTILDE: + puts("Tilde"); + break; } } #if DEBUG free(file.p); - free(toks.p); + lexemes_free(toks); #endif return EXIT_SUCCESS; } diff --git a/src/types.h b/src/types.h new file mode 100644 index 0000000..c784b26 --- /dev/null +++ b/src/types.h @@ -0,0 +1,13 @@ +#ifndef ORYX_TYPES_H +#define ORYX_TYPES_H + +#include <stddef.h> + +typedef unsigned char uchar; + +struct strview { + const uchar *p; + size_t len; +}; + +#endif /* !ORYX_TYPES_H */ |