aboutsummaryrefslogtreecommitdiff
path: root/src/lexer.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/lexer.c')
-rw-r--r--src/lexer.c133
1 files changed, 99 insertions, 34 deletions
diff --git a/src/lexer.c b/src/lexer.c
index 663c8dd..77e60f2 100644
--- a/src/lexer.c
+++ b/src/lexer.c
@@ -1,24 +1,26 @@
+#include <assert.h>
+#include <errno.h>
#include <inttypes.h>
+#include <stdalign.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdlib.h>
+#include <string.h>
#include "errors.h"
#include "lexer.h"
#include "unicode.h"
-static bool skip_comment(const unsigned char **, const char *);
+#define SIZE_WDTH (sizeof(size_t) * 8)
-struct lexeme *
-lexstring(const unsigned char *code, size_t codesz, size_t *lcnt)
-{
- struct {
- struct lexeme *buf;
- size_t len, cap;
- } data = {.cap = 1024};
- if ((data.buf = malloc(data.cap)) == NULL)
- err("malloc:");
+static bool skip_comment(const uchar **, const uchar *);
+
+static struct lexemes_soa mk_lexemes_soa(void);
+static void lexemes_soa_resz(struct lexemes_soa *);
+struct lexemes_soa
+lexstring(const uchar *code, size_t codesz)
+{
#if ORYX_SIMD
if (!utf8_validate_simd(code, codesz)) {
#endif
@@ -31,19 +33,31 @@ lexstring(const unsigned char *code, size_t codesz, size_t *lcnt)
}
#endif
- const unsigned char *start = code, *end = start + codesz;
+ struct lexemes_soa data = mk_lexemes_soa();
+
+ const uchar *start = code, *end = start + codesz;
while (code < end) {
- struct lexeme l;
- const unsigned char *spnbeg = code, *spnend;
+ const uchar *spnbeg = code, *spnend;
rune ch = utf8_decode(&code);
switch (ch) {
/* Single-byte literals */
- case '&': case '(': case ')': case '*':
- case '+': case '-': case ':': case '=':
- case ';': case '{': case '|': case '}':
+ case '&':
+ case '(':
+ case ')':
+ case '*':
+ case '+':
+ case '-':
+ case ':':
+ case ';':
+ case '=':
+ case '[':
+ case ']':
+ case '{':
+ case '|':
+ case '}':
case '~':
- l.kind = ch;
+ data.kinds[data.len++] = ch;
break;
/* Single- or double-byte literals */
@@ -54,17 +68,17 @@ lexstring(const unsigned char *code, size_t codesz, size_t *lcnt)
continue;
}
- l.kind = ch;
+ data.kinds[data.len++] = ch;
break;
case '<':
case '>':
- l.kind = ch;
+ data.kinds[data.len++] = ch;
/* See the comment in lexer.h for where 193 comes from */
if (code < end && code[0] == ch) {
code++;
- l.kind += 193;
+ data.kinds[data.len - 1] += 193;
}
break;
@@ -72,8 +86,8 @@ lexstring(const unsigned char *code, size_t codesz, size_t *lcnt)
if (!rune_is_xids(ch))
continue;
- l.kind = LEXIDENT;
- l.p = spnbeg;
+ data.kinds[data.len] = LEXIDENT;
+ data.strs[data.len].p = spnbeg;
spnend = code;
while (code < end && rune_is_xidc(ch)) {
@@ -83,27 +97,21 @@ lexstring(const unsigned char *code, size_t codesz, size_t *lcnt)
if (code < end)
code = spnend;
- l.len = spnend - spnbeg;
+ data.strs[data.len++].len = spnend - spnbeg;
}
- if (data.len == data.cap) {
- data.cap *= 2;
- if ((data.buf = realloc(data.buf, data.cap)) == NULL)
- err("realloc:");
- }
-
- data.buf[data.len++] = l;
+ if (data.len == data.cap)
+ lexemes_soa_resz(&data);
}
- *lcnt = data.len;
- return data.buf;
+ return data;
}
bool
-skip_comment(const unsigned char **ptr, const char *end)
+skip_comment(const uchar **ptr, const uchar *end)
{
int nst = 1;
- const char *p = *ptr;
+ const uchar *p = *ptr;
for (p++; p < end; p++) {
if (p + 1 < end) {
@@ -124,3 +132,60 @@ out:
*ptr = ++p;
return true;
}
+
+struct lexemes_soa
+mk_lexemes_soa(void)
+{
+ static_assert(offsetof(struct lexemes_soa, kinds)
+ < offsetof(struct lexemes_soa, strs),
+ "KINDS is not the first field before STRS");
+
+ struct lexemes_soa soa;
+ soa.len = 0;
+ soa.cap = 2048;
+
+ /* Ensure that soa.strs is properly aligned */
+ size_t pad = alignof(*soa.strs)
+ - soa.cap * sizeof(*soa.kinds) % alignof(*soa.strs);
+ if (pad == 8)
+ pad = 0;
+
+ if ((soa.kinds = malloc(soa.cap * LEXEMES_SOA_BLKSZ + pad)) == NULL)
+ err("malloc:");
+ soa.strs = (void *)((char *)soa.kinds + soa.cap * sizeof(*soa.kinds) + pad);
+
+ return soa;
+}
+
+void
+lexemes_soa_resz(struct lexemes_soa *soa)
+{
+ static_assert(offsetof(struct lexemes_soa, kinds)
+ < offsetof(struct lexemes_soa, strs),
+ "KINDS is not the first field before STRS");
+
+ size_t ncap, pad, newsz;
+ ptrdiff_t off = (char *)soa->strs - (char *)soa->kinds;
+
+ /* The capacity is always going to be a power of 2, so checking for overflow
+ becomes pretty trivial */
+ if ((soa->cap >> (SIZE_WDTH - 1)) != 0) {
+ errno = EOVERFLOW;
+ err("lexemes_soa_resz:");
+ }
+ ncap = soa->cap << 1;
+
+ /* Ensure that soa->strs is properly aligned */
+ pad = alignof(*soa->strs)
+ - ncap * sizeof(*soa->kinds) % alignof(*soa->strs);
+ if (pad == 8)
+ pad = 0;
+
+ newsz = ncap * LEXEMES_SOA_BLKSZ + pad;
+
+ if ((soa->kinds = realloc(soa->kinds, newsz)) == NULL)
+ err("realloc:");
+ soa->strs = (void *)((char *)soa->kinds + ncap * sizeof(*soa->kinds) + pad);
+ memmove(soa->strs, (char *)soa->kinds + off, soa->len * sizeof(*soa->strs));
+ soa->cap = ncap;
+}