diff options
author | Thomas Voss <mail@thomasvoss.com> | 2024-06-08 12:58:04 +0200 |
---|---|---|
committer | Thomas Voss <mail@thomasvoss.com> | 2024-06-08 12:58:04 +0200 |
commit | baea074f524676a889043779c80fb17cdd38f30d (patch) | |
tree | 4609684954145d092ed684ac38e94b8b04ecd79a /src/lexer.c | |
parent | 4f698ec642547534bac2b37f96de045dd828fd58 (diff) |
Do some lexer work
Diffstat (limited to 'src/lexer.c')
-rw-r--r-- | src/lexer.c | 112 |
1 files changed, 102 insertions, 10 deletions
diff --git a/src/lexer.c b/src/lexer.c index 970202a..663c8dd 100644 --- a/src/lexer.c +++ b/src/lexer.c @@ -1,34 +1,126 @@ #include <inttypes.h> +#include <stdbool.h> #include <stddef.h> -#include <stdio.h> +#include <stdlib.h> #include "errors.h" #include "lexer.h" #include "unicode.h" +static bool skip_comment(const unsigned char **, const char *); + struct lexeme * -lexstring(const char *code, size_t codesz, size_t *lcnt) +lexstring(const unsigned char *code, size_t codesz, size_t *lcnt) { struct { - struct lexeme *p; - size_t len, buf; - } data = {0}; + struct lexeme *buf; + size_t len, cap; + } data = {.cap = 1024}; + if ((data.buf = malloc(data.cap)) == NULL) + err("malloc:"); #if ORYX_SIMD if (!utf8_validate_simd(code, codesz)) { #endif - size_t off = utf8_validate_off(code, codesz); - if (off != 0) - err("Invalid UTF-8 at byte-offset %zu", off - 1); + size_t loc = utf8_validate_off(code, codesz); + if (loc != 0) { + err("Invalid byte ‘0x%02" PRIx8 "’ in UTF-8 input at byte %zu", + code[loc - 1], loc); + } #if ORYX_SIMD } #endif - const char *end = code + codesz; + const unsigned char *start = code, *end = start + codesz; while (code < end) { + struct lexeme l; + const unsigned char *spnbeg = code, *spnend; rune ch = utf8_decode(&code); + + switch (ch) { + /* Single-byte literals */ + case '&': case '(': case ')': case '*': + case '+': case '-': case ':': case '=': + case ';': case '{': case '|': case '}': + case '~': + l.kind = ch; + break; + + /* Single- or double-byte literals */ + case '/': + if (code < end && code[0] == '*') { + if (!skip_comment(&code, end)) + err("Unterminated comment at byte %td", code - start); + continue; + } + + l.kind = ch; + break; + + case '<': + case '>': + l.kind = ch; + + /* See the comment in lexer.h for where 193 comes from */ + if (code < end && code[0] == ch) { + code++; + l.kind += 193; + } + break; + + default: + if (!rune_is_xids(ch)) + continue; + + l.kind = LEXIDENT; + l.p = spnbeg; + + spnend = code; + while (code < end && rune_is_xidc(ch)) { + spnend = code; + ch = utf8_decode(&code); + } + if (code < end) + code = spnend; + + l.len = spnend - spnbeg; + } + + if (data.len == data.cap) { + data.cap *= 2; + if ((data.buf = realloc(data.buf, data.cap)) == NULL) + err("realloc:"); + } + + data.buf[data.len++] = l; } *lcnt = data.len; - return data.p; + return data.buf; +} + +bool +skip_comment(const unsigned char **ptr, const char *end) +{ + int nst = 1; + const char *p = *ptr; + + for (p++; p < end; p++) { + if (p + 1 < end) { + if (p[0] == '*' && p[1] == '/') { + p++; + if (--nst == 0) + goto out; + } else if (p[0] == '/' && p[1] == '*') { + p++; + nst++; + } + } + } + + return false; + +out: + *ptr = ++p; + return true; } |