aboutsummaryrefslogtreecommitdiff
path: root/src/c8asm/lexer.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/c8asm/lexer.c')
-rw-r--r--src/c8asm/lexer.c204
1 files changed, 204 insertions, 0 deletions
diff --git a/src/c8asm/lexer.c b/src/c8asm/lexer.c
new file mode 100644
index 0000000..effc32e
--- /dev/null
+++ b/src/c8asm/lexer.c
@@ -0,0 +1,204 @@
+#include <da.h>
+#include <mbstring.h>
+#include <rtype.h>
+
+#include "cerr.h"
+#include "common.h"
+#include "lexer.h"
+
+#define ISDIGIT(n) ((n) >= '0' && (n) <= '9')
+#define U8MOV(sv, n) ((sv)->p += (n), (sv)->len -= (n))
+
+#define E_BASE "integer with invalid base specifier ‘%.*s’"
+#define E_EXTRA "unknown extraneous character ‘%.*s’"
+#define E_IDENTCHAR "illegal character in identifier ‘%.*s’"
+#define E_IDENTLOST "local label missing identifier"
+#define E_IDENTSCHAR "illegal first character in identifier ‘%.*s’"
+#define E_UNTERMINATED "unterminated string literal ‘%.*s%.*s’"
+#define E_UTF8 "invalid UTF-8 byte near ‘%02X’"
+
+#define EOLS U"\n\v\f\r\x85\u2028\u2029"
+#define NUMCHARS U"'0123456789abcdefABCDEF"
+
+static void lexline(struct tokens *, struct u8view *);
+static bool skipws(struct u8view *);
+
+const char *
+tokrepr(tokkind k)
+{
+ return (const char *[]){
+ [T_COLON] = "colon", [T_EOL] = "end of line",
+ [T_IDENT] = "identifier", [T_NUMBER] = "number",
+ [T_STRING] = "string",
+ }[k];
+}
+
+struct tokens
+lexfile(struct u8view sv)
+{
+ const char8_t *s;
+ struct tokens toks;
+
+ if (s = u8chk(sv.p, sv.len))
+ die_with_off(filename, s - sv.p, E_UTF8, *s);
+
+ dainit(&toks, 256);
+
+ while (sv.len) {
+ size_t len = u8cbspn(sv.p, sv.len, EOLS, lengthof(EOLS) - 1);
+ struct u8view line = {
+ .p = sv.p,
+ .len = len,
+ };
+
+ lexline(&toks, &line);
+
+ /* Skip trailing EOL */
+ if (sv.len > len)
+ len += u8rlen(sv.p + len);
+
+ U8MOV(&sv, len);
+ }
+
+ return toks;
+}
+
+void
+lexline(struct tokens *toks, struct u8view *sv)
+{
+#define die_with_off(...) \
+ die_with_off(filename, sv->p - baseptr - w, __VA_ARGS__);
+
+ struct token tok;
+
+ for (;;) {
+ int w;
+ rune ch;
+
+ if (!skipws(sv))
+ goto end;
+
+ tok.sv.p = sv->p;
+ tok.sv.len = w = u8next(&ch, &sv->p, &sv->len);
+
+ if (ISDIGIT(ch)) {
+ size_t off, m = 10;
+
+ tok.kind = T_NUMBER;
+ tok.base = 10;
+
+ if (ch == '0') {
+ w = u8next(&ch, &sv->p, &sv->len);
+ if (!w || rprop_is_pat_ws(ch)) {
+ sv->p -= w;
+ sv->len += w;
+ goto out;
+ }
+ tok.sv.len++;
+
+ switch (ch) {
+ case 'b':
+ tok.base = m = 2;
+ break;
+ case 'o':
+ tok.base = m = 8;
+ break;
+ case 'd':
+ /* Implicitly base-10 already */
+ break;
+ case 'x':
+ /* m = 22 because A–F can be both upper- or lowercase */
+ tok.base = 16;
+ m = 22;
+ break;
+ default:
+ if (!ISDIGIT(ch))
+ die_with_off(E_BASE, w, sv->p - w);
+ }
+ }
+
+out:
+ /* +1 to support the digit separator */
+ tok.sv.len += off = u8bspn(sv->p, sv->len, NUMCHARS, m + 1);
+ U8MOV(sv, off);
+ } else if (ch == '.' || ch == '_' || rprop_is_xids(ch)) {
+ tok.kind = T_IDENT;
+ if (ch == '.') {
+ if (!sv->len)
+ die_with_off(E_IDENTLOST);
+
+ tok.sv.len += w = u8next(&ch, &sv->p, &sv->len);
+ if (rprop_is_pat_ws(ch))
+ die_with_off(E_IDENTLOST);
+ if (ch != '_' && !rprop_is_xids(ch)) {
+ die_with_off(E_IDENTSCHAR, w, sv->p - w);
+ }
+ }
+
+ while (w = u8next(&ch, &sv->p, &sv->len)) {
+ if (ch == ':' || rprop_is_pat_ws(ch)) {
+ U8MOV(sv, -w);
+ break;
+ }
+ if (!rprop_is_xidc(ch))
+ die_with_off(E_IDENTCHAR, w, sv->p - w);
+
+ tok.sv.len += w;
+ }
+ } else if (ch == '"') {
+ tok.kind = T_STRING;
+ while (w = u8next(&ch, &sv->p, &sv->len)) {
+ tok.sv.len += w;
+ if (ch == '"')
+ goto found;
+ }
+ die_with_off(E_UNTERMINATED, (int)MIN(tok.sv.len, 20), tok.sv.p,
+ tok.sv.len > 20 ? (int)lengthof(u8"…") - 1 : 0, u8"…");
+found:
+ } else if (ch == ':') {
+ tok.kind = T_COLON;
+ } else if (ch == ';') {
+ goto end;
+ } else {
+ die_with_off(E_EXTRA, w, sv->p - w);
+ }
+
+ /* The colon is the only token that isn’t whitespace separated */
+ if (ch != ':' && sv->len) {
+ w = u8next(&ch, &sv->p, &sv->len);
+ if (!w || !rprop_is_pat_ws(ch))
+ die_with_off(E_EXTRA, w, sv->p - w);
+ }
+
+ dapush(toks, tok);
+ }
+
+end:;
+ tok = (struct token){
+ .kind = T_EOL,
+ .sv.p = sv->p,
+ .sv.len = 0,
+ };
+ dapush(toks, tok);
+
+#undef die_with_off
+}
+
+bool
+skipws(struct u8view *sv)
+{
+ rune ch;
+
+ if (!sv->len)
+ return false;
+
+ for (int w = u8tor_uc(&ch, sv->p); rprop_is_pat_ws(ch);
+ w = u8tor_uc(&ch, sv->p))
+ {
+ U8MOV(sv, w);
+ if (!sv->len)
+ return false;
+ }
+
+ return true;
+}