#include #include #include #include "cerr.h" #include "common.h" #include "lexer.h" #include "macros.h" #define ISDIGIT(n) ((n) >= '0' && (n) <= '9') #define U8MOV(sv, n) ((sv)->p += (n), (sv)->len -= (n)) #define E_BASE "integer with invalid base specifier ‘%.*s’" #define E_EXTRA "unknown extraneous character ‘%.*s’" #define E_IDENTCHAR "illegal character in identifier ‘%.*s’" #define E_IDENTLOST "local label missing identifier" #define E_IDENTSCHAR "illegal first character in identifier ‘%.*s’" #define E_UNTERMINATED "unterminated string literal ‘%.*s%.*s’" #define E_UTF8 "invalid UTF-8 byte near ‘%02X’" #define EOLS U"\n\v\f\r\x85\u2028\u2029" #define NUMCHARS U"'0123456789abcdefABCDEF" static void lexline(struct tokens *, struct u8view *); static bool skipws(struct u8view *); const char * tokrepr(tokkind k) { return (const char *[]){ [T_COLON] = "colon", [T_EOL] = "end of line", [T_IDENT] = "identifier", [T_NUMBER] = "number", [T_STRING] = "string", }[k]; } struct tokens lexfile(struct u8view sv) { const char8_t *s; struct tokens toks; if (s = u8chk(sv.p, sv.len)) die_with_off(filename, s - sv.p, E_UTF8, *s); dainit(&toks, 256); while (sv.len) { size_t len = u8cbspn(sv.p, sv.len, EOLS, lengthof(EOLS) - 1); struct u8view line = { .p = sv.p, .len = len, }; lexline(&toks, &line); /* Skip trailing EOL */ if (sv.len > len) len += u8rlen(sv.p + len); U8MOV(&sv, len); } return toks; } void lexline(struct tokens *toks, struct u8view *sv) { #define die_with_off(...) \ die_with_off(filename, sv->p - baseptr - w, __VA_ARGS__); struct token tok; for (;;) { int w; rune ch; if (!skipws(sv)) goto end; tok.sv.p = sv->p; tok.sv.len = w = u8next(&ch, &sv->p, &sv->len); if (ISDIGIT(ch)) { size_t off, m = 10; tok.kind = T_NUMBER; tok.base = 10; if (ch == '0') { w = u8next(&ch, &sv->p, &sv->len); if (!w || rprop_is_pat_ws(ch)) { U8MOV(sv, -w); goto out; } tok.sv.len++; switch (ch) { case 'b': tok.base = m = 2; break; case 'o': tok.base = m = 8; break; case 'd': /* Implicitly base-10 already */ break; case 'x': /* m = 22 because A–F can be both upper- or lowercase */ tok.base = 16; m = 22; break; default: if (!ISDIGIT(ch)) die_with_off(E_BASE, w, sv->p - w); } } out: /* +1 to support the digit separator */ tok.sv.len += off = u8bspn(sv->p, sv->len, NUMCHARS, m + 1); U8MOV(sv, off); } else if (ch == '.' || ch == '_' || rprop_is_xids(ch)) { tok.kind = T_IDENT; if (ch == '.') { if (!sv->len) die_with_off(E_IDENTLOST); tok.sv.len += w = u8next(&ch, &sv->p, &sv->len); if (rprop_is_pat_ws(ch)) die_with_off(E_IDENTLOST); if (ch != '_' && !rprop_is_xids(ch)) { die_with_off(E_IDENTSCHAR, w, sv->p - w); } } while (w = u8next(&ch, &sv->p, &sv->len)) { if (ch == ':' || rprop_is_pat_ws(ch)) { U8MOV(sv, -w); break; } if (!rprop_is_xidc(ch)) die_with_off(E_IDENTCHAR, w, sv->p - w); tok.sv.len += w; } } else if (ch == '"') { tok.kind = T_STRING; while (w = u8next(&ch, &sv->p, &sv->len)) { tok.sv.len += w; if (ch == '"') goto found; } die_with_off(E_UNTERMINATED, (int)MIN(tok.sv.len, 20), tok.sv.p, tok.sv.len > 20 ? (int)lengthof(u8"…") - 1 : 0, u8"…"); found: } else if (ch == ':') { tok.kind = T_COLON; } else if (ch == ';') { goto end; } else { die_with_off(E_EXTRA, w, sv->p - w); } /* The colon is the only token that isn’t whitespace separated */ if (ch != ':' && sv->len) { w = u8next(&ch, &sv->p, &sv->len); if (!w || !rprop_is_pat_ws(ch)) die_with_off(E_EXTRA, w, sv->p - w); } dapush(toks, tok); } end:; tok = (struct token){ .kind = T_EOL, .sv.p = sv->p, .sv.len = 0, }; dapush(toks, tok); #undef die_with_off } bool skipws(struct u8view *sv) { rune ch; if (!sv->len) return false; for (int w = u8tor_uc(&ch, sv->p); rprop_is_pat_ws(ch); w = u8tor_uc(&ch, sv->p)) { U8MOV(sv, w); if (!sv->len) return false; } return true; }