From 79e6af86ca526d5fb56af6f6ca3da713e3a5e9f9 Mon Sep 17 00:00:00 2001 From: Thomas Voss Date: Tue, 13 Feb 2024 13:02:28 +0100 Subject: Genesis commit --- src/c8asm/assembler.c | 74 ++++++ src/c8asm/assembler.h | 10 + src/c8asm/common.h | 16 ++ src/c8asm/grammar.ebnf | 46 ++++ src/c8asm/instr.gperf | 33 +++ src/c8asm/lexer.c | 204 ++++++++++++++++ src/c8asm/lexer.h | 28 +++ src/c8asm/lookup.h | 190 +++++++++++++++ src/c8asm/main.c | 109 +++++++++ src/c8asm/parser.c | 627 +++++++++++++++++++++++++++++++++++++++++++++++++ src/c8asm/parser.h | 122 ++++++++++ 11 files changed, 1459 insertions(+) create mode 100644 src/c8asm/assembler.c create mode 100644 src/c8asm/assembler.h create mode 100644 src/c8asm/common.h create mode 100644 src/c8asm/grammar.ebnf create mode 100644 src/c8asm/instr.gperf create mode 100644 src/c8asm/lexer.c create mode 100644 src/c8asm/lexer.h create mode 100644 src/c8asm/lookup.h create mode 100644 src/c8asm/main.c create mode 100644 src/c8asm/parser.c create mode 100644 src/c8asm/parser.h (limited to 'src/c8asm') diff --git a/src/c8asm/assembler.c b/src/c8asm/assembler.c new file mode 100644 index 0000000..0d02555 --- /dev/null +++ b/src/c8asm/assembler.c @@ -0,0 +1,74 @@ +#include +#include +#include +#include + +#include +#include + +#include "assembler.h" +#include "cerr.h" +#include "common.h" +#include "parser.h" + +/* TODO: Remove */ +#ifndef unreachable +# define unreachable() __builtin_unreachable() +#endif + +#define E_LEXISTS "label ‘%.*s’ has already been declared" + +struct label { + uint16_t addr; + struct u8view sv; +}; + +struct labels { + struct label *buf; + size_t len, cap; +}; + +static bool u8eq(struct u8view, struct u8view); +static void pushlabel(struct labels *, struct label); + +static size_t i; + +bool +u8eq(struct u8view x, struct u8view y) +{ + return x.len == y.len && memcmp(x.p, y.p, x.len) == 0; +} + +void +pushlabel(struct labels *dst, struct label lbl) +{ + da_foreach (dst, stored) { + if (u8eq(stored->sv, lbl.sv)) { + die_with_off(filename, lbl.sv.p - baseptr, E_LEXISTS, + U8_PRI_ARGS(lbl.sv)); + } + } + + dapush(dst, lbl); +} + +void +assemble([[maybe_unused]] FILE *stream, struct ast ast) +{ + static struct labels locals, globals; + + da_foreach (&ast, node) { + if (node->kind == D_LABEL) { + struct label lbl = { + .addr = i, + .sv = node->name, + }; + pushlabel(node->name.p[0] == '.' ? &locals : &globals, lbl); + } else if (node->kind == D_INSTR) + i += node->instr.kind == I_DB ? node->instr.len : 2; + else + unreachable(); + } + + locals.len = 0; +} diff --git a/src/c8asm/assembler.h b/src/c8asm/assembler.h new file mode 100644 index 0000000..7b37a53 --- /dev/null +++ b/src/c8asm/assembler.h @@ -0,0 +1,10 @@ +#ifndef AHOY_C8ASM_ASSEMBLER_H +#define AHOY_C8ASM_ASSEMBLER_H + +#include + +#include "parser.h" + +void assemble(FILE *, struct ast); + +#endif /* !AHOY_C8ASM_ASSEMBLER_H */ diff --git a/src/c8asm/common.h b/src/c8asm/common.h new file mode 100644 index 0000000..4f905b3 --- /dev/null +++ b/src/c8asm/common.h @@ -0,0 +1,16 @@ +#ifndef AHOY_C8ASM_COMMON_H +#define AHOY_C8ASM_COMMON_H + +#include + +#define MIN(x, y) ((x) < (y) ? (x) : (y)) +#define MAX(x, y) ((x) > (y) ? (x) : (y)) + +#define lengthof(a) (sizeof(a) / sizeof(*(a))) +#define streq(x, y) (!strcmp(x, y)) + +extern size_t filesize; +extern const char *filename; +extern const char8_t *baseptr; + +#endif /* !AHOY_C8ASM_COMMON_H */ diff --git a/src/c8asm/grammar.ebnf b/src/c8asm/grammar.ebnf new file mode 100644 index 0000000..59ba9fc --- /dev/null +++ b/src/c8asm/grammar.ebnf @@ -0,0 +1,46 @@ +program = {line}; +line = {label}, [operation], EOL; +label = IDENT, ':'; + +operation = add | and | bcd | call + | cls | drw | hex | jp + | ld | or | ret | rnd + | rstr | se | shl | shr + | sknp | skp | sne | stor + | sub | subn | sys | xor + | db; + +add = "add", ((vreg, vreg) | (vreg, BYTE) | ("i", vreg)); +and = "and", vreg, vreg; +bcd = "bcd", vreg; +call = "call", addr; +cls = "cls"; +drw = "drw", vreg, vreg, NIBBL; +hex = "hex", vreg; +jp = "jp", ["v0"], addr; +ld = "ld", ((vreg, (vreg | BYTE | "dt" | "k")) + | ("i", addr) + | ("dt", vreg) + | ("st", vreg)); +or = "or", vreg, vreg; +ret = "ret"; +rnd = "rnd", vreg, BYTE; +rstr = "rstr", vreg; +se = "se", vreg, (vreg | BYTE); +shl = "shl", vreg; +shr = "shr", vreg; +sknp = "sknp", vreg; +skp = "skp", vreg; +sne = "sne", vreg, (vreg | BYTE); +stor = "stor", vreg; +sub = "sub", vreg, vreg; +subn = "subn", vreg, vreg; +sys = "sys", addr; +xor = "xor", vreg, vreg; +db = "db", {(BYTE | STRING)}; + +addr = ADDR | IDENT; +vreg = "v0" | "v1" | "v2" | "v3" + | "v4" | "v5" | "v6" | "v7" + | "v8" | "v9" | "va" | "vb" + | "vc" | "vd" | "ve" | "vf"; diff --git a/src/c8asm/instr.gperf b/src/c8asm/instr.gperf new file mode 100644 index 0000000..fac4e57 --- /dev/null +++ b/src/c8asm/instr.gperf @@ -0,0 +1,33 @@ +%compare-lengths +%define initializer-suffix ,nullptr +%define lookup-function-name oplookup +%includes +%readonly-tables +%struct-type +struct opf_pair { char *name; void (*pfn)(void); }; +%% +add, parseop_add +and, parseop_and +bcd, parseop_bcd +call, parseop_call +cls, parseop_cls +db, parseop_db +drw, parseop_drw +hex, parseop_hex +jp, parseop_jp +ld, parseop_ld +or, parseop_or +ret, parseop_ret +rnd, parseop_rnd +rstr, parseop_rstr +se, parseop_se +shl, parseop_shl +shr, parseop_shr +sknp, parseop_sknp +skp, parseop_skp +sne, parseop_sne +stor, parseop_stor +sub, parseop_sub +subn, parseop_subn +sys, parseop_sys +xor, parseop_xor diff --git a/src/c8asm/lexer.c b/src/c8asm/lexer.c new file mode 100644 index 0000000..effc32e --- /dev/null +++ b/src/c8asm/lexer.c @@ -0,0 +1,204 @@ +#include +#include +#include + +#include "cerr.h" +#include "common.h" +#include "lexer.h" + +#define ISDIGIT(n) ((n) >= '0' && (n) <= '9') +#define U8MOV(sv, n) ((sv)->p += (n), (sv)->len -= (n)) + +#define E_BASE "integer with invalid base specifier ‘%.*s’" +#define E_EXTRA "unknown extraneous character ‘%.*s’" +#define E_IDENTCHAR "illegal character in identifier ‘%.*s’" +#define E_IDENTLOST "local label missing identifier" +#define E_IDENTSCHAR "illegal first character in identifier ‘%.*s’" +#define E_UNTERMINATED "unterminated string literal ‘%.*s%.*s’" +#define E_UTF8 "invalid UTF-8 byte near ‘%02X’" + +#define EOLS U"\n\v\f\r\x85\u2028\u2029" +#define NUMCHARS U"'0123456789abcdefABCDEF" + +static void lexline(struct tokens *, struct u8view *); +static bool skipws(struct u8view *); + +const char * +tokrepr(tokkind k) +{ + return (const char *[]){ + [T_COLON] = "colon", [T_EOL] = "end of line", + [T_IDENT] = "identifier", [T_NUMBER] = "number", + [T_STRING] = "string", + }[k]; +} + +struct tokens +lexfile(struct u8view sv) +{ + const char8_t *s; + struct tokens toks; + + if (s = u8chk(sv.p, sv.len)) + die_with_off(filename, s - sv.p, E_UTF8, *s); + + dainit(&toks, 256); + + while (sv.len) { + size_t len = u8cbspn(sv.p, sv.len, EOLS, lengthof(EOLS) - 1); + struct u8view line = { + .p = sv.p, + .len = len, + }; + + lexline(&toks, &line); + + /* Skip trailing EOL */ + if (sv.len > len) + len += u8rlen(sv.p + len); + + U8MOV(&sv, len); + } + + return toks; +} + +void +lexline(struct tokens *toks, struct u8view *sv) +{ +#define die_with_off(...) \ + die_with_off(filename, sv->p - baseptr - w, __VA_ARGS__); + + struct token tok; + + for (;;) { + int w; + rune ch; + + if (!skipws(sv)) + goto end; + + tok.sv.p = sv->p; + tok.sv.len = w = u8next(&ch, &sv->p, &sv->len); + + if (ISDIGIT(ch)) { + size_t off, m = 10; + + tok.kind = T_NUMBER; + tok.base = 10; + + if (ch == '0') { + w = u8next(&ch, &sv->p, &sv->len); + if (!w || rprop_is_pat_ws(ch)) { + sv->p -= w; + sv->len += w; + goto out; + } + tok.sv.len++; + + switch (ch) { + case 'b': + tok.base = m = 2; + break; + case 'o': + tok.base = m = 8; + break; + case 'd': + /* Implicitly base-10 already */ + break; + case 'x': + /* m = 22 because A–F can be both upper- or lowercase */ + tok.base = 16; + m = 22; + break; + default: + if (!ISDIGIT(ch)) + die_with_off(E_BASE, w, sv->p - w); + } + } + +out: + /* +1 to support the digit separator */ + tok.sv.len += off = u8bspn(sv->p, sv->len, NUMCHARS, m + 1); + U8MOV(sv, off); + } else if (ch == '.' || ch == '_' || rprop_is_xids(ch)) { + tok.kind = T_IDENT; + if (ch == '.') { + if (!sv->len) + die_with_off(E_IDENTLOST); + + tok.sv.len += w = u8next(&ch, &sv->p, &sv->len); + if (rprop_is_pat_ws(ch)) + die_with_off(E_IDENTLOST); + if (ch != '_' && !rprop_is_xids(ch)) { + die_with_off(E_IDENTSCHAR, w, sv->p - w); + } + } + + while (w = u8next(&ch, &sv->p, &sv->len)) { + if (ch == ':' || rprop_is_pat_ws(ch)) { + U8MOV(sv, -w); + break; + } + if (!rprop_is_xidc(ch)) + die_with_off(E_IDENTCHAR, w, sv->p - w); + + tok.sv.len += w; + } + } else if (ch == '"') { + tok.kind = T_STRING; + while (w = u8next(&ch, &sv->p, &sv->len)) { + tok.sv.len += w; + if (ch == '"') + goto found; + } + die_with_off(E_UNTERMINATED, (int)MIN(tok.sv.len, 20), tok.sv.p, + tok.sv.len > 20 ? (int)lengthof(u8"…") - 1 : 0, u8"…"); +found: + } else if (ch == ':') { + tok.kind = T_COLON; + } else if (ch == ';') { + goto end; + } else { + die_with_off(E_EXTRA, w, sv->p - w); + } + + /* The colon is the only token that isn’t whitespace separated */ + if (ch != ':' && sv->len) { + w = u8next(&ch, &sv->p, &sv->len); + if (!w || !rprop_is_pat_ws(ch)) + die_with_off(E_EXTRA, w, sv->p - w); + } + + dapush(toks, tok); + } + +end:; + tok = (struct token){ + .kind = T_EOL, + .sv.p = sv->p, + .sv.len = 0, + }; + dapush(toks, tok); + +#undef die_with_off +} + +bool +skipws(struct u8view *sv) +{ + rune ch; + + if (!sv->len) + return false; + + for (int w = u8tor_uc(&ch, sv->p); rprop_is_pat_ws(ch); + w = u8tor_uc(&ch, sv->p)) + { + U8MOV(sv, w); + if (!sv->len) + return false; + } + + return true; +} diff --git a/src/c8asm/lexer.h b/src/c8asm/lexer.h new file mode 100644 index 0000000..ef20cef --- /dev/null +++ b/src/c8asm/lexer.h @@ -0,0 +1,28 @@ +#ifndef AHOY_C8ASM_LEXER_H +#define AHOY_C8ASM_LEXER_H + +#include + +typedef enum [[clang::flag_enum]] { + T_COLON = 1 << 0, + T_EOL = 1 << 1, + T_IDENT = 1 << 2, + T_NUMBER = 1 << 3, + T_STRING = 1 << 4, +} tokkind; + +struct token { + tokkind kind; + struct u8view sv; + int base; /* For number literals */ +}; + +struct tokens { + struct token *buf; + size_t len, cap; +}; + +const char *tokrepr(tokkind); +struct tokens lexfile(struct u8view); + +#endif /* !AHOY_C8ASM_LEXER_H */ diff --git a/src/c8asm/lookup.h b/src/c8asm/lookup.h new file mode 100644 index 0000000..26bc141 --- /dev/null +++ b/src/c8asm/lookup.h @@ -0,0 +1,190 @@ +/* ANSI-C code produced by gperf version 3.1 */ +/* Command-line: gperf --output-file src/c8asm/lookup.h src/c8asm/instr.gperf */ +/* Computed positions: -k'1-3' */ + +#if !((' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) \ + && ('%' == 37) && ('&' == 38) && ('\'' == 39) && ('(' == 40) \ + && (')' == 41) && ('*' == 42) && ('+' == 43) && (',' == 44) \ + && ('-' == 45) && ('.' == 46) && ('/' == 47) && ('0' == 48) \ + && ('1' == 49) && ('2' == 50) && ('3' == 51) && ('4' == 52) \ + && ('5' == 53) && ('6' == 54) && ('7' == 55) && ('8' == 56) \ + && ('9' == 57) && (':' == 58) && (';' == 59) && ('<' == 60) \ + && ('=' == 61) && ('>' == 62) && ('?' == 63) && ('A' == 65) \ + && ('B' == 66) && ('C' == 67) && ('D' == 68) && ('E' == 69) \ + && ('F' == 70) && ('G' == 71) && ('H' == 72) && ('I' == 73) \ + && ('J' == 74) && ('K' == 75) && ('L' == 76) && ('M' == 77) \ + && ('N' == 78) && ('O' == 79) && ('P' == 80) && ('Q' == 81) \ + && ('R' == 82) && ('S' == 83) && ('T' == 84) && ('U' == 85) \ + && ('V' == 86) && ('W' == 87) && ('X' == 88) && ('Y' == 89) \ + && ('Z' == 90) && ('[' == 91) && ('\\' == 92) && (']' == 93) \ + && ('^' == 94) && ('_' == 95) && ('a' == 97) && ('b' == 98) \ + && ('c' == 99) && ('d' == 100) && ('e' == 101) && ('f' == 102) \ + && ('g' == 103) && ('h' == 104) && ('i' == 105) && ('j' == 106) \ + && ('k' == 107) && ('l' == 108) && ('m' == 109) && ('n' == 110) \ + && ('o' == 111) && ('p' == 112) && ('q' == 113) && ('r' == 114) \ + && ('s' == 115) && ('t' == 116) && ('u' == 117) && ('v' == 118) \ + && ('w' == 119) && ('x' == 120) && ('y' == 121) && ('z' == 122) \ + && ('{' == 123) && ('|' == 124) && ('}' == 125) && ('~' == 126)) +/* The character set is not based on ISO-646. */ +#error "gperf generated tables don't work with this execution character set. Please report a bug to ." +#endif + +#line 7 "src/c8asm/instr.gperf" +struct opf_pair { char *name; void (*pfn)(void); }; +#include + +#define TOTAL_KEYWORDS 25 +#define MIN_WORD_LENGTH 2 +#define MAX_WORD_LENGTH 4 +#define MIN_HASH_VALUE 2 +#define MAX_HASH_VALUE 49 +/* maximum key range = 48, duplicates = 0 */ + +#ifdef __GNUC__ +__inline +#else +#ifdef __cplusplus +inline +#endif +#endif +static unsigned int +hash (register const char *str, register size_t len) +{ + static const unsigned char asso_values[] = + { + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 5, 15, 15, + 10, 0, 50, 50, 10, 50, 0, 5, 20, 50, + 5, 5, 18, 50, 0, 0, 0, 30, 50, 8, + 3, 3, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50 + }; + register unsigned int hval = len; + + switch (hval) + { + default: + hval += asso_values[(unsigned char)str[2]]; + /*FALLTHROUGH*/ + case 2: + hval += asso_values[(unsigned char)str[1]]; + /*FALLTHROUGH*/ + case 1: + hval += asso_values[(unsigned char)str[0]]; + break; + } + return hval; +} + +const struct opf_pair * +oplookup (register const char *str, register size_t len) +{ + static const unsigned char lengthtable[] = + { + 0, 0, 2, 3, 4, 0, 3, 2, 3, 4, 0, 3, 0, 3, + 4, 0, 3, 0, 3, 0, 2, 3, 0, 3, 0, 0, 3, 2, + 3, 0, 0, 0, 2, 3, 0, 0, 0, 0, 3, 0, 0, 0, + 0, 3, 4, 0, 0, 0, 3, 4 + }; + static const struct opf_pair wordlist[] = + { + {"",nullptr}, {"",nullptr}, +#line 23 "src/c8asm/instr.gperf" + {"se", parseop_se}, +#line 20 "src/c8asm/instr.gperf" + {"ret", parseop_ret}, +#line 22 "src/c8asm/instr.gperf" + {"rstr", parseop_rstr}, + {"",nullptr}, +#line 32 "src/c8asm/instr.gperf" + {"sys", parseop_sys}, +#line 19 "src/c8asm/instr.gperf" + {"or", parseop_or}, +#line 28 "src/c8asm/instr.gperf" + {"sne", parseop_sne}, +#line 29 "src/c8asm/instr.gperf" + {"stor", parseop_stor}, + {"",nullptr}, +#line 33 "src/c8asm/instr.gperf" + {"xor", parseop_xor}, + {"",nullptr}, +#line 25 "src/c8asm/instr.gperf" + {"shr", parseop_shr}, +#line 26 "src/c8asm/instr.gperf" + {"sknp", parseop_sknp}, + {"",nullptr}, +#line 16 "src/c8asm/instr.gperf" + {"hex", parseop_hex}, + {"",nullptr}, +#line 21 "src/c8asm/instr.gperf" + {"rnd", parseop_rnd}, + {"",nullptr}, +#line 17 "src/c8asm/instr.gperf" + {"jp", parseop_jp}, +#line 15 "src/c8asm/instr.gperf" + {"drw", parseop_drw}, + {"",nullptr}, +#line 10 "src/c8asm/instr.gperf" + {"and", parseop_and}, + {"",nullptr}, {"",nullptr}, +#line 27 "src/c8asm/instr.gperf" + {"skp", parseop_skp}, +#line 14 "src/c8asm/instr.gperf" + {"db", parseop_db}, +#line 9 "src/c8asm/instr.gperf" + {"add", parseop_add}, + {"",nullptr}, {"",nullptr}, {"",nullptr}, +#line 18 "src/c8asm/instr.gperf" + {"ld", parseop_ld}, +#line 24 "src/c8asm/instr.gperf" + {"shl", parseop_shl}, + {"",nullptr}, {"",nullptr}, {"",nullptr}, {"",nullptr}, +#line 13 "src/c8asm/instr.gperf" + {"cls", parseop_cls}, + {"",nullptr}, {"",nullptr}, {"",nullptr}, {"",nullptr}, +#line 11 "src/c8asm/instr.gperf" + {"bcd", parseop_bcd}, +#line 12 "src/c8asm/instr.gperf" + {"call", parseop_call}, + {"",nullptr}, {"",nullptr}, {"",nullptr}, +#line 30 "src/c8asm/instr.gperf" + {"sub", parseop_sub}, +#line 31 "src/c8asm/instr.gperf" + {"subn", parseop_subn} + }; + + if (len <= MAX_WORD_LENGTH && len >= MIN_WORD_LENGTH) + { + register unsigned int key = hash (str, len); + + if (key <= MAX_HASH_VALUE) + if (len == lengthtable[key]) + { + register const char *s = wordlist[key].name; + + if (*str == *s && !memcmp (str + 1, s + 1, len - 1)) + return &wordlist[key]; + } + } + return 0; +} diff --git a/src/c8asm/main.c b/src/c8asm/main.c new file mode 100644 index 0000000..edc595b --- /dev/null +++ b/src/c8asm/main.c @@ -0,0 +1,109 @@ +#include + +#include +#include +#include +#include +#include + +#include +#include + +#include "assembler.h" +#include "cerr.h" +#include "common.h" +#include "lexer.h" +#include "parser.h" + +static void asmfile(int, const char *); + +size_t filesize; +const char *filename; +const char8_t *baseptr; + +int +main(int argc, char **argv) +{ + int opt; + const struct option longopts[] = { + {"help", no_argument, nullptr, 'h'}, + {nullptr, no_argument, nullptr, 0 }, + }; + + cerrinit(*argv); + while ((opt = getopt_long(argc, argv, "h", longopts, nullptr)) != -1) { + switch (opt) { + case 'h': + execlp("man", "man", "1", argv[0], nullptr); + die("execlp: man 1 %s", argv[0]); + default: + fprintf(stderr, "Usage: %s [file ...]\n", argv[0]); + exit(EXIT_FAILURE); + } + } + + argc -= optind; + argv += optind; + + if (!argc) + asmfile(STDIN_FILENO, "-"); + for (int i = 0; i < argc; i++) { + if (streq("-", argv[i])) + asmfile(STDIN_FILENO, "-"); + else { + int fd; + if ((fd = open(argv[i], O_RDONLY)) == -1) + die("open: %s", argv[i]); + asmfile(fd, argv[i]); + close(fd); + } + } + + return EXIT_SUCCESS; +} + +void +asmfile(int fd, const char *fn) +{ + char *buf; + size_t blksize; + ssize_t nr; + struct ast ast; + struct stat st; + struct u8str sb; + struct tokens toks; + + filename = fn; + + if (fstat(fd, &st) == -1) + die("fstat: %s", filename); + blksize = MAX(st.st_blksize, BUFSIZ); + if (!(buf = malloc(blksize))) + die("malloc"); + + /* Load the contents of the file into sb */ + u8strinit(&sb, S_ISREG(st.st_mode) ? (size_t)st.st_size : blksize); + while ((nr = read(fd, buf, blksize)) > 0) { + struct u8view v = { + .p = buf, + .len = nr, + }; + if (!u8strpush(&sb, v)) + die("u8strpush"); + } + if (nr == -1) + die("read: %s", filename); + + free(buf); + filesize = sb.len; + baseptr = u8strfit(&sb)->p; + assemble(stdout, ast = parsefile(toks = lexfile(u8strtou8(sb)))); + + da_foreach (&ast, node) { + if (node->kind == D_INSTR && node->instr.kind == I_DB) + free(node->instr.buf); + } + + free(toks.buf); + u8strfree(sb); +} diff --git a/src/c8asm/parser.c b/src/c8asm/parser.c new file mode 100644 index 0000000..b746bea --- /dev/null +++ b/src/c8asm/parser.c @@ -0,0 +1,627 @@ +#include +#include +#include +#include + +#include + +#include "cerr.h" +#include "common.h" +#include "lexer.h" +#include "parser.h" + +/* TODO: Remove */ +#ifndef unreachable +# define unreachable() __builtin_unreachable() +#endif + +#define E_BADLABEL "identifier ‘%.*s’ cannot be used as a label" +#define E_EARLY "expected %s but input ended prematurely" +#define E_EXPECTED2 "expected %s but got %s" +#define E_EXPECTED "expected %s but got %s ‘%.*s’" +#define E_INSTR "got unknown instruction ‘%.*s’" +#define E_TOOLARGE "expected %s but got out-of-range integer ‘%.*s’" + +#define die_with_off(P, ...) die_with_off(filename, (P)-baseptr, __VA_ARGS__) +#define memeq(X, Y, N) (!memcmp(X, Y, N)) + +enum numsize { + NS_NIBBLE = 0xF, + NS_BYTE = 0xFF, + NS_ADDR = 0xFFF, +}; + +enum regtype { + RT_NONE, + RT_DT, + RT_I, + RT_K, + RT_ST, + RT_VX, +}; + +static bool parselabel(void); +static void parseline(void); +static void parseop(void); +static struct raw_addr parseaddr(struct token); +static struct token reqnext(const char *, tokkind); + +static uint16_t hexval(char); +static uint16_t parsenum(struct token, enum numsize); +static enum regtype regtype(struct u8view); + +static void parseop_add(void), parseop_and(void), parseop_bcd(void), + parseop_call(void), parseop_cls(void), parseop_db(void), parseop_drw(void), + parseop_hex(void), parseop_jp(void), parseop_ld(void), parseop_or(void), + parseop_ret(void), parseop_rnd(void), parseop_rstr(void), parseop_se(void), + parseop_shl(void), parseop_shr(void), parseop_sknp(void), parseop_skp(void), + parseop_sne(void), parseop_stor(void), parseop_sub(void), + parseop_subn(void), parseop_sys(void), parseop_xor(void); +#include "lookup.h" + +static size_t i; +static struct ast ast; +static struct tokens *tokens; + +struct ast +parsefile(struct tokens toks) +{ + ast.len = i = 0; + tokens = &toks; + + while (i < toks.len) + parseline(); + + /* We can safely not do this, but GCC disagrees. This gets GCC to not + complain about dangling pointers. */ + tokens = nullptr; + + return ast; +} + +void +parseline(void) +{ + while (parselabel()) + ; + parseop(); + reqnext("end of line", T_EOL); +} + +bool +parselabel(void) +{ + if (tokens->len - i >= 2 && tokens->buf[i].kind == T_IDENT + && tokens->buf[i + 1].kind == T_COLON) + { + struct dir lbl = { + .kind = D_LABEL, + .name = tokens->buf[i].sv, + }; + if (regtype(lbl.name) != RT_NONE) + die_with_off(lbl.name.p, E_BADLABEL, U8_PRI_ARGS(lbl.name)); + dapush(&ast, lbl); + i += 2; + return true; + } + + return false; +} + +void +parseop(void) +{ + const struct opf_pair *op; + struct token tok = reqnext("instruction or end of line", T_IDENT | T_EOL); + + if (tok.kind == T_EOL) { + i--; + return; + } + + if (!(op = oplookup(tok.sv.p, tok.sv.len))) + die_with_off(tok.sv.p, E_INSTR, U8_PRI_ARGS(tok.sv)); + op->pfn(); +} + +struct raw_addr +parseaddr(struct token tok) +{ + if (tok.kind == T_NUMBER) + return (struct raw_addr){.val = parsenum(tok, NS_ADDR)}; + if (tok.kind == T_STRING) { + if (regtype(tok.sv) != RT_NONE) + die_with_off(tok.sv.p, E_BADLABEL, U8_PRI_ARGS(tok.sv)); + return (struct raw_addr){.label = true, .sv = tok.sv}; + } + unreachable(); +} + +enum regtype +regtype(struct u8view v) +{ + if (v.len == 0 || v.len > 2) + return RT_NONE; + if (v.len == 1) + return v.p[0] == 'i' ? RT_I : v.p[0] == 'k' ? RT_K : RT_NONE; + if (memeq(v.p, "dt", 2)) + return RT_DT; + if (memeq(v.p, "st", 2)) + return RT_ST; + return v.p[0] == 'v' + && ((v.p[1] >= '0' && v.p[1] <= '9') + || (v.p[1] >= 'a' && v.p[1] <= 'f')) + ? RT_VX + : RT_NONE; +} + +uint16_t +hexval(char ch) +{ + return ch >= '0' && ch <= '9' ? ch - '0' + : ch >= 'a' && ch <= 'f' ? ch - 'a' + 10 + : (unreachable(), 0); +} + +uint16_t +parsenum(struct token tok, enum numsize size) +{ + char ch; + uint16_t acc, cutoff, cutlim; + struct u8view v = tok.sv; + + acc = 0; + cutoff = size; + cutlim = cutoff % tok.base; + cutoff /= tok.base; + + if (v.len >= 2 && v.p[0] == '0' && v.p[1] > '9') { + v.p += 2; + v.len -= 2; + } + + for (ch = *v.p; v.len; v.p++, v.len--, ch = *v.p) { + if (ch == '\'') + continue; + else if (ch >= '0' && ch <= '9') + ch -= '0'; + else if (ch >= 'a' && ch <= 'f') + ch -= 'a' - 10; + else if (ch >= 'A' && ch <= 'F') + ch -= 'A' - 10; + else + unreachable(); + + if (acc > cutoff || (acc == cutoff && ch > cutlim)) { + const char *s = size == NS_NIBBLE ? "nibble" + : size == NS_BYTE ? "byte" + : size == NS_ADDR ? "address" + : (unreachable(), nullptr); + die_with_off(tok.sv.p, E_TOOLARGE, s, U8_PRI_ARGS(tok.sv)); + } + + acc *= tok.base; + acc += ch; + } + + return acc; +} + +struct token +reqnext(const char *want, tokkind msk) +{ + struct token t; + if (i >= tokens->len) + die_with_off(baseptr + filesize - 1, E_EARLY, want); + + if ((t = tokens->buf[i++]).kind & msk) + return t; + if (t.kind == T_EOL) + die_with_off(t.sv.p, E_EXPECTED2, want, tokrepr(t.kind)); + die_with_off(t.sv.p, E_EXPECTED, want, tokrepr(t.kind), U8_PRI_ARGS(t.sv)); +} + +#define I(...) ((struct dir){.kind = D_INSTR, .instr = (__VA_ARGS__)}) + +/* Common implementations of instructions that always take 1 or 2 v-registers */ +#define ONE_VREG(T) \ + do { \ + struct instr ins = {.kind = (T)}; \ + struct token tok = reqnext("v-register", T_IDENT); \ + if (regtype(tok.sv) & ~RT_VX) { \ + die_with_off(tok.sv.p, E_EXPECTED, "v-register", \ + tokrepr(tok.kind), U8_PRI_ARGS(tok.sv)); \ + } \ + ins.args[ins.len++].val = hexval(tok.sv.p[1]); \ + dapush(&ast, I(ins)); \ + } while (false) +#define TWO_VREG(T) \ + do { \ + struct instr ins = {.kind = (T)}; \ + struct token lhs = reqnext("v-register", T_IDENT); \ + struct token rhs = reqnext("v-register", T_IDENT); \ + if (regtype(lhs.sv) & ~RT_VX) { \ + die_with_off(lhs.sv.p, E_EXPECTED, "v-register", \ + tokrepr(lhs.kind), U8_PRI_ARGS(lhs.sv)); \ + } \ + if (regtype(rhs.sv) & ~RT_VX) { \ + die_with_off(rhs.sv.p, E_EXPECTED, "v-register", \ + tokrepr(rhs.kind), U8_PRI_ARGS(rhs.sv)); \ + } \ + ins.args[ins.len++].val = hexval(lhs.sv.p[1]); \ + ins.args[ins.len++].val = hexval(rhs.sv.p[1]); \ + dapush(&ast, I(ins)); \ + } while (false) + +void +parseop_add(void) +{ + enum regtype rt; + struct instr ins = {}; + struct token tok = reqnext("v- or i-register", T_IDENT); + + switch (rt = regtype(tok.sv)) { + case RT_VX: + ins.args[ins.len++].val = hexval(tok.sv.p[1]); + tok = reqnext("byte or v-register", T_IDENT | T_NUMBER); + + if (tok.kind == T_NUMBER) { + ins.kind = I_ADD_VX_B; + ins.args[ins.len++].val = parsenum(tok, NS_BYTE); + } else if (regtype(tok.sv) != RT_VX) { + die_with_off(tok.sv.p, E_EXPECTED, "v-register", tokrepr(tok.kind), + U8_PRI_ARGS(tok.sv)); + } else { + ins.kind = I_ADD_VX_VY; + ins.args[ins.len++].val = hexval(tok.sv.p[1]); + } + break; + case RT_I: + ins.kind = I_ADD_I_VX; + tok = reqnext("v-register", T_IDENT); + if (regtype(tok.sv) != RT_VX) { + die_with_off(tok.sv.p, E_EXPECTED, "v-register", tokrepr(tok.kind), + U8_PRI_ARGS(tok.sv)); + } + ins.args[ins.len++].val = hexval(tok.sv.p[1]); + break; + default: + die_with_off(tok.sv.p, E_EXPECTED, "v- or i-register", + tokrepr(tok.kind), U8_PRI_ARGS(tok.sv)); + } + + dapush(&ast, I(ins)); +} + +void +parseop_and(void) +{ + TWO_VREG(I_AND); +} + +void +parseop_bcd(void) +{ + ONE_VREG(I_BCD); +} + +void +parseop_call(void) +{ + struct instr ins = {.kind = I_CALL}; + struct token tok = reqnext("address", T_IDENT | T_NUMBER); + ins.args[ins.len++] = parseaddr(tok); + dapush(&ast, I(ins)); +} + +void +parseop_cls(void) +{ + dapush(&ast, I((struct instr){.kind = I_CLS})); +} + +void +parseop_db(void) +{ + struct instr ins = {.kind = I_DB}; + do { + struct token tok = reqnext("byte or string", T_NUMBER | T_STRING); + switch (tok.kind) { + case T_NUMBER: + dapush(&ins, parsenum(tok, NS_BYTE)); + break; + case T_STRING: + for (size_t i = 1; i < tok.sv.len - 1; i++) + dapush(&ins, tok.sv.p[i]); + break; + default: + unreachable(); + } + } while (i < tokens->len && tokens->buf[i].kind != T_EOL); + + dapush(&ast, I(ins)); +} + +void +parseop_drw(void) +{ + struct instr ins = {.kind = I_DRW}; + struct token op1, op2, op3; + + op1 = reqnext("v-register", T_IDENT); + op2 = reqnext("v-register", T_IDENT); + op3 = reqnext("nibble", T_NUMBER); + + if (regtype(op1.sv) != RT_VX) { + die_with_off(op1.sv.p, E_EXPECTED, "v-register", tokrepr(op1.kind), + U8_PRI_ARGS(op1.sv)); + } + if (regtype(op2.sv) != RT_VX) { + die_with_off(op2.sv.p, E_EXPECTED, "v-register", tokrepr(op2.kind), + U8_PRI_ARGS(op2.sv)); + } + + ins.args[ins.len++].val = hexval(op1.sv.p[1]); + ins.args[ins.len++].val = hexval(op2.sv.p[1]); + ins.args[ins.len++].val = parsenum(op3, NS_NIBBLE); + dapush(&ast, I(ins)); +} + +void +parseop_hex(void) +{ + ONE_VREG(I_HEX); +} + +void +parseop_jp(void) +{ + enum regtype rt; + struct instr ins = {}; + struct token op = reqnext("v0-register or address", T_IDENT | T_NUMBER); + + if (op.kind == T_IDENT) + rt = regtype(op.sv); + if (op.kind == T_NUMBER || (op.kind == T_IDENT && rt == RT_NONE)) { + ins.kind = I_JP_ADDR; + ins.args[ins.len++] = parseaddr(op); + } else if (op.kind == T_IDENT) { + ins.kind = I_JP_V0_ADDR; + if (op.sv.len != 2 || !memeq(op.sv.p, "v0", 2)) { + die_with_off(op.sv.p, E_EXPECTED, "v0-register or address", + tokrepr(op.kind), U8_PRI_ARGS(op.sv)); + } + ins.args[ins.len++] = parseaddr(reqnext("address", T_NUMBER | T_IDENT)); + } else + unreachable(); + + dapush(&ast, I(ins)); +} + +void +parseop_ld(void) +{ + enum regtype rt; + struct instr ins = {}; + struct token op = reqnext("v-, i-, dt-, or st-register", T_IDENT); + + switch (rt = regtype(op.sv)) { + case RT_DT: + case RT_ST: + ins.kind = rt == RT_DT ? I_LD_DT : I_LD_ST; + op = reqnext("v-register", T_IDENT); + if (regtype(op.sv) != RT_VX) { + die_with_off(op.sv.p, E_EXPECTED, "v-register", tokrepr(op.kind), + U8_PRI_ARGS(op.sv)); + } + ins.args[ins.len++].val = hexval(op.sv.p[1]); + break; + + case RT_I: + ins.kind = I_LD_I; + ins.args[ins.len++] = parseaddr(reqnext("address", T_NUMBER | T_IDENT)); + break; + + case RT_VX: + ins.args[ins.len++].val = hexval(op.sv.p[1]); + op = reqnext("v-, k-, or dt-register, or byte", T_IDENT | T_NUMBER); + + switch (op.kind) { + case T_IDENT: + switch (rt = regtype(op.sv)) { + case RT_DT: + ins.kind = I_LD_VX_DT; + break; + case RT_K: + ins.kind = I_LD_VX_K; + break; + case RT_VX: + ins.kind = I_LD_VX_VY; + ins.args[ins.len++].val = hexval(op.sv.p[1]); + break; + default: + die_with_off(op.sv.p, E_EXPECTED, + "v-, k-, or dt-register, or byte", + tokrepr(op.kind), U8_PRI_ARGS(op.sv)); + } + + break; + case T_NUMBER: + ins.kind = I_LD_VX_BYTE; + ins.args[ins.len++].val = parsenum(op, NS_BYTE); + break; + default: + unreachable(); + } + break; + + default: + die_with_off(op.sv.p, E_EXPECTED, "v-, i-, dt-, or st-register", + tokrepr(op.kind), U8_PRI_ARGS(op.sv)); + } + + dapush(&ast, I(ins)); +} + +void +parseop_or(void) +{ + TWO_VREG(I_OR); +} + +void +parseop_ret(void) +{ + dapush(&ast, I((struct instr){.kind = I_RET})); +} + +void +parseop_rnd(void) +{ + struct instr ins = {.kind = I_RND}; + struct token op1, op2; + + op1 = reqnext("v-register", T_IDENT); + op2 = reqnext("byte", T_NUMBER); + + if (regtype(op1.sv) != RT_VX) { + die_with_off(op1.sv.p, E_EXPECTED, "v-register", tokrepr(op1.kind), + U8_PRI_ARGS(op1.sv)); + } + + ins.args[ins.len++].val = hexval(op1.sv.p[1]); + ins.args[ins.len++].val = parsenum(op2, NS_BYTE); + dapush(&ast, I(ins)); +} + +void +parseop_rstr(void) +{ + ONE_VREG(I_RSTR); +} + +void +parseop_se(void) +{ + struct instr ins = {}; + struct token op1, op2; + + op1 = reqnext("v-register", T_IDENT); + op2 = reqnext("byte or v-register", T_IDENT | T_NUMBER); + + if (regtype(op1.sv) != RT_VX) { + die_with_off(op1.sv.p, E_EXPECTED, "v-register", tokrepr(op1.kind), + U8_PRI_ARGS(op1.sv)); + } + ins.args[ins.len++].val = hexval(op1.sv.p[1]); + + switch (op2.kind) { + case T_IDENT: + if (regtype(op2.sv) != RT_VX) { + die_with_off(op2.sv.p, E_EXPECTED, "v-register", tokrepr(op2.kind), + U8_PRI_ARGS(op2.sv)); + } + ins.kind = I_SE_VX_VY; + ins.args[ins.len++].val = hexval(op2.sv.p[1]); + break; + case T_NUMBER: + ins.kind = I_SE_VX_B; + ins.args[ins.len++].val = parsenum(op2, NS_BYTE); + break; + default: + unreachable(); + } + + dapush(&ast, I(ins)); +} + +void +parseop_shl(void) +{ + ONE_VREG(I_SHL); +} + +void +parseop_shr(void) +{ + ONE_VREG(I_SHR); +} + +void +parseop_sknp(void) +{ + ONE_VREG(I_SKNP); +} + +void +parseop_skp(void) +{ + ONE_VREG(I_SKP); +} + +void +parseop_sne(void) +{ + struct instr ins = {}; + struct token op1, op2; + + op1 = reqnext("v-register", T_IDENT); + op2 = reqnext("byte or v-register", T_IDENT | T_NUMBER); + + if (regtype(op1.sv) != RT_VX) { + die_with_off(op1.sv.p, E_EXPECTED, "v-register", tokrepr(op1.kind), + U8_PRI_ARGS(op1.sv)); + } + ins.args[ins.len++].val = hexval(op1.sv.p[1]); + + switch (op2.kind) { + case T_IDENT: + if (regtype(op2.sv) != RT_VX) { + die_with_off(op2.sv.p, E_EXPECTED, "v-register", tokrepr(op2.kind), + U8_PRI_ARGS(op2.sv)); + } + ins.kind = I_SNE_VX_VY; + ins.args[ins.len++].val = hexval(op2.sv.p[1]); + break; + case T_NUMBER: + ins.kind = I_SNE_VX_B; + ins.args[ins.len++].val = parsenum(op2, NS_BYTE); + break; + default: + unreachable(); + } + + dapush(&ast, I(ins)); +} + +void +parseop_stor(void) +{ + ONE_VREG(I_STOR); +} + +void +parseop_sub(void) +{ + TWO_VREG(I_SUB); +} + +void +parseop_subn(void) +{ + TWO_VREG(I_SUBN); +} + +void +parseop_sys(void) +{ + struct instr ins = {.kind = I_SYS}; + ins.args[ins.len++] = parseaddr(reqnext("address", T_NUMBER | T_IDENT)); + dapush(&ast, I(ins)); +} + +void +parseop_xor(void) +{ + TWO_VREG(I_XOR); +} diff --git a/src/c8asm/parser.h b/src/c8asm/parser.h new file mode 100644 index 0000000..392b003 --- /dev/null +++ b/src/c8asm/parser.h @@ -0,0 +1,122 @@ +#ifndef AHOY_C8ASM_PARSER_H +#define AHOY_C8ASM_PARSER_H + +#include +#include + +#include + +struct tokens; + +typedef enum { + I_ADD_I_VX, + I_ADD_VX_B, + I_ADD_VX_VY, + I_AND, + I_BCD, + I_CALL, + I_CLS, + I_DB, + I_DRW, + I_HEX, + I_JP_ADDR, + I_JP_V0_ADDR, + I_LD_DT, + I_LD_I, + I_LD_ST, + I_LD_VX_BYTE, + I_LD_VX_DT, + I_LD_VX_K, + I_LD_VX_VY, + I_OR, + I_RET, + I_RND, + I_RSTR, + I_SE_VX_B, + I_SE_VX_VY, + I_SHL, + I_SHR, + I_SKNP, + I_SKP, + I_SNE_VX_B, + I_SNE_VX_VY, + I_STOR, + I_SUB, + I_SUBN, + I_SYS, + I_XOR, +} instrkind; + +typedef enum { + R_V0, + R_V1, + R_V2, + R_V3, + R_V4, + R_V5, + R_V6, + R_V7, + R_V8, + R_V9, + R_VA, + R_VB, + R_VC, + R_VD, + R_VE, + R_VF, + R_I, + R_K, + R_DT, + R_ST, +} reg; + +typedef enum { + D_INSTR, + D_LABEL, +} dirkind; + +/* Arguments can always be represented by a uint16_t, however the parser is not + responsible for assigning addresses to labels. As a result an arg at this + stage can be either a uint16_t or the name of a label. */ +struct raw_addr { + bool label; + union { + uint16_t val; + struct u8view sv; + }; +}; + +struct instr { + instrkind kind; + + /* The most arguments any instruction can take is 3, so it’s more efficient + to just store the arguments in a fixed-size array. The only exception is + the ‘db’ instruction which takes a variable-number of arguments, so in + that case we use a dynamic array. */ + union { + struct raw_addr args[3]; + struct { + uint8_t *buf; + size_t cap; + }; + }; + + size_t len; +}; + +struct dir { + dirkind kind; + union { + struct u8view name; + struct instr instr; + }; +}; + +struct ast { + struct dir *buf; + size_t len, cap; +}; + +struct ast parsefile(struct tokens); + +#endif /* !AHOY_C8ASM_PARSER_H */ -- cgit v1.2.3