#include <da.h>
#include <mbstring.h>
#include <rtype.h>

#include "cerr.h"
#include "common.h"
#include "lexer.h"
#include "macros.h"

#define ISDIGIT(n)   ((n) >= '0' && (n) <= '9')
#define U8MOV(sv, n) ((sv)->p += (n), (sv)->len -= (n))

#define E_BASE         "integer with invalid base specifier ‘%.*s’"
#define E_EXTRA        "unknown extraneous character ‘%.*s’"
#define E_IDENTCHAR    "illegal character in identifier ‘%.*s’"
#define E_IDENTLOST    "local label missing identifier"
#define E_IDENTSCHAR   "illegal first character in identifier ‘%.*s’"
#define E_UNTERMINATED "unterminated string literal ‘%.*s%.*s’"
#define E_UTF8         "invalid UTF-8 byte near ‘%02X’"

#define EOLS     U"\n\v\f\r\x85\u2028\u2029"
#define NUMCHARS U"'0123456789abcdefABCDEF"

static void lexline(struct tokens *, struct u8view *);
static bool skipws(struct u8view *);

const char *
tokrepr(tokkind k)
{
	return (const char *[]){
		[T_COLON] = "colon",      [T_EOL] = "end of line",
		[T_IDENT] = "identifier", [T_NUMBER] = "number",
		[T_STRING] = "string",
	}[k];
}

struct tokens
lexfile(struct u8view sv)
{
	const char8_t *s;
	struct tokens toks;

	if (s = u8chk(sv.p, sv.len))
		die_with_off(filename, s - sv.p, E_UTF8, *s);

	dainit(&toks, 256);

	while (sv.len) {
		size_t len = u8cbspn(sv.p, sv.len, EOLS, lengthof(EOLS) - 1);
		struct u8view line = {
			.p = sv.p,
			.len = len,
		};

		lexline(&toks, &line);

		/* Skip trailing EOL */
		if (sv.len > len)
			len += u8rlen(sv.p + len);

		U8MOV(&sv, len);
	}

	return toks;
}

void
lexline(struct tokens *toks, struct u8view *sv)
{
#define die_with_off(...) \
	die_with_off(filename, sv->p - baseptr - w, __VA_ARGS__);

	struct token tok;

	for (;;) {
		int w;
		rune ch;

		if (!skipws(sv))
			goto end;

		tok.sv.p = sv->p;
		tok.sv.len = w = u8next(&ch, &sv->p, &sv->len);

		if (ISDIGIT(ch)) {
			size_t off, m = 10;

			tok.kind = T_NUMBER;
			tok.base = 10;

			if (ch == '0') {
				w = u8next(&ch, &sv->p, &sv->len);
				if (!w || rprop_is_pat_ws(ch)) {
					U8MOV(sv, -w);
					goto out;
				}
				tok.sv.len++;

				switch (ch) {
				case 'b':
					tok.base = m = 2;
					break;
				case 'o':
					tok.base = m = 8;
					break;
				case 'd':
					/* Implicitly base-10 already */
					break;
				case 'x':
					/* m = 22 because A–F can be both upper- or lowercase */
					tok.base = 16;
					m = 22;
					break;
				default:
					if (!ISDIGIT(ch))
						die_with_off(E_BASE, w, sv->p - w);
				}
			}

out:
			/* +1 to support the digit separator */
			tok.sv.len += off = u8bspn(sv->p, sv->len, NUMCHARS, m + 1);
			U8MOV(sv, off);
		} else if (ch == '.' || ch == '_' || rprop_is_xids(ch)) {
			tok.kind = T_IDENT;
			if (ch == '.') {
				if (!sv->len)
					die_with_off(E_IDENTLOST);

				tok.sv.len += w = u8next(&ch, &sv->p, &sv->len);
				if (rprop_is_pat_ws(ch))
					die_with_off(E_IDENTLOST);
				if (ch != '_' && !rprop_is_xids(ch)) {
					die_with_off(E_IDENTSCHAR, w, sv->p - w);
				}
			}

			while (w = u8next(&ch, &sv->p, &sv->len)) {
				if (ch == ':' || rprop_is_pat_ws(ch)) {
					U8MOV(sv, -w);
					break;
				}
				if (!rprop_is_xidc(ch))
					die_with_off(E_IDENTCHAR, w, sv->p - w);

				tok.sv.len += w;
			}
		} else if (ch == '"') {
			tok.kind = T_STRING;
			while (w = u8next(&ch, &sv->p, &sv->len)) {
				tok.sv.len += w;
				if (ch == '"')
					goto found;
			}
			die_with_off(E_UNTERMINATED, (int)MIN(tok.sv.len, 20), tok.sv.p,
			             tok.sv.len > 20 ? (int)lengthof(u8"…") - 1 : 0, u8"…");
found:
		} else if (ch == ':') {
			tok.kind = T_COLON;
		} else if (ch == ';') {
			goto end;
		} else {
			die_with_off(E_EXTRA, w, sv->p - w);
		}

		/* The colon is the only token that isn’t whitespace separated */
		if (ch != ':' && sv->len) {
			w = u8next(&ch, &sv->p, &sv->len);
			if (!w || !rprop_is_pat_ws(ch))
				die_with_off(E_EXTRA, w, sv->p - w);
		}

		dapush(toks, tok);
	}

end:;
	tok = (struct token){
		.kind = T_EOL,
		.sv.p = sv->p,
		.sv.len = 0,
	};
	dapush(toks, tok);

#undef die_with_off
}

bool
skipws(struct u8view *sv)
{
	rune ch;

	if (!sv->len)
		return false;

	for (int w = u8tor_uc(&ch, sv->p); rprop_is_pat_ws(ch);
	     w = u8tor_uc(&ch, sv->p))
	{
		U8MOV(sv, w);
		if (!sv->len)
			return false;
	}

	return true;
}