From f09e816aee0513031656734cc3cded9827e0b22b Mon Sep 17 00:00:00 2001
From: Thomas Voss <thomas.voss@humanwave.nl>
Date: Mon, 2 Mar 2026 16:27:17 +0100
Subject: Significantly improve lexer error reporting

---
 oryxc/src/lexer.rs | 309 +++++++++++++++++++++++++++--------------------------
 1 file changed, 155 insertions(+), 154 deletions(-)

(limited to 'oryxc/src/lexer.rs')
diff --git a/oryxc/src/lexer.rs b/oryxc/src/lexer.rs
index 2e1a21c..6f05a9c 100644
--- a/oryxc/src/lexer.rs
+++ b/oryxc/src/lexer.rs
@@ -1,13 +1,9 @@
-use std::borrow::Cow;
-use std::ffi::OsStr;
-use std::fmt::Display;
 use std::iter::Peekable;
 use std::mem;
 use std::str::{
 	self,
 	Chars,
 };
-use std::vec::Vec;
 
 use phf;
 use soa_rs::{
@@ -15,10 +11,8 @@ use soa_rs::{
 	Soars,
 };
 
-use crate::{
-	errors,
-	unicode,
-};
+use crate::errors::OryxError;
+use crate::unicode;
 
 #[allow(dead_code)]
 #[repr(u8)]
@@ -60,17 +54,6 @@ pub enum TokenType {
 }
 
 impl TokenType {
-	pub fn literalp(&self) -> bool {
-		return match self {
-			Self::Identifier
-			| Self::KeywordDef
-			| Self::KeywordFunc
-			| Self::Number
-			| Self::String => true,
-			_ => false,
-		};
-	}
-
 	/* Tokens that start an expression */
 	pub fn exprp(&self) -> bool {
 		return match self {
@@ -100,41 +83,20 @@ pub struct Token {
 	pub view: Span,
 }
 
-pub struct Error {
-	pub pos: usize,
-	pub msg: Cow<'static, str>,
-}
-
-impl Error {
-	fn new<T>(pos: usize, msg: T) -> Self
-	where
-		T: Into<Cow<'static, str>>,
-	{
-		return Self {
-			pos,
-			msg: msg.into(),
-		};
-	}
-}
-
 struct LexerContext<'a> {
-	pos_a:          usize, /* Pos [a]fter char */
-	pos_b:          usize, /* Pos [b]efore char */
-	chars:          Peekable<Chars<'a>>,
-	string:         &'a str,
-	filename:       &'a OsStr,
-	expect_punct_p: bool,
+	pos_a:  usize, /* Pos [a]fter char */
+	pos_b:  usize, /* Pos [b]efore char */
+	chars:  Peekable<Chars<'a>>,
+	string: &'a str,
 }
 
 impl<'a> LexerContext<'a> {
-	fn new(filename: &'a OsStr, string: &'a str) -> Self {
+	fn new(string: &'a str) -> Self {
 		return Self {
 			pos_a: 0,
 			pos_b: 0,
 			chars: string.chars().peekable(),
 			string,
-			filename,
-			expect_punct_p: false,
 		};
 	}
 
@@ -150,22 +112,6 @@ impl<'a> LexerContext<'a> {
 	fn peek(&mut self) -> Option<char> {
 		return self.chars.peek().copied();
 	}
-
-	fn err_at_position<S>(&self, s: S) -> !
-	where
-		S: Display,
-	{
-		errors::err_at_position(self.filename, s);
-	}
-
-	#[inline(always)]
-	fn literal_spacing_guard(&self) {
-		if self.expect_punct_p {
-			self.err_at_position(
-				"Two literals may not be directly adjacent to each other",
-			);
-		}
-	}
 }
 
 static KEYWORDS: phf::Map<&'static str, TokenType> = phf::phf_map! {
@@ -174,16 +120,15 @@ static KEYWORDS: phf::Map<&'static str, TokenType> = phf::phf_map! {
 	"return" => TokenType::KeywordReturn,
 };
 
-pub fn tokenize(filename: &OsStr, s: &str) -> Result<Soa<Token>, Vec<Error>> {
+pub fn tokenize(s: &str) -> Result<Soa<Token>, OryxError> {
 	let mut toks = Soa::<Token>::with_capacity(s.len() / 2);
-	let mut ctx = LexerContext::new(filename, s);
+	let mut ctx = LexerContext::new(s);
 
 	while let Some(c) = ctx.next() {
 		let (i, j) = (ctx.pos_b, ctx.pos_a);
 		if let Some(tok) = match c {
 			'/' if ctx.peek().is_some_and(|c| c == '*') => {
-				skip_comment(&mut ctx);
-				ctx.expect_punct_p = false;
+				skip_comment(&mut ctx)?;
 				None
 			},
 			'<' if ctx.peek().is_some_and(|c| c == '<') => {
@@ -226,34 +171,19 @@ pub fn tokenize(filename: &OsStr, s: &str) -> Result<Soa<Token>, Vec<Error>> {
 					view: Span(i, j),
 				})
 			},
-			'#' => {
-				ctx.literal_spacing_guard();
-				Some(tokenize_number_based(&mut ctx))
-			},
-			'0'..='9' => {
-				ctx.literal_spacing_guard();
-				Some(tokenize_number(&mut ctx, "0123456789"))
-			},
-			'"' => {
-				ctx.literal_spacing_guard();
-				Some(tokenize_string(&mut ctx))
-			},
-			_ if unicode::xid_start_p(c) => {
-				ctx.literal_spacing_guard();
-				Some(tokenize_identifier(&mut ctx))
-			},
-			_ if unicode::pattern_white_space_p(c) => {
-				if !unicode::default_ignorable_code_point_p(c) {
-					ctx.expect_punct_p = false;
-				}
-				None
-			},
+			'#' => Some(tokenize_number_based(&mut ctx)?),
+			'0'..='9' => Some(tokenize_number(&mut ctx, "0123456789")?),
+			'"' => Some(tokenize_string(&mut ctx)?),
+			_ if unicode::xid_start_p(c) => Some(tokenize_identifier(&mut ctx)),
+			_ if unicode::pattern_white_space_p(c) => None,
 			c => {
-				let msg = format!("Invalid character ‘{c}’");
-				ctx.err_at_position(msg.as_str());
+				return Err(OryxError::new(
+					i,
+					j,
+					format!("Invalid character ‘{c}’"),
+				));
 			},
 		} {
-			ctx.expect_punct_p = tok.kind.literalp();
 			toks.push(tok);
 		}
 	}
@@ -265,7 +195,8 @@ pub fn tokenize(filename: &OsStr, s: &str) -> Result<Soa<Token>, Vec<Error>> {
 	return Ok(toks);
 }
 
-fn skip_comment<'a>(ctx: &mut LexerContext<'a>) {
+fn skip_comment<'a>(ctx: &mut LexerContext<'a>) -> Result<(), OryxError> {
+	let beg = ctx.pos_b;
 	ctx.next(); /* Consume ‘*’ */
 	let mut depth = 1;
 	while let Some(c) = ctx.next() {
@@ -278,118 +209,169 @@ fn skip_comment<'a>(ctx: &mut LexerContext<'a>) {
 				depth -= 1;
 				ctx.next(); /* Consume ‘/’ */
 				if depth == 0 {
-					return;
+					return Ok(());
 				}
 			},
 			_ => {},
 		};
 	}
-	ctx.err_at_position("Unterminated comment");
+	return Err(OryxError::new(beg, ctx.pos_a, "Unterminated comment"));
 }
 
-fn tokenize_number_based<'a>(ctx: &mut LexerContext<'a>) -> Token {
+fn tokenize_number_based<'a>(
+	ctx: &mut LexerContext<'a>,
+) -> Result<Token, OryxError> {
 	let i = ctx.pos_b;
 	let alphabet = match ctx.next() {
 		Some('b') => "01",
 		Some('o') => "01234567",
 		Some('d') => "0123456789",
 		Some('x') => "0123456789ABCDEF",
-		Some(c) => {
-			let msg = format!("Invalid number base specifier ‘{c}’");
-			ctx.err_at_position(msg.as_str());
+		Some(c @ 'B') | Some(c @ 'O') | Some(c @ 'D') | Some(c @ 'X') => {
+			return Err(OryxError::new(
+				ctx.pos_b,
+				ctx.pos_a,
+				format!(
+					"Invalid number base specifier ‘{c}’, did you mean ‘{}’?",
+					c.to_ascii_lowercase()
+				),
+			));
+		},
+		Some(c) if c.is_alphanumeric() => {
+			return Err(OryxError::new(
+				ctx.pos_b,
+				ctx.pos_a,
+				format!("Invalid number base specifier ‘{c}’"),
+			));
+		},
+		_ => {
+			return Err(OryxError::new(
+				i,
+				i + 1,
+				"Expected number base specifier after ‘#’",
+			));
 		},
-		None => ctx.err_at_position("Expected number base specifier after ‘#’"),
 	};
+
+	let (beg, end) = (ctx.pos_b, ctx.pos_a);
 	let mut tok = match ctx.next() {
-		Some(c) if alphabet.contains(c) => tokenize_number(ctx, alphabet),
-		Some(c) => {
-			let base = match alphabet.len() {
-				2 => "binary",
-				8 => "octal",
-				10 => "decimal",
-				16 => "hexadecimal",
-				_ => unreachable!(),
-			};
-			let msg = format!("Invalid {base} digit ‘{c}’");
-			ctx.err_at_position(msg.as_str());
+		Some(c) if alphabet.contains(c) => tokenize_number(ctx, alphabet)?,
+		Some(c) if alphabet.len() == 16 && c.is_ascii_hexdigit() => {
+			return Err(OryxError::new(
+				ctx.pos_b,
+				ctx.pos_a,
+				format!("Hexadecimal digits must be uppercase"),
+			));
+		},
+		Some(c) if c.is_alphanumeric() => {
+			let base = base2str(alphabet.len());
+			return Err(OryxError::new(
+				ctx.pos_b,
+				ctx.pos_a,
+				format!("Invalid {base} digit ‘{c}’"),
+			));
+		},
+		Some('\'') => {
+			return Err(OryxError::new(
+				ctx.pos_b,
+				ctx.pos_a,
+				format!(
+					"Numeric literals may not begin with a digit separator"
+				),
+			));
+		},
+		_ => {
+			let base = base2str(alphabet.len());
+			return Err(OryxError::new(
+				beg,
+				end,
+				format!("Expected {base} digit after base specifier"),
+			));
 		},
-		None => ctx.err_at_position("Expected number after base specifier"),
 	};
 	tok.view = Span(i, ctx.pos_a);
-	return tok;
+	return Ok(tok);
 }
 
 fn tokenize_number<'a>(
 	ctx: &mut LexerContext<'a>,
 	alphabet: &'static str,
-) -> Token {
+) -> Result<Token, OryxError> {
 	let i = ctx.pos_b;
-	span_raw_number(ctx, alphabet, true);
+	span_raw_number(ctx, alphabet, true)?;
 
 	/* Fractional part */
 	if ctx.peek().is_some_and(|c| c == '.') {
 		ctx.next();
 		if ctx.peek().is_some_and(|c| alphabet.contains(c)) {
-			span_raw_number(ctx, alphabet, false);
+			span_raw_number(ctx, alphabet, false)?;
 		}
 	}
 
 	/* Exponential part */
 	if ctx.peek().is_some_and(|c| c == 'e') {
 		ctx.next();
-		span_raw_number(ctx, alphabet, false);
+		if ctx.peek().is_some_and(|c| c == '+' || c == '-') {
+			ctx.next();
+		}
+		span_raw_number(ctx, alphabet, false)?;
 	}
 
-	return Token {
+	return Ok(Token {
 		kind: TokenType::Number,
 		view: Span(i, ctx.pos_a),
-	};
+	});
 }
 
 fn span_raw_number<'a>(
 	ctx: &mut LexerContext<'a>,
 	alphabet: &'static str,
 	first_digit_lexed_p: bool,
-) {
+) -> Result<(), OryxError> {
 	if !first_digit_lexed_p {
 		match ctx.next() {
 			Some(c) if alphabet.contains(c) => c,
-			Some(c) => {
-				let base = match alphabet.len() {
-					2 => "binary",
-					8 => "octal",
-					10 => "decimal",
-					16 => "hexadecimal",
-					_ => unreachable!(),
-				};
-				let msg = format!("Invalid {base} digit ‘{c}’");
-				ctx.err_at_position(msg.as_str());
+			Some(c) if alphabet.len() == 16 && c.is_ascii_hexdigit() => {
+				return Err(OryxError::new(
+					ctx.pos_b,
+					ctx.pos_a,
+					format!("Hexadecimal digits must be uppercase"),
+				));
 			},
-			None => {
-				let base = match alphabet.len() {
-					2 => "binary",
-					8 => "octal",
-					10 => "decimal",
-					16 => "hexadecimal",
-					_ => unreachable!(),
-				};
-				let msg = format!(
-					"Expected {base} digit but reached end-of-file instead"
-				);
-				ctx.err_at_position(msg.as_str());
+			Some(c) if c.is_alphanumeric() => {
+				let base = base2str(alphabet.len());
+				return Err(OryxError::new(
+					ctx.pos_b,
+					ctx.pos_a,
+					format!("Invalid {base} digit ‘{c}’"),
+				));
+			},
+			_ => {
+				let base = base2str(alphabet.len());
+				return Err(OryxError::new(
+					ctx.pos_b,
+					ctx.pos_a,
+					format!("Expected {base} digit"),
+				));
 			},
 		};
 	}
 
+	let (mut beg, mut end) = (0, 0);
 	let mut last_was_apos_p = false;
 	while let Some(c) = ctx.peek() {
 		match c {
-			'\'' if last_was_apos_p => ctx.err_at_position(
-				"Multiple concurrent digit separators in numeric literal",
-			),
+			'\'' if last_was_apos_p => {
+				return Err(OryxError::new(
+					ctx.pos_b,
+					ctx.pos_a + 1,
+					"Numeric literals may not have adjecent digit separators",
+				));
+			},
 			'\'' => {
 				last_was_apos_p = true;
 				ctx.next();
+				(beg, end) = (ctx.pos_b, ctx.pos_a);
 			},
 			_ if alphabet.contains(c) => {
 				last_was_apos_p = false;
@@ -400,27 +382,36 @@ fn span_raw_number<'a>(
 	}
 
 	if last_was_apos_p {
-		ctx.err_at_position(
+		return Err(OryxError::new(
+			beg,
+			end,
 			"Numeric literals may not end with a digit separator",
-		);
+		));
 	}
+
+	return Ok(());
 }
 
-fn tokenize_string<'a>(ctx: &mut LexerContext<'a>) -> Token {
+fn tokenize_string<'a>(ctx: &mut LexerContext<'a>) -> Result<Token, OryxError> {
 	let i = ctx.pos_b;
+
 	loop {
-		if let Some(c) = ctx.next() {
-			if c == '"' {
-				break;
-			}
-		} else {
-			ctx.err_at_position("Unterminated string");
+		match ctx.next() {
+			Some(c) if c == '"' => break,
+			Some(_) => {},
+			None => {
+				return Err(OryxError::new(
+					i,
+					ctx.pos_a,
+					"Unterminated string literal",
+				));
+			},
 		}
 	}
-	return Token {
+	return Ok(Token {
 		kind: TokenType::String,
 		view: Span(i, ctx.pos_a),
-	};
+	});
 }
 
 fn tokenize_identifier<'a>(ctx: &mut LexerContext<'a>) -> Token {
@@ -435,3 +426,13 @@ fn tokenize_identifier<'a>(ctx: &mut LexerContext<'a>) -> Token {
 	};
 	return Token { kind, view };
 }
+
+fn base2str(n: usize) -> &'static str {
+	return match n {
+		2 => "binary",
+		8 => "octal",
+		10 => "decimal",
+		16 => "hexadecimal",
+		_ => unreachable!(),
+	};
+}
-- 
cgit v1.2.3