use std::borrow::Cow; use std::ffi::OsStr; use std::fmt::Display; use std::vec::Vec; use std::{ iter, mem, str, }; use phf; use soa_rs::{ Soa, Soars, }; use crate::{ errors, unicode, }; #[allow(dead_code)] #[repr(u8)] #[derive(Clone, Copy, Debug, Eq, PartialEq)] pub enum TokenType { Eof = 0, Ampersand = '&' as u8, AngleL = '<' as u8, AngleR = '>' as u8, Asterisk = '*' as u8, Bar = '|' as u8, BraceL = '{' as u8, BraceR = '}' as u8, BracketL = '[' as u8, BracketR = ']' as u8, Caret = '^' as u8, Comma = ',' as u8, Equals = '=' as u8, Exclamation = '!' as u8, Minus = '-' as u8, ParenL = '(' as u8, ParenR = ')' as u8, Plus = '+' as u8, Semicolon = ';' as u8, Slash = '/' as u8, Tilde = '~' as u8, AmpersandTilde, AngleL2, AngleL3, AngleR2, AngleR3, Ellipsis, Identifier, KeywordDef, KeywordFunc, KeywordReturn, Number, String, } impl TokenType { pub fn literalp(&self) -> bool { return match self { Self::Identifier | Self::KeywordDef | Self::KeywordFunc | Self::Number | Self::String => true, _ => false, }; } /* Tokens that start an expression */ pub fn exprp(&self) -> bool { return match self { Self::Ampersand | Self::Caret | Self::Exclamation | Self::Identifier | Self::KeywordFunc | Self::Minus | Self::Number | Self::ParenL | Self::Plus | Self::String | Self::Tilde => true, _ => false, }; } } #[derive(Clone, Copy, Debug, Eq, PartialEq)] pub struct Span(usize, usize); #[derive(Soars)] #[soa_derive(Debug)] pub struct Token { pub kind: TokenType, pub view: Span, } pub struct Error { pub pos: usize, pub msg: Cow<'static, str>, } impl Error { fn new(pos: usize, msg: T) -> Self where T: Into>, { return Self { pos, msg: msg.into(), }; } } struct LexerContext<'a> { pos_a: usize, /* Pos [a]fter char */ pos_b: usize, /* Pos [b]efore char */ chars: iter::Peekable>, string: &'a str, filename: &'a OsStr, expect_punct_p: bool, } impl<'a> LexerContext<'a> { fn new(filename: &'a OsStr, string: &'a str) -> Self { return Self { pos_a: 0, pos_b: 0, chars: string.chars().peekable(), string, filename, expect_punct_p: false, }; } #[inline(always)] fn next(&mut self) -> Option { let c = self.chars.next()?; self.pos_b = self.pos_a; self.pos_a += c.len_utf8(); return Some(c); } #[inline(always)] fn peek(&mut self) -> Option { return self.chars.peek().copied(); } fn err_at_position(&self, s: S) -> ! where S: Display, { errors::err_at_position(self.filename, s); } #[inline(always)] fn literal_spacing_guard(&self) { if self.expect_punct_p { self.err_at_position( "Two literals may not be directly adjacent to each other", ); } } } static KEYWORDS: phf::Map<&'static str, TokenType> = phf::phf_map! { "def" => TokenType::KeywordDef, "func" => TokenType::KeywordFunc, "return" => TokenType::KeywordReturn, }; pub fn tokenize(filename: &OsStr, s: &str) -> Result, Vec> { let mut toks = Soa::::with_capacity(s.len() / 2); let mut ctx = LexerContext::new(filename, s); while let Some(c) = ctx.next() { let (i, j) = (ctx.pos_b, ctx.pos_a); if let Some(tok) = match c { '/' if ctx.peek().is_some_and(|c| c == '*') => { skip_comment(&mut ctx); ctx.expect_punct_p = false; None }, '<' if ctx.peek().is_some_and(|c| c == '<') => { ctx.next(); /* Consume ‘<’ */ let kind = if ctx.peek().is_some_and(|c| c == '<') { ctx.next(); /* Consume ‘<’ */ TokenType::AngleL3 } else { TokenType::AngleL2 }; Some(Token { kind, view: Span(i, ctx.pos_a), }) }, '>' if ctx.peek().is_some_and(|c| c == '>') => { ctx.next(); /* Consume ‘>’ */ let kind = if ctx.peek().is_some_and(|c| c == '>') { ctx.next(); /* Consume ‘>’ */ TokenType::AngleR3 } else { TokenType::AngleR2 }; Some(Token { kind, view: Span(i, ctx.pos_a), }) }, '&' if ctx.peek().is_some_and(|c| c == '~') => { ctx.next(); /* Consume ‘~’ */ Some(Token { kind: TokenType::AmpersandTilde, view: Span(i, j + 1), }) }, '!' | '&' | '(' | ')' | '*' | '+' | ',' | '-' | '/' | ';' | '<' | '=' | '>' | '[' | ']' | '^' | '{' | '|' | '}' | '~' | '…' => { Some(Token { kind: unsafe { mem::transmute(c as u8) }, view: Span(i, j), }) }, '#' => { ctx.literal_spacing_guard(); Some(tokenize_number_based(&mut ctx)) }, '0'..='9' => { ctx.literal_spacing_guard(); Some(tokenize_number(&mut ctx, "0123456789")) }, '"' => { ctx.literal_spacing_guard(); Some(tokenize_string(&mut ctx)) }, _ if unicode::xid_start_p(c) => { ctx.literal_spacing_guard(); Some(tokenize_identifier(&mut ctx)) }, _ if unicode::pattern_white_space_p(c) => { if !unicode::default_ignorable_code_point_p(c) { ctx.expect_punct_p = false; } None }, c => { let msg = format!("Invalid character ‘{c}’"); ctx.err_at_position(msg.as_str()); }, } { ctx.expect_punct_p = tok.kind.literalp(); toks.push(tok); } } toks.push(Token { kind: TokenType::Eof, view: Span(s.len() - 1, s.len()), }); return Ok(toks); } fn skip_comment<'a>(ctx: &mut LexerContext<'a>) { ctx.next(); /* Consume ‘*’ */ let mut depth = 1; while let Some(c) = ctx.next() { match c { '/' if ctx.peek().is_some_and(|c| c == '*') => { depth += 1; ctx.next(); /* Consume ‘*’ */ }, '*' if ctx.peek().is_some_and(|c| c == '/') => { depth -= 1; ctx.next(); /* Consume ‘/’ */ if depth == 0 { return; } }, _ => {}, }; } ctx.err_at_position("Unterminated comment"); } fn tokenize_number_based<'a>(ctx: &mut LexerContext<'a>) -> Token { let i = ctx.pos_b; let alphabet = match ctx.next() { Some('b') => "01", Some('o') => "01234567", Some('d') => "0123456789", Some('x') => "0123456789ABCDEF", Some(c) => { let msg = format!("Invalid number base specifier ‘{c}’"); ctx.err_at_position(msg.as_str()); }, None => ctx.err_at_position("Expected number base specifier after ‘#’"), }; let mut tok = match ctx.next() { Some(c) if alphabet.contains(c) => tokenize_number(ctx, alphabet), Some(c) => { let base = match alphabet.len() { 2 => "binary", 8 => "octal", 10 => "decimal", 16 => "hexadecimal", _ => unreachable!(), }; let msg = format!("Invalid {base} digit ‘{c}’"); ctx.err_at_position(msg.as_str()); }, None => ctx.err_at_position("Expected number after base specifier"), }; tok.view = Span(i, ctx.pos_a); return tok; } fn tokenize_number<'a>( ctx: &mut LexerContext<'a>, alphabet: &'static str, ) -> Token { let i = ctx.pos_b; span_raw_number(ctx, alphabet, true); /* Fractional part */ if ctx.peek().is_some_and(|c| c == '.') { ctx.next(); if ctx.peek().is_some_and(|c| alphabet.contains(c)) { span_raw_number(ctx, alphabet, false); } } /* Exponential part */ if ctx.peek().is_some_and(|c| c == 'e') { ctx.next(); span_raw_number(ctx, alphabet, false); } return Token { kind: TokenType::Number, view: Span(i, ctx.pos_a), }; } fn span_raw_number<'a>( ctx: &mut LexerContext<'a>, alphabet: &'static str, first_digit_lexed_p: bool, ) { if !first_digit_lexed_p { match ctx.next() { Some(c) if alphabet.contains(c) => c, Some(c) => { let base = match alphabet.len() { 2 => "binary", 8 => "octal", 10 => "decimal", 16 => "hexadecimal", _ => unreachable!(), }; let msg = format!("Invalid {base} digit ‘{c}’"); ctx.err_at_position(msg.as_str()); }, None => { let base = match alphabet.len() { 2 => "binary", 8 => "octal", 10 => "decimal", 16 => "hexadecimal", _ => unreachable!(), }; let msg = format!( "Expected {base} digit but reached end-of-file instead" ); ctx.err_at_position(msg.as_str()); }, }; } let mut last_was_apos_p = false; while let Some(c) = ctx.peek() { match c { '\'' if last_was_apos_p => ctx.err_at_position( "Multiple concurrent digit separators in numeric literal", ), '\'' => { last_was_apos_p = true; ctx.next(); }, _ if alphabet.contains(c) => { last_was_apos_p = false; ctx.next(); }, _ => break, }; } if last_was_apos_p { ctx.err_at_position( "Numeric literals may not end with a digit separator", ); } } fn tokenize_string<'a>(ctx: &mut LexerContext<'a>) -> Token { let i = ctx.pos_b; loop { if let Some(c) = ctx.next() { if c == '"' { break; } } else { ctx.err_at_position("Unterminated string"); } } return Token { kind: TokenType::String, view: Span(i, ctx.pos_a), }; } fn tokenize_identifier<'a>(ctx: &mut LexerContext<'a>) -> Token { let i = ctx.pos_b; while ctx.peek().is_some_and(unicode::xid_continue_p) { ctx.next(); } let view = Span(i, ctx.pos_a); let kind = match KEYWORDS.get(&ctx.string[view.0..view.1]) { Some(kind) => kind.clone(), None => TokenType::Identifier, }; return Token { kind, view }; }