use std::iter::Peekable; use std::mem; use std::str::{ self, Chars, }; use phf; use soa_rs::{ Soa, Soars, }; use crate::errors::OryxError; use crate::unicode; #[allow(dead_code)] #[repr(u8)] #[derive(Clone, Copy, Debug, Eq, PartialEq)] pub enum TokenType { Eof = 0, Ampersand = '&' as u8, AngleL = '<' as u8, AngleR = '>' as u8, Asterisk = '*' as u8, Bang = '!' as u8, Bar = '|' as u8, BraceL = '{' as u8, BraceR = '}' as u8, BracketL = '[' as u8, BracketR = ']' as u8, Caret = '^' as u8, Comma = ',' as u8, Equals = '=' as u8, Minus = '-' as u8, ParenL = '(' as u8, ParenR = ')' as u8, Percent = '%' as u8, Plus = '+' as u8, Semicolon = ';' as u8, Slash = '/' as u8, Tilde = '~' as u8, _AsciiEnd = 0x7F as u8, Ampersand2, AmpersandTilde, AngleL2, AngleL3, AngleLEquals, AngleR2, AngleR3, AngleREquals, BangEquals, Bar2, Ellipsis, Equals2, Identifier, KeywordDef, KeywordFunc, KeywordReturn, Number, Percent2, String, } impl TokenType { /* Tokens that start an expression */ pub fn exprp(&self) -> bool { return match self { Self::Ampersand | Self::Bang | Self::Caret | Self::Identifier | Self::KeywordFunc | Self::Minus | Self::Number | Self::ParenL | Self::Plus | Self::String | Self::Tilde => true, _ => false, }; } } #[derive(Soars)] #[soa_derive(Debug)] pub struct Token { pub kind: TokenType, pub view: (usize, usize), } struct LexerContext<'a> { pos_a: usize, /* Pos [a]fter char */ pos_b: usize, /* Pos [b]efore char */ chars: Peekable>, string: &'a str, } impl<'a> LexerContext<'a> { fn new(string: &'a str) -> Self { return Self { pos_a: 0, pos_b: 0, chars: string.chars().peekable(), string, }; } #[inline(always)] fn next(&mut self) -> Option { let c = self.chars.next()?; self.pos_b = self.pos_a; self.pos_a += c.len_utf8(); return if c == '\0' { None } else { Some(c) }; } #[inline(always)] fn peek(&mut self) -> Option { return self.chars.peek().copied(); } } static KEYWORDS: phf::Map<&'static str, TokenType> = phf::phf_map! { "def" => TokenType::KeywordDef, "func" => TokenType::KeywordFunc, "return" => TokenType::KeywordReturn, }; pub fn tokenize(s: &str) -> Result, OryxError> { let mut toks = Soa::::with_capacity(s.len() / 2); let mut ctx = LexerContext::new(s); while let Some(c) = ctx.next() { let (i, j) = (ctx.pos_b, ctx.pos_a); if let Some(tok) = match c { '/' if ctx.peek().is_some_and(|c| c == '*') => { skip_comment(&mut ctx)?; None }, '<' if ctx.peek().is_some_and(|c| c == '<') => { ctx.next(); /* Consume ‘<’ */ let kind = if ctx.peek().is_some_and(|c| c == '<') { ctx.next(); /* Consume ‘<’ */ TokenType::AngleL3 } else { TokenType::AngleL2 }; Some(Token { kind, view: (i, ctx.pos_a), }) }, '>' if ctx.peek().is_some_and(|c| c == '>') => { ctx.next(); /* Consume ‘>’ */ let kind = if ctx.peek().is_some_and(|c| c == '>') { ctx.next(); /* Consume ‘>’ */ TokenType::AngleR3 } else { TokenType::AngleR2 }; Some(Token { kind, view: (i, ctx.pos_a), }) }, '<' if ctx.peek().is_some_and(|c| c == '=') => { ctx.next(); /* Consume ‘=’ */ Some(Token { kind: TokenType::AngleLEquals, view: (i, ctx.pos_a), }) }, '>' if ctx.peek().is_some_and(|c| c == '=') => { ctx.next(); /* Consume ‘=’ */ Some(Token { kind: TokenType::AngleREquals, view: (i, ctx.pos_a), }) }, '|' if ctx.peek().is_some_and(|c| c == '|') => { ctx.next(); /* Consume ‘|’ */ Some(Token { kind: TokenType::Bar2, view: (i, ctx.pos_a), }) }, '&' if ctx.peek().is_some_and(|c| c == '&') => { ctx.next(); /* Consume ‘&’ */ Some(Token { kind: TokenType::Ampersand2, view: (i, ctx.pos_a), }) }, '%' if ctx.peek().is_some_and(|c| c == '%') => { ctx.next(); /* Consume ‘%’ */ Some(Token { kind: TokenType::Percent2, view: (i, ctx.pos_a), }) }, '=' if ctx.peek().is_some_and(|c| c == '=') => { ctx.next(); /* Consume ‘=’ */ Some(Token { kind: TokenType::Equals2, view: (i, ctx.pos_a), }) }, '!' if ctx.peek().is_some_and(|c| c == '=') => { ctx.next(); /* Consume ‘=’ */ Some(Token { kind: TokenType::BangEquals, view: (i, ctx.pos_a), }) }, '&' if ctx.peek().is_some_and(|c| c == '~') => { ctx.next(); /* Consume ‘~’ */ Some(Token { kind: TokenType::AmpersandTilde, view: (i, j + 1), }) }, '!' | '&' | '(' | ')' | '*' | '+' | ',' | '-' | '/' | ';' | '<' | '=' | '>' | '[' | ']' | '^' | '{' | '|' | '}' | '~' | '…' | '%' => Some(Token { kind: unsafe { mem::transmute(c as u8) }, view: (i, j), }), '#' => Some(tokenize_number_based(&mut ctx)?), '0'..='9' => Some(tokenize_number(&mut ctx, "0123456789")?), '"' => Some(tokenize_string(&mut ctx)?), _ if unicode::xid_start_p(c) => Some(tokenize_identifier(&mut ctx)), _ if unicode::pattern_white_space_p(c) => None, c => { return Err(OryxError::new( (i, j), format!("invalid character ‘{c}’"), )); }, } { toks.push(tok); } } toks.push(Token { kind: TokenType::Eof, view: (s.len() - 1, s.len()), }); return Ok(toks); } fn skip_comment<'a>(ctx: &mut LexerContext<'a>) -> Result<(), OryxError> { let beg = ctx.pos_b; ctx.next(); /* Consume ‘*’ */ let mut depth = 1; while let Some(c) = ctx.next() { match c { '/' if ctx.peek().is_some_and(|c| c == '*') => { depth += 1; ctx.next(); /* Consume ‘*’ */ }, '*' if ctx.peek().is_some_and(|c| c == '/') => { depth -= 1; ctx.next(); /* Consume ‘/’ */ if depth == 0 { return Ok(()); } }, _ => {}, }; } return Err(OryxError::new((beg, ctx.pos_a), "unterminated comment")); } fn tokenize_number_based<'a>( ctx: &mut LexerContext<'a>, ) -> Result { let i = ctx.pos_b; let alphabet = match ctx.next() { Some('b') => "01", Some('o') => "01234567", Some('d') => "0123456789", Some('x') => "0123456789ABCDEF", Some(c @ 'B') | Some(c @ 'O') | Some(c @ 'D') | Some(c @ 'X') => { return Err(OryxError::new( (ctx.pos_b, ctx.pos_a), format!( "invalid number base specifier ‘{c}’, did you mean ‘{}’?", c.to_ascii_lowercase() ), )); }, Some(c) if c.is_alphanumeric() => { return Err(OryxError::new( (ctx.pos_b, ctx.pos_a), format!("invalid number base specifier ‘{c}’"), )); }, _ => { return Err(OryxError::new( (i, i + 1), "expected number base specifier after ‘#’", )); }, }; let (beg, end) = (ctx.pos_b, ctx.pos_a); let mut tok = match ctx.next() { Some(c) if alphabet.contains(c) => tokenize_number(ctx, alphabet)?, Some(c) if alphabet.len() == 16 && c.is_ascii_hexdigit() => { return Err(OryxError::new( (ctx.pos_b, ctx.pos_a), format!("hexadecimal digits must be uppercase"), )); }, Some(c) if c.is_alphanumeric() => { let base = base2str(alphabet.len()); return Err(OryxError::new( (ctx.pos_b, ctx.pos_a), format!("invalid {base} digit ‘{c}’"), )); }, Some('\'') => { return Err(OryxError::new( (ctx.pos_b, ctx.pos_a), format!( "numeric literals may not begin with a digit separator" ), )); }, _ => { let base = base2str(alphabet.len()); return Err(OryxError::new( (beg, end), format!("expected {base} digit after base specifier"), )); }, }; tok.view = (i, ctx.pos_a); return Ok(tok); } fn tokenize_number<'a>( ctx: &mut LexerContext<'a>, alphabet: &'static str, ) -> Result { let i = ctx.pos_b; span_raw_number(ctx, alphabet, true)?; /* Fractional part */ if ctx.peek().is_some_and(|c| c == '.') { ctx.next(); if ctx.peek().is_some_and(|c| alphabet.contains(c)) { span_raw_number(ctx, alphabet, false)?; } } /* Exponential part */ if ctx.peek().is_some_and(|c| c == 'e') { ctx.next(); if ctx.peek().is_some_and(|c| c == '+' || c == '-') { ctx.next(); } span_raw_number(ctx, alphabet, false)?; } return Ok(Token { kind: TokenType::Number, view: (i, ctx.pos_a), }); } fn span_raw_number<'a>( ctx: &mut LexerContext<'a>, alphabet: &'static str, first_digit_lexed_p: bool, ) -> Result<(), OryxError> { if !first_digit_lexed_p { match ctx.next() { Some(c) if alphabet.contains(c) => c, Some(c) if alphabet.len() == 16 && c.is_ascii_hexdigit() => { return Err(OryxError::new( (ctx.pos_b, ctx.pos_a), format!("hexadecimal digits must be uppercase"), )); }, Some(c) if c.is_alphanumeric() => { let base = base2str(alphabet.len()); return Err(OryxError::new( (ctx.pos_b, ctx.pos_a), format!("invalid {base} digit ‘{c}’"), )); }, _ => { let base = base2str(alphabet.len()); return Err(OryxError::new( (ctx.pos_b, ctx.pos_a), format!("expected {base} digit"), )); }, }; } let (mut beg, mut end) = (0, 0); let mut last_was_apos_p = false; while let Some(c) = ctx.peek() { match c { '\'' if last_was_apos_p => { return Err(OryxError::new( (ctx.pos_b, ctx.pos_a + 1), "numeric literals may not have adjecent digit separators", )); }, '\'' => { last_was_apos_p = true; ctx.next(); (beg, end) = (ctx.pos_b, ctx.pos_a); }, _ if alphabet.contains(c) => { last_was_apos_p = false; ctx.next(); }, _ => break, }; } if last_was_apos_p { return Err(OryxError::new( (beg, end), "numeric literals may not end with a digit separator", )); } return Ok(()); } fn tokenize_string<'a>(ctx: &mut LexerContext<'a>) -> Result { let i = ctx.pos_b; loop { let Some(c) = ctx.next() else { return Err(OryxError::new( (i, ctx.pos_a), "unterminated string literal", )); }; if c == '"' { break; } } return Ok(Token { kind: TokenType::String, view: (i, ctx.pos_a), }); } fn tokenize_identifier<'a>(ctx: &mut LexerContext<'a>) -> Token { let i = ctx.pos_b; while ctx.peek().is_some_and(unicode::xid_continue_p) { ctx.next(); } let view = (i, ctx.pos_a); let kind = match KEYWORDS.get(&ctx.string[view.0..view.1]) { Some(kind) => kind.clone(), None => TokenType::Identifier, }; return Token { kind, view }; } fn base2str(n: usize) -> &'static str { return match n { 2 => "binary", 8 => "octal", 10 => "decimal", 16 => "hexadecimal", _ => unreachable!(), }; }