diff options
Diffstat (limited to 'oryxc/src/lexer.rs')
| -rw-r--r-- | oryxc/src/lexer.rs | 427 |
1 files changed, 427 insertions, 0 deletions
diff --git a/oryxc/src/lexer.rs b/oryxc/src/lexer.rs new file mode 100644 index 0000000..531593d --- /dev/null +++ b/oryxc/src/lexer.rs @@ -0,0 +1,427 @@ +use std::ffi::OsStr; +use std::fmt::Display; +use std::{ + iter, + mem, + str, +}; + +use phf; +use soa_rs::{ + self, + Soars, +}; + +use crate::{ + errors, + size, + unicode, +}; + +#[repr(u8)] +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum TokenType { + Eof = 0, + Ampersand = '&' as u8, + AngleL = '<' as u8, + AngleR = '>' as u8, + Asterisk = '*' as u8, + Bar = '|' as u8, + BraceL = '{' as u8, + BraceR = '}' as u8, + BracketL = '[' as u8, + BracketR = ']' as u8, + Caret = '^' as u8, + Comma = ',' as u8, + Equals = '=' as u8, + Exclamation = '!' as u8, + Minus = '-' as u8, + ParenL = '(' as u8, + ParenR = ')' as u8, + Plus = '+' as u8, + Semicolon = ';' as u8, + Slash = '/' as u8, + Tilde = '~' as u8, + AmpersandTilde, + AngleL2, + AngleL3, + AngleR2, + AngleR3, + Ellipsis, + Identifier, + KeywordDef, + KeywordFunc, + KeywordReturn, + Number, + String, +} + +impl TokenType { + pub fn literalp(&self) -> bool { + return match self { + Self::Identifier + | Self::KeywordDef + | Self::KeywordFunc + | Self::Number + | Self::String => true, + _ => false, + }; + } + + /* Tokens that start an expression */ + pub fn exprp(&self) -> bool { + return match self { + Self::Ampersand + | Self::Caret + | Self::Exclamation + | Self::Identifier + | Self::KeywordFunc + | Self::Minus + | Self::Number + | Self::ParenL + | Self::Plus + | Self::String + | Self::Tilde => true, + _ => false, + }; + } +} + +#[derive(Soars)] +#[soa_derive(Debug)] +pub struct Token<'a> { + pub kind: TokenType, + pub view: &'a str, +} + +pub struct TokenizedBuffer<'a> { + pub tokens: soa_rs::Soa<Token<'a>>, + pub buffer: &'a str, + pub filename: Option<&'a OsStr>, +} + +struct LexerContext<'a> { + pos_a: usize, /* Pos [a]fter char */ + pos_b: usize, /* Pos [b]efore char */ + chars: iter::Peekable<str::Chars<'a>>, + string: &'a str, + filename: Option<&'a OsStr>, + expect_punct_p: bool, +} + +impl<'a> LexerContext<'a> { + fn new(filename: Option<&'a OsStr>, string: &'a str) -> Self { + return Self { + pos_a: 0, + pos_b: 0, + chars: string.chars().peekable(), + string, + filename, + expect_punct_p: false, + }; + } + + #[inline(always)] + fn next(&mut self) -> Option<char> { + let c = self.chars.next()?; + self.pos_b = self.pos_a; + self.pos_a += c.len_utf8(); + return Some(c); + } + + #[inline(always)] + fn peek(&mut self) -> Option<char> { + return self.chars.peek().copied(); + } + + fn err_at_position<S>(&self, s: S) -> ! + where + S: Display, + { + errors::err_at_position(self.filename.unwrap_or(OsStr::new("-")), s); + } + + #[inline(always)] + fn literal_spacing_guard(&self) { + if self.expect_punct_p { + self.err_at_position( + "Two literals may not be directly adjacent to each other", + ); + } + } +} + +static KEYWORDS: phf::Map<&'static str, TokenType> = phf::phf_map! { + "def" => TokenType::KeywordDef, + "func" => TokenType::KeywordFunc, + "return" => TokenType::KeywordReturn, +}; + +pub fn tokenize<'a>( + filename: Option<&'a OsStr>, + s: &'a str, +) -> TokenizedBuffer<'a> { + let mut toks = soa_rs::Soa::<Token>::with_capacity(size::kibibytes(10)); + let mut ctx = LexerContext::new(filename, s); + + while let Some(c) = ctx.next() { + let (i, j) = (ctx.pos_b, ctx.pos_a); + if let Some(tok) = match c { + '/' if ctx.peek().is_some_and(|c| c == '*') => { + skip_comment(&mut ctx); + ctx.expect_punct_p = false; + None + }, + '<' if ctx.peek().is_some_and(|c| c == '<') => { + ctx.next(); /* Consume ‘<’ */ + let kind = if ctx.peek().is_some_and(|c| c == '<') { + ctx.next(); /* Consume ‘<’ */ + TokenType::AngleL3 + } else { + TokenType::AngleL2 + }; + Some(Token { + kind, + view: &s[i..ctx.pos_a], + }) + }, + '>' if ctx.peek().is_some_and(|c| c == '>') => { + ctx.next(); /* Consume ‘>’ */ + let kind = if ctx.peek().is_some_and(|c| c == '>') { + ctx.next(); /* Consume ‘>’ */ + TokenType::AngleR3 + } else { + TokenType::AngleR2 + }; + Some(Token { + kind, + view: &s[i..ctx.pos_a], + }) + }, + '&' if ctx.peek().is_some_and(|c| c == '~') => { + ctx.next(); /* Consume ‘~’ */ + Some(Token { + kind: TokenType::AmpersandTilde, + view: &s[i..j + 1], + }) + }, + '!' | '&' | '(' | ')' | '*' | '+' | ',' | '-' | '/' | ';' | '<' + | '=' | '>' | '[' | ']' | '^' | '{' | '|' | '}' | '~' | '…' => { + Some(Token { + kind: unsafe { mem::transmute(c as u8) }, + view: &s[i..j], + }) + }, + '#' => { + ctx.literal_spacing_guard(); + Some(tokenize_number_based(&mut ctx)) + }, + '0'..='9' => { + ctx.literal_spacing_guard(); + Some(tokenize_number(&mut ctx, "0123456789")) + }, + '"' => { + ctx.literal_spacing_guard(); + Some(tokenize_string(&mut ctx)) + }, + _ if unicode::xid_start_p(c) => { + ctx.literal_spacing_guard(); + Some(tokenize_identifier(&mut ctx)) + }, + _ if unicode::pattern_white_space_p(c) => { + if !unicode::default_ignorable_code_point_p(c) { + ctx.expect_punct_p = false; + } + None + }, + c => { + let msg = format!("Invalid character ‘{c}’"); + ctx.err_at_position(msg.as_str()); + }, + } { + ctx.expect_punct_p = tok.kind.literalp(); + toks.push(tok); + } + } + + toks.push(Token { + kind: TokenType::Eof, + view: &s[s.len() - 1..], + }); + return TokenizedBuffer { + tokens: toks, + buffer: s, + filename, + }; +} + +fn skip_comment<'a>(ctx: &mut LexerContext<'a>) { + ctx.next(); /* Consume ‘*’ */ + let mut depth = 1; + while let Some(c) = ctx.next() { + match c { + '/' if ctx.peek().is_some_and(|c| c == '*') => { + depth += 1; + ctx.next(); /* Consume ‘*’ */ + }, + '*' if ctx.peek().is_some_and(|c| c == '/') => { + depth -= 1; + ctx.next(); /* Consume ‘/’ */ + if depth == 0 { + return; + } + }, + _ => {}, + }; + } + ctx.err_at_position("Unterminated comment"); +} + +fn tokenize_number_based<'a>(ctx: &mut LexerContext<'a>) -> Token<'a> { + let i = ctx.pos_b; + let alphabet = match ctx.next() { + Some('b') => "01", + Some('o') => "01234567", + Some('d') => "0123456789", + Some('x') => "0123456789ABCDEF", + Some(c) => { + let msg = format!("Invalid number base specifier ‘{c}’"); + ctx.err_at_position(msg.as_str()); + }, + None => ctx.err_at_position("Expected number base specifier after ‘#’"), + }; + let mut tok = match ctx.next() { + Some(c) if alphabet.contains(c) => tokenize_number(ctx, alphabet), + Some(c) => { + let base = match alphabet.len() { + 2 => "binary", + 8 => "octal", + 10 => "decimal", + 16 => "hexadecimal", + _ => unreachable!(), + }; + let msg = format!("Invalid {base} digit ‘{c}’"); + ctx.err_at_position(msg.as_str()); + }, + None => ctx.err_at_position("Expected number after base specifier"), + }; + tok.view = &ctx.string[i..ctx.pos_a]; + return tok; +} + +fn tokenize_number<'a>( + ctx: &mut LexerContext<'a>, + alphabet: &'static str, +) -> Token<'a> { + let i = ctx.pos_b; + span_raw_number(ctx, alphabet, true); + + /* Fractional part */ + if ctx.peek().is_some_and(|c| c == '.') { + ctx.next(); + if ctx.peek().is_some_and(|c| alphabet.contains(c)) { + span_raw_number(ctx, alphabet, false); + } + } + + /* Exponential part */ + if ctx.peek().is_some_and(|c| c == 'e') { + ctx.next(); + span_raw_number(ctx, alphabet, false); + } + + return Token { + kind: TokenType::Number, + view: &ctx.string[i..ctx.pos_a], + }; +} + +fn span_raw_number<'a>( + ctx: &mut LexerContext<'a>, + alphabet: &'static str, + first_digit_lexed_p: bool, +) { + if !first_digit_lexed_p { + match ctx.next() { + Some(c) if alphabet.contains(c) => c, + Some(c) => { + let base = match alphabet.len() { + 2 => "binary", + 8 => "octal", + 10 => "decimal", + 16 => "hexadecimal", + _ => unreachable!(), + }; + let msg = format!("Invalid {base} digit ‘{c}’"); + ctx.err_at_position(msg.as_str()); + }, + None => { + let base = match alphabet.len() { + 2 => "binary", + 8 => "octal", + 10 => "decimal", + 16 => "hexadecimal", + _ => unreachable!(), + }; + let msg = format!( + "Expected {base} digit but reached end-of-file instead" + ); + ctx.err_at_position(msg.as_str()); + }, + }; + } + + let mut last_was_apos_p = false; + while let Some(c) = ctx.peek() { + match c { + '\'' if last_was_apos_p => ctx.err_at_position( + "Multiple concurrent digit separators in numeric literal", + ), + '\'' => { + last_was_apos_p = true; + ctx.next(); + }, + _ if alphabet.contains(c) => { + last_was_apos_p = false; + ctx.next(); + }, + _ => break, + }; + } + + if last_was_apos_p { + ctx.err_at_position( + "Numeric literals may not end with a digit separator", + ); + } +} + +fn tokenize_string<'a>(ctx: &mut LexerContext<'a>) -> Token<'a> { + let i = ctx.pos_b; + loop { + if let Some(c) = ctx.next() { + if c == '"' { + break; + } + } else { + ctx.err_at_position("Unterminated string"); + } + } + return Token { + kind: TokenType::String, + view: &ctx.string[i..ctx.pos_a], + }; +} + +fn tokenize_identifier<'a>(ctx: &mut LexerContext<'a>) -> Token<'a> { + let i = ctx.pos_b; + while ctx.peek().is_some_and(unicode::xid_continue_p) { + ctx.next(); + } + let view = &ctx.string[i..ctx.pos_a]; + let kind = match KEYWORDS.get(view) { + Some(kind) => kind.clone(), + None => TokenType::Identifier, + }; + return Token { kind, view }; +} |