summaryrefslogtreecommitdiff
path: root/oryxc/src/lexer.rs
diff options
context:
space:
mode:
authorThomas Voss <mail@thomasvoss.com> 2026-02-24 11:08:42 +0100
committerThomas Voss <mail@thomasvoss.com> 2026-02-24 11:08:42 +0100
commit7d42170c8625de0fe44b98f47e8b9a603a9de794 (patch)
tree86623e0e0809d23ee3dd861ad7006ff21672e455 /oryxc/src/lexer.rs
Genesis commit
Diffstat (limited to 'oryxc/src/lexer.rs')
-rw-r--r--oryxc/src/lexer.rs427
1 files changed, 427 insertions, 0 deletions
diff --git a/oryxc/src/lexer.rs b/oryxc/src/lexer.rs
new file mode 100644
index 0000000..531593d
--- /dev/null
+++ b/oryxc/src/lexer.rs
@@ -0,0 +1,427 @@
+use std::ffi::OsStr;
+use std::fmt::Display;
+use std::{
+ iter,
+ mem,
+ str,
+};
+
+use phf;
+use soa_rs::{
+ self,
+ Soars,
+};
+
+use crate::{
+ errors,
+ size,
+ unicode,
+};
+
+#[repr(u8)]
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum TokenType {
+ Eof = 0,
+ Ampersand = '&' as u8,
+ AngleL = '<' as u8,
+ AngleR = '>' as u8,
+ Asterisk = '*' as u8,
+ Bar = '|' as u8,
+ BraceL = '{' as u8,
+ BraceR = '}' as u8,
+ BracketL = '[' as u8,
+ BracketR = ']' as u8,
+ Caret = '^' as u8,
+ Comma = ',' as u8,
+ Equals = '=' as u8,
+ Exclamation = '!' as u8,
+ Minus = '-' as u8,
+ ParenL = '(' as u8,
+ ParenR = ')' as u8,
+ Plus = '+' as u8,
+ Semicolon = ';' as u8,
+ Slash = '/' as u8,
+ Tilde = '~' as u8,
+ AmpersandTilde,
+ AngleL2,
+ AngleL3,
+ AngleR2,
+ AngleR3,
+ Ellipsis,
+ Identifier,
+ KeywordDef,
+ KeywordFunc,
+ KeywordReturn,
+ Number,
+ String,
+}
+
+impl TokenType {
+ pub fn literalp(&self) -> bool {
+ return match self {
+ Self::Identifier
+ | Self::KeywordDef
+ | Self::KeywordFunc
+ | Self::Number
+ | Self::String => true,
+ _ => false,
+ };
+ }
+
+ /* Tokens that start an expression */
+ pub fn exprp(&self) -> bool {
+ return match self {
+ Self::Ampersand
+ | Self::Caret
+ | Self::Exclamation
+ | Self::Identifier
+ | Self::KeywordFunc
+ | Self::Minus
+ | Self::Number
+ | Self::ParenL
+ | Self::Plus
+ | Self::String
+ | Self::Tilde => true,
+ _ => false,
+ };
+ }
+}
+
+#[derive(Soars)]
+#[soa_derive(Debug)]
+pub struct Token<'a> {
+ pub kind: TokenType,
+ pub view: &'a str,
+}
+
+pub struct TokenizedBuffer<'a> {
+ pub tokens: soa_rs::Soa<Token<'a>>,
+ pub buffer: &'a str,
+ pub filename: Option<&'a OsStr>,
+}
+
+struct LexerContext<'a> {
+ pos_a: usize, /* Pos [a]fter char */
+ pos_b: usize, /* Pos [b]efore char */
+ chars: iter::Peekable<str::Chars<'a>>,
+ string: &'a str,
+ filename: Option<&'a OsStr>,
+ expect_punct_p: bool,
+}
+
+impl<'a> LexerContext<'a> {
+ fn new(filename: Option<&'a OsStr>, string: &'a str) -> Self {
+ return Self {
+ pos_a: 0,
+ pos_b: 0,
+ chars: string.chars().peekable(),
+ string,
+ filename,
+ expect_punct_p: false,
+ };
+ }
+
+ #[inline(always)]
+ fn next(&mut self) -> Option<char> {
+ let c = self.chars.next()?;
+ self.pos_b = self.pos_a;
+ self.pos_a += c.len_utf8();
+ return Some(c);
+ }
+
+ #[inline(always)]
+ fn peek(&mut self) -> Option<char> {
+ return self.chars.peek().copied();
+ }
+
+ fn err_at_position<S>(&self, s: S) -> !
+ where
+ S: Display,
+ {
+ errors::err_at_position(self.filename.unwrap_or(OsStr::new("-")), s);
+ }
+
+ #[inline(always)]
+ fn literal_spacing_guard(&self) {
+ if self.expect_punct_p {
+ self.err_at_position(
+ "Two literals may not be directly adjacent to each other",
+ );
+ }
+ }
+}
+
+static KEYWORDS: phf::Map<&'static str, TokenType> = phf::phf_map! {
+ "def" => TokenType::KeywordDef,
+ "func" => TokenType::KeywordFunc,
+ "return" => TokenType::KeywordReturn,
+};
+
+pub fn tokenize<'a>(
+ filename: Option<&'a OsStr>,
+ s: &'a str,
+) -> TokenizedBuffer<'a> {
+ let mut toks = soa_rs::Soa::<Token>::with_capacity(size::kibibytes(10));
+ let mut ctx = LexerContext::new(filename, s);
+
+ while let Some(c) = ctx.next() {
+ let (i, j) = (ctx.pos_b, ctx.pos_a);
+ if let Some(tok) = match c {
+ '/' if ctx.peek().is_some_and(|c| c == '*') => {
+ skip_comment(&mut ctx);
+ ctx.expect_punct_p = false;
+ None
+ },
+ '<' if ctx.peek().is_some_and(|c| c == '<') => {
+ ctx.next(); /* Consume ‘<’ */
+ let kind = if ctx.peek().is_some_and(|c| c == '<') {
+ ctx.next(); /* Consume ‘<’ */
+ TokenType::AngleL3
+ } else {
+ TokenType::AngleL2
+ };
+ Some(Token {
+ kind,
+ view: &s[i..ctx.pos_a],
+ })
+ },
+ '>' if ctx.peek().is_some_and(|c| c == '>') => {
+ ctx.next(); /* Consume ‘>’ */
+ let kind = if ctx.peek().is_some_and(|c| c == '>') {
+ ctx.next(); /* Consume ‘>’ */
+ TokenType::AngleR3
+ } else {
+ TokenType::AngleR2
+ };
+ Some(Token {
+ kind,
+ view: &s[i..ctx.pos_a],
+ })
+ },
+ '&' if ctx.peek().is_some_and(|c| c == '~') => {
+ ctx.next(); /* Consume ‘~’ */
+ Some(Token {
+ kind: TokenType::AmpersandTilde,
+ view: &s[i..j + 1],
+ })
+ },
+ '!' | '&' | '(' | ')' | '*' | '+' | ',' | '-' | '/' | ';' | '<'
+ | '=' | '>' | '[' | ']' | '^' | '{' | '|' | '}' | '~' | '…' => {
+ Some(Token {
+ kind: unsafe { mem::transmute(c as u8) },
+ view: &s[i..j],
+ })
+ },
+ '#' => {
+ ctx.literal_spacing_guard();
+ Some(tokenize_number_based(&mut ctx))
+ },
+ '0'..='9' => {
+ ctx.literal_spacing_guard();
+ Some(tokenize_number(&mut ctx, "0123456789"))
+ },
+ '"' => {
+ ctx.literal_spacing_guard();
+ Some(tokenize_string(&mut ctx))
+ },
+ _ if unicode::xid_start_p(c) => {
+ ctx.literal_spacing_guard();
+ Some(tokenize_identifier(&mut ctx))
+ },
+ _ if unicode::pattern_white_space_p(c) => {
+ if !unicode::default_ignorable_code_point_p(c) {
+ ctx.expect_punct_p = false;
+ }
+ None
+ },
+ c => {
+ let msg = format!("Invalid character ‘{c}’");
+ ctx.err_at_position(msg.as_str());
+ },
+ } {
+ ctx.expect_punct_p = tok.kind.literalp();
+ toks.push(tok);
+ }
+ }
+
+ toks.push(Token {
+ kind: TokenType::Eof,
+ view: &s[s.len() - 1..],
+ });
+ return TokenizedBuffer {
+ tokens: toks,
+ buffer: s,
+ filename,
+ };
+}
+
+fn skip_comment<'a>(ctx: &mut LexerContext<'a>) {
+ ctx.next(); /* Consume ‘*’ */
+ let mut depth = 1;
+ while let Some(c) = ctx.next() {
+ match c {
+ '/' if ctx.peek().is_some_and(|c| c == '*') => {
+ depth += 1;
+ ctx.next(); /* Consume ‘*’ */
+ },
+ '*' if ctx.peek().is_some_and(|c| c == '/') => {
+ depth -= 1;
+ ctx.next(); /* Consume ‘/’ */
+ if depth == 0 {
+ return;
+ }
+ },
+ _ => {},
+ };
+ }
+ ctx.err_at_position("Unterminated comment");
+}
+
+fn tokenize_number_based<'a>(ctx: &mut LexerContext<'a>) -> Token<'a> {
+ let i = ctx.pos_b;
+ let alphabet = match ctx.next() {
+ Some('b') => "01",
+ Some('o') => "01234567",
+ Some('d') => "0123456789",
+ Some('x') => "0123456789ABCDEF",
+ Some(c) => {
+ let msg = format!("Invalid number base specifier ‘{c}’");
+ ctx.err_at_position(msg.as_str());
+ },
+ None => ctx.err_at_position("Expected number base specifier after ‘#’"),
+ };
+ let mut tok = match ctx.next() {
+ Some(c) if alphabet.contains(c) => tokenize_number(ctx, alphabet),
+ Some(c) => {
+ let base = match alphabet.len() {
+ 2 => "binary",
+ 8 => "octal",
+ 10 => "decimal",
+ 16 => "hexadecimal",
+ _ => unreachable!(),
+ };
+ let msg = format!("Invalid {base} digit ‘{c}’");
+ ctx.err_at_position(msg.as_str());
+ },
+ None => ctx.err_at_position("Expected number after base specifier"),
+ };
+ tok.view = &ctx.string[i..ctx.pos_a];
+ return tok;
+}
+
+fn tokenize_number<'a>(
+ ctx: &mut LexerContext<'a>,
+ alphabet: &'static str,
+) -> Token<'a> {
+ let i = ctx.pos_b;
+ span_raw_number(ctx, alphabet, true);
+
+ /* Fractional part */
+ if ctx.peek().is_some_and(|c| c == '.') {
+ ctx.next();
+ if ctx.peek().is_some_and(|c| alphabet.contains(c)) {
+ span_raw_number(ctx, alphabet, false);
+ }
+ }
+
+ /* Exponential part */
+ if ctx.peek().is_some_and(|c| c == 'e') {
+ ctx.next();
+ span_raw_number(ctx, alphabet, false);
+ }
+
+ return Token {
+ kind: TokenType::Number,
+ view: &ctx.string[i..ctx.pos_a],
+ };
+}
+
+fn span_raw_number<'a>(
+ ctx: &mut LexerContext<'a>,
+ alphabet: &'static str,
+ first_digit_lexed_p: bool,
+) {
+ if !first_digit_lexed_p {
+ match ctx.next() {
+ Some(c) if alphabet.contains(c) => c,
+ Some(c) => {
+ let base = match alphabet.len() {
+ 2 => "binary",
+ 8 => "octal",
+ 10 => "decimal",
+ 16 => "hexadecimal",
+ _ => unreachable!(),
+ };
+ let msg = format!("Invalid {base} digit ‘{c}’");
+ ctx.err_at_position(msg.as_str());
+ },
+ None => {
+ let base = match alphabet.len() {
+ 2 => "binary",
+ 8 => "octal",
+ 10 => "decimal",
+ 16 => "hexadecimal",
+ _ => unreachable!(),
+ };
+ let msg = format!(
+ "Expected {base} digit but reached end-of-file instead"
+ );
+ ctx.err_at_position(msg.as_str());
+ },
+ };
+ }
+
+ let mut last_was_apos_p = false;
+ while let Some(c) = ctx.peek() {
+ match c {
+ '\'' if last_was_apos_p => ctx.err_at_position(
+ "Multiple concurrent digit separators in numeric literal",
+ ),
+ '\'' => {
+ last_was_apos_p = true;
+ ctx.next();
+ },
+ _ if alphabet.contains(c) => {
+ last_was_apos_p = false;
+ ctx.next();
+ },
+ _ => break,
+ };
+ }
+
+ if last_was_apos_p {
+ ctx.err_at_position(
+ "Numeric literals may not end with a digit separator",
+ );
+ }
+}
+
+fn tokenize_string<'a>(ctx: &mut LexerContext<'a>) -> Token<'a> {
+ let i = ctx.pos_b;
+ loop {
+ if let Some(c) = ctx.next() {
+ if c == '"' {
+ break;
+ }
+ } else {
+ ctx.err_at_position("Unterminated string");
+ }
+ }
+ return Token {
+ kind: TokenType::String,
+ view: &ctx.string[i..ctx.pos_a],
+ };
+}
+
+fn tokenize_identifier<'a>(ctx: &mut LexerContext<'a>) -> Token<'a> {
+ let i = ctx.pos_b;
+ while ctx.peek().is_some_and(unicode::xid_continue_p) {
+ ctx.next();
+ }
+ let view = &ctx.string[i..ctx.pos_a];
+ let kind = match KEYWORDS.get(view) {
+ Some(kind) => kind.clone(),
+ None => TokenType::Identifier,
+ };
+ return Token { kind, view };
+}