summaryrefslogtreecommitdiff
path: root/oryxc/src/lexer.rs
diff options
context:
space:
mode:
authorThomas Voss <mail@thomasvoss.com> 2026-03-04 00:13:12 +0100
committerThomas Voss <mail@thomasvoss.com> 2026-03-04 00:13:12 +0100
commit4be26b845b8cfddfafa738f620a21894f65c4de6 (patch)
tree61bd43ff93f8d6e6892bf2c3b617c7cd67bc485b /oryxc/src/lexer.rs
parentbb66e26e4fd766397bd0a32863f9d1ac3135744e (diff)
Big parser work + error recovery
Diffstat (limited to 'oryxc/src/lexer.rs')
-rw-r--r--oryxc/src/lexer.rs143
1 files changed, 100 insertions, 43 deletions
diff --git a/oryxc/src/lexer.rs b/oryxc/src/lexer.rs
index a76c643..c82cd2c 100644
--- a/oryxc/src/lexer.rs
+++ b/oryxc/src/lexer.rs
@@ -18,38 +18,47 @@ use crate::unicode;
#[repr(u8)]
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum TokenType {
- Eof = 0,
- Ampersand = '&' as u8,
- AngleL = '<' as u8,
- AngleR = '>' as u8,
- Asterisk = '*' as u8,
- Bar = '|' as u8,
- BraceL = '{' as u8,
- BraceR = '}' as u8,
- BracketL = '[' as u8,
- BracketR = ']' as u8,
- Caret = '^' as u8,
- Comma = ',' as u8,
- Equals = '=' as u8,
- Exclamation = '!' as u8,
- Minus = '-' as u8,
- ParenL = '(' as u8,
- ParenR = ')' as u8,
- Plus = '+' as u8,
- Semicolon = ';' as u8,
- Slash = '/' as u8,
- Tilde = '~' as u8,
+ Eof = 0,
+ Ampersand = '&' as u8,
+ AngleL = '<' as u8,
+ AngleR = '>' as u8,
+ Asterisk = '*' as u8,
+ Bang = '!' as u8,
+ Bar = '|' as u8,
+ BraceL = '{' as u8,
+ BraceR = '}' as u8,
+ BracketL = '[' as u8,
+ BracketR = ']' as u8,
+ Caret = '^' as u8,
+ Comma = ',' as u8,
+ Equals = '=' as u8,
+ Minus = '-' as u8,
+ ParenL = '(' as u8,
+ ParenR = ')' as u8,
+ Percent = '%' as u8,
+ Plus = '+' as u8,
+ Semicolon = ';' as u8,
+ Slash = '/' as u8,
+ Tilde = '~' as u8,
+ _AsciiEnd = 0x7F as u8,
+ Ampersand2,
AmpersandTilde,
AngleL2,
AngleL3,
+ AngleLEquals,
AngleR2,
AngleR3,
+ AngleREquals,
+ BangEquals,
+ Bar2,
Ellipsis,
+ Equals2,
Identifier,
KeywordDef,
KeywordFunc,
KeywordReturn,
Number,
+ Percent2,
String,
}
@@ -58,8 +67,8 @@ impl TokenType {
pub fn exprp(&self) -> bool {
return match self {
Self::Ampersand
+ | Self::Bang
| Self::Caret
- | Self::Exclamation
| Self::Identifier
| Self::KeywordFunc
| Self::Minus
@@ -154,6 +163,55 @@ pub fn tokenize(s: &str) -> Result<Soa<Token>, OryxError> {
view: (i, ctx.pos_a),
})
},
+ '<' if ctx.peek().is_some_and(|c| c == '=') => {
+ ctx.next(); /* Consume ‘=’ */
+ Some(Token {
+ kind: TokenType::AngleLEquals,
+ view: (i, ctx.pos_a),
+ })
+ },
+ '>' if ctx.peek().is_some_and(|c| c == '=') => {
+ ctx.next(); /* Consume ‘=’ */
+ Some(Token {
+ kind: TokenType::AngleREquals,
+ view: (i, ctx.pos_a),
+ })
+ },
+ '|' if ctx.peek().is_some_and(|c| c == '|') => {
+ ctx.next(); /* Consume ‘|’ */
+ Some(Token {
+ kind: TokenType::Bar2,
+ view: (i, ctx.pos_a),
+ })
+ },
+ '&' if ctx.peek().is_some_and(|c| c == '&') => {
+ ctx.next(); /* Consume ‘&’ */
+ Some(Token {
+ kind: TokenType::Ampersand2,
+ view: (i, ctx.pos_a),
+ })
+ },
+ '%' if ctx.peek().is_some_and(|c| c == '%') => {
+ ctx.next(); /* Consume ‘%’ */
+ Some(Token {
+ kind: TokenType::Percent2,
+ view: (i, ctx.pos_a),
+ })
+ },
+ '=' if ctx.peek().is_some_and(|c| c == '=') => {
+ ctx.next(); /* Consume ‘=’ */
+ Some(Token {
+ kind: TokenType::Equals2,
+ view: (i, ctx.pos_a),
+ })
+ },
+ '!' if ctx.peek().is_some_and(|c| c == '=') => {
+ ctx.next(); /* Consume ‘=’ */
+ Some(Token {
+ kind: TokenType::BangEquals,
+ view: (i, ctx.pos_a),
+ })
+ },
'&' if ctx.peek().is_some_and(|c| c == '~') => {
ctx.next(); /* Consume ‘~’ */
Some(Token {
@@ -162,12 +220,11 @@ pub fn tokenize(s: &str) -> Result<Soa<Token>, OryxError> {
})
},
'!' | '&' | '(' | ')' | '*' | '+' | ',' | '-' | '/' | ';' | '<'
- | '=' | '>' | '[' | ']' | '^' | '{' | '|' | '}' | '~' | '…' => {
- Some(Token {
- kind: unsafe { mem::transmute(c as u8) },
- view: (i, j),
- })
- },
+ | '=' | '>' | '[' | ']' | '^' | '{' | '|' | '}' | '~' | '…'
+ | '%' => Some(Token {
+ kind: unsafe { mem::transmute(c as u8) },
+ view: (i, j),
+ }),
'#' => Some(tokenize_number_based(&mut ctx)?),
'0'..='9' => Some(tokenize_number(&mut ctx, "0123456789")?),
'"' => Some(tokenize_string(&mut ctx)?),
@@ -176,7 +233,7 @@ pub fn tokenize(s: &str) -> Result<Soa<Token>, OryxError> {
c => {
return Err(OryxError::new(
(i, j),
- format!("Invalid character ‘{c}’"),
+ format!("invalid character ‘{c}’"),
));
},
} {
@@ -211,7 +268,7 @@ fn skip_comment<'a>(ctx: &mut LexerContext<'a>) -> Result<(), OryxError> {
_ => {},
};
}
- return Err(OryxError::new((beg, ctx.pos_a), "Unterminated comment"));
+ return Err(OryxError::new((beg, ctx.pos_a), "unterminated comment"));
}
fn tokenize_number_based<'a>(
@@ -227,7 +284,7 @@ fn tokenize_number_based<'a>(
return Err(OryxError::new(
(ctx.pos_b, ctx.pos_a),
format!(
- "Invalid number base specifier ‘{c}’, did you mean ‘{}’?",
+ "invalid number base specifier ‘{c}’, did you mean ‘{}’?",
c.to_ascii_lowercase()
),
));
@@ -235,13 +292,13 @@ fn tokenize_number_based<'a>(
Some(c) if c.is_alphanumeric() => {
return Err(OryxError::new(
(ctx.pos_b, ctx.pos_a),
- format!("Invalid number base specifier ‘{c}’"),
+ format!("invalid number base specifier ‘{c}’"),
));
},
_ => {
return Err(OryxError::new(
(i, i + 1),
- "Expected number base specifier after ‘#’",
+ "expected number base specifier after ‘#’",
));
},
};
@@ -252,21 +309,21 @@ fn tokenize_number_based<'a>(
Some(c) if alphabet.len() == 16 && c.is_ascii_hexdigit() => {
return Err(OryxError::new(
(ctx.pos_b, ctx.pos_a),
- format!("Hexadecimal digits must be uppercase"),
+ format!("hexadecimal digits must be uppercase"),
));
},
Some(c) if c.is_alphanumeric() => {
let base = base2str(alphabet.len());
return Err(OryxError::new(
(ctx.pos_b, ctx.pos_a),
- format!("Invalid {base} digit ‘{c}’"),
+ format!("invalid {base} digit ‘{c}’"),
));
},
Some('\'') => {
return Err(OryxError::new(
(ctx.pos_b, ctx.pos_a),
format!(
- "Numeric literals may not begin with a digit separator"
+ "numeric literals may not begin with a digit separator"
),
));
},
@@ -274,7 +331,7 @@ fn tokenize_number_based<'a>(
let base = base2str(alphabet.len());
return Err(OryxError::new(
(beg, end),
- format!("Expected {base} digit after base specifier"),
+ format!("expected {base} digit after base specifier"),
));
},
};
@@ -323,21 +380,21 @@ fn span_raw_number<'a>(
Some(c) if alphabet.len() == 16 && c.is_ascii_hexdigit() => {
return Err(OryxError::new(
(ctx.pos_b, ctx.pos_a),
- format!("Hexadecimal digits must be uppercase"),
+ format!("hexadecimal digits must be uppercase"),
));
},
Some(c) if c.is_alphanumeric() => {
let base = base2str(alphabet.len());
return Err(OryxError::new(
(ctx.pos_b, ctx.pos_a),
- format!("Invalid {base} digit ‘{c}’"),
+ format!("invalid {base} digit ‘{c}’"),
));
},
_ => {
let base = base2str(alphabet.len());
return Err(OryxError::new(
(ctx.pos_b, ctx.pos_a),
- format!("Expected {base} digit"),
+ format!("expected {base} digit"),
));
},
};
@@ -350,7 +407,7 @@ fn span_raw_number<'a>(
'\'' if last_was_apos_p => {
return Err(OryxError::new(
(ctx.pos_b, ctx.pos_a + 1),
- "Numeric literals may not have adjecent digit separators",
+ "numeric literals may not have adjecent digit separators",
));
},
'\'' => {
@@ -369,7 +426,7 @@ fn span_raw_number<'a>(
if last_was_apos_p {
return Err(OryxError::new(
(beg, end),
- "Numeric literals may not end with a digit separator",
+ "numeric literals may not end with a digit separator",
));
}
@@ -386,7 +443,7 @@ fn tokenize_string<'a>(ctx: &mut LexerContext<'a>) -> Result<Token, OryxError> {
None => {
return Err(OryxError::new(
(i, ctx.pos_a),
- "Unterminated string literal",
+ "unterminated string literal",
));
},
}