diff options
| author | Thomas Voss <mail@thomasvoss.com> | 2026-03-02 16:27:17 +0100 |
|---|---|---|
| committer | Thomas Voss <mail@thomasvoss.com> | 2026-03-02 18:37:57 +0100 |
| commit | f09e816aee0513031656734cc3cded9827e0b22b (patch) | |
| tree | 06edab862eceb829dd235c1f39265f26b15ed386 /oryxc | |
| parent | 048a4e588c01f65c94d7a6d93c555ca11e0230ae (diff) | |
Significantly improve lexer error reporting
Diffstat (limited to 'oryxc')
| -rw-r--r-- | oryxc/Cargo.lock | 7 | ||||
| -rw-r--r-- | oryxc/Cargo.toml | 1 | ||||
| -rw-r--r-- | oryxc/src/compiler.rs | 26 | ||||
| -rw-r--r-- | oryxc/src/errors.rs | 122 | ||||
| -rw-r--r-- | oryxc/src/lexer.rs | 309 | ||||
| -rw-r--r-- | oryxc/src/unicode/default_ignorable_code_point.rs | 363 | ||||
| -rw-r--r-- | oryxc/src/unicode/mod.rs | 2 |
7 files changed, 306 insertions, 524 deletions
diff --git a/oryxc/Cargo.lock b/oryxc/Cargo.lock index 5514afa..eeaf054 100644 --- a/oryxc/Cargo.lock +++ b/oryxc/Cargo.lock @@ -101,6 +101,7 @@ dependencies = [ "lexopt", "phf", "soa-rs", + "unicode-width", ] [[package]] @@ -271,6 +272,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "537dd038a89878be9b64dd4bd1b260315c1bb94f4d784956b81e27a088d9a09e" [[package]] +name = "unicode-width" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" + +[[package]] name = "windows-link" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" diff --git a/oryxc/Cargo.toml b/oryxc/Cargo.toml index 88464ca..984d910 100644 --- a/oryxc/Cargo.toml +++ b/oryxc/Cargo.toml @@ -11,3 +11,4 @@ lexopt = "0.1.0" # num-rational = "0.4.2" phf = { version = "0.13.1", features = ["macros"] } soa-rs = "0.9.1" +unicode-width = "0.2.2" diff --git a/oryxc/src/compiler.rs b/oryxc/src/compiler.rs index f3de028..1e539ce 100644 --- a/oryxc/src/compiler.rs +++ b/oryxc/src/compiler.rs @@ -14,6 +14,7 @@ use std::vec::Vec; use std::{ fs, panic, + process, thread, }; @@ -26,6 +27,7 @@ use crossbeam_deque::{ use dashmap::DashMap; use soa_rs::Soa; +use crate::errors::OryxError; use crate::lexer::Token; use crate::parser::AstNode; use crate::{ @@ -111,7 +113,7 @@ where let stealer_view = Arc::clone(&stealer_view); let state = Arc::clone(&state); threads.push(thread::spawn(move || { - worker_loop(id, w, stealer_view, state); + worker_loop(id, state, w, stealer_view); })); } @@ -120,11 +122,24 @@ where } } +fn emit_errors<T>(state: Arc<CompilerState>, file: FileId, errors: T) +where + T: IntoIterator<Item = OryxError>, +{ + let (name, buffer) = { + let fdata = state.files.get(&file).unwrap(); + (fdata.name.clone(), fdata.buffer.clone()) + }; + for e in errors.into_iter() { + e.report(name.as_ref(), buffer.as_ref()); + } +} + fn worker_loop( id: usize, + state: Arc<CompilerState>, queue: Worker<Job>, stealers: Arc<[Stealer<Job>]>, - state: Arc<CompilerState>, ) { loop { if state.njobs.load(Ordering::SeqCst) == 0 { @@ -140,9 +155,12 @@ fn worker_loop( (fdata.name.clone(), fdata.buffer.clone()) }; let (name, buffer) = (name.as_ref(), buffer.as_ref()); - let tokens = match lexer::tokenize(name, buffer) { + let tokens = match lexer::tokenize(buffer) { Ok(xs) => xs, - Err(errs) => todo!(), + Err(e) => { + emit_errors(state.clone(), file, vec![e]); + process::exit(1); + }, }; if state.flags.debug_lexer { diff --git a/oryxc/src/errors.rs b/oryxc/src/errors.rs index b3e6013..b9b5955 100644 --- a/oryxc/src/errors.rs +++ b/oryxc/src/errors.rs @@ -1,8 +1,15 @@ +use std::borrow::Cow; +use std::convert::AsRef; +use std::error::Error; use std::ffi::{ OsStr, OsString, }; -use std::fmt::Display; +use std::fmt::{ + self, + Display, + Formatter, +}; use std::ops::Deref; use std::path::Path; use std::sync::OnceLock; @@ -11,6 +18,13 @@ use std::{ process, }; +use unicode_width::UnicodeWidthStr; + +use crate::unicode; + +const TAB_AS_SPACES: &'static str = " "; +const TABSIZE: usize = TAB_AS_SPACES.len(); + pub fn progname() -> &'static OsString { static ARGV0: OnceLock<OsString> = OnceLock::new(); return ARGV0.get_or_init(|| { @@ -65,3 +79,109 @@ where eprintln!("{}: \x1b[31;1mError:\x1b[0m {}", filename.display(), s); process::exit(1); } + +#[derive(Debug)] +pub struct OryxError { + pub span: (usize, usize), + pub msg: Cow<'static, str>, +} + +impl OryxError { + pub fn new<T>(beg: usize, end: usize, msg: T) -> Self + where + T: Into<Cow<'static, str>>, + { + return Self { + span: (beg, end), + msg: msg.into(), + }; + } + + pub fn report<Tf, Tb>(&self, filename: &Tf, buffer: &Tb) + where + Tf: AsRef<OsStr>, + Tb: AsRef<str>, + { + fn nspaces(n: i32) -> i32 { + return match () { + () if n < 10000 => 6, + () if n < 100000 => 7, + () if n < 1000000 => 8, + () if n < 10000000 => 9, + () if n < 100000000 => 10, + () if n < 1000000000 => 11, + () => 12, + }; + } + + let buffer = buffer.as_ref(); + let (mut line, mut linebeg, mut lineend) = (1, 0, buffer.len()); + for (i, c) in buffer.char_indices() { + if unicode::line_terminator_p(c) { + if i >= self.span.0 { + lineend = i; + break; + } + line += 1; + linebeg = i + c.len_utf8(); + } + } + + let (spanbeg, spanend) = (self.span.0, self.span.1.min(lineend)); + + let errbeg = new_string_with_spaces(&buffer[linebeg..spanbeg]); + let errmid = new_string_with_spaces(&buffer[spanbeg..spanend]); + let errend = new_string_with_spaces(&buffer[spanend..lineend]); + + let errmid = match errmid.len() { + 0 => "_".to_string(), + _ => errmid, + }; + + /* TODO: Do tab math */ + let col = errbeg.width() + 1; + + const FNAMEBEG: &str = "\x1b[37;1m"; + const ERRORBEG: &str = "\x1b[31;1m"; + const FMTEND: &str = "\x1b[0m"; + + eprintln!( + "{FNAMEBEG}{}:{line}:{col}:{FMTEND} {ERRORBEG}error:{FMTEND} {self}", + filename.as_ref().display() + ); + eprintln!(" {line:>4} │ {errbeg}{ERRORBEG}{errmid}{FMTEND}{errend}"); + for _ in 0..nspaces(line) { + eprint!(" "); + } + eprint!("│ "); + for _ in 1..col { + eprint!(" "); + } + eprint!("{ERRORBEG}"); + for _ in 0..errmid.width().max(1) { + eprint!("^"); + } + eprint!("{FMTEND}"); + eprintln!(); + } +} + +fn new_string_with_spaces(s: &str) -> String { + let ntabs = s.bytes().filter(|b| *b == b'\t').count(); + let mut buf = String::with_capacity(s.len() + ntabs * (TABSIZE - 1)); + for c in s.chars() { + match c { + '\t' => buf.push_str(TAB_AS_SPACES), + _ => buf.push(c), + } + } + return buf; +} + +impl Display for OryxError { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + return write!(f, "{}", self.msg); + } +} + +impl Error for OryxError {} diff --git a/oryxc/src/lexer.rs b/oryxc/src/lexer.rs index 2e1a21c..6f05a9c 100644 --- a/oryxc/src/lexer.rs +++ b/oryxc/src/lexer.rs @@ -1,13 +1,9 @@ -use std::borrow::Cow; -use std::ffi::OsStr; -use std::fmt::Display; use std::iter::Peekable; use std::mem; use std::str::{ self, Chars, }; -use std::vec::Vec; use phf; use soa_rs::{ @@ -15,10 +11,8 @@ use soa_rs::{ Soars, }; -use crate::{ - errors, - unicode, -}; +use crate::errors::OryxError; +use crate::unicode; #[allow(dead_code)] #[repr(u8)] @@ -60,17 +54,6 @@ pub enum TokenType { } impl TokenType { - pub fn literalp(&self) -> bool { - return match self { - Self::Identifier - | Self::KeywordDef - | Self::KeywordFunc - | Self::Number - | Self::String => true, - _ => false, - }; - } - /* Tokens that start an expression */ pub fn exprp(&self) -> bool { return match self { @@ -100,41 +83,20 @@ pub struct Token { pub view: Span, } -pub struct Error { - pub pos: usize, - pub msg: Cow<'static, str>, -} - -impl Error { - fn new<T>(pos: usize, msg: T) -> Self - where - T: Into<Cow<'static, str>>, - { - return Self { - pos, - msg: msg.into(), - }; - } -} - struct LexerContext<'a> { - pos_a: usize, /* Pos [a]fter char */ - pos_b: usize, /* Pos [b]efore char */ - chars: Peekable<Chars<'a>>, - string: &'a str, - filename: &'a OsStr, - expect_punct_p: bool, + pos_a: usize, /* Pos [a]fter char */ + pos_b: usize, /* Pos [b]efore char */ + chars: Peekable<Chars<'a>>, + string: &'a str, } impl<'a> LexerContext<'a> { - fn new(filename: &'a OsStr, string: &'a str) -> Self { + fn new(string: &'a str) -> Self { return Self { pos_a: 0, pos_b: 0, chars: string.chars().peekable(), string, - filename, - expect_punct_p: false, }; } @@ -150,22 +112,6 @@ impl<'a> LexerContext<'a> { fn peek(&mut self) -> Option<char> { return self.chars.peek().copied(); } - - fn err_at_position<S>(&self, s: S) -> ! - where - S: Display, - { - errors::err_at_position(self.filename, s); - } - - #[inline(always)] - fn literal_spacing_guard(&self) { - if self.expect_punct_p { - self.err_at_position( - "Two literals may not be directly adjacent to each other", - ); - } - } } static KEYWORDS: phf::Map<&'static str, TokenType> = phf::phf_map! { @@ -174,16 +120,15 @@ static KEYWORDS: phf::Map<&'static str, TokenType> = phf::phf_map! { "return" => TokenType::KeywordReturn, }; -pub fn tokenize(filename: &OsStr, s: &str) -> Result<Soa<Token>, Vec<Error>> { +pub fn tokenize(s: &str) -> Result<Soa<Token>, OryxError> { let mut toks = Soa::<Token>::with_capacity(s.len() / 2); - let mut ctx = LexerContext::new(filename, s); + let mut ctx = LexerContext::new(s); while let Some(c) = ctx.next() { let (i, j) = (ctx.pos_b, ctx.pos_a); if let Some(tok) = match c { '/' if ctx.peek().is_some_and(|c| c == '*') => { - skip_comment(&mut ctx); - ctx.expect_punct_p = false; + skip_comment(&mut ctx)?; None }, '<' if ctx.peek().is_some_and(|c| c == '<') => { @@ -226,34 +171,19 @@ pub fn tokenize(filename: &OsStr, s: &str) -> Result<Soa<Token>, Vec<Error>> { view: Span(i, j), }) }, - '#' => { - ctx.literal_spacing_guard(); - Some(tokenize_number_based(&mut ctx)) - }, - '0'..='9' => { - ctx.literal_spacing_guard(); - Some(tokenize_number(&mut ctx, "0123456789")) - }, - '"' => { - ctx.literal_spacing_guard(); - Some(tokenize_string(&mut ctx)) - }, - _ if unicode::xid_start_p(c) => { - ctx.literal_spacing_guard(); - Some(tokenize_identifier(&mut ctx)) - }, - _ if unicode::pattern_white_space_p(c) => { - if !unicode::default_ignorable_code_point_p(c) { - ctx.expect_punct_p = false; - } - None - }, + '#' => Some(tokenize_number_based(&mut ctx)?), + '0'..='9' => Some(tokenize_number(&mut ctx, "0123456789")?), + '"' => Some(tokenize_string(&mut ctx)?), + _ if unicode::xid_start_p(c) => Some(tokenize_identifier(&mut ctx)), + _ if unicode::pattern_white_space_p(c) => None, c => { - let msg = format!("Invalid character ‘{c}’"); - ctx.err_at_position(msg.as_str()); + return Err(OryxError::new( + i, + j, + format!("Invalid character ‘{c}’"), + )); }, } { - ctx.expect_punct_p = tok.kind.literalp(); toks.push(tok); } } @@ -265,7 +195,8 @@ pub fn tokenize(filename: &OsStr, s: &str) -> Result<Soa<Token>, Vec<Error>> { return Ok(toks); } -fn skip_comment<'a>(ctx: &mut LexerContext<'a>) { +fn skip_comment<'a>(ctx: &mut LexerContext<'a>) -> Result<(), OryxError> { + let beg = ctx.pos_b; ctx.next(); /* Consume ‘*’ */ let mut depth = 1; while let Some(c) = ctx.next() { @@ -278,118 +209,169 @@ fn skip_comment<'a>(ctx: &mut LexerContext<'a>) { depth -= 1; ctx.next(); /* Consume ‘/’ */ if depth == 0 { - return; + return Ok(()); } }, _ => {}, }; } - ctx.err_at_position("Unterminated comment"); + return Err(OryxError::new(beg, ctx.pos_a, "Unterminated comment")); } -fn tokenize_number_based<'a>(ctx: &mut LexerContext<'a>) -> Token { +fn tokenize_number_based<'a>( + ctx: &mut LexerContext<'a>, +) -> Result<Token, OryxError> { let i = ctx.pos_b; let alphabet = match ctx.next() { Some('b') => "01", Some('o') => "01234567", Some('d') => "0123456789", Some('x') => "0123456789ABCDEF", - Some(c) => { - let msg = format!("Invalid number base specifier ‘{c}’"); - ctx.err_at_position(msg.as_str()); + Some(c @ 'B') | Some(c @ 'O') | Some(c @ 'D') | Some(c @ 'X') => { + return Err(OryxError::new( + ctx.pos_b, + ctx.pos_a, + format!( + "Invalid number base specifier ‘{c}’, did you mean ‘{}’?", + c.to_ascii_lowercase() + ), + )); + }, + Some(c) if c.is_alphanumeric() => { + return Err(OryxError::new( + ctx.pos_b, + ctx.pos_a, + format!("Invalid number base specifier ‘{c}’"), + )); + }, + _ => { + return Err(OryxError::new( + i, + i + 1, + "Expected number base specifier after ‘#’", + )); }, - None => ctx.err_at_position("Expected number base specifier after ‘#’"), }; + + let (beg, end) = (ctx.pos_b, ctx.pos_a); let mut tok = match ctx.next() { - Some(c) if alphabet.contains(c) => tokenize_number(ctx, alphabet), - Some(c) => { - let base = match alphabet.len() { - 2 => "binary", - 8 => "octal", - 10 => "decimal", - 16 => "hexadecimal", - _ => unreachable!(), - }; - let msg = format!("Invalid {base} digit ‘{c}’"); - ctx.err_at_position(msg.as_str()); + Some(c) if alphabet.contains(c) => tokenize_number(ctx, alphabet)?, + Some(c) if alphabet.len() == 16 && c.is_ascii_hexdigit() => { + return Err(OryxError::new( + ctx.pos_b, + ctx.pos_a, + format!("Hexadecimal digits must be uppercase"), + )); + }, + Some(c) if c.is_alphanumeric() => { + let base = base2str(alphabet.len()); + return Err(OryxError::new( + ctx.pos_b, + ctx.pos_a, + format!("Invalid {base} digit ‘{c}’"), + )); + }, + Some('\'') => { + return Err(OryxError::new( + ctx.pos_b, + ctx.pos_a, + format!( + "Numeric literals may not begin with a digit separator" + ), + )); + }, + _ => { + let base = base2str(alphabet.len()); + return Err(OryxError::new( + beg, + end, + format!("Expected {base} digit after base specifier"), + )); }, - None => ctx.err_at_position("Expected number after base specifier"), }; tok.view = Span(i, ctx.pos_a); - return tok; + return Ok(tok); } fn tokenize_number<'a>( ctx: &mut LexerContext<'a>, alphabet: &'static str, -) -> Token { +) -> Result<Token, OryxError> { let i = ctx.pos_b; - span_raw_number(ctx, alphabet, true); + span_raw_number(ctx, alphabet, true)?; /* Fractional part */ if ctx.peek().is_some_and(|c| c == '.') { ctx.next(); if ctx.peek().is_some_and(|c| alphabet.contains(c)) { - span_raw_number(ctx, alphabet, false); + span_raw_number(ctx, alphabet, false)?; } } /* Exponential part */ if ctx.peek().is_some_and(|c| c == 'e') { ctx.next(); - span_raw_number(ctx, alphabet, false); + if ctx.peek().is_some_and(|c| c == '+' || c == '-') { + ctx.next(); + } + span_raw_number(ctx, alphabet, false)?; } - return Token { + return Ok(Token { kind: TokenType::Number, view: Span(i, ctx.pos_a), - }; + }); } fn span_raw_number<'a>( ctx: &mut LexerContext<'a>, alphabet: &'static str, first_digit_lexed_p: bool, -) { +) -> Result<(), OryxError> { if !first_digit_lexed_p { match ctx.next() { Some(c) if alphabet.contains(c) => c, - Some(c) => { - let base = match alphabet.len() { - 2 => "binary", - 8 => "octal", - 10 => "decimal", - 16 => "hexadecimal", - _ => unreachable!(), - }; - let msg = format!("Invalid {base} digit ‘{c}’"); - ctx.err_at_position(msg.as_str()); + Some(c) if alphabet.len() == 16 && c.is_ascii_hexdigit() => { + return Err(OryxError::new( + ctx.pos_b, + ctx.pos_a, + format!("Hexadecimal digits must be uppercase"), + )); }, - None => { - let base = match alphabet.len() { - 2 => "binary", - 8 => "octal", - 10 => "decimal", - 16 => "hexadecimal", - _ => unreachable!(), - }; - let msg = format!( - "Expected {base} digit but reached end-of-file instead" - ); - ctx.err_at_position(msg.as_str()); + Some(c) if c.is_alphanumeric() => { + let base = base2str(alphabet.len()); + return Err(OryxError::new( + ctx.pos_b, + ctx.pos_a, + format!("Invalid {base} digit ‘{c}’"), + )); + }, + _ => { + let base = base2str(alphabet.len()); + return Err(OryxError::new( + ctx.pos_b, + ctx.pos_a, + format!("Expected {base} digit"), + )); }, }; } + let (mut beg, mut end) = (0, 0); let mut last_was_apos_p = false; while let Some(c) = ctx.peek() { match c { - '\'' if last_was_apos_p => ctx.err_at_position( - "Multiple concurrent digit separators in numeric literal", - ), + '\'' if last_was_apos_p => { + return Err(OryxError::new( + ctx.pos_b, + ctx.pos_a + 1, + "Numeric literals may not have adjecent digit separators", + )); + }, '\'' => { last_was_apos_p = true; ctx.next(); + (beg, end) = (ctx.pos_b, ctx.pos_a); }, _ if alphabet.contains(c) => { last_was_apos_p = false; @@ -400,27 +382,36 @@ fn span_raw_number<'a>( } if last_was_apos_p { - ctx.err_at_position( + return Err(OryxError::new( + beg, + end, "Numeric literals may not end with a digit separator", - ); + )); } + + return Ok(()); } -fn tokenize_string<'a>(ctx: &mut LexerContext<'a>) -> Token { +fn tokenize_string<'a>(ctx: &mut LexerContext<'a>) -> Result<Token, OryxError> { let i = ctx.pos_b; + loop { - if let Some(c) = ctx.next() { - if c == '"' { - break; - } - } else { - ctx.err_at_position("Unterminated string"); + match ctx.next() { + Some(c) if c == '"' => break, + Some(_) => {}, + None => { + return Err(OryxError::new( + i, + ctx.pos_a, + "Unterminated string literal", + )); + }, } } - return Token { + return Ok(Token { kind: TokenType::String, view: Span(i, ctx.pos_a), - }; + }); } fn tokenize_identifier<'a>(ctx: &mut LexerContext<'a>) -> Token { @@ -435,3 +426,13 @@ fn tokenize_identifier<'a>(ctx: &mut LexerContext<'a>) -> Token { }; return Token { kind, view }; } + +fn base2str(n: usize) -> &'static str { + return match n { + 2 => "binary", + 8 => "octal", + 10 => "decimal", + 16 => "hexadecimal", + _ => unreachable!(), + }; +} diff --git a/oryxc/src/unicode/default_ignorable_code_point.rs b/oryxc/src/unicode/default_ignorable_code_point.rs deleted file mode 100644 index b900a3b..0000000 --- a/oryxc/src/unicode/default_ignorable_code_point.rs +++ /dev/null @@ -1,363 +0,0 @@ -/* Autogenerated – DO NOT EDIT */ - -static DEFAULT_IGNORABLE_CODE_POINT_L1: [u16; 544] = [ - 0, 1, 2, 3, 4, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 7, 1, 1, 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -]; -static DEFAULT_IGNORABLE_CODE_POINT_L2: [u64; 320] = [ - 0, - 0, - 35184372088832, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 32768, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 268435456, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 6442450944, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 13510798882111488, - 0, - 63488, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 136339441907712, - 281470681743360, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 68719476736, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 65535, - 0, - 0, - 9223372036854775808, - 0, - 0, - 4294967296, - 143833713099145216, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 64424509440, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 574208952489738240, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 18446744073709551615, - 18446744073709551615, - 18446744073709551615, - 18446744073709551615, - 18446744073709551615, - 18446744073709551615, - 18446744073709551615, - 18446744073709551615, - 18446744073709551615, - 18446744073709551615, - 18446744073709551615, - 18446744073709551615, - 18446744073709551615, - 18446744073709551615, - 18446744073709551615, - 18446744073709551615, - 18446744073709551615, - 18446744073709551615, - 18446744073709551615, - 18446744073709551615, - 18446744073709551615, - 18446744073709551615, - 18446744073709551615, - 18446744073709551615, - 18446744073709551615, - 18446744073709551615, - 18446744073709551615, - 18446744073709551615, - 18446744073709551615, - 18446744073709551615, - 18446744073709551615, - 18446744073709551615, -]; -#[inline] -pub fn default_ignorable_code_point_p(c: char) -> bool { - let cp = c as usize; - let blki = - unsafe { *DEFAULT_IGNORABLE_CODE_POINT_L1.get_unchecked(cp >> 11) } - as usize; - let in_blk_offset_p = cp & 0x7FF; - let wordi = (blki * 32) + (in_blk_offset_p >> 6); - let biti = in_blk_offset_p & 0x3F; - unsafe { - return (*DEFAULT_IGNORABLE_CODE_POINT_L2.get_unchecked(wordi) - & (1 << biti)) - != 0; - } -} diff --git a/oryxc/src/unicode/mod.rs b/oryxc/src/unicode/mod.rs index 2fbdcb3..e4a719e 100644 --- a/oryxc/src/unicode/mod.rs +++ b/oryxc/src/unicode/mod.rs @@ -1,10 +1,8 @@ -pub mod default_ignorable_code_point; pub mod line_terminator; pub mod pattern_white_space; pub mod xid_continue; pub mod xid_start; -pub use default_ignorable_code_point::default_ignorable_code_point_p; pub use line_terminator::line_terminator_p; pub use pattern_white_space::pattern_white_space_p; pub use xid_continue::xid_continue_p; |