From 7d42170c8625de0fe44b98f47e8b9a603a9de794 Mon Sep 17 00:00:00 2001 From: Thomas Voss Date: Tue, 24 Feb 2026 11:08:42 +0100 Subject: Genesis commit --- .gitignore | 3 + oryxc/.gitignore | 1 + oryxc/Cargo.lock | 277 ++++++ oryxc/Cargo.toml | 13 + oryxc/rustfmt.toml | 1 + oryxc/src/compiler.rs | 136 +++ oryxc/src/errors.rs | 67 ++ oryxc/src/intern.rs | 68 ++ oryxc/src/lexer.rs | 427 +++++++++ oryxc/src/main.rs | 95 ++ oryxc/src/parser.rs | 544 +++++++++++ oryxc/src/size.rs | 3 + oryxc/src/unicode/default_ignorable_code_point.rs | 363 ++++++++ oryxc/src/unicode/line_terminator.rs | 135 +++ oryxc/src/unicode/mod.rs | 11 + oryxc/src/unicode/pattern_white_space.rs | 137 +++ oryxc/src/unicode/xid_continue.rs | 1007 +++++++++++++++++++++ oryxc/src/unicode/xid_start.rs | 927 +++++++++++++++++++ rustfmt.toml | 14 + test.x | 33 + unigen/Cargo.lock | 16 + unigen/Cargo.toml | 7 + unigen/fetch | 19 + unigen/rustfmt.toml | 1 + unigen/src/main.rs | 278 ++++++ 25 files changed, 4583 insertions(+) create mode 100644 .gitignore create mode 100644 oryxc/.gitignore create mode 100644 oryxc/Cargo.lock create mode 100644 oryxc/Cargo.toml create mode 120000 oryxc/rustfmt.toml create mode 100644 oryxc/src/compiler.rs create mode 100644 oryxc/src/errors.rs create mode 100644 oryxc/src/intern.rs create mode 100644 oryxc/src/lexer.rs create mode 100644 oryxc/src/main.rs create mode 100644 oryxc/src/parser.rs create mode 100644 oryxc/src/size.rs create mode 100644 oryxc/src/unicode/default_ignorable_code_point.rs create mode 100644 oryxc/src/unicode/line_terminator.rs create mode 100644 oryxc/src/unicode/mod.rs create mode 100644 oryxc/src/unicode/pattern_white_space.rs create mode 100644 oryxc/src/unicode/xid_continue.rs create mode 100644 oryxc/src/unicode/xid_start.rs create mode 100644 rustfmt.toml create mode 100644 test.x create mode 100644 unigen/Cargo.lock create mode 100644 unigen/Cargo.toml create mode 100755 unigen/fetch create mode 120000 unigen/rustfmt.toml create mode 100644 unigen/src/main.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..987ff3e --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +oryxc/target/ +unigen/data/ +unigen/target/ diff --git a/oryxc/.gitignore b/oryxc/.gitignore new file mode 100644 index 0000000..ea8c4bf --- /dev/null +++ b/oryxc/.gitignore @@ -0,0 +1 @@ +/target diff --git a/oryxc/Cargo.lock b/oryxc/Cargo.lock new file mode 100644 index 0000000..5514afa --- /dev/null +++ b/oryxc/Cargo.lock @@ -0,0 +1,277 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "bitflags" +version = "2.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3" + +[[package]] +name = "cfg-if" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" + +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "dashmap" +version = "6.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf" +dependencies = [ + "cfg-if", + "crossbeam-utils", + "hashbrown", + "lock_api", + "once_cell", + "parking_lot_core", +] + +[[package]] +name = "fastrand" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" + +[[package]] +name = "hashbrown" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" + +[[package]] +name = "lexopt" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c5d9b5843e8c9311ff602e6bd50855015e99e75159c2c54fe104cfac241f552" + +[[package]] +name = "libc" +version = "0.2.181" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "459427e2af2b9c839b132acb702a1c654d95e10f8c326bfc2ad11310e458b1c5" + +[[package]] +name = "lock_api" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965" +dependencies = [ + "scopeguard", +] + +[[package]] +name = "once_cell" +version = "1.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" + +[[package]] +name = "oryx" +version = "0.1.0" +dependencies = [ + "crossbeam-deque", + "dashmap", + "lexopt", + "phf", + "soa-rs", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-link", +] + +[[package]] +name = "phf" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1562dc717473dbaa4c1f85a36410e03c047b2e7df7f45ee938fbef64ae7fadf" +dependencies = [ + "phf_macros", + "phf_shared", + "serde", +] + +[[package]] +name = "phf_generator" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "135ace3a761e564ec88c03a77317a7c6b80bb7f7135ef2544dbe054243b89737" +dependencies = [ + "fastrand", + "phf_shared", +] + +[[package]] +name = "phf_macros" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "812f032b54b1e759ccd5f8b6677695d5268c588701effba24601f6932f8269ef" +dependencies = [ + "phf_generator", + "phf_shared", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "phf_shared" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e57fef6bc5981e38c2ce2d63bfa546861309f875b8a75f092d1d54ae2d64f266" +dependencies = [ + "siphasher", +] + +[[package]] +name = "proc-macro2" +version = "1.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21b2ebcf727b7760c461f091f9f0f539b77b8e87f2fd88131e7f1b433b3cece4" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "redox_syscall" +version = "0.5.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" +dependencies = [ + "bitflags", +] + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "siphasher" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2aa850e253778c88a04c3d7323b043aeda9d3e30d5971937c1855769763678e" + +[[package]] +name = "smallvec" +version = "1.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" + +[[package]] +name = "soa-rs" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf20e367c4676b712916633bc735e97d18cf4d7c2a88b0e29a43446790d029d6" +dependencies = [ + "soa-rs-derive", +] + +[[package]] +name = "soa-rs-derive" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3de734b144ae71c89c91cdc45de3bd22e99e4efb1098e2635797fc0ee0566172" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "syn" +version = "2.0.115" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e614ed320ac28113fa64972c4262d5dbc89deacdfd00c34a3e4cea073243c12" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "unicode-ident" +version = "1.0.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "537dd038a89878be9b64dd4bd1b260315c1bb94f4d784956b81e27a088d9a09e" + +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" diff --git a/oryxc/Cargo.toml b/oryxc/Cargo.toml new file mode 100644 index 0000000..88464ca --- /dev/null +++ b/oryxc/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "oryx" +version = "0.1.0" +edition = "2024" + +[dependencies] +crossbeam-deque = "0.8.6" +dashmap = "6.1.0" +# icu = { version = "2.1.1", features = ["compiled_data"] } +lexopt = "0.1.0" +# num-rational = "0.4.2" +phf = { version = "0.13.1", features = ["macros"] } +soa-rs = "0.9.1" diff --git a/oryxc/rustfmt.toml b/oryxc/rustfmt.toml new file mode 120000 index 0000000..39f97b0 --- /dev/null +++ b/oryxc/rustfmt.toml @@ -0,0 +1 @@ +../rustfmt.toml \ No newline at end of file diff --git a/oryxc/src/compiler.rs b/oryxc/src/compiler.rs new file mode 100644 index 0000000..05e275f --- /dev/null +++ b/oryxc/src/compiler.rs @@ -0,0 +1,136 @@ +use std::ffi::OsString; +use std::iter::IntoIterator; +use std::sync::Arc; +use std::sync::atomic::{ + AtomicUsize, + Ordering, +}; +use std::vec::Vec; +use std::{ + panic, + thread, +}; + +use crossbeam_deque::{ + Injector, + Steal, + Stealer, + Worker, +}; +use dashmap::DashMap; + +use crate::Flags; + +#[derive(Clone, Copy, Eq, Hash, PartialEq)] +pub struct FileId(u32); + +pub struct FileData { + name: OsString, +} + +pub enum Job { + LexAndParse { file: FileId }, + TypeCheck { file: FileId }, +} + +pub struct CompilerState { + pub files: DashMap, + pub globalq: Injector, + pub njobs: AtomicUsize, + pub flags: Flags, +} + +pub fn start(paths: T, flags: Flags) +where + T: IntoIterator, +{ + let state = Arc::new(CompilerState { + files: DashMap::new(), + globalq: Injector::new(), + njobs: AtomicUsize::new(0), + flags, + }); + for (i, path) in paths.into_iter().enumerate() { + let id = FileId(i as u32); + state.files.insert(id, FileData { name: path.clone() }); + state.njobs.fetch_add(1, Ordering::SeqCst); + state.globalq.push(Job::LexAndParse { file: id }); + } + + let mut workers = Vec::with_capacity(flags.threads); + let mut stealers = Vec::with_capacity(flags.threads); + for _ in 0..flags.threads { + let w = Worker::new_fifo(); + stealers.push(w.stealer()); + workers.push(w); + } + + let mut threads = Vec::with_capacity(flags.threads); + let stealer_view: Arc<[_]> = Arc::from(stealers); + + for (id, w) in workers.into_iter().enumerate() { + let stealer_view = Arc::clone(&stealer_view); + let state = Arc::clone(&state); + threads.push(thread::spawn(move || { + worker_loop(id, w, stealer_view, state); + })); + } + + for t in threads { + t.join().unwrap_or_else(|e| panic::resume_unwind(e)); + } +} + +fn worker_loop( + id: usize, + queue: Worker, + stealers: Arc<[Stealer]>, + state: Arc, +) { + loop { + if state.njobs.load(Ordering::SeqCst) == 0 { + break; + } + + let job = find_task(&queue, &state.globalq, &stealers); + if let Some(job) = job { + match job { + LexAndParse { file } => {}, + } + + state.njobs.fetch_sub(1, Ordering::SeqCst); + } else { + thread::yield_now(); + } + } +} + +fn find_task( + localq: &Worker, + globalq: &Injector, + stealers: &Arc<[Stealer]>, +) -> Option { + if let Some(job) = localq.pop() { + return Some(job); + } + + loop { + match globalq.steal_batch_and_pop(localq) { + Steal::Success(job) => return Some(job), + Steal::Empty => break, + Steal::Retry => continue, + } + } + + for s in stealers.iter() { + loop { + match s.steal_batch_and_pop(localq) { + Steal::Success(job) => return Some(job), + Steal::Empty => break, + Steal::Retry => continue, + } + } + } + + return None; +} diff --git a/oryxc/src/errors.rs b/oryxc/src/errors.rs new file mode 100644 index 0000000..b3e6013 --- /dev/null +++ b/oryxc/src/errors.rs @@ -0,0 +1,67 @@ +use std::ffi::{ + OsStr, + OsString, +}; +use std::fmt::Display; +use std::ops::Deref; +use std::path::Path; +use std::sync::OnceLock; +use std::{ + env, + process, +}; + +pub fn progname() -> &'static OsString { + static ARGV0: OnceLock = OnceLock::new(); + return ARGV0.get_or_init(|| { + let default = OsStr::new("oryxc"); + let s = env::args_os().next().unwrap_or(default.into()); + return Path::new(&s).file_name().unwrap_or(default).to_os_string(); + }); +} + +#[macro_export] +macro_rules! warn { + ($err:expr, $fmt:literal, $($arg:tt)*) => {{ + use crate::errors::progname; + let _ = eprintln!("{}: {}: {}", progname().display(), + format_args!($fmt, $($arg)*), $err); + }}; + + ($err:expr, $fmt:literal) => {{ + warn!($err, $fmt,); + }}; + + ($err:expr) => {{ + use crate::errors::progname; + let _ = eprintln!("{}: {}", progname().display(), $err); + }}; +} + +#[macro_export] +macro_rules! err { + ($err:expr, $fmt:literal, $($arg:tt)*) => {{ + use crate::warn; + warn!($err, $fmt, $($arg)*); + std::process::exit(1); + }}; + + ($err:expr, $fmt:literal) => {{ + err!($err, $fmt,); + }}; + + ($err:expr) => {{ + use crate::warn; + warn!($err); + std::process::exit(1); + }}; +} + +pub fn err_at_position(filename: T, s: S) -> ! +where + T: Deref, + S: Display, +{ + eprintln!("{}: \x1b[31;1mError:\x1b[0m {}", filename.display(), s); + process::exit(1); +} diff --git a/oryxc/src/intern.rs b/oryxc/src/intern.rs new file mode 100644 index 0000000..3ab91cf --- /dev/null +++ b/oryxc/src/intern.rs @@ -0,0 +1,68 @@ +use std::hash; + +use dashmap; +use icu::normalizer; + +#[repr(transparent)] +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub struct Key(u32); + +pub struct Interner<'a> { + map: dashmap::DashMap, Key>, + store: Vec<&'a str>, +} + +#[derive(Eq)] +pub struct UniStr<'a>(pub &'a str); + +impl hash::Hash for UniStr<'_> { + fn hash(&self, state: &mut H) { + if self.0.is_ascii() { + self.0.chars().for_each(|c| c.hash(state)); + } else { + let nfkd = normalizer::DecomposingNormalizer::new_nfkd(); + nfkd.normalize_iter(self.0.chars()).for_each(|c| c.hash(state)); + } + } +} + +impl PartialEq for UniStr<'_> { + fn eq(&self, other: &Self) -> bool { + let nfkd = normalizer::DecomposingNormalizer::new_nfkd(); + return match (self.0.is_ascii(), other.0.is_ascii()) { + (true, true) => self.0 == other.0, + (true, false) => { + self.0.chars().eq(nfkd.normalize_iter(other.0.chars())) + }, + (false, true) => { + other.0.chars().eq(nfkd.normalize_iter(self.0.chars())) + }, + (false, false) => nfkd + .normalize_iter(self.0.chars()) + .eq(nfkd.normalize_iter(other.0.chars())), + }; + } +} + +impl<'a> Interner<'a> { + pub fn new() -> Self { + return Interner { + map: dashmap::DashMap::new(), + store: Vec::new(), + }; + } + + pub fn get(&self, key: Key) -> &str { + return self.store[key.0 as usize]; + } + + pub fn intern(&mut self, value: &'a str) -> Key { + if let Some(key) = self.map.get(&UniStr(value)) { + return *key; + } + let key = Key(self.store.len() as u32); + self.map.insert(UniStr(value), key); + self.store.push(value); + return key; + } +} diff --git a/oryxc/src/lexer.rs b/oryxc/src/lexer.rs new file mode 100644 index 0000000..531593d --- /dev/null +++ b/oryxc/src/lexer.rs @@ -0,0 +1,427 @@ +use std::ffi::OsStr; +use std::fmt::Display; +use std::{ + iter, + mem, + str, +}; + +use phf; +use soa_rs::{ + self, + Soars, +}; + +use crate::{ + errors, + size, + unicode, +}; + +#[repr(u8)] +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum TokenType { + Eof = 0, + Ampersand = '&' as u8, + AngleL = '<' as u8, + AngleR = '>' as u8, + Asterisk = '*' as u8, + Bar = '|' as u8, + BraceL = '{' as u8, + BraceR = '}' as u8, + BracketL = '[' as u8, + BracketR = ']' as u8, + Caret = '^' as u8, + Comma = ',' as u8, + Equals = '=' as u8, + Exclamation = '!' as u8, + Minus = '-' as u8, + ParenL = '(' as u8, + ParenR = ')' as u8, + Plus = '+' as u8, + Semicolon = ';' as u8, + Slash = '/' as u8, + Tilde = '~' as u8, + AmpersandTilde, + AngleL2, + AngleL3, + AngleR2, + AngleR3, + Ellipsis, + Identifier, + KeywordDef, + KeywordFunc, + KeywordReturn, + Number, + String, +} + +impl TokenType { + pub fn literalp(&self) -> bool { + return match self { + Self::Identifier + | Self::KeywordDef + | Self::KeywordFunc + | Self::Number + | Self::String => true, + _ => false, + }; + } + + /* Tokens that start an expression */ + pub fn exprp(&self) -> bool { + return match self { + Self::Ampersand + | Self::Caret + | Self::Exclamation + | Self::Identifier + | Self::KeywordFunc + | Self::Minus + | Self::Number + | Self::ParenL + | Self::Plus + | Self::String + | Self::Tilde => true, + _ => false, + }; + } +} + +#[derive(Soars)] +#[soa_derive(Debug)] +pub struct Token<'a> { + pub kind: TokenType, + pub view: &'a str, +} + +pub struct TokenizedBuffer<'a> { + pub tokens: soa_rs::Soa>, + pub buffer: &'a str, + pub filename: Option<&'a OsStr>, +} + +struct LexerContext<'a> { + pos_a: usize, /* Pos [a]fter char */ + pos_b: usize, /* Pos [b]efore char */ + chars: iter::Peekable>, + string: &'a str, + filename: Option<&'a OsStr>, + expect_punct_p: bool, +} + +impl<'a> LexerContext<'a> { + fn new(filename: Option<&'a OsStr>, string: &'a str) -> Self { + return Self { + pos_a: 0, + pos_b: 0, + chars: string.chars().peekable(), + string, + filename, + expect_punct_p: false, + }; + } + + #[inline(always)] + fn next(&mut self) -> Option { + let c = self.chars.next()?; + self.pos_b = self.pos_a; + self.pos_a += c.len_utf8(); + return Some(c); + } + + #[inline(always)] + fn peek(&mut self) -> Option { + return self.chars.peek().copied(); + } + + fn err_at_position(&self, s: S) -> ! + where + S: Display, + { + errors::err_at_position(self.filename.unwrap_or(OsStr::new("-")), s); + } + + #[inline(always)] + fn literal_spacing_guard(&self) { + if self.expect_punct_p { + self.err_at_position( + "Two literals may not be directly adjacent to each other", + ); + } + } +} + +static KEYWORDS: phf::Map<&'static str, TokenType> = phf::phf_map! { + "def" => TokenType::KeywordDef, + "func" => TokenType::KeywordFunc, + "return" => TokenType::KeywordReturn, +}; + +pub fn tokenize<'a>( + filename: Option<&'a OsStr>, + s: &'a str, +) -> TokenizedBuffer<'a> { + let mut toks = soa_rs::Soa::::with_capacity(size::kibibytes(10)); + let mut ctx = LexerContext::new(filename, s); + + while let Some(c) = ctx.next() { + let (i, j) = (ctx.pos_b, ctx.pos_a); + if let Some(tok) = match c { + '/' if ctx.peek().is_some_and(|c| c == '*') => { + skip_comment(&mut ctx); + ctx.expect_punct_p = false; + None + }, + '<' if ctx.peek().is_some_and(|c| c == '<') => { + ctx.next(); /* Consume ‘<’ */ + let kind = if ctx.peek().is_some_and(|c| c == '<') { + ctx.next(); /* Consume ‘<’ */ + TokenType::AngleL3 + } else { + TokenType::AngleL2 + }; + Some(Token { + kind, + view: &s[i..ctx.pos_a], + }) + }, + '>' if ctx.peek().is_some_and(|c| c == '>') => { + ctx.next(); /* Consume ‘>’ */ + let kind = if ctx.peek().is_some_and(|c| c == '>') { + ctx.next(); /* Consume ‘>’ */ + TokenType::AngleR3 + } else { + TokenType::AngleR2 + }; + Some(Token { + kind, + view: &s[i..ctx.pos_a], + }) + }, + '&' if ctx.peek().is_some_and(|c| c == '~') => { + ctx.next(); /* Consume ‘~’ */ + Some(Token { + kind: TokenType::AmpersandTilde, + view: &s[i..j + 1], + }) + }, + '!' | '&' | '(' | ')' | '*' | '+' | ',' | '-' | '/' | ';' | '<' + | '=' | '>' | '[' | ']' | '^' | '{' | '|' | '}' | '~' | '…' => { + Some(Token { + kind: unsafe { mem::transmute(c as u8) }, + view: &s[i..j], + }) + }, + '#' => { + ctx.literal_spacing_guard(); + Some(tokenize_number_based(&mut ctx)) + }, + '0'..='9' => { + ctx.literal_spacing_guard(); + Some(tokenize_number(&mut ctx, "0123456789")) + }, + '"' => { + ctx.literal_spacing_guard(); + Some(tokenize_string(&mut ctx)) + }, + _ if unicode::xid_start_p(c) => { + ctx.literal_spacing_guard(); + Some(tokenize_identifier(&mut ctx)) + }, + _ if unicode::pattern_white_space_p(c) => { + if !unicode::default_ignorable_code_point_p(c) { + ctx.expect_punct_p = false; + } + None + }, + c => { + let msg = format!("Invalid character ‘{c}’"); + ctx.err_at_position(msg.as_str()); + }, + } { + ctx.expect_punct_p = tok.kind.literalp(); + toks.push(tok); + } + } + + toks.push(Token { + kind: TokenType::Eof, + view: &s[s.len() - 1..], + }); + return TokenizedBuffer { + tokens: toks, + buffer: s, + filename, + }; +} + +fn skip_comment<'a>(ctx: &mut LexerContext<'a>) { + ctx.next(); /* Consume ‘*’ */ + let mut depth = 1; + while let Some(c) = ctx.next() { + match c { + '/' if ctx.peek().is_some_and(|c| c == '*') => { + depth += 1; + ctx.next(); /* Consume ‘*’ */ + }, + '*' if ctx.peek().is_some_and(|c| c == '/') => { + depth -= 1; + ctx.next(); /* Consume ‘/’ */ + if depth == 0 { + return; + } + }, + _ => {}, + }; + } + ctx.err_at_position("Unterminated comment"); +} + +fn tokenize_number_based<'a>(ctx: &mut LexerContext<'a>) -> Token<'a> { + let i = ctx.pos_b; + let alphabet = match ctx.next() { + Some('b') => "01", + Some('o') => "01234567", + Some('d') => "0123456789", + Some('x') => "0123456789ABCDEF", + Some(c) => { + let msg = format!("Invalid number base specifier ‘{c}’"); + ctx.err_at_position(msg.as_str()); + }, + None => ctx.err_at_position("Expected number base specifier after ‘#’"), + }; + let mut tok = match ctx.next() { + Some(c) if alphabet.contains(c) => tokenize_number(ctx, alphabet), + Some(c) => { + let base = match alphabet.len() { + 2 => "binary", + 8 => "octal", + 10 => "decimal", + 16 => "hexadecimal", + _ => unreachable!(), + }; + let msg = format!("Invalid {base} digit ‘{c}’"); + ctx.err_at_position(msg.as_str()); + }, + None => ctx.err_at_position("Expected number after base specifier"), + }; + tok.view = &ctx.string[i..ctx.pos_a]; + return tok; +} + +fn tokenize_number<'a>( + ctx: &mut LexerContext<'a>, + alphabet: &'static str, +) -> Token<'a> { + let i = ctx.pos_b; + span_raw_number(ctx, alphabet, true); + + /* Fractional part */ + if ctx.peek().is_some_and(|c| c == '.') { + ctx.next(); + if ctx.peek().is_some_and(|c| alphabet.contains(c)) { + span_raw_number(ctx, alphabet, false); + } + } + + /* Exponential part */ + if ctx.peek().is_some_and(|c| c == 'e') { + ctx.next(); + span_raw_number(ctx, alphabet, false); + } + + return Token { + kind: TokenType::Number, + view: &ctx.string[i..ctx.pos_a], + }; +} + +fn span_raw_number<'a>( + ctx: &mut LexerContext<'a>, + alphabet: &'static str, + first_digit_lexed_p: bool, +) { + if !first_digit_lexed_p { + match ctx.next() { + Some(c) if alphabet.contains(c) => c, + Some(c) => { + let base = match alphabet.len() { + 2 => "binary", + 8 => "octal", + 10 => "decimal", + 16 => "hexadecimal", + _ => unreachable!(), + }; + let msg = format!("Invalid {base} digit ‘{c}’"); + ctx.err_at_position(msg.as_str()); + }, + None => { + let base = match alphabet.len() { + 2 => "binary", + 8 => "octal", + 10 => "decimal", + 16 => "hexadecimal", + _ => unreachable!(), + }; + let msg = format!( + "Expected {base} digit but reached end-of-file instead" + ); + ctx.err_at_position(msg.as_str()); + }, + }; + } + + let mut last_was_apos_p = false; + while let Some(c) = ctx.peek() { + match c { + '\'' if last_was_apos_p => ctx.err_at_position( + "Multiple concurrent digit separators in numeric literal", + ), + '\'' => { + last_was_apos_p = true; + ctx.next(); + }, + _ if alphabet.contains(c) => { + last_was_apos_p = false; + ctx.next(); + }, + _ => break, + }; + } + + if last_was_apos_p { + ctx.err_at_position( + "Numeric literals may not end with a digit separator", + ); + } +} + +fn tokenize_string<'a>(ctx: &mut LexerContext<'a>) -> Token<'a> { + let i = ctx.pos_b; + loop { + if let Some(c) = ctx.next() { + if c == '"' { + break; + } + } else { + ctx.err_at_position("Unterminated string"); + } + } + return Token { + kind: TokenType::String, + view: &ctx.string[i..ctx.pos_a], + }; +} + +fn tokenize_identifier<'a>(ctx: &mut LexerContext<'a>) -> Token<'a> { + let i = ctx.pos_b; + while ctx.peek().is_some_and(unicode::xid_continue_p) { + ctx.next(); + } + let view = &ctx.string[i..ctx.pos_a]; + let kind = match KEYWORDS.get(view) { + Some(kind) => kind.clone(), + None => TokenType::Identifier, + }; + return Token { kind, view }; +} diff --git a/oryxc/src/main.rs b/oryxc/src/main.rs new file mode 100644 index 0000000..298093d --- /dev/null +++ b/oryxc/src/main.rs @@ -0,0 +1,95 @@ +#![allow(unsafe_op_in_unsafe_fn)] + +mod compiler; +mod errors; +mod lexer; +mod parser; +mod size; +mod unicode; + +use std::ffi::OsString; +use std::{ + env, + fs, + process, + thread, +}; + +use lexopt; + +#[derive(Clone, Copy, Default)] +pub struct Flags { + pub debug_lexer: bool, + pub debug_parser: bool, + pub help: bool, + pub threads: usize, +} + +impl Flags { + fn parse() -> Result<(Flags, Vec), lexopt::Error> { + use lexopt::prelude::*; + + let mut rest = Vec::with_capacity(env::args().len()); + let mut flags = Flags::default(); + let mut parser = lexopt::Parser::from_env(); + + while let Some(arg) = parser.next()? { + match arg { + Short('h') | Long("help") => flags.help = true, + Short('l') | Long("debug-lexer") => flags.debug_lexer = true, + Short('p') | Long("debug-parser") => flags.debug_parser = true, + Short('t') | Long("threads") => { + flags.threads = parser.value()?.parse()?; + if flags.threads == 0 { + err!("thread count must be greater than 0"); + } + }, + Value(v) => rest.push(v), + _ => return Err(arg.unexpected()), + } + } + + if flags.threads == 0 { + flags.threads = thread::available_parallelism().map_or_else( + |e| { + warn!(e, "failed to get thread count"); + 1 + }, + |x| x.get(), + ); + } + + return Ok((flags, rest)); + } +} + +fn usage() { + eprintln!( + concat!("Usage: {0} [-lp] [-t threads]\n", " {0} -h"), + errors::progname().display() + ); +} + +fn main() { + let (flags, rest) = match Flags::parse() { + Ok(v) => v, + Err(e) => { + warn!(e); + usage(); + process::exit(1); + }, + }; + + if flags.help { + usage(); + process::exit(0); + } + + compiler::start(rest, flags); + // let tokbuf = lexer::tokenize(Some(file), s.as_str()); + // let (ast, extra_data) = parser::parse(&tokbuf); + + // if flags.debug_lexer { + // tokbuf.tokens.iter().for_each(|t| println!("{t:?}")); + // } +} diff --git a/oryxc/src/parser.rs b/oryxc/src/parser.rs new file mode 100644 index 0000000..212d0db --- /dev/null +++ b/oryxc/src/parser.rs @@ -0,0 +1,544 @@ +use std::ffi::OsStr; +use std::fmt::Display; +use std::mem::ManuallyDrop; +use std::vec::Vec; + +use soa_rs::{ + Soa, + Soars, +}; + +use crate::lexer::{ + TokenType, + TokenizedBuffer, +}; +use crate::{ + errors, + size, +}; + +const MIN_PREC: i64 = 0; +const MAX_PREC: i64 = 6; + +#[repr(u8)] +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum AstType { + Assign, /* (ident-token, expression) */ + Block, /* (extra-data, _) */ + Dereference, /* (lhs, _) */ + FunCall, /* (expression, extra-data) */ + FunProto, /* (extra-data, _) */ + Function, /* (prototype, body) */ + Identifier, /* (_, _) */ + MultiDefBind, /* (extra-data, _) */ + Number, /* (token, _) */ + Pointer, /* (rhs, _) */ + Return, /* (extra-data, _) */ + String, /* (token, _) */ + Type, /* (type-data, _) */ + UnaryOperator, /* (rhs, _) */ + BinaryOperator, /* (lhs, rhs) */ +} + +#[derive(Clone, Copy, Debug)] +pub struct SubNodes(u32, u32); + +impl Default for SubNodes { + fn default() -> Self { + return Self(u32::MAX, u32::MAX); + } +} + +#[derive(Soars)] +#[soa_derive(Debug)] +pub struct AstNode { + pub kind: AstType, + pub tok: u32, + pub sub: SubNodes, +} + +pub struct DeclData { + lhs: Vec<(u32, u32)>, /* (ident, type) tuple */ + rhs: Vec, +} + +pub struct FunCallData { + args: Vec, +} + +pub struct FunProtoData { + args: Vec<(u32, u32)>, /* (ident, type) tuple */ + ret: Vec, +} + +pub struct BlockData { + stmts: Vec, +} + +pub struct ReturnData { + exprs: Vec, +} + +pub union ExtraData { + block: ManuallyDrop, + decl: ManuallyDrop, + funcall: ManuallyDrop, + funproto: ManuallyDrop, + r#return: ManuallyDrop, +} + +struct Parser<'a, 'b> { + ast: Soa, + extra_data: Vec, + tokbuf: &'a TokenizedBuffer<'b>, + cursor: u32, + scratch: Vec, +} + +impl<'a, 'b> Parser<'a, 'b> { + fn new(tokbuf: &'a TokenizedBuffer<'b>) -> Self { + return Self { + ast: Soa::with_capacity(size::kibibytes(10)), + extra_data: Vec::with_capacity(size::kibibytes(1)), + tokbuf, + cursor: 0, + scratch: Vec::with_capacity(64), + }; + } + + #[inline(always)] + fn get(&self) -> TokenType { + return unsafe { + *self + .tokbuf + .tokens + .kind() + .get_unchecked(self.cursor as usize) + }; + } + + #[inline(always)] + fn next(&mut self) -> TokenType { + self.cursor += 1; + return self.get(); + } + + #[inline(always)] + fn get_n_move(&mut self) -> TokenType { + let t = self.get(); + self.cursor += 1; + return t; + } + + #[inline(always)] + fn new_node(&mut self, n: AstNode) -> u32 { + self.ast.push(n); + return (self.ast.len() - 1) as u32; + } + + #[inline(always)] + fn new_extra_data(&mut self, d: ExtraData) -> u32 { + self.extra_data.push(d); + return (self.extra_data.len() - 1) as u32; + } + + fn err_at_position(&self, i: u32, s: T) -> ! + where + T: Display, + { + errors::err_at_position( + self.tokbuf.filename.unwrap_or(OsStr::new("-")), + s, + ); + } + + fn parse_toplevel(&mut self) { + match self.get() { + TokenType::KeywordDef => self.parse_def(), + TokenType::Eof => return, + _ => { + let msg = format!( + "Expected top-level statement but got {:?}", + self.get() + ); + self.err_at_position(self.cursor, msg.as_str()); + }, + }; + } + + fn parse_stmt(&mut self) -> u32 { + return match self.get() { + TokenType::KeywordDef => self.parse_def(), + TokenType::KeywordReturn => { + let main_tok = self.cursor; + self.next(); /* Consume ‘return’ */ + let exprs = self.parse_expr_list(); + if self.get_n_move() != TokenType::Semicolon { + self.err_at_position( + self.cursor - 1, + "Expected semicolon after return statement", + ); + } + let i = self.new_extra_data(ExtraData { + r#return: ManuallyDrop::new(ReturnData { exprs }), + }); + self.new_node(AstNode { + kind: AstType::Return, + tok: main_tok, + sub: SubNodes(i, u32::MAX), + }) + }, + t if t.exprp() => { + let k = self.parse_expr(MIN_PREC); + if self.get_n_move() != TokenType::Semicolon { + self.err_at_position( + self.cursor - 1, + "Expected semicolon after expression", + ); + } + k + }, + _ => { + let msg = + format!("Expected statement but got {:?}", self.get()); + self.err_at_position(self.cursor, msg.as_str()); + }, + }; + } + + fn parse_def(&mut self) -> u32 { + let main_tok = self.cursor; + if self.get_n_move() != TokenType::KeywordDef { + self.err_at_position(self.cursor - 1, "Expected ‘def’"); + } + let lhs = self.parse_decl_list(); + if lhs.len() == 0 { + self.err_at_position(main_tok, "Expected an identifier"); + } + + if self.get_n_move() != TokenType::Equals { + self.err_at_position(self.cursor - 1, "Expected ‘=’"); + } + + let rhs = self.parse_expr_list(); + if rhs.len() == 0 { + self.err_at_position( + self.cursor - 1, + "Expected expression after ‘=’", + ); + } + if self.get_n_move() != TokenType::Semicolon { + self.err_at_position(self.cursor - 1, "Expected semicolon"); + } + + let i = self.new_extra_data(ExtraData { + decl: ManuallyDrop::new(DeclData { lhs, rhs }), + }); + return self.new_node(AstNode { + kind: AstType::MultiDefBind, + tok: main_tok, + sub: SubNodes(i as u32, u32::MAX), + }); + } + + fn parse_func_proto(&mut self) -> u32 { + let main_tok = self.cursor; + + /* No params or return */ + if self.next() != TokenType::ParenL { + return self.new_node(AstNode { + kind: AstType::FunProto, + tok: main_tok, + sub: SubNodes(u32::MAX, u32::MAX), + }); + } + + self.next(); /* Consume ‘(’ */ + let args = self.parse_decl_list(); + + if self.get_n_move() != TokenType::ParenR { + self.err_at_position( + self.cursor - 1, + "Expected closing parenthesis", + ); + } + + let t = self.get(); + let ret = match t { + TokenType::ParenL => { + self.next(); /* Consume ‘(’ */ + let xs = self.parse_expr_list(); + if self.get_n_move() != TokenType::ParenR { + self.err_at_position( + self.cursor - 1, + "Expected closing parenthesis", + ); + } + xs + }, + _ if t.exprp() => { + // TODO: This is really bad. We should probably optimize + // for the small cases (or use an arena?) + vec![self.parse_expr(MIN_PREC)] + }, + _ => Vec::new(), /* Doesn’t allocate */ + }; + + let i = self.new_extra_data(ExtraData { + funproto: ManuallyDrop::new(FunProtoData { args, ret }), + }); + return self.new_node(AstNode { + kind: AstType::FunProto, + tok: main_tok, + sub: SubNodes(i, u32::MAX), + }); + } + + fn parse_block(&mut self) -> u32 { + let main_tok = self.cursor; + if self.get_n_move() != TokenType::BraceL { + self.err_at_position(self.cursor - 1, "Expected opening brace"); + } + + let mut stmts = Vec::::with_capacity(64); + while self.get() != TokenType::BraceR { + stmts.push(self.parse_stmt()); + } + self.next(); /* Consume ‘}’ */ + let i = self.new_extra_data(ExtraData { + block: ManuallyDrop::new(BlockData { stmts }), + }); + return self.new_node(AstNode { + kind: AstType::Block, + tok: main_tok, + sub: SubNodes(i, u32::MAX), + }); + } + + fn parse_decl_list(&mut self) -> Vec<(u32, u32)> { + let scratch_beg = self.scratch.len(); + let (mut nidents, mut nuntyped) = (0, 0); + loop { + if self.get() != TokenType::Identifier { + break; + } + self.scratch.push(self.cursor); + self.scratch.push(u32::MAX); + nidents += 1; + nuntyped += 1; + + match self.next() { + TokenType::Comma => { + self.next(); + }, + t if t.exprp() => { + let k = self.parse_expr(MIN_PREC); + let len = self.scratch.len(); + for i in 0..nuntyped { + self.scratch[len - 1 - 2 * i] = k; + } + nuntyped = 0; + }, + _ => break, + }; + } + + let mut iter = self.scratch.drain(scratch_beg..); + let mut pairs = Vec::with_capacity(nidents); + while let (Some(a), Some(b)) = (iter.next(), iter.next()) { + pairs.push((a, b)); + } + return pairs; + } + + fn parse_expr_list(&mut self) -> Vec { + let scratch_beg = self.scratch.len(); + + while self.get().exprp() { + let k = self.parse_expr(MIN_PREC); + self.scratch.push(k); + if self.get() == TokenType::Comma { + self.next(); + } else { + break; + } + } + + return self.scratch.drain(scratch_beg..).collect(); + } + + fn parse_expr(&mut self, minprec: i64) -> u32 { + fn getprec(t: TokenType) -> i64 { + match t { + TokenType::ParenL => 6, + TokenType::Ampersand + | TokenType::AmpersandTilde + | TokenType::AngleL2 + | TokenType::AngleL3 + | TokenType::AngleR2 + | TokenType::AngleR3 + | TokenType::Asterisk + | TokenType::Slash => 5, + TokenType::Bar + | TokenType::Minus + | TokenType::Plus + | TokenType::Tilde => 4, + TokenType::AngleL | TokenType::AngleR => 3, + _ => -1, + } + } + + let mut lhs = match self.get() { + TokenType::Identifier => { + self.next(); + self.new_node(AstNode { + kind: AstType::Identifier, + tok: self.cursor - 1, + sub: SubNodes::default(), + }) + }, + TokenType::Number => { + self.next(); + self.new_node(AstNode { + kind: AstType::Number, + tok: self.cursor - 1, + sub: SubNodes::default(), + }) + }, + TokenType::String => { + self.next(); + self.new_node(AstNode { + kind: AstType::String, + tok: self.cursor - 1, + sub: SubNodes::default(), + }) + }, + TokenType::Ampersand + | TokenType::Exclamation + | TokenType::Minus + | TokenType::Plus + | TokenType::Tilde => { + let i = self.cursor; + self.next(); + let lhs = self.parse_expr(MAX_PREC); + self.new_node(AstNode { + kind: AstType::UnaryOperator, + tok: i, + sub: SubNodes(lhs, u32::MAX), + }) + }, + TokenType::ParenL => { + self.next(); + let k = self.parse_expr(MIN_PREC); + if self.get() != TokenType::ParenR { + self.err_at_position( + self.cursor, + "Expected closing parenthesis", + ); + } + self.next(); /* Consume ‘)’ */ + k + }, + TokenType::Caret => { + let tok = self.cursor; + self.next(); + let k = self.parse_expr(MAX_PREC); + self.new_node(AstNode { + kind: AstType::Pointer, + tok, + sub: SubNodes(k, u32::MAX), + }) + }, + TokenType::KeywordFunc => { + let tok = self.cursor; + let proto = self.parse_func_proto(); + if self.get() == TokenType::BraceL { + let body = self.parse_block(); + self.new_node(AstNode { + kind: AstType::Function, + tok, + sub: SubNodes(proto, body), + }) + } else { + proto + } + }, + _ => self.err_at_position(self.cursor, "Expected expression"), + }; + + loop { + let tok = self.get(); + let prec = getprec(tok); + if prec < minprec { + break; + } + + lhs = match tok { + /* Binop */ + TokenType::Ampersand + | TokenType::AmpersandTilde + | TokenType::AngleL2 + | TokenType::AngleL3 + | TokenType::AngleR2 + | TokenType::AngleR3 + | TokenType::Asterisk + | TokenType::Slash + | TokenType::Bar + | TokenType::Minus + | TokenType::Plus + | TokenType::Tilde + | TokenType::AngleL + | TokenType::AngleR => { + let i = self.cursor; + self.next(); + let rhs = self.parse_expr(prec); + self.new_node(AstNode { + kind: AstType::BinaryOperator, + tok: i, + sub: SubNodes(lhs, rhs), + }) + }, + + /* Dereference */ + TokenType::Caret => { + self.next(); + self.new_node(AstNode { + kind: AstType::Dereference, + tok: self.cursor - 1, + sub: SubNodes(lhs, u32::MAX), + }) + }, + + /* Funcall */ + TokenType::ParenL => { + let tok = self.cursor; + self.next(); + let args = self.parse_expr_list(); + if self.get_n_move() != TokenType::ParenR { + self.err_at_position(self.cursor - 1, "Expected ‘)’"); + } + let i = self.new_extra_data(ExtraData { + funcall: ManuallyDrop::new(FunCallData { args }), + }); + self.new_node(AstNode { + kind: AstType::FunCall, + tok, + sub: SubNodes(lhs, i), + }) + }, + + _ => break, + } + } + + return lhs; + } +} + +pub fn parse(tokbuf: &TokenizedBuffer) -> (Soa, Vec) { + let mut p = Parser::new(tokbuf); + while p.get() != TokenType::Eof { + p.parse_toplevel(); + } + return (p.ast, p.extra_data); +} diff --git a/oryxc/src/size.rs b/oryxc/src/size.rs new file mode 100644 index 0000000..4f639c2 --- /dev/null +++ b/oryxc/src/size.rs @@ -0,0 +1,3 @@ +pub const fn kibibytes(n: usize) -> usize { + return n * 1024; +} diff --git a/oryxc/src/unicode/default_ignorable_code_point.rs b/oryxc/src/unicode/default_ignorable_code_point.rs new file mode 100644 index 0000000..b900a3b --- /dev/null +++ b/oryxc/src/unicode/default_ignorable_code_point.rs @@ -0,0 +1,363 @@ +/* Autogenerated – DO NOT EDIT */ + +static DEFAULT_IGNORABLE_CODE_POINT_L1: [u16; 544] = [ + 0, 1, 2, 3, 4, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 7, 1, 1, 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +]; +static DEFAULT_IGNORABLE_CODE_POINT_L2: [u64; 320] = [ + 0, + 0, + 35184372088832, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 32768, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 268435456, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 6442450944, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 13510798882111488, + 0, + 63488, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 136339441907712, + 281470681743360, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 68719476736, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 65535, + 0, + 0, + 9223372036854775808, + 0, + 0, + 4294967296, + 143833713099145216, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 64424509440, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 574208952489738240, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, +]; +#[inline] +pub fn default_ignorable_code_point_p(c: char) -> bool { + let cp = c as usize; + let blki = + unsafe { *DEFAULT_IGNORABLE_CODE_POINT_L1.get_unchecked(cp >> 11) } + as usize; + let in_blk_offset_p = cp & 0x7FF; + let wordi = (blki * 32) + (in_blk_offset_p >> 6); + let biti = in_blk_offset_p & 0x3F; + unsafe { + return (*DEFAULT_IGNORABLE_CODE_POINT_L2.get_unchecked(wordi) + & (1 << biti)) + != 0; + } +} diff --git a/oryxc/src/unicode/line_terminator.rs b/oryxc/src/unicode/line_terminator.rs new file mode 100644 index 0000000..e30e031 --- /dev/null +++ b/oryxc/src/unicode/line_terminator.rs @@ -0,0 +1,135 @@ +/* Autogenerated – DO NOT EDIT */ + +static LINE_TERMINATOR_L1: [u16; 544] = [ + 0, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +]; +static LINE_TERMINATOR_L2: [u64; 96] = [ + 15360, + 0, + 32, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 3298534883328, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, +]; +#[inline] +pub fn line_terminator_p(c: char) -> bool { + let cp = c as usize; + let blki = unsafe { *LINE_TERMINATOR_L1.get_unchecked(cp >> 11) } as usize; + let in_blk_offset_p = cp & 0x7FF; + let wordi = (blki * 32) + (in_blk_offset_p >> 6); + let biti = in_blk_offset_p & 0x3F; + unsafe { + return (*LINE_TERMINATOR_L2.get_unchecked(wordi) & (1 << biti)) != 0; + } +} diff --git a/oryxc/src/unicode/mod.rs b/oryxc/src/unicode/mod.rs new file mode 100644 index 0000000..2fbdcb3 --- /dev/null +++ b/oryxc/src/unicode/mod.rs @@ -0,0 +1,11 @@ +pub mod default_ignorable_code_point; +pub mod line_terminator; +pub mod pattern_white_space; +pub mod xid_continue; +pub mod xid_start; + +pub use default_ignorable_code_point::default_ignorable_code_point_p; +pub use line_terminator::line_terminator_p; +pub use pattern_white_space::pattern_white_space_p; +pub use xid_continue::xid_continue_p; +pub use xid_start::xid_start_p; diff --git a/oryxc/src/unicode/pattern_white_space.rs b/oryxc/src/unicode/pattern_white_space.rs new file mode 100644 index 0000000..b051e3a --- /dev/null +++ b/oryxc/src/unicode/pattern_white_space.rs @@ -0,0 +1,137 @@ +/* Autogenerated – DO NOT EDIT */ + +static PATTERN_WHITE_SPACE_L1: [u16; 544] = [ + 0, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +]; +static PATTERN_WHITE_SPACE_L2: [u64; 96] = [ + 4294983168, + 0, + 32, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 3298534932480, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, +]; +#[inline] +pub fn pattern_white_space_p(c: char) -> bool { + let cp = c as usize; + let blki = + unsafe { *PATTERN_WHITE_SPACE_L1.get_unchecked(cp >> 11) } as usize; + let in_blk_offset_p = cp & 0x7FF; + let wordi = (blki * 32) + (in_blk_offset_p >> 6); + let biti = in_blk_offset_p & 0x3F; + unsafe { + return (*PATTERN_WHITE_SPACE_L2.get_unchecked(wordi) & (1 << biti)) + != 0; + } +} diff --git a/oryxc/src/unicode/xid_continue.rs b/oryxc/src/unicode/xid_continue.rs new file mode 100644 index 0000000..8fbbce4 --- /dev/null +++ b/oryxc/src/unicode/xid_continue.rs @@ -0,0 +1,1007 @@ +/* Autogenerated – DO NOT EDIT */ + +static XID_CONTINUE_L1: [u16; 1088] = [ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 10, 11, 12, 12, 12, 12, 12, 12, 13, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 14, 15, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 9, 9, 9, 9, 9, 9, + 9, 9, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 9, 29, 12, 30, 12, + 12, 31, 32, 9, 9, 9, 9, 9, 9, 33, 9, 34, 35, 12, 12, 12, 12, 12, 12, 12, + 36, 9, 9, 9, 9, 9, 9, 9, 37, 38, 9, 9, 39, 9, 9, 9, 40, 41, 42, 43, 44, 45, + 46, 47, 48, 9, 9, 49, 9, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 50, 12, 12, 12, 12, 51, 12, 12, 12, + 12, 52, 12, 12, 12, 12, 12, 12, 53, 54, 9, 9, 55, 9, 12, 12, 12, 12, 56, + 12, 12, 12, 12, 12, 12, 12, 12, 57, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 58, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, +]; +static XID_CONTINUE_L2: [u64; 944] = [ + 287948901175001088, + 576460745995190270, + 333270770471927808, + 18410715276682199039, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 88094074470339, + 18446744073709551615, + 13321647697761927167, + 18446744056529672128, + 18428729675200069631, + 18446744073709551615, + 18446744073709551615, + 18446744073709550843, + 18446744073709551615, + 18446462598732840959, + 18446744069456527359, + 13835058055282033151, + 2119858418286774, + 18446744069548736512, + 18446678103011885055, + 18446744073709551615, + 11529212845433552895, + 18446744073709486080, + 18446744073709545471, + 1125899906842623, + 2612087783874887679, + 70368744177663, + 18446471390799331327, + 18446744073701228287, + 18446744056529682431, + 18446744073709551615, + 18446462392574410751, + 17565725197581524975, + 5765733215448889759, + 15235112390417287150, + 18014125208779143, + 17576984196650090478, + 18302910150157089727, + 17576984196649951214, + 844217444219295, + 14123225865944680428, + 281200107273671, + 17582050746231021567, + 281265452367327, + 17577547146603651055, + 4221916082617823, + 18446744073709412351, + 18158794964244397535, + 3457638613854978030, + 3658904103781503, + 576460752303423486, + 67076095, + 4611685674830002134, + 4093607775, + 14024213633433600001, + 18446216308128218879, + 2305843009196916703, + 64, + 18446744073709551615, + 18446744073709487103, + 18446744070488326143, + 17870283321406070975, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744070446333439, + 9168765891372858879, + 18446744073701162813, + 18446744073696837631, + 1123704775901183, + 18446744069414649855, + 4557642822898941951, + 18446744073709551614, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446638520593285119, + 18446744069548802046, + 144053615424700415, + 9007197111451647, + 3905461007941631, + 18446744073709551615, + 4394566287359, + 18446744069481674752, + 144115188075855871, + 18446471394825863167, + 18014398509481983, + 1152657619668697087, + 8796093022207936, + 18446480190918885375, + 134153215, + 18446744069683019775, + 11529215043920986111, + 13834777130128311295, + 17588964818943, + 18446744073709551615, + 4494803601399807, + 18446744073709551615, + 4503599627370495, + 72057594037927935, + 4611686018427380735, + 16717361816799217663, + 576460752302833664, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744070475743231, + 4611686017001275199, + 6908521828386340863, + 2295745090394464220, + 9231253336202686464, + 9223934986817634305, + 536805376, + 562821641207808, + 17582049991377026180, + 18446744069414601696, + 511, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 4494940973301759, + 18446498607738650623, + 9223513873854758911, + 9187201948305063935, + 18446744071553646463, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 2251518330118602976, + 18446744073709551614, + 18446744068986765311, + 18446744073709551615, + 18446462598732840928, + 18446744073709551615, + 18446744069414617087, + 18446462598732840960, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 0, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 8191, + 4611686018427322368, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 17592185987071, + 13830835930631503871, + 18446744073709551615, + 1125899906842623, + 18446744060816261120, + 18446744073709551615, + 18446744073709550079, + 18446181124293001215, + 18691697672191, + 4503599627370495, + 18446744073709551615, + 16789419406609285183, + 18446532967477018623, + 2305843004919775231, + 18446744073709551615, + 9223372032626884609, + 36028797018963967, + 18194542490348896255, + 18446744073709551615, + 35184368733388807, + 18446602782178705022, + 18446466996645134335, + 18446744073709551615, + 288010473826156543, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446462667452317695, + 1152921504606845055, + 0, + 0, + 0, + 0, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446532967477018623, + 18446744073709551615, + 67108863, + 6881498031078244479, + 18446744073709551579, + 1125899906842623, + 18446744073709027328, + 18446744073709551615, + 18446744006063816703, + 18446744073709551615, + 18446744073709551615, + 4611686018427387903, + 18446744073709486080, + 18446744073709355007, + 287948901175001343, + 7036870122864639, + 12288634533233876992, + 18446744073709551615, + 2305843009213693951, + 9799832780635308032, + 18446743936404815870, + 9223372036854775807, + 486341884, + 13258596753222922239, + 1073692671, + 18446744073709551615, + 576460752303423487, + 0, + 9007199254740991, + 0, + 2305843009213693952, + 0, + 0, + 18446744069951455231, + 4295098367, + 18446708893632430079, + 576460752303359999, + 18446744070488326143, + 4128527, + 18446744073709551615, + 18446744073709551615, + 18446466993558126591, + 1152921504591118335, + 18446463698244468735, + 17870001915148894207, + 2016486715970549759, + 4503599627370495, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 36028797018963967, + 1095220854783, + 575897802350002111, + 0, + 10502394331027995967, + 36028792728190975, + 2147483647, + 15762594400829440, + 288230371860938751, + 67108863, + 13907115649320091647, + 0, + 9745789593611923567, + 2305843004918726656, + 536870911, + 549755813631, + 18014398509481983, + 2251795522912255, + 262143, + 0, + 18446744073709551615, + 511, + 2251799813685247, + 2251799813685247, + 287950000686628863, + 18446671780820025343, + 63, + 0, + 0, + 0, + 875211255709695, + 18158513697557840124, + 18446463149025525759, + 18446462598732972031, + 18446462598732841023, + 36028792723996703, + 18446744073709551615, + 9241386160486350975, + 576460752303423487, + 287951100198191108, + 18437736874454810623, + 22517998136787184, + 18446744073709551615, + 402644511, + 13907115649319829503, + 3, + 18446464796682337663, + 287957697268023295, + 18153444948953374703, + 8760701963286943, + 18428729675200023551, + 25770850213, + 18446744073709551615, + 16173172735, + 18446744073709551615, + 67043519, + 0, + 0, + 18392700878181105663, + 1056964609, + 18446744073709551615, + 67043345, + 144115188075855871, + 68719412223, + 287966492958392319, + 127, + 0, + 0, + 576460752303423487, + 0, + 18446744069414584320, + 9223376434901286911, + 17996384110963061375, + 67043343, + 18446740770879700992, + 120208752639, + 9223372036854775807, + 18446744073709486208, + 18446462599336820735, + 144115188075855871, + 0, + 1095216660480, + 0, + 287948909764935679, + 18410715276690587135, + 18445618173869752321, + 36027697507139583, + 0, + 13006395723845991295, + 18446741595580465407, + 18446466992517644287, + 4394019979263, + 0, + 0, + 0, + 36028792723996672, + 14411518807585456127, + 134152199, + 281474976710656, + 0, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 67108863, + 0, + 18446744073709551615, + 140737488355327, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 15, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 18446744073709486080, + 562949953421311, + 281474976710655, + 18446744069418778623, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 576460752303423487, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 127, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 288230376151711743, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 144115188075855871, + 18446466994631868415, + 9223372036854775807, + 8796093022143487, + 36028797018963967, + 16212958624241090575, + 65535, + 0, + 0, + 0, + 0, + 0, + 0, + 287984085547089919, + 0, + 0, + 0, + 18446744073709551615, + 18014398505187016704, + 1048575, + 18446744073709551615, + 18446744073709520895, + 4294934783, + 35747438006370304, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 9223372036858970111, + 2147483647, + 0, + 18446744073709551615, + 2251799813685247, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 8065665457643847680, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 1125934266580991, + 18446463629527547904, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 1152921504606846975, + 0, + 0, + 0, + 0, + 18446744073709551615, + 2305570330330005503, + 1677656575, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 287948901175001088, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 18446532967477018623, + 127, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 17872504197455282176, + 65970697670631, + 0, + 0, + 28, + 0, + 0, + 0, + 0, + 0, + 0, + 18446744073709551615, + 18446744073707454463, + 17005555242810474495, + 18446744073709551599, + 8935141660164089791, + 18446744073709419615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446743249075830783, + 17870283321271910397, + 18437736874452713471, + 18446603336221163519, + 18446741874686295551, + 18446744073709539319, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 17906312118425092095, + 9042383626829823, + 281470547525648, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 8660801552383, + 0, + 0, + 0, + 18446471240106377087, + 70368744177663, + 32768, + 0, + 4611439727822766079, + 17407, + 0, + 0, + 0, + 0, + 140737488289792, + 288230376151711743, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 288230376151646208, + 0, + 0, + 0, + 576460752303357952, + 0, + 0, + 0, + 13853072451644162047, + 0, + 0, + 0, + 9223213153129594880, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 8323103, + 18446744073709551615, + 67047423, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 790380184120328175, + 6843210385291930244, + 1152917029519358975, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 287948901175001088, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 4294967295, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744070488326143, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446532967477018623, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446462607322775551, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 1073741823, + 0, + 0, + 0, + 0, + 0, + 0, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 1073741823, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709488127, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 288230376151711743, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 281474976710655, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, +]; +#[inline] +pub fn xid_continue_p(c: char) -> bool { + let cp = c as usize; + let blki = unsafe { *XID_CONTINUE_L1.get_unchecked(cp >> 10) } as usize; + let in_blk_offset_p = cp & 0x3FF; + let wordi = (blki * 16) + (in_blk_offset_p >> 6); + let biti = in_blk_offset_p & 0x3F; + unsafe { + return (*XID_CONTINUE_L2.get_unchecked(wordi) & (1 << biti)) != 0; + } +} diff --git a/oryxc/src/unicode/xid_start.rs b/oryxc/src/unicode/xid_start.rs new file mode 100644 index 0000000..1c9d9ae --- /dev/null +++ b/oryxc/src/unicode/xid_start.rs @@ -0,0 +1,927 @@ +/* Autogenerated – DO NOT EDIT */ + +static XID_START_L1: [u16; 1088] = [ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 10, 11, 12, 12, 12, 12, 12, 12, 13, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 14, 15, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 9, 9, 9, 9, 9, 9, + 9, 9, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 9, 29, 12, 30, 12, + 12, 31, 32, 9, 9, 9, 9, 9, 9, 33, 9, 34, 35, 12, 12, 12, 12, 12, 12, 12, + 36, 9, 9, 9, 9, 9, 9, 9, 37, 38, 9, 9, 39, 9, 9, 9, 9, 9, 40, 9, 41, 42, + 43, 44, 45, 9, 9, 9, 9, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 46, 12, 12, 12, 12, 47, 12, 12, 12, 12, + 48, 12, 12, 12, 12, 12, 12, 49, 50, 9, 9, 51, 9, 12, 12, 12, 12, 52, 12, + 12, 12, 12, 12, 12, 12, 12, 53, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, +]; +static XID_START_L2: [u64; 864] = [ + 68719476736, + 576460745995190270, + 297241973452963840, + 18410715276682199039, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 88094074470339, + 0, + 13321366222785216512, + 18446744056529672000, + 18428729675200069631, + 18446744073709551615, + 18446744073709551615, + 18446744073709550595, + 18446744073709551615, + 18446462598732840959, + 18446744069456527359, + 511, + 2119858418286592, + 18446744069414584320, + 18446392229988665343, + 18446744073709551615, + 11241196188469297151, + 281474976514048, + 18446744073709543424, + 563224831328255, + 301749971126844416, + 1168302407679, + 18446471390564450303, + 18446744069414649599, + 1023, + 2594073385365405680, + 18446181140919287808, + 2577745637692514273, + 1153765945374687232, + 247132830528276448, + 7881300924956672, + 2589004636761079776, + 144115200960823296, + 2589004636760940512, + 562965791113216, + 288167810662516712, + 65536, + 2594071186342010848, + 13807648768, + 2589567586714640353, + 1688864624214016, + 2882303761516978160, + 18158513712597581824, + 3457638613854978016, + 127, + 1688849860263934, + 127, + 2307531515476572118, + 4026531935, + 1, + 35184372088575, + 7936, + 0, + 9223380832947798015, + 18438229877581611008, + 18446744069414600707, + 17870283321406070975, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744070446333439, + 9168765891372858879, + 18446744073701162813, + 18446744073696837631, + 134217727, + 18446744069414649855, + 4557642822898941951, + 18446744073709551614, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446638520593285119, + 18446744069548802046, + 144053615424700415, + 1125897759621119, + 527761286627327, + 4503599627370495, + 276824064, + 18446744069414584320, + 144115188075855871, + 18446469195802607615, + 18014398509481983, + 2147483647, + 8796093022142464, + 18446480190918885375, + 1023, + 18446744069422972927, + 2097151, + 549755813888, + 0, + 4503599627370464, + 8160, + 18158724812380307448, + 274877906943, + 68719476735, + 4611686018360336384, + 16717361816799217663, + 319718190147960832, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 0, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744070475743231, + 4611686017001275199, + 6908521828386340863, + 2295745090394464220, + 0, + 9223934986808197120, + 536805376, + 0, + 17582049991377026180, + 18446744069414601696, + 511, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 3509778554814463, + 18446498607738650623, + 141836999983103, + 9187201948305063935, + 2139062143, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 2251241253188403424, + 18446744073709551614, + 18446744068886102015, + 17870283321406128127, + 18446462598732840928, + 18446744073709551615, + 18446744069414617087, + 18446462598732840960, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 0, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 8191, + 4611686018427322368, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 13198434443263, + 9223512774343131135, + 18446744070488326143, + 281474976710655, + 18446744060816261120, + 18446744073709551615, + 18446744073709550079, + 18446181124293001215, + 34359736251, + 4503599627370495, + 4503599627370492, + 7564921474075590656, + 18446462873610746880, + 2305843004918726783, + 2251799813685232, + 8935422993945886720, + 2199023255551, + 14159317224157876215, + 4495436853045886975, + 7890092085477381, + 18446602782178705022, + 18446466996645134335, + 18446744073709551615, + 34359738367, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446462667452317695, + 1152921504606845055, + 0, + 0, + 0, + 0, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446532967477018623, + 18446744073709551615, + 67108863, + 6881498030004502655, + 18446744073709551579, + 1125899906842623, + 18446744073709027328, + 18446744073709551615, + 18446744006063816703, + 18446744073709551615, + 18446744073709551615, + 4611686018427387903, + 18446744073709486080, + 18446744073709355007, + 287948901175001343, + 0, + 12288634533233819648, + 18446744073709551615, + 2305843009213693951, + 576460743713488896, + 18446743798965862398, + 9223372033633550335, + 486341884, + 13258596753222922239, + 1073692671, + 18446744073709551615, + 576460752303423487, + 0, + 9007199254740991, + 0, + 0, + 0, + 0, + 18446744069951455231, + 131071, + 18446708893632430079, + 18014398509418495, + 18446744070488326143, + 4128527, + 18446744073709551615, + 18446744073709551615, + 18446462599806582783, + 1152921504591118335, + 18446463698244468735, + 17870001915148894207, + 2016486715970549759, + 4503599627370495, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 36028797018963967, + 1095220854783, + 575897802350002111, + 0, + 10502394331027995967, + 36028792728190975, + 2147483647, + 15762594400829440, + 288230371860938751, + 67108863, + 13907115649320091647, + 0, + 18014398491590657, + 2305843004918726656, + 536870911, + 137438953215, + 18014398509481983, + 2251795522912255, + 262143, + 0, + 18446744073709551615, + 511, + 2251799813685247, + 2251799813685247, + 68719476735, + 18446603611099102208, + 63, + 0, + 0, + 0, + 848822976643071, + 252, + 18446463149025525759, + 18446462598732841023, + 18446462598732840963, + 36028792723996703, + 72057594037927928, + 10696049115004928, + 281474976710648, + 2199023190016, + 549755813880, + 20266198323101840, + 2251799813685240, + 335544350, + 9223389629040558079, + 1, + 18446464796682337663, + 2147483647, + 2589004636760940512, + 16643063808, + 54043195528399871, + 655360, + 9007199254740991, + 15032387456, + 281474976710655, + 176, + 0, + 0, + 140737488355327, + 251658240, + 281474976710655, + 16, + 72066390130950143, + 0, + 134217727, + 127, + 0, + 0, + 17592186044415, + 0, + 18446744069414584320, + 9223372041149743103, + 9223653511822045823, + 2, + 18446740770879700992, + 42949804031, + 290482175965394945, + 18446744073441181696, + 18446462599269712895, + 144115188075855871, + 0, + 0, + 0, + 8589934591, + 140737488354815, + 18445618173802708993, + 65535, + 0, + 562949953420159, + 18446741595513421888, + 18446462598749619199, + 268435455, + 0, + 0, + 0, + 2251795518717952, + 4503599627239412, + 0, + 281474976710656, + 0, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 67108863, + 0, + 18446744073709551615, + 140737488355327, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 15, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 18446744073709486080, + 562949953421311, + 281474976710655, + 18446744069414584446, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 576460752303423487, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 127, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 1073741823, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 144115188075855871, + 18446462600880324607, + 9223372036854775807, + 70368744112128, + 281474976710655, + 16212958624174047247, + 65535, + 0, + 0, + 0, + 0, + 0, + 0, + 35184372088831, + 0, + 0, + 0, + 18446744073709551615, + 18014398505187016704, + 1048575, + 18446744073709551615, + 67583, + 4294443008, + 34902944356761600, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 9223372036858970111, + 2147483647, + 0, + 18446744073709551615, + 2251799813685247, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 8065665457643847680, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 1125934266580991, + 18446463629527547904, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 1152921504606846975, + 0, + 0, + 0, + 0, + 18446744073709551615, + 2305570330330005503, + 67043839, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 18446744073709551615, + 18446744073707454463, + 17005555242810474495, + 18446744073709551599, + 8935141660164089791, + 18446744073709419615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446743249075830783, + 17870283321271910397, + 18437736874452713471, + 18446603336221163519, + 18446741874686295551, + 4087, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 8660801552383, + 0, + 0, + 0, + 18446462598732840960, + 70368744177663, + 0, + 0, + 4575692405780512767, + 16384, + 0, + 0, + 0, + 0, + 70368744112128, + 17592186044415, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 17592185978880, + 0, + 0, + 0, + 351843720822784, + 0, + 0, + 0, + 13843853836919242751, + 0, + 0, + 0, + 9223213153129594880, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 31, + 18446744073709551615, + 2063, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 790380184120328175, + 6843210385291930244, + 1152917029519358975, + 0, + 0, + 0, + 0, + 0, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 4294967295, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744070488326143, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446532967477018623, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446462607322775551, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 1073741823, + 0, + 0, + 0, + 0, + 0, + 0, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 1073741823, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 18446744073709488127, + 18446744073709551615, + 18446744073709551615, + 18446744073709551615, + 288230376151711743, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, +]; +#[inline] +pub fn xid_start_p(c: char) -> bool { + let cp = c as usize; + let blki = unsafe { *XID_START_L1.get_unchecked(cp >> 10) } as usize; + let in_blk_offset_p = cp & 0x3FF; + let wordi = (blki * 16) + (in_blk_offset_p >> 6); + let biti = in_blk_offset_p & 0x3F; + unsafe { + return (*XID_START_L2.get_unchecked(wordi) & (1 << biti)) != 0; + } +} diff --git a/rustfmt.toml b/rustfmt.toml new file mode 100644 index 0000000..8632490 --- /dev/null +++ b/rustfmt.toml @@ -0,0 +1,14 @@ +comment_width = 73 +wrap_comments = true +edition = "2024" +enum_discrim_align_threshold = 20 +hard_tabs = true +imports_layout = "Vertical" +match_block_trailing_comma = true +max_width = 80 +imports_granularity = "Module" +newline_style = "Unix" +group_imports = "StdExternalCrate" +struct_field_align_threshold = 20 +style_edition = "2024" +unstable_features = true diff --git a/test.x b/test.x new file mode 100644 index 0000000..f5d52f0 --- /dev/null +++ b/test.x @@ -0,0 +1,33 @@ +def puts = $foreign("puts", func(s ^u8)); + +/* +def foo = func { + let my_string = + \ This is my line + \ this is a second line + \ etc. + ; + puts(my_string); +} +*/ + +/* def add = func(dst *vec($N), v, u vec($N)) + * $poke(operator.addeq) + * { + * loop (i: 0...N) + * dst[i] = v[i] + u[i]; + * }; */ + +def main′ = func { + puts("Hello, sailor!"); + some_func(#b10.1100'1001e11); + slices_sort(my_slice, func(x, y int) int { + return x - y; + }); +}; + +def some_func = func(n u32) u32 { return n * 2; }; + +/* def MY_FLOAT = union { f f64; n u64; } { n = 0x482DEF }.f */ + +def main = func { main′(); }; diff --git a/unigen/Cargo.lock b/unigen/Cargo.lock new file mode 100644 index 0000000..5a36f70 --- /dev/null +++ b/unigen/Cargo.lock @@ -0,0 +1,16 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "lexopt" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c5d9b5843e8c9311ff602e6bd50855015e99e75159c2c54fe104cfac241f552" + +[[package]] +name = "unigen" +version = "0.1.0" +dependencies = [ + "lexopt", +] diff --git a/unigen/Cargo.toml b/unigen/Cargo.toml new file mode 100644 index 0000000..e0c6b4d --- /dev/null +++ b/unigen/Cargo.toml @@ -0,0 +1,7 @@ +[package] +name = "unigen" +version = "0.1.0" +edition = "2024" + +[dependencies] +lexopt = "0.1.0" diff --git a/unigen/fetch b/unigen/fetch new file mode 100755 index 0000000..46c02f9 --- /dev/null +++ b/unigen/fetch @@ -0,0 +1,19 @@ +#!/bin/sh + +set -e + +cd "${0%/*}" +trap 'rm -f UCD.zip' EXIT + +set -x +mkdir -p data +curl -LO https://www.unicode.org/Public/zipped/latest/UCD.zip +unzip -od data UCD.zip + +# XID_Start and XID_Continue additions +cat <<-EOF >>data/DerivedCoreProperties.txt +0024 ; XID_Start # Pc DOLLAR SIGN +005F ; XID_Start # Pc LOW LINE +2032..2034 ; XID_Continue # Po [3] PRIME..TRIPLE PRIME +2057 ; XID_Continue # Po QUADRUPLE PRIME +EOF diff --git a/unigen/rustfmt.toml b/unigen/rustfmt.toml new file mode 120000 index 0000000..39f97b0 --- /dev/null +++ b/unigen/rustfmt.toml @@ -0,0 +1 @@ +../rustfmt.toml \ No newline at end of file diff --git a/unigen/src/main.rs b/unigen/src/main.rs new file mode 100644 index 0000000..4851fa5 --- /dev/null +++ b/unigen/src/main.rs @@ -0,0 +1,278 @@ +use std::collections::HashMap; +use std::ffi::{ + OsStr, + OsString, +}; +use std::fs::File; +use std::io::{ + self, + BufRead, + BufReader, +}; +use std::path::Path; +use std::sync::OnceLock; +use std::vec::Vec; +use std::{ + env, + process, +}; + +const MIN_SHIFT: usize = 1; +const MAX_SHIFT: usize = 22; + +#[derive(Default)] +struct Flags { + codepoints: Option>, + help: bool, +} + +impl Flags { + fn parse() -> Result<(Flags, Vec), lexopt::Error> { + use lexopt::prelude::*; + + let mut rest = Vec::with_capacity(env::args().len() - 1); + let mut flags = Flags::default(); + let mut parser = lexopt::Parser::from_env(); + + while let Some(arg) = parser.next()? { + match arg { + Short('c') | Long("codepoints") => { + fn hex_to_char(s: &str) -> char { + return u32::from_str_radix(s, 16).map_or_else( + |e| { + eprintln!("{}: {s}: {e}", progname().display()); + process::exit(1); + }, + |n| { + char::from_u32(n).unwrap_or_else(|| { + eprintln!( + "{}: {s}: invalid codepoint", + progname().display() + ); + process::exit(1); + }) + }, + ); + } + + flags.codepoints = Some( + parser + .value()? + .to_str() + .unwrap_or_else(|| { + eprintln!( + "{}: unable to parse argument to -c/--codepoints", + progname().display() + ); + process::exit(1); + }) + .split(',') + .map(hex_to_char) + .collect(), + ); + }, + Short('h') | Long("help") => flags.help = true, + Value(v) => rest.push(v.into_string()?), + _ => return Err(arg.unexpected()), + } + } + + return Ok((flags, rest)); + } +} + +fn progname() -> &'static OsString { + static ARGV0: OnceLock = OnceLock::new(); + return ARGV0.get_or_init(|| { + let default = OsStr::new("oryxc"); + let s = env::args_os().next().unwrap_or(default.into()); + return Path::new(&s).file_name().unwrap_or(default).to_os_string(); + }); +} + +fn usage() { + eprintln!( + concat!( + "Usage: {0} data-file property-name\n", + " {0} -c codepoints name\n", + " {0} -h", + ), + progname().display() + ); +} + +fn main() -> io::Result<()> { + let (flags, rest) = match Flags::parse() { + Ok(v) => v, + Err(e) => { + eprintln!("{}: {e}", progname().display()); + usage(); + process::exit(1); + }, + }; + + if flags.help { + usage(); + process::exit(0); + } + + if (flags.codepoints.is_none() && rest.len() != 2) + || (flags.codepoints.is_some() && rest.len() != 1) + { + usage(); + process::exit(1); + } + + let mut bitmap = vec![false; 0x110000]; + let name = match flags.codepoints { + Some(vec) => { + vec.iter().for_each(|c| bitmap[*c as usize] = true); + &rest[0] + }, + None => { + parse_file(&rest[0], &rest[1], &mut bitmap)?; + &rest[1] + }, + }; + let (shift, lvl1, lvl2) = optimize_tables(&bitmap); + write_tables(name, shift, &lvl1, &lvl2); + return Ok(()); +} + +fn optimize_tables(bitmap: &[bool]) -> (usize, Vec, Vec) { + let mut minsz = usize::MAX; + let mut config = (0, Vec::new(), Vec::new()); + + for i in MIN_SHIFT..=MAX_SHIFT { + let (l1, l2) = build_tables(bitmap, i); + let sz = l1.len() * 2 + l2.len() * 8; + if sz < minsz { + minsz = sz; + config = (i, l1, l2); + } + } + + return config; +} + +fn parse_file>( + path: P, + prop: &str, + bitmap: &mut [bool], +) -> io::Result<()> { + let file = File::open(path)?; + let reader = BufReader::new(file); + + for line in reader.lines() { + let line = line?; + let line = line.split('#').next().unwrap_or("").trim(); + if line.is_empty() { + continue; + } + + let parts: Vec<&str> = line.split(';').map(|s| s.trim()).collect(); + if parts.len() < 2 || parts[1] != prop { + continue; + } + + let (beg, end) = if parts[0].contains("..") { + let mut range = parts[0].split(".."); + ( + u32::from_str_radix(range.next().unwrap(), 16).unwrap(), + u32::from_str_radix(range.next().unwrap(), 16).unwrap(), + ) + } else { + let val = u32::from_str_radix(parts[0], 16).unwrap(); + (val, val) + }; + + for cp in beg..=end { + if (cp as usize) < bitmap.len() { + bitmap[cp as usize] = true; + } + } + } + return Ok(()); +} + +fn build_tables(bitmap: &[bool], shift: usize) -> (Vec, Vec) { + let blksz = 1 << shift; + let u64s_per_block = (blksz + 63) / 64; + + let mut lvl2: Vec = Vec::new(); + let mut lvl1: Vec = Vec::new(); + let mut blkmap: HashMap, u16> = HashMap::new(); + + for chunk in bitmap.chunks(blksz) { + let mut blkdata = vec![0u64; u64s_per_block]; + + for (i, &bit) in chunk.iter().enumerate() { + if bit { + let word_idx = i / 64; + let bit_idx = i % 64; + blkdata[word_idx] |= 1 << bit_idx; + } + } + + if let Some(&i) = blkmap.get(&blkdata) { + lvl1.push(i); + } else { + let i = (lvl2.len() / u64s_per_block) as u16; + lvl2.extend_from_slice(&blkdata); + blkmap.insert(blkdata, i); + lvl1.push(i); + } + } + + return (lvl1, lvl2); +} + +fn write_tables(prop_name: &str, shift: usize, level1: &[u16], level2: &[u64]) { + let upper_name = prop_name.to_uppercase(); + let lower_name = prop_name.to_lowercase(); + let block_size = 1 << shift; + let mask = block_size - 1; + let u64s_per_block = (block_size + 63) / 64; + + println!("/* Autogenerated – DO NOT EDIT */\n"); + print!( + "static {upper_name}_L1: [u16; {}] = {level1:?};", + level1.len() + ); + print!( + "static {upper_name}_L2: [u64; {}] = {level2:?};", + level2.len() + ); + + let pred_name = if lower_name.contains('_') { + format!("{lower_name}_p") + } else { + format!("{lower_name}p") + }; + + print!( + "#[inline] + pub fn {pred_name}(c: char) -> bool {{ + let cp = c as usize; + let blki = unsafe {{ *{upper_name}_L1.get_unchecked(cp >> {shift}) }} as usize; + let in_blk_offset_p = cp & 0x{mask:X};" + ); + + if u64s_per_block == 1 { + print!( + " unsafe {{ + return ({upper_name}_L2.get_unchecked(blki) & (1 << in_blk_offset_p)) != 0; + }}" + ); + } else { + print!( + "let wordi = (blki * {u64s_per_block}) + (in_blk_offset_p >> 6); + let biti = in_blk_offset_p & 0x3F; + unsafe {{ + return (*{upper_name}_L2.get_unchecked(wordi) & (1 << biti)) != 0; + }}" + ); + } + + print!("}}"); +} -- cgit v1.2.3