From 82c14f030b36938cb10c1c8f8e880d0e0acaadc2 Mon Sep 17 00:00:00 2001 From: Thomas Voss Date: Wed, 4 Mar 2026 23:21:07 +0100 Subject: Begin working on symbol resolution --- Cargo.lock | 115 ++++++++++++++++++++++++++++++++++++++++++++++++- oryxc/Cargo.toml | 5 ++- oryxc/src/compiler.rs | 32 ++++++++++---- oryxc/src/intern.rs | 116 +++++++++++++++++++++++++++++++++++++++----------- oryxc/src/main.rs | 1 + oryxc/src/prelude.rs | 12 +++++- 6 files changed, 245 insertions(+), 36 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 8bb0332..ee11eb5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -52,6 +52,18 @@ dependencies = [ "windows-sys", ] +[[package]] +name = "bitflags" +version = "2.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "843867be96c8daad0d758b57df9392b6d8d271134fce549de6ce169ff98a92af" + +[[package]] +name = "cfg-if" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" + [[package]] name = "clap" version = "4.5.60" @@ -123,12 +135,32 @@ version = "0.8.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" +[[package]] +name = "dashmap" +version = "6.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf" +dependencies = [ + "cfg-if", + "crossbeam-utils", + "hashbrown", + "lock_api", + "once_cell", + "parking_lot_core", +] + [[package]] name = "fastrand" version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" +[[package]] +name = "hashbrown" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" + [[package]] name = "heck" version = "0.5.0" @@ -147,6 +179,27 @@ version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "803ec87c9cfb29b9d2633f20cba1f488db3fd53f2158b1024cbefb47ba05d413" +[[package]] +name = "libc" +version = "0.2.182" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6800badb6cb2082ffd7b6a67e6125bb39f18782f793520caee8cb8846be06112" + +[[package]] +name = "lock_api" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965" +dependencies = [ + "scopeguard", +] + +[[package]] +name = "once_cell" +version = "1.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" + [[package]] name = "once_cell_polyfill" version = "1.70.2" @@ -154,16 +207,31 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" [[package]] -name = "oryx" +name = "oryxc" version = "0.1.0" dependencies = [ "clap", "crossbeam-deque", + "dashmap", "phf", "soa-rs", + "unicode-normalization", "unicode-width", ] +[[package]] +name = "parking_lot_core" +version = "0.9.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-link", +] + [[package]] name = "phf" version = "0.13.1" @@ -225,6 +293,21 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "redox_syscall" +version = "0.5.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" +dependencies = [ + "bitflags", +] + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + [[package]] name = "serde" version = "1.0.228" @@ -260,6 +343,12 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b2aa850e253778c88a04c3d7323b043aeda9d3e30d5971937c1855769763678e" +[[package]] +name = "smallvec" +version = "1.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" + [[package]] name = "soa-rs" version = "0.9.1" @@ -297,12 +386,36 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "tinyvec" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa5fdc3bce6191a1dbc8c02d5c8bffcf557bafa17c124c5264a458f1b0613fa" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" + [[package]] name = "unicode-ident" version = "1.0.24" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" +[[package]] +name = "unicode-normalization" +version = "0.1.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fd4f6878c9cb28d874b009da9e8d183b5abc80117c40bbd187a1fde336be6e8" +dependencies = [ + "tinyvec", +] + [[package]] name = "unicode-width" version = "0.2.2" diff --git a/oryxc/Cargo.toml b/oryxc/Cargo.toml index 3cd08fb..cd5dedf 100644 --- a/oryxc/Cargo.toml +++ b/oryxc/Cargo.toml @@ -4,10 +4,11 @@ version = "0.1.0" edition = "2024" [dependencies] -crossbeam-deque = "0.8.6" -# icu = { version = "2.1.1", features = ["compiled_data"] } clap = { version = "4", features = ["derive"] } +crossbeam-deque = "0.8.6" +dashmap = "6.1.0" # num-rational = "0.4.2" phf = { version = "0.13.1", features = ["macros"] } soa-rs = "0.9.1" +unicode-normalization = "0.1.25" unicode-width = "0.2.2" diff --git a/oryxc/src/compiler.rs b/oryxc/src/compiler.rs index f630166..d8bdfa2 100644 --- a/oryxc/src/compiler.rs +++ b/oryxc/src/compiler.rs @@ -25,6 +25,7 @@ use crossbeam_deque::{ Stealer, Worker, }; +use dashmap::DashMap; use soa_rs::Soa; use crate::errors::OryxError; @@ -44,6 +45,7 @@ pub struct FileData { pub tokens: OnceLock>, pub ast: OnceLock>, pub extra_data: OnceLock>, + pub scopes: Vec>, } impl FileData { @@ -65,6 +67,7 @@ impl FileData { tokens: OnceLock::new(), ast: OnceLock::new(), extra_data: OnceLock::new(), + scopes: Vec::new(), }) } } @@ -82,7 +85,8 @@ pub enum Job { ResolveDef { file: FileId, fdata: Arc, - node: NodeId, + scope: ScopeId, + node: u32, }, } @@ -171,8 +175,8 @@ where handles.iter().map(|h| h.thread().clone()).collect(); let _ = state.worker_threads.set(worker_threads); - // if work completes before we get here, wake them so they can observe the - // termination condition and exit. + // if work completes before we get here, wake them so they can observe + // the termination condition and exit. state.wake_all(); for h in handles { @@ -195,7 +199,8 @@ fn worker_loop( } let Some(job) = find_task(&queue, &state.globalq, &stealers) else { - // no work available; check termination condition before parking to avoid missed wakeups + // no work available; check termination condition before parking to + // avoid missed wakeups if state.njobs.load(Ordering::Acquire) == 0 { break; } @@ -245,15 +250,25 @@ fn worker_loop( let SubNodes(i, nstmts) = ast.sub()[ast.len() - 1]; for j in 0..nstmts { - let node = NodeId(extra_data[(i + j) as usize]); + let node = extra_data[(i + j) as usize]; let fdata = fdata.clone(); state.push_job( &queue, - Job::ResolveDef { file, fdata, node }, + Job::ResolveDef { + file, + fdata, + node, + scope: ScopeId::GLOBAL, + }, ); } }, - Job::ResolveDef { file, fdata, node } => { + Job::ResolveDef { + file, + fdata, + scope, + node, + } => { eprintln!("Resolving def at node index {node:?}"); }, } @@ -263,7 +278,8 @@ fn worker_loop( // condition and exit. state.wake_all(); - // break here to avoid unnecessary steal attempts after work is done. + // break here to avoid unnecessary steal attempts after work is + // done. break; } } diff --git a/oryxc/src/intern.rs b/oryxc/src/intern.rs index 3ab91cf..b0d1a00 100644 --- a/oryxc/src/intern.rs +++ b/oryxc/src/intern.rs @@ -1,45 +1,61 @@ -use std::hash; +use std::hash::{ + Hash, + Hasher, +}; -use dashmap; -use icu::normalizer; +use dashmap::DashMap; +use unicode_normalization::{ + self, + IsNormalized, + UnicodeNormalization, +}; -#[repr(transparent)] -#[derive(Clone, Copy, Debug, Eq, PartialEq)] -pub struct Key(u32); +// use icu::normalizer::DecomposingNormalizer; +use crate::prelude::*; pub struct Interner<'a> { - map: dashmap::DashMap, Key>, + map: DashMap, SymbolId>, store: Vec<&'a str>, } -#[derive(Eq)] +#[derive(Debug, Eq)] pub struct UniStr<'a>(pub &'a str); -impl hash::Hash for UniStr<'_> { - fn hash(&self, state: &mut H) { +impl Hash for UniStr<'_> { + fn hash(&self, state: &mut H) { + /* In the ASCII common case we use .bytes() to avoid decoding + * every codepoint (a no-op in ASCII) */ if self.0.is_ascii() { - self.0.chars().for_each(|c| c.hash(state)); + self.0.bytes().for_each(|c| (c as char).hash(state)); + } else if unicode_normalization::is_nfkd_quick(self.0.chars()) + == IsNormalized::Yes + { + self.0.chars().for_each(|c| c.hash(state)); } else { - let nfkd = normalizer::DecomposingNormalizer::new_nfkd(); - nfkd.normalize_iter(self.0.chars()).for_each(|c| c.hash(state)); + self.0.nfkd().for_each(|c| c.hash(state)); } } } impl PartialEq for UniStr<'_> { fn eq(&self, other: &Self) -> bool { - let nfkd = normalizer::DecomposingNormalizer::new_nfkd(); - return match (self.0.is_ascii(), other.0.is_ascii()) { + /* Most code is ASCII, and normalization is obviously a lot + * slower than not normalizing, so we try to only normalize when + * we have to */ + return match ( + unicode_normalization::is_nfkd_quick(self.0.chars()) + == IsNormalized::Yes, + unicode_normalization::is_nfkd_quick(other.0.chars()) + == IsNormalized::Yes, + ) { (true, true) => self.0 == other.0, (true, false) => { - self.0.chars().eq(nfkd.normalize_iter(other.0.chars())) + self.0.bytes().map(|b| b as char).eq(other.0.nfkd()) }, (false, true) => { - other.0.chars().eq(nfkd.normalize_iter(self.0.chars())) + self.0.nfkd().eq(other.0.bytes().map(|b| b as char)) }, - (false, false) => nfkd - .normalize_iter(self.0.chars()) - .eq(nfkd.normalize_iter(other.0.chars())), + (false, false) => self.0.nfkd().eq(other.0.nfkd()), }; } } @@ -47,22 +63,74 @@ impl PartialEq for UniStr<'_> { impl<'a> Interner<'a> { pub fn new() -> Self { return Interner { - map: dashmap::DashMap::new(), + map: DashMap::new(), store: Vec::new(), }; } - pub fn get(&self, key: Key) -> &str { + pub fn get(&self, key: SymbolId) -> &str { return self.store[key.0 as usize]; } - pub fn intern(&mut self, value: &'a str) -> Key { + pub fn intern(&mut self, value: &'a str) -> SymbolId { if let Some(key) = self.map.get(&UniStr(value)) { return *key; } - let key = Key(self.store.len() as u32); + let key = SymbolId(self.store.len() as u32); self.map.insert(UniStr(value), key); self.store.push(value); return key; } } + +#[test] +fn test_unistr_eq() { + assert_eq!(UniStr("fishi"), UniStr("fishᵢ")); + assert_eq!(UniStr("fishi"), UniStr("fishi")); + assert_eq!(UniStr("fishi"), UniStr("fishᵢ")); + assert_eq!(UniStr("fishᵢ"), UniStr("fishᵢ")); +} + +#[test] +fn test_unistr_hash() { + use std::hash::DefaultHasher; + for (lhs, rhs) in &[ + (UniStr("fishi"), UniStr("fishᵢ")), + (UniStr("fishi"), UniStr("fishi")), + (UniStr("fishi"), UniStr("fishᵢ")), + (UniStr("fishᵢ"), UniStr("fishᵢ")), + ] { + let mut hashl = DefaultHasher::new(); + let mut hashr = DefaultHasher::new(); + lhs.hash(&mut hashl); + rhs.hash(&mut hashr); + assert_eq!(hashl.finish(), hashr.finish()); + } +} + +#[test] +fn test_interner_intern() { + let xs = ["fishi", "fishi", "fishᵢ"]; + let y = "andy"; + + let mut interner = Interner::new(); + for i in 0..xs.len() { + for j in i..xs.len() { + assert_eq!(interner.intern(xs[i]), interner.intern(xs[j])); + } + } + for i in 0..xs.len() { + assert_ne!(interner.intern(y), interner.intern(xs[i])); + } +} + +#[test] +fn test_interner_gets_first_inserted() { + let mut interner = Interner::new(); + let xs = ["fishi", "fishi", "fishᵢ"]; + let ys = xs.iter().map(|x| interner.intern(x)).collect::>(); + + for i in 0..ys.len() { + assert_eq!(interner.get(ys[i]), xs[0]); + } +} diff --git a/oryxc/src/main.rs b/oryxc/src/main.rs index e8c552f..109aed3 100644 --- a/oryxc/src/main.rs +++ b/oryxc/src/main.rs @@ -2,6 +2,7 @@ mod compiler; mod errors; +mod intern; mod lexer; mod parser; mod prelude; diff --git a/oryxc/src/prelude.rs b/oryxc/src/prelude.rs index 78e7597..b7e80c2 100644 --- a/oryxc/src/prelude.rs +++ b/oryxc/src/prelude.rs @@ -8,7 +8,17 @@ use std::fmt::{ pub struct FileId(pub usize); #[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] -pub struct NodeId(pub u32); +pub struct ScopeId(pub usize); + +impl ScopeId { + pub const GLOBAL: Self = Self(0); +} + +#[repr(transparent)] +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] +pub struct SymbolId(pub u32); + +pub struct SymbolVal {} #[derive(Clone, Copy)] pub struct SubNodes(pub u32, pub u32); -- cgit v1.2.3