diff options
| author | Thomas Voss <mail@thomasvoss.com> | 2026-03-04 23:21:07 +0100 |
|---|---|---|
| committer | Thomas Voss <mail@thomasvoss.com> | 2026-03-04 23:21:07 +0100 |
| commit | 82c14f030b36938cb10c1c8f8e880d0e0acaadc2 (patch) | |
| tree | 50d399aebb9d374c867d86482da129fce1b1c4e7 /oryxc/src/intern.rs | |
| parent | f1a862a334efb1aa1f1cc2c3f30dcbffeaa9b4e3 (diff) | |
Begin working on symbol resolution
Diffstat (limited to 'oryxc/src/intern.rs')
| -rw-r--r-- | oryxc/src/intern.rs | 116 |
1 files changed, 92 insertions, 24 deletions
diff --git a/oryxc/src/intern.rs b/oryxc/src/intern.rs index 3ab91cf..b0d1a00 100644 --- a/oryxc/src/intern.rs +++ b/oryxc/src/intern.rs @@ -1,45 +1,61 @@ -use std::hash; +use std::hash::{ + Hash, + Hasher, +}; -use dashmap; -use icu::normalizer; +use dashmap::DashMap; +use unicode_normalization::{ + self, + IsNormalized, + UnicodeNormalization, +}; -#[repr(transparent)] -#[derive(Clone, Copy, Debug, Eq, PartialEq)] -pub struct Key(u32); +// use icu::normalizer::DecomposingNormalizer; +use crate::prelude::*; pub struct Interner<'a> { - map: dashmap::DashMap<UniStr<'a>, Key>, + map: DashMap<UniStr<'a>, SymbolId>, store: Vec<&'a str>, } -#[derive(Eq)] +#[derive(Debug, Eq)] pub struct UniStr<'a>(pub &'a str); -impl hash::Hash for UniStr<'_> { - fn hash<H: hash::Hasher>(&self, state: &mut H) { +impl Hash for UniStr<'_> { + fn hash<H: Hasher>(&self, state: &mut H) { + /* In the ASCII common case we use .bytes() to avoid decoding + * every codepoint (a no-op in ASCII) */ if self.0.is_ascii() { - self.0.chars().for_each(|c| c.hash(state)); + self.0.bytes().for_each(|c| (c as char).hash(state)); + } else if unicode_normalization::is_nfkd_quick(self.0.chars()) + == IsNormalized::Yes + { + self.0.chars().for_each(|c| c.hash(state)); } else { - let nfkd = normalizer::DecomposingNormalizer::new_nfkd(); - nfkd.normalize_iter(self.0.chars()).for_each(|c| c.hash(state)); + self.0.nfkd().for_each(|c| c.hash(state)); } } } impl PartialEq for UniStr<'_> { fn eq(&self, other: &Self) -> bool { - let nfkd = normalizer::DecomposingNormalizer::new_nfkd(); - return match (self.0.is_ascii(), other.0.is_ascii()) { + /* Most code is ASCII, and normalization is obviously a lot + * slower than not normalizing, so we try to only normalize when + * we have to */ + return match ( + unicode_normalization::is_nfkd_quick(self.0.chars()) + == IsNormalized::Yes, + unicode_normalization::is_nfkd_quick(other.0.chars()) + == IsNormalized::Yes, + ) { (true, true) => self.0 == other.0, (true, false) => { - self.0.chars().eq(nfkd.normalize_iter(other.0.chars())) + self.0.bytes().map(|b| b as char).eq(other.0.nfkd()) }, (false, true) => { - other.0.chars().eq(nfkd.normalize_iter(self.0.chars())) + self.0.nfkd().eq(other.0.bytes().map(|b| b as char)) }, - (false, false) => nfkd - .normalize_iter(self.0.chars()) - .eq(nfkd.normalize_iter(other.0.chars())), + (false, false) => self.0.nfkd().eq(other.0.nfkd()), }; } } @@ -47,22 +63,74 @@ impl PartialEq for UniStr<'_> { impl<'a> Interner<'a> { pub fn new() -> Self { return Interner { - map: dashmap::DashMap::new(), + map: DashMap::new(), store: Vec::new(), }; } - pub fn get(&self, key: Key) -> &str { + pub fn get(&self, key: SymbolId) -> &str { return self.store[key.0 as usize]; } - pub fn intern(&mut self, value: &'a str) -> Key { + pub fn intern(&mut self, value: &'a str) -> SymbolId { if let Some(key) = self.map.get(&UniStr(value)) { return *key; } - let key = Key(self.store.len() as u32); + let key = SymbolId(self.store.len() as u32); self.map.insert(UniStr(value), key); self.store.push(value); return key; } } + +#[test] +fn test_unistr_eq() { + assert_eq!(UniStr("fishi"), UniStr("fishᵢ")); + assert_eq!(UniStr("fishi"), UniStr("fishi")); + assert_eq!(UniStr("fishi"), UniStr("fishᵢ")); + assert_eq!(UniStr("fishᵢ"), UniStr("fishᵢ")); +} + +#[test] +fn test_unistr_hash() { + use std::hash::DefaultHasher; + for (lhs, rhs) in &[ + (UniStr("fishi"), UniStr("fishᵢ")), + (UniStr("fishi"), UniStr("fishi")), + (UniStr("fishi"), UniStr("fishᵢ")), + (UniStr("fishᵢ"), UniStr("fishᵢ")), + ] { + let mut hashl = DefaultHasher::new(); + let mut hashr = DefaultHasher::new(); + lhs.hash(&mut hashl); + rhs.hash(&mut hashr); + assert_eq!(hashl.finish(), hashr.finish()); + } +} + +#[test] +fn test_interner_intern() { + let xs = ["fishi", "fishi", "fishᵢ"]; + let y = "andy"; + + let mut interner = Interner::new(); + for i in 0..xs.len() { + for j in i..xs.len() { + assert_eq!(interner.intern(xs[i]), interner.intern(xs[j])); + } + } + for i in 0..xs.len() { + assert_ne!(interner.intern(y), interner.intern(xs[i])); + } +} + +#[test] +fn test_interner_gets_first_inserted() { + let mut interner = Interner::new(); + let xs = ["fishi", "fishi", "fishᵢ"]; + let ys = xs.iter().map(|x| interner.intern(x)).collect::<Vec<_>>(); + + for i in 0..ys.len() { + assert_eq!(interner.get(ys[i]), xs[0]); + } +} |