summaryrefslogtreecommitdiff
path: root/oryxc/src/intern.rs
diff options
context:
space:
mode:
authorThomas Voss <mail@thomasvoss.com> 2026-03-04 23:21:07 +0100
committerThomas Voss <mail@thomasvoss.com> 2026-03-04 23:21:07 +0100
commit82c14f030b36938cb10c1c8f8e880d0e0acaadc2 (patch)
tree50d399aebb9d374c867d86482da129fce1b1c4e7 /oryxc/src/intern.rs
parentf1a862a334efb1aa1f1cc2c3f30dcbffeaa9b4e3 (diff)
Begin working on symbol resolution
Diffstat (limited to 'oryxc/src/intern.rs')
-rw-r--r--oryxc/src/intern.rs116
1 files changed, 92 insertions, 24 deletions
diff --git a/oryxc/src/intern.rs b/oryxc/src/intern.rs
index 3ab91cf..b0d1a00 100644
--- a/oryxc/src/intern.rs
+++ b/oryxc/src/intern.rs
@@ -1,45 +1,61 @@
-use std::hash;
+use std::hash::{
+ Hash,
+ Hasher,
+};
-use dashmap;
-use icu::normalizer;
+use dashmap::DashMap;
+use unicode_normalization::{
+ self,
+ IsNormalized,
+ UnicodeNormalization,
+};
-#[repr(transparent)]
-#[derive(Clone, Copy, Debug, Eq, PartialEq)]
-pub struct Key(u32);
+// use icu::normalizer::DecomposingNormalizer;
+use crate::prelude::*;
pub struct Interner<'a> {
- map: dashmap::DashMap<UniStr<'a>, Key>,
+ map: DashMap<UniStr<'a>, SymbolId>,
store: Vec<&'a str>,
}
-#[derive(Eq)]
+#[derive(Debug, Eq)]
pub struct UniStr<'a>(pub &'a str);
-impl hash::Hash for UniStr<'_> {
- fn hash<H: hash::Hasher>(&self, state: &mut H) {
+impl Hash for UniStr<'_> {
+ fn hash<H: Hasher>(&self, state: &mut H) {
+ /* In the ASCII common case we use .bytes() to avoid decoding
+ * every codepoint (a no-op in ASCII) */
if self.0.is_ascii() {
- self.0.chars().for_each(|c| c.hash(state));
+ self.0.bytes().for_each(|c| (c as char).hash(state));
+ } else if unicode_normalization::is_nfkd_quick(self.0.chars())
+ == IsNormalized::Yes
+ {
+ self.0.chars().for_each(|c| c.hash(state));
} else {
- let nfkd = normalizer::DecomposingNormalizer::new_nfkd();
- nfkd.normalize_iter(self.0.chars()).for_each(|c| c.hash(state));
+ self.0.nfkd().for_each(|c| c.hash(state));
}
}
}
impl PartialEq for UniStr<'_> {
fn eq(&self, other: &Self) -> bool {
- let nfkd = normalizer::DecomposingNormalizer::new_nfkd();
- return match (self.0.is_ascii(), other.0.is_ascii()) {
+ /* Most code is ASCII, and normalization is obviously a lot
+ * slower than not normalizing, so we try to only normalize when
+ * we have to */
+ return match (
+ unicode_normalization::is_nfkd_quick(self.0.chars())
+ == IsNormalized::Yes,
+ unicode_normalization::is_nfkd_quick(other.0.chars())
+ == IsNormalized::Yes,
+ ) {
(true, true) => self.0 == other.0,
(true, false) => {
- self.0.chars().eq(nfkd.normalize_iter(other.0.chars()))
+ self.0.bytes().map(|b| b as char).eq(other.0.nfkd())
},
(false, true) => {
- other.0.chars().eq(nfkd.normalize_iter(self.0.chars()))
+ self.0.nfkd().eq(other.0.bytes().map(|b| b as char))
},
- (false, false) => nfkd
- .normalize_iter(self.0.chars())
- .eq(nfkd.normalize_iter(other.0.chars())),
+ (false, false) => self.0.nfkd().eq(other.0.nfkd()),
};
}
}
@@ -47,22 +63,74 @@ impl PartialEq for UniStr<'_> {
impl<'a> Interner<'a> {
pub fn new() -> Self {
return Interner {
- map: dashmap::DashMap::new(),
+ map: DashMap::new(),
store: Vec::new(),
};
}
- pub fn get(&self, key: Key) -> &str {
+ pub fn get(&self, key: SymbolId) -> &str {
return self.store[key.0 as usize];
}
- pub fn intern(&mut self, value: &'a str) -> Key {
+ pub fn intern(&mut self, value: &'a str) -> SymbolId {
if let Some(key) = self.map.get(&UniStr(value)) {
return *key;
}
- let key = Key(self.store.len() as u32);
+ let key = SymbolId(self.store.len() as u32);
self.map.insert(UniStr(value), key);
self.store.push(value);
return key;
}
}
+
+#[test]
+fn test_unistr_eq() {
+ assert_eq!(UniStr("fishi"), UniStr("fishᵢ"));
+ assert_eq!(UniStr("fishi"), UniStr("fishi"));
+ assert_eq!(UniStr("fishi"), UniStr("fishᵢ"));
+ assert_eq!(UniStr("fishᵢ"), UniStr("fishᵢ"));
+}
+
+#[test]
+fn test_unistr_hash() {
+ use std::hash::DefaultHasher;
+ for (lhs, rhs) in &[
+ (UniStr("fishi"), UniStr("fishᵢ")),
+ (UniStr("fishi"), UniStr("fishi")),
+ (UniStr("fishi"), UniStr("fishᵢ")),
+ (UniStr("fishᵢ"), UniStr("fishᵢ")),
+ ] {
+ let mut hashl = DefaultHasher::new();
+ let mut hashr = DefaultHasher::new();
+ lhs.hash(&mut hashl);
+ rhs.hash(&mut hashr);
+ assert_eq!(hashl.finish(), hashr.finish());
+ }
+}
+
+#[test]
+fn test_interner_intern() {
+ let xs = ["fishi", "fishi", "fishᵢ"];
+ let y = "andy";
+
+ let mut interner = Interner::new();
+ for i in 0..xs.len() {
+ for j in i..xs.len() {
+ assert_eq!(interner.intern(xs[i]), interner.intern(xs[j]));
+ }
+ }
+ for i in 0..xs.len() {
+ assert_ne!(interner.intern(y), interner.intern(xs[i]));
+ }
+}
+
+#[test]
+fn test_interner_gets_first_inserted() {
+ let mut interner = Interner::new();
+ let xs = ["fishi", "fishi", "fishᵢ"];
+ let ys = xs.iter().map(|x| interner.intern(x)).collect::<Vec<_>>();
+
+ for i in 0..ys.len() {
+ assert_eq!(interner.get(ys[i]), xs[0]);
+ }
+}