Make UniStr it’s own thing

author: Thomas Voss <mail@thomasvoss.com> 2026-03-15 23:04:29 +0100
committer: Thomas Voss <mail@thomasvoss.com> 2026-03-15 23:04:29 +0100
commit: 5d570a58f9cb64478d2a2704c740e05f15a71235 (patch)
tree: 5eeef8612e3cfa61a57e77c1cca0e088527a196e /oryxc
parent: 58c294d0a5fedca250eb591ade22ab8b1a768cb5 (diff)
2 files changed, 133 insertions, 117 deletions
diff --git a/oryxc/src/intern.rs b/oryxc/src/intern.rs
index ea131a2..684a693 100644
--- a/oryxc/src/intern.rs
+++ b/oryxc/src/intern.rs
@@ -1,71 +1,24 @@
 use std::hash::{
 	Hash,
-	Hasher,
 };
 
 use boxcar;
 use dashmap::DashMap;
-use unicode_normalization::{
-	self,
-	IsNormalized,
-	UnicodeNormalization,
-};
-
-use crate::prelude::*;
-
-pub struct Interner<'a> {
-	map:   DashMap<UniStr<'a>, SymbolId>,
-	store: boxcar::Vec<&'a str>,
-}
-
-#[derive(Debug, Eq)]
-pub struct UniStr<'a>(pub &'a str);
-
-impl Hash for UniStr<'_> {
-	fn hash<H: Hasher>(&self, state: &mut H) {
-		/* In the ASCII common case we use .bytes() to avoid decoding
-		 * every codepoint (a no-op in ASCII) */
-		if self.0.is_ascii() {
-			self.0.bytes().for_each(|c| (c as char).hash(state));
-		} else if unicode_normalization::is_nfkd_quick(self.0.chars())
-			== IsNormalized::Yes
-		{
-			self.0.chars().for_each(|c| c.hash(state));
-		} else {
-			self.0.nfkd().for_each(|c| c.hash(state));
-		}
-	}
-}
 
-impl PartialEq for UniStr<'_> {
-	fn eq(&self, other: &Self) -> bool {
-		/* Most code is ASCII, and normalization is obviously a lot
-		 * slower than not normalizing, so we try to only normalize when
-		 * we have to */
-
-		if self.0.is_ascii() && other.0.is_ascii() {
-			return self.0 == other.0;
-		}
-
-		return match (
-			unicode_normalization::is_nfkd_quick(self.0.chars())
-				== IsNormalized::Yes,
-			unicode_normalization::is_nfkd_quick(other.0.chars())
-				== IsNormalized::Yes,
-		) {
-			(true, true) => self.0 == other.0,
-			(true, false) => {
-				self.0.chars().map(|b| b as char).eq(other.0.nfkd())
-			},
-			(false, true) => {
-				self.0.nfkd().eq(other.0.chars().map(|b| b as char))
-			},
-			(false, false) => self.0.nfkd().eq(other.0.nfkd()),
-		};
-	}
+pub struct Interner<V, I>
+where
+	V: Copy + Eq + Hash,
+	I: Copy + Into<usize>,
+{
+	map:   DashMap<V, I>,
+	store: boxcar::Vec<V>,
 }
 
-impl<'a> Interner<'a> {
+impl<V, I> Interner<V, I>
+where
+	V: Copy + Eq + Hash,
+	I: Copy + From<usize> + Into<usize>,
+{
 	pub fn new() -> Self {
 		return Interner {
 			map:   DashMap::new(),
@@ -73,70 +26,49 @@ impl<'a> Interner<'a> {
 		};
 	}
 
-	pub fn get(&self, key: SymbolId) -> &str {
-		return self.store[key.0 as usize];
+	pub fn get(&self, key: I) -> V {
+		return self.store[key.into()];
 	}
 
-	pub fn intern(&self, value: &'a str) -> SymbolId {
-		if let Some(key) = self.map.get(&UniStr(value)) {
+	pub fn intern(&self, value: V) -> I {
+		if let Some(key) = self.map.get(&value) {
 			return *key;
 		}
-		let key = SymbolId(self.store.push(value) as u32);
-		self.map.insert(UniStr(value), key);
+        let key = self.store.push(value).into();
+		self.map.insert(value, key);
 		return key;
 	}
 }
 
-#[test]
-fn test_unistr_eq() {
-	assert_eq!(UniStr("fishi"), UniStr("ﬁshᵢ"));
-	assert_eq!(UniStr("fishi"), UniStr("fishi"));
-	assert_eq!(UniStr("ﬁshi"), UniStr("fishᵢ"));
-	assert_eq!(UniStr("ﬁshᵢ"), UniStr("ﬁshᵢ"));
-	assert_eq!(UniStr("corné"), UniStr("corné"));
-}
-
-#[test]
-fn test_unistr_hash() {
-	use std::hash::DefaultHasher;
-	for (lhs, rhs) in &[
-		(UniStr("fishi"), UniStr("ﬁshᵢ")),
-		(UniStr("fishi"), UniStr("fishi")),
-		(UniStr("ﬁshi"), UniStr("fishᵢ")),
-		(UniStr("ﬁshᵢ"), UniStr("ﬁshᵢ")),
-		(UniStr("corné"), UniStr("corné")),
-	] {
-		let mut hashl = DefaultHasher::new();
-		let mut hashr = DefaultHasher::new();
-		lhs.hash(&mut hashl);
-		rhs.hash(&mut hashr);
-		assert_eq!(hashl.finish(), hashr.finish());
-	}
-}
-
-#[test]
-fn test_interner_intern() {
-	let xs = ["ﬁshi", "fishi", "ﬁshᵢ"];
-	let y = "andy";
-
-	let mut interner = Interner::new();
-	for i in 0..xs.len() {
-		for j in i..xs.len() {
-			assert_eq!(interner.intern(xs[i]), interner.intern(xs[j]));
-		}
-	}
-	for i in 0..xs.len() {
-		assert_ne!(interner.intern(y), interner.intern(xs[i]));
-	}
-}
-
-#[test]
-fn test_interner_gets_first_inserted() {
-	let mut interner = Interner::new();
-	let xs = ["ﬁshi", "fishi", "ﬁshᵢ"];
-	let ys = xs.iter().map(|x| interner.intern(x)).collect::<Vec<_>>();
-
-	for i in 0..ys.len() {
-		assert_eq!(interner.get(ys[i]), xs[0]);
-	}
+#[cfg(test)]
+mod tests {
+    use crate::unistr::UniStr;
+    use super::Interner;
+
+    #[test]
+    fn test_interner_intern() {
+	    let xs = [UniStr("ﬁshi"), UniStr("fishi"), UniStr("ﬁshᵢ")];
+	    let y = UniStr("andy");
+
+	    let mut interner = Interner::<_, usize>::new();
+	    for i in 0..xs.len() {
+		    for j in i..xs.len() {
+			    assert_eq!(interner.intern(xs[i]), interner.intern(xs[j]));
+		    }
+	    }
+	    for i in 0..xs.len() {
+		    assert_ne!(interner.intern(y), interner.intern(xs[i]));
+	    }
+    }
+
+    #[test]
+    fn test_interner_gets_first_inserted() {
+	    let mut interner = Interner::<_, usize>::new();
+	    let xs = [UniStr("ﬁshi"), UniStr("fishi"), UniStr("ﬁshᵢ")];
+	    let ys = xs.iter().map(|&x| interner.intern(x)).collect::<Vec<_>>();
+
+	    for i in 0..ys.len() {
+		    assert_eq!(interner.get(ys[i]), xs[0]);
+	    }
+    }
 }
diff --git a/oryxc/src/unistr.rs b/oryxc/src/unistr.rs
new file mode 100644
index 0000000..5dc8160
--- /dev/null
+++ b/oryxc/src/unistr.rs
@@ -0,0 +1,84 @@
+use std::hash::{Hash, Hasher};
+
+use unicode_normalization::{self, IsNormalized, UnicodeNormalization};
+
+#[repr(transparent)]
+#[derive(Copy, Clone, Debug, Eq)]
+pub struct UniStr<'a>(pub &'a str);
+
+impl Hash for UniStr<'_> {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+		/* In the ASCII common case we use .bytes() to avoid decoding
+		 * every codepoint (a no-op in ASCII) */
+		if self.0.is_ascii() {
+			self.0.bytes().for_each(|c| (c as char).hash(state));
+		} else if unicode_normalization::is_nfkd_quick(self.0.chars())
+			== IsNormalized::Yes
+		{
+			self.0.chars().for_each(|c| c.hash(state));
+		} else {
+			self.0.nfkd().for_each(|c| c.hash(state));
+		}
+    }
+}
+
+impl PartialEq for UniStr<'_> {
+	fn eq(&self, other: &Self) -> bool {
+		/* Most code is ASCII, and normalization is obviously a lot
+		 * slower than not normalizing, so we try to only normalize when
+		 * we have to */
+
+		if self.0.is_ascii() && other.0.is_ascii() {
+			return self.0 == other.0;
+		}
+
+		return match (
+			unicode_normalization::is_nfkd_quick(self.0.chars())
+				== IsNormalized::Yes,
+			unicode_normalization::is_nfkd_quick(other.0.chars())
+				== IsNormalized::Yes,
+		) {
+			(true, true) => self.0 == other.0,
+			(true, false) => {
+				self.0.chars().map(|b| b as char).eq(other.0.nfkd())
+			},
+			(false, true) => {
+				self.0.nfkd().eq(other.0.chars().map(|b| b as char))
+			},
+			(false, false) => self.0.nfkd().eq(other.0.nfkd()),
+		};
+	}
+}
+
+#[cfg(test)]
+mod tests {
+    use std::hash::{DefaultHasher, Hash, Hasher};
+
+    use super::UniStr;
+
+    #[test]
+    fn test_unistr_eq() {
+	    assert_eq!(UniStr("fishi"), UniStr("ﬁshᵢ"));
+	    assert_eq!(UniStr("fishi"), UniStr("fishi"));
+	    assert_eq!(UniStr("ﬁshi"), UniStr("fishᵢ"));
+	    assert_eq!(UniStr("ﬁshᵢ"), UniStr("ﬁshᵢ"));
+	    assert_eq!(UniStr("corné"), UniStr("corné"));
+    }
+
+    #[test]
+    fn test_unistr_hash() {
+	    for (lhs, rhs) in &[
+		    (UniStr("fishi"), UniStr("ﬁshᵢ")),
+		    (UniStr("fishi"), UniStr("fishi")),
+		    (UniStr("ﬁshi"), UniStr("fishᵢ")),
+		    (UniStr("ﬁshᵢ"), UniStr("ﬁshᵢ")),
+		    (UniStr("corné"), UniStr("corné")),
+	    ] {
+		    let mut hashl = DefaultHasher::new();
+		    let mut hashr = DefaultHasher::new();
+		    lhs.hash(&mut hashl);
+		    rhs.hash(&mut hashr);
+		    assert_eq!(hashl.finish(), hashr.finish());
+	    }
+    }
+}
author	Thomas Voss <mail@thomasvoss.com>	2026-03-15 23:04:29 +0100
committer	Thomas Voss <mail@thomasvoss.com>	2026-03-15 23:04:29 +0100
commit	5d570a58f9cb64478d2a2704c740e05f15a71235 (patch)
tree	5eeef8612e3cfa61a57e77c1cca0e088527a196e /oryxc
parent	58c294d0a5fedca250eb591ade22ab8b1a768cb5 (diff)