summaryrefslogtreecommitdiff
path: root/oryxc
diff options
context:
space:
mode:
authorThomas Voss <mail@thomasvoss.com> 2026-03-15 23:04:29 +0100
committerThomas Voss <mail@thomasvoss.com> 2026-03-15 23:04:29 +0100
commit5d570a58f9cb64478d2a2704c740e05f15a71235 (patch)
tree5eeef8612e3cfa61a57e77c1cca0e088527a196e /oryxc
parent58c294d0a5fedca250eb591ade22ab8b1a768cb5 (diff)
Make UniStr it’s own thing
Diffstat (limited to 'oryxc')
-rw-r--r--oryxc/src/intern.rs166
-rw-r--r--oryxc/src/unistr.rs84
2 files changed, 133 insertions, 117 deletions
diff --git a/oryxc/src/intern.rs b/oryxc/src/intern.rs
index ea131a2..684a693 100644
--- a/oryxc/src/intern.rs
+++ b/oryxc/src/intern.rs
@@ -1,71 +1,24 @@
use std::hash::{
Hash,
- Hasher,
};
use boxcar;
use dashmap::DashMap;
-use unicode_normalization::{
- self,
- IsNormalized,
- UnicodeNormalization,
-};
-
-use crate::prelude::*;
-
-pub struct Interner<'a> {
- map: DashMap<UniStr<'a>, SymbolId>,
- store: boxcar::Vec<&'a str>,
-}
-
-#[derive(Debug, Eq)]
-pub struct UniStr<'a>(pub &'a str);
-
-impl Hash for UniStr<'_> {
- fn hash<H: Hasher>(&self, state: &mut H) {
- /* In the ASCII common case we use .bytes() to avoid decoding
- * every codepoint (a no-op in ASCII) */
- if self.0.is_ascii() {
- self.0.bytes().for_each(|c| (c as char).hash(state));
- } else if unicode_normalization::is_nfkd_quick(self.0.chars())
- == IsNormalized::Yes
- {
- self.0.chars().for_each(|c| c.hash(state));
- } else {
- self.0.nfkd().for_each(|c| c.hash(state));
- }
- }
-}
-impl PartialEq for UniStr<'_> {
- fn eq(&self, other: &Self) -> bool {
- /* Most code is ASCII, and normalization is obviously a lot
- * slower than not normalizing, so we try to only normalize when
- * we have to */
-
- if self.0.is_ascii() && other.0.is_ascii() {
- return self.0 == other.0;
- }
-
- return match (
- unicode_normalization::is_nfkd_quick(self.0.chars())
- == IsNormalized::Yes,
- unicode_normalization::is_nfkd_quick(other.0.chars())
- == IsNormalized::Yes,
- ) {
- (true, true) => self.0 == other.0,
- (true, false) => {
- self.0.chars().map(|b| b as char).eq(other.0.nfkd())
- },
- (false, true) => {
- self.0.nfkd().eq(other.0.chars().map(|b| b as char))
- },
- (false, false) => self.0.nfkd().eq(other.0.nfkd()),
- };
- }
+pub struct Interner<V, I>
+where
+ V: Copy + Eq + Hash,
+ I: Copy + Into<usize>,
+{
+ map: DashMap<V, I>,
+ store: boxcar::Vec<V>,
}
-impl<'a> Interner<'a> {
+impl<V, I> Interner<V, I>
+where
+ V: Copy + Eq + Hash,
+ I: Copy + From<usize> + Into<usize>,
+{
pub fn new() -> Self {
return Interner {
map: DashMap::new(),
@@ -73,70 +26,49 @@ impl<'a> Interner<'a> {
};
}
- pub fn get(&self, key: SymbolId) -> &str {
- return self.store[key.0 as usize];
+ pub fn get(&self, key: I) -> V {
+ return self.store[key.into()];
}
- pub fn intern(&self, value: &'a str) -> SymbolId {
- if let Some(key) = self.map.get(&UniStr(value)) {
+ pub fn intern(&self, value: V) -> I {
+ if let Some(key) = self.map.get(&value) {
return *key;
}
- let key = SymbolId(self.store.push(value) as u32);
- self.map.insert(UniStr(value), key);
+ let key = self.store.push(value).into();
+ self.map.insert(value, key);
return key;
}
}
-#[test]
-fn test_unistr_eq() {
- assert_eq!(UniStr("fishi"), UniStr("fishᵢ"));
- assert_eq!(UniStr("fishi"), UniStr("fishi"));
- assert_eq!(UniStr("fishi"), UniStr("fishᵢ"));
- assert_eq!(UniStr("fishᵢ"), UniStr("fishᵢ"));
- assert_eq!(UniStr("corné"), UniStr("corné"));
-}
-
-#[test]
-fn test_unistr_hash() {
- use std::hash::DefaultHasher;
- for (lhs, rhs) in &[
- (UniStr("fishi"), UniStr("fishᵢ")),
- (UniStr("fishi"), UniStr("fishi")),
- (UniStr("fishi"), UniStr("fishᵢ")),
- (UniStr("fishᵢ"), UniStr("fishᵢ")),
- (UniStr("corné"), UniStr("corné")),
- ] {
- let mut hashl = DefaultHasher::new();
- let mut hashr = DefaultHasher::new();
- lhs.hash(&mut hashl);
- rhs.hash(&mut hashr);
- assert_eq!(hashl.finish(), hashr.finish());
- }
-}
-
-#[test]
-fn test_interner_intern() {
- let xs = ["fishi", "fishi", "fishᵢ"];
- let y = "andy";
-
- let mut interner = Interner::new();
- for i in 0..xs.len() {
- for j in i..xs.len() {
- assert_eq!(interner.intern(xs[i]), interner.intern(xs[j]));
- }
- }
- for i in 0..xs.len() {
- assert_ne!(interner.intern(y), interner.intern(xs[i]));
- }
-}
-
-#[test]
-fn test_interner_gets_first_inserted() {
- let mut interner = Interner::new();
- let xs = ["fishi", "fishi", "fishᵢ"];
- let ys = xs.iter().map(|x| interner.intern(x)).collect::<Vec<_>>();
-
- for i in 0..ys.len() {
- assert_eq!(interner.get(ys[i]), xs[0]);
- }
+#[cfg(test)]
+mod tests {
+ use crate::unistr::UniStr;
+ use super::Interner;
+
+ #[test]
+ fn test_interner_intern() {
+ let xs = [UniStr("fishi"), UniStr("fishi"), UniStr("fishᵢ")];
+ let y = UniStr("andy");
+
+ let mut interner = Interner::<_, usize>::new();
+ for i in 0..xs.len() {
+ for j in i..xs.len() {
+ assert_eq!(interner.intern(xs[i]), interner.intern(xs[j]));
+ }
+ }
+ for i in 0..xs.len() {
+ assert_ne!(interner.intern(y), interner.intern(xs[i]));
+ }
+ }
+
+ #[test]
+ fn test_interner_gets_first_inserted() {
+ let mut interner = Interner::<_, usize>::new();
+ let xs = [UniStr("fishi"), UniStr("fishi"), UniStr("fishᵢ")];
+ let ys = xs.iter().map(|&x| interner.intern(x)).collect::<Vec<_>>();
+
+ for i in 0..ys.len() {
+ assert_eq!(interner.get(ys[i]), xs[0]);
+ }
+ }
}
diff --git a/oryxc/src/unistr.rs b/oryxc/src/unistr.rs
new file mode 100644
index 0000000..5dc8160
--- /dev/null
+++ b/oryxc/src/unistr.rs
@@ -0,0 +1,84 @@
+use std::hash::{Hash, Hasher};
+
+use unicode_normalization::{self, IsNormalized, UnicodeNormalization};
+
+#[repr(transparent)]
+#[derive(Copy, Clone, Debug, Eq)]
+pub struct UniStr<'a>(pub &'a str);
+
+impl Hash for UniStr<'_> {
+ fn hash<H: Hasher>(&self, state: &mut H) {
+ /* In the ASCII common case we use .bytes() to avoid decoding
+ * every codepoint (a no-op in ASCII) */
+ if self.0.is_ascii() {
+ self.0.bytes().for_each(|c| (c as char).hash(state));
+ } else if unicode_normalization::is_nfkd_quick(self.0.chars())
+ == IsNormalized::Yes
+ {
+ self.0.chars().for_each(|c| c.hash(state));
+ } else {
+ self.0.nfkd().for_each(|c| c.hash(state));
+ }
+ }
+}
+
+impl PartialEq for UniStr<'_> {
+ fn eq(&self, other: &Self) -> bool {
+ /* Most code is ASCII, and normalization is obviously a lot
+ * slower than not normalizing, so we try to only normalize when
+ * we have to */
+
+ if self.0.is_ascii() && other.0.is_ascii() {
+ return self.0 == other.0;
+ }
+
+ return match (
+ unicode_normalization::is_nfkd_quick(self.0.chars())
+ == IsNormalized::Yes,
+ unicode_normalization::is_nfkd_quick(other.0.chars())
+ == IsNormalized::Yes,
+ ) {
+ (true, true) => self.0 == other.0,
+ (true, false) => {
+ self.0.chars().map(|b| b as char).eq(other.0.nfkd())
+ },
+ (false, true) => {
+ self.0.nfkd().eq(other.0.chars().map(|b| b as char))
+ },
+ (false, false) => self.0.nfkd().eq(other.0.nfkd()),
+ };
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use std::hash::{DefaultHasher, Hash, Hasher};
+
+ use super::UniStr;
+
+ #[test]
+ fn test_unistr_eq() {
+ assert_eq!(UniStr("fishi"), UniStr("fishᵢ"));
+ assert_eq!(UniStr("fishi"), UniStr("fishi"));
+ assert_eq!(UniStr("fishi"), UniStr("fishᵢ"));
+ assert_eq!(UniStr("fishᵢ"), UniStr("fishᵢ"));
+ assert_eq!(UniStr("corné"), UniStr("corné"));
+ }
+
+ #[test]
+ fn test_unistr_hash() {
+ for (lhs, rhs) in &[
+ (UniStr("fishi"), UniStr("fishᵢ")),
+ (UniStr("fishi"), UniStr("fishi")),
+ (UniStr("fishi"), UniStr("fishᵢ")),
+ (UniStr("fishᵢ"), UniStr("fishᵢ")),
+ (UniStr("corné"), UniStr("corné")),
+ ] {
+ let mut hashl = DefaultHasher::new();
+ let mut hashr = DefaultHasher::new();
+ lhs.hash(&mut hashl);
+ rhs.hash(&mut hashr);
+ assert_eq!(hashl.finish(), hashr.finish());
+ }
+ }
+}