summaryrefslogtreecommitdiff
path: root/unigen
diff options
context:
space:
mode:
authorThomas Voss <mail@thomasvoss.com> 2026-02-24 11:08:42 +0100
committerThomas Voss <mail@thomasvoss.com> 2026-02-24 11:08:42 +0100
commit7d42170c8625de0fe44b98f47e8b9a603a9de794 (patch)
tree86623e0e0809d23ee3dd861ad7006ff21672e455 /unigen
Genesis commit
Diffstat (limited to 'unigen')
-rw-r--r--unigen/Cargo.lock16
-rw-r--r--unigen/Cargo.toml7
-rwxr-xr-xunigen/fetch19
l---------unigen/rustfmt.toml1
-rw-r--r--unigen/src/main.rs278
5 files changed, 321 insertions, 0 deletions
diff --git a/unigen/Cargo.lock b/unigen/Cargo.lock
new file mode 100644
index 0000000..5a36f70
--- /dev/null
+++ b/unigen/Cargo.lock
@@ -0,0 +1,16 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 4
+
+[[package]]
+name = "lexopt"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6c5d9b5843e8c9311ff602e6bd50855015e99e75159c2c54fe104cfac241f552"
+
+[[package]]
+name = "unigen"
+version = "0.1.0"
+dependencies = [
+ "lexopt",
+]
diff --git a/unigen/Cargo.toml b/unigen/Cargo.toml
new file mode 100644
index 0000000..e0c6b4d
--- /dev/null
+++ b/unigen/Cargo.toml
@@ -0,0 +1,7 @@
+[package]
+name = "unigen"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+lexopt = "0.1.0"
diff --git a/unigen/fetch b/unigen/fetch
new file mode 100755
index 0000000..46c02f9
--- /dev/null
+++ b/unigen/fetch
@@ -0,0 +1,19 @@
+#!/bin/sh
+
+set -e
+
+cd "${0%/*}"
+trap 'rm -f UCD.zip' EXIT
+
+set -x
+mkdir -p data
+curl -LO https://www.unicode.org/Public/zipped/latest/UCD.zip
+unzip -od data UCD.zip
+
+# XID_Start and XID_Continue additions
+cat <<-EOF >>data/DerivedCoreProperties.txt
+0024 ; XID_Start # Pc DOLLAR SIGN
+005F ; XID_Start # Pc LOW LINE
+2032..2034 ; XID_Continue # Po [3] PRIME..TRIPLE PRIME
+2057 ; XID_Continue # Po QUADRUPLE PRIME
+EOF
diff --git a/unigen/rustfmt.toml b/unigen/rustfmt.toml
new file mode 120000
index 0000000..39f97b0
--- /dev/null
+++ b/unigen/rustfmt.toml
@@ -0,0 +1 @@
+../rustfmt.toml \ No newline at end of file
diff --git a/unigen/src/main.rs b/unigen/src/main.rs
new file mode 100644
index 0000000..4851fa5
--- /dev/null
+++ b/unigen/src/main.rs
@@ -0,0 +1,278 @@
+use std::collections::HashMap;
+use std::ffi::{
+ OsStr,
+ OsString,
+};
+use std::fs::File;
+use std::io::{
+ self,
+ BufRead,
+ BufReader,
+};
+use std::path::Path;
+use std::sync::OnceLock;
+use std::vec::Vec;
+use std::{
+ env,
+ process,
+};
+
+const MIN_SHIFT: usize = 1;
+const MAX_SHIFT: usize = 22;
+
+#[derive(Default)]
+struct Flags {
+ codepoints: Option<Vec<char>>,
+ help: bool,
+}
+
+impl Flags {
+ fn parse() -> Result<(Flags, Vec<String>), lexopt::Error> {
+ use lexopt::prelude::*;
+
+ let mut rest = Vec::with_capacity(env::args().len() - 1);
+ let mut flags = Flags::default();
+ let mut parser = lexopt::Parser::from_env();
+
+ while let Some(arg) = parser.next()? {
+ match arg {
+ Short('c') | Long("codepoints") => {
+ fn hex_to_char(s: &str) -> char {
+ return u32::from_str_radix(s, 16).map_or_else(
+ |e| {
+ eprintln!("{}: {s}: {e}", progname().display());
+ process::exit(1);
+ },
+ |n| {
+ char::from_u32(n).unwrap_or_else(|| {
+ eprintln!(
+ "{}: {s}: invalid codepoint",
+ progname().display()
+ );
+ process::exit(1);
+ })
+ },
+ );
+ }
+
+ flags.codepoints = Some(
+ parser
+ .value()?
+ .to_str()
+ .unwrap_or_else(|| {
+ eprintln!(
+ "{}: unable to parse argument to -c/--codepoints",
+ progname().display()
+ );
+ process::exit(1);
+ })
+ .split(',')
+ .map(hex_to_char)
+ .collect(),
+ );
+ },
+ Short('h') | Long("help") => flags.help = true,
+ Value(v) => rest.push(v.into_string()?),
+ _ => return Err(arg.unexpected()),
+ }
+ }
+
+ return Ok((flags, rest));
+ }
+}
+
+fn progname() -> &'static OsString {
+ static ARGV0: OnceLock<OsString> = OnceLock::new();
+ return ARGV0.get_or_init(|| {
+ let default = OsStr::new("oryxc");
+ let s = env::args_os().next().unwrap_or(default.into());
+ return Path::new(&s).file_name().unwrap_or(default).to_os_string();
+ });
+}
+
+fn usage() {
+ eprintln!(
+ concat!(
+ "Usage: {0} data-file property-name\n",
+ " {0} -c codepoints name\n",
+ " {0} -h",
+ ),
+ progname().display()
+ );
+}
+
+fn main() -> io::Result<()> {
+ let (flags, rest) = match Flags::parse() {
+ Ok(v) => v,
+ Err(e) => {
+ eprintln!("{}: {e}", progname().display());
+ usage();
+ process::exit(1);
+ },
+ };
+
+ if flags.help {
+ usage();
+ process::exit(0);
+ }
+
+ if (flags.codepoints.is_none() && rest.len() != 2)
+ || (flags.codepoints.is_some() && rest.len() != 1)
+ {
+ usage();
+ process::exit(1);
+ }
+
+ let mut bitmap = vec![false; 0x110000];
+ let name = match flags.codepoints {
+ Some(vec) => {
+ vec.iter().for_each(|c| bitmap[*c as usize] = true);
+ &rest[0]
+ },
+ None => {
+ parse_file(&rest[0], &rest[1], &mut bitmap)?;
+ &rest[1]
+ },
+ };
+ let (shift, lvl1, lvl2) = optimize_tables(&bitmap);
+ write_tables(name, shift, &lvl1, &lvl2);
+ return Ok(());
+}
+
+fn optimize_tables(bitmap: &[bool]) -> (usize, Vec<u16>, Vec<u64>) {
+ let mut minsz = usize::MAX;
+ let mut config = (0, Vec::new(), Vec::new());
+
+ for i in MIN_SHIFT..=MAX_SHIFT {
+ let (l1, l2) = build_tables(bitmap, i);
+ let sz = l1.len() * 2 + l2.len() * 8;
+ if sz < minsz {
+ minsz = sz;
+ config = (i, l1, l2);
+ }
+ }
+
+ return config;
+}
+
+fn parse_file<P: AsRef<Path>>(
+ path: P,
+ prop: &str,
+ bitmap: &mut [bool],
+) -> io::Result<()> {
+ let file = File::open(path)?;
+ let reader = BufReader::new(file);
+
+ for line in reader.lines() {
+ let line = line?;
+ let line = line.split('#').next().unwrap_or("").trim();
+ if line.is_empty() {
+ continue;
+ }
+
+ let parts: Vec<&str> = line.split(';').map(|s| s.trim()).collect();
+ if parts.len() < 2 || parts[1] != prop {
+ continue;
+ }
+
+ let (beg, end) = if parts[0].contains("..") {
+ let mut range = parts[0].split("..");
+ (
+ u32::from_str_radix(range.next().unwrap(), 16).unwrap(),
+ u32::from_str_radix(range.next().unwrap(), 16).unwrap(),
+ )
+ } else {
+ let val = u32::from_str_radix(parts[0], 16).unwrap();
+ (val, val)
+ };
+
+ for cp in beg..=end {
+ if (cp as usize) < bitmap.len() {
+ bitmap[cp as usize] = true;
+ }
+ }
+ }
+ return Ok(());
+}
+
+fn build_tables(bitmap: &[bool], shift: usize) -> (Vec<u16>, Vec<u64>) {
+ let blksz = 1 << shift;
+ let u64s_per_block = (blksz + 63) / 64;
+
+ let mut lvl2: Vec<u64> = Vec::new();
+ let mut lvl1: Vec<u16> = Vec::new();
+ let mut blkmap: HashMap<Vec<u64>, u16> = HashMap::new();
+
+ for chunk in bitmap.chunks(blksz) {
+ let mut blkdata = vec![0u64; u64s_per_block];
+
+ for (i, &bit) in chunk.iter().enumerate() {
+ if bit {
+ let word_idx = i / 64;
+ let bit_idx = i % 64;
+ blkdata[word_idx] |= 1 << bit_idx;
+ }
+ }
+
+ if let Some(&i) = blkmap.get(&blkdata) {
+ lvl1.push(i);
+ } else {
+ let i = (lvl2.len() / u64s_per_block) as u16;
+ lvl2.extend_from_slice(&blkdata);
+ blkmap.insert(blkdata, i);
+ lvl1.push(i);
+ }
+ }
+
+ return (lvl1, lvl2);
+}
+
+fn write_tables(prop_name: &str, shift: usize, level1: &[u16], level2: &[u64]) {
+ let upper_name = prop_name.to_uppercase();
+ let lower_name = prop_name.to_lowercase();
+ let block_size = 1 << shift;
+ let mask = block_size - 1;
+ let u64s_per_block = (block_size + 63) / 64;
+
+ println!("/* Autogenerated – DO NOT EDIT */\n");
+ print!(
+ "static {upper_name}_L1: [u16; {}] = {level1:?};",
+ level1.len()
+ );
+ print!(
+ "static {upper_name}_L2: [u64; {}] = {level2:?};",
+ level2.len()
+ );
+
+ let pred_name = if lower_name.contains('_') {
+ format!("{lower_name}_p")
+ } else {
+ format!("{lower_name}p")
+ };
+
+ print!(
+ "#[inline]
+ pub fn {pred_name}(c: char) -> bool {{
+ let cp = c as usize;
+ let blki = unsafe {{ *{upper_name}_L1.get_unchecked(cp >> {shift}) }} as usize;
+ let in_blk_offset_p = cp & 0x{mask:X};"
+ );
+
+ if u64s_per_block == 1 {
+ print!(
+ " unsafe {{
+ return ({upper_name}_L2.get_unchecked(blki) & (1 << in_blk_offset_p)) != 0;
+ }}"
+ );
+ } else {
+ print!(
+ "let wordi = (blki * {u64s_per_block}) + (in_blk_offset_p >> 6);
+ let biti = in_blk_offset_p & 0x3F;
+ unsafe {{
+ return (*{upper_name}_L2.get_unchecked(wordi) & (1 << biti)) != 0;
+ }}"
+ );
+ }
+
+ print!("}}");
+}