From 7d42170c8625de0fe44b98f47e8b9a603a9de794 Mon Sep 17 00:00:00 2001 From: Thomas Voss Date: Tue, 24 Feb 2026 11:08:42 +0100 Subject: Genesis commit --- unigen/Cargo.lock | 16 +++ unigen/Cargo.toml | 7 ++ unigen/fetch | 19 ++++ unigen/rustfmt.toml | 1 + unigen/src/main.rs | 278 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 321 insertions(+) create mode 100644 unigen/Cargo.lock create mode 100644 unigen/Cargo.toml create mode 100755 unigen/fetch create mode 120000 unigen/rustfmt.toml create mode 100644 unigen/src/main.rs (limited to 'unigen') diff --git a/unigen/Cargo.lock b/unigen/Cargo.lock new file mode 100644 index 0000000..5a36f70 --- /dev/null +++ b/unigen/Cargo.lock @@ -0,0 +1,16 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "lexopt" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c5d9b5843e8c9311ff602e6bd50855015e99e75159c2c54fe104cfac241f552" + +[[package]] +name = "unigen" +version = "0.1.0" +dependencies = [ + "lexopt", +] diff --git a/unigen/Cargo.toml b/unigen/Cargo.toml new file mode 100644 index 0000000..e0c6b4d --- /dev/null +++ b/unigen/Cargo.toml @@ -0,0 +1,7 @@ +[package] +name = "unigen" +version = "0.1.0" +edition = "2024" + +[dependencies] +lexopt = "0.1.0" diff --git a/unigen/fetch b/unigen/fetch new file mode 100755 index 0000000..46c02f9 --- /dev/null +++ b/unigen/fetch @@ -0,0 +1,19 @@ +#!/bin/sh + +set -e + +cd "${0%/*}" +trap 'rm -f UCD.zip' EXIT + +set -x +mkdir -p data +curl -LO https://www.unicode.org/Public/zipped/latest/UCD.zip +unzip -od data UCD.zip + +# XID_Start and XID_Continue additions +cat <<-EOF >>data/DerivedCoreProperties.txt +0024 ; XID_Start # Pc DOLLAR SIGN +005F ; XID_Start # Pc LOW LINE +2032..2034 ; XID_Continue # Po [3] PRIME..TRIPLE PRIME +2057 ; XID_Continue # Po QUADRUPLE PRIME +EOF diff --git a/unigen/rustfmt.toml b/unigen/rustfmt.toml new file mode 120000 index 0000000..39f97b0 --- /dev/null +++ b/unigen/rustfmt.toml @@ -0,0 +1 @@ +../rustfmt.toml \ No newline at end of file diff --git a/unigen/src/main.rs b/unigen/src/main.rs new file mode 100644 index 0000000..4851fa5 --- /dev/null +++ b/unigen/src/main.rs @@ -0,0 +1,278 @@ +use std::collections::HashMap; +use std::ffi::{ + OsStr, + OsString, +}; +use std::fs::File; +use std::io::{ + self, + BufRead, + BufReader, +}; +use std::path::Path; +use std::sync::OnceLock; +use std::vec::Vec; +use std::{ + env, + process, +}; + +const MIN_SHIFT: usize = 1; +const MAX_SHIFT: usize = 22; + +#[derive(Default)] +struct Flags { + codepoints: Option>, + help: bool, +} + +impl Flags { + fn parse() -> Result<(Flags, Vec), lexopt::Error> { + use lexopt::prelude::*; + + let mut rest = Vec::with_capacity(env::args().len() - 1); + let mut flags = Flags::default(); + let mut parser = lexopt::Parser::from_env(); + + while let Some(arg) = parser.next()? { + match arg { + Short('c') | Long("codepoints") => { + fn hex_to_char(s: &str) -> char { + return u32::from_str_radix(s, 16).map_or_else( + |e| { + eprintln!("{}: {s}: {e}", progname().display()); + process::exit(1); + }, + |n| { + char::from_u32(n).unwrap_or_else(|| { + eprintln!( + "{}: {s}: invalid codepoint", + progname().display() + ); + process::exit(1); + }) + }, + ); + } + + flags.codepoints = Some( + parser + .value()? + .to_str() + .unwrap_or_else(|| { + eprintln!( + "{}: unable to parse argument to -c/--codepoints", + progname().display() + ); + process::exit(1); + }) + .split(',') + .map(hex_to_char) + .collect(), + ); + }, + Short('h') | Long("help") => flags.help = true, + Value(v) => rest.push(v.into_string()?), + _ => return Err(arg.unexpected()), + } + } + + return Ok((flags, rest)); + } +} + +fn progname() -> &'static OsString { + static ARGV0: OnceLock = OnceLock::new(); + return ARGV0.get_or_init(|| { + let default = OsStr::new("oryxc"); + let s = env::args_os().next().unwrap_or(default.into()); + return Path::new(&s).file_name().unwrap_or(default).to_os_string(); + }); +} + +fn usage() { + eprintln!( + concat!( + "Usage: {0} data-file property-name\n", + " {0} -c codepoints name\n", + " {0} -h", + ), + progname().display() + ); +} + +fn main() -> io::Result<()> { + let (flags, rest) = match Flags::parse() { + Ok(v) => v, + Err(e) => { + eprintln!("{}: {e}", progname().display()); + usage(); + process::exit(1); + }, + }; + + if flags.help { + usage(); + process::exit(0); + } + + if (flags.codepoints.is_none() && rest.len() != 2) + || (flags.codepoints.is_some() && rest.len() != 1) + { + usage(); + process::exit(1); + } + + let mut bitmap = vec![false; 0x110000]; + let name = match flags.codepoints { + Some(vec) => { + vec.iter().for_each(|c| bitmap[*c as usize] = true); + &rest[0] + }, + None => { + parse_file(&rest[0], &rest[1], &mut bitmap)?; + &rest[1] + }, + }; + let (shift, lvl1, lvl2) = optimize_tables(&bitmap); + write_tables(name, shift, &lvl1, &lvl2); + return Ok(()); +} + +fn optimize_tables(bitmap: &[bool]) -> (usize, Vec, Vec) { + let mut minsz = usize::MAX; + let mut config = (0, Vec::new(), Vec::new()); + + for i in MIN_SHIFT..=MAX_SHIFT { + let (l1, l2) = build_tables(bitmap, i); + let sz = l1.len() * 2 + l2.len() * 8; + if sz < minsz { + minsz = sz; + config = (i, l1, l2); + } + } + + return config; +} + +fn parse_file>( + path: P, + prop: &str, + bitmap: &mut [bool], +) -> io::Result<()> { + let file = File::open(path)?; + let reader = BufReader::new(file); + + for line in reader.lines() { + let line = line?; + let line = line.split('#').next().unwrap_or("").trim(); + if line.is_empty() { + continue; + } + + let parts: Vec<&str> = line.split(';').map(|s| s.trim()).collect(); + if parts.len() < 2 || parts[1] != prop { + continue; + } + + let (beg, end) = if parts[0].contains("..") { + let mut range = parts[0].split(".."); + ( + u32::from_str_radix(range.next().unwrap(), 16).unwrap(), + u32::from_str_radix(range.next().unwrap(), 16).unwrap(), + ) + } else { + let val = u32::from_str_radix(parts[0], 16).unwrap(); + (val, val) + }; + + for cp in beg..=end { + if (cp as usize) < bitmap.len() { + bitmap[cp as usize] = true; + } + } + } + return Ok(()); +} + +fn build_tables(bitmap: &[bool], shift: usize) -> (Vec, Vec) { + let blksz = 1 << shift; + let u64s_per_block = (blksz + 63) / 64; + + let mut lvl2: Vec = Vec::new(); + let mut lvl1: Vec = Vec::new(); + let mut blkmap: HashMap, u16> = HashMap::new(); + + for chunk in bitmap.chunks(blksz) { + let mut blkdata = vec![0u64; u64s_per_block]; + + for (i, &bit) in chunk.iter().enumerate() { + if bit { + let word_idx = i / 64; + let bit_idx = i % 64; + blkdata[word_idx] |= 1 << bit_idx; + } + } + + if let Some(&i) = blkmap.get(&blkdata) { + lvl1.push(i); + } else { + let i = (lvl2.len() / u64s_per_block) as u16; + lvl2.extend_from_slice(&blkdata); + blkmap.insert(blkdata, i); + lvl1.push(i); + } + } + + return (lvl1, lvl2); +} + +fn write_tables(prop_name: &str, shift: usize, level1: &[u16], level2: &[u64]) { + let upper_name = prop_name.to_uppercase(); + let lower_name = prop_name.to_lowercase(); + let block_size = 1 << shift; + let mask = block_size - 1; + let u64s_per_block = (block_size + 63) / 64; + + println!("/* Autogenerated – DO NOT EDIT */\n"); + print!( + "static {upper_name}_L1: [u16; {}] = {level1:?};", + level1.len() + ); + print!( + "static {upper_name}_L2: [u64; {}] = {level2:?};", + level2.len() + ); + + let pred_name = if lower_name.contains('_') { + format!("{lower_name}_p") + } else { + format!("{lower_name}p") + }; + + print!( + "#[inline] + pub fn {pred_name}(c: char) -> bool {{ + let cp = c as usize; + let blki = unsafe {{ *{upper_name}_L1.get_unchecked(cp >> {shift}) }} as usize; + let in_blk_offset_p = cp & 0x{mask:X};" + ); + + if u64s_per_block == 1 { + print!( + " unsafe {{ + return ({upper_name}_L2.get_unchecked(blki) & (1 << in_blk_offset_p)) != 0; + }}" + ); + } else { + print!( + "let wordi = (blki * {u64s_per_block}) + (in_blk_offset_p >> 6); + let biti = in_blk_offset_p & 0x3F; + unsafe {{ + return (*{upper_name}_L2.get_unchecked(wordi) & (1 << biti)) != 0; + }}" + ); + } + + print!("}}"); +} -- cgit v1.2.3