From 6826cf0be1816a8320835ab315d46063dc746aac Mon Sep 17 00:00:00 2001 From: romir kulshrestha Date: Wed, 4 Mar 2026 21:01:09 +0100 Subject: buildscript --- unigen/src/main.rs | 279 ----------------------------------------------------- 1 file changed, 279 deletions(-) delete mode 100644 unigen/src/main.rs (limited to 'unigen/src') diff --git a/unigen/src/main.rs b/unigen/src/main.rs deleted file mode 100644 index bedabc6..0000000 --- a/unigen/src/main.rs +++ /dev/null @@ -1,279 +0,0 @@ -use std::collections::HashMap; -use std::ffi::{ - OsStr, - OsString, -}; -use std::fs::File; -use std::io::{ - self, - BufRead, - BufReader, -}; -use std::path::Path; -use std::sync::OnceLock; -use std::vec::Vec; -use std::{ - env, - process, -}; - -const MIN_SHIFT: usize = 1; -const MAX_SHIFT: usize = 22; - -#[derive(Default)] -struct Flags { - codepoints: Option>, - help: bool, -} - -impl Flags { - fn parse() -> Result<(Flags, Vec), lexopt::Error> { - use lexopt::prelude::*; - - let mut rest = Vec::with_capacity(env::args().len() - 1); - let mut flags = Flags::default(); - let mut parser = lexopt::Parser::from_env(); - parser.set_short_equals(false); - - while let Some(arg) = parser.next()? { - match arg { - Short('c') | Long("codepoints") => { - fn hex_to_char(s: &str) -> char { - return u32::from_str_radix(s, 16).map_or_else( - |e| { - eprintln!("{}: {s}: {e}", progname().display()); - process::exit(1); - }, - |n| { - char::from_u32(n).unwrap_or_else(|| { - eprintln!( - "{}: {s}: invalid codepoint", - progname().display() - ); - process::exit(1); - }) - }, - ); - } - - flags.codepoints = Some( - parser - .value()? - .to_str() - .unwrap_or_else(|| { - eprintln!( - "{}: unable to parse argument to -c/--codepoints", - progname().display() - ); - process::exit(1); - }) - .split(',') - .map(hex_to_char) - .collect(), - ); - }, - Short('h') | Long("help") => flags.help = true, - Value(v) => rest.push(v.into_string()?), - _ => return Err(arg.unexpected()), - } - } - - return Ok((flags, rest)); - } -} - -fn progname() -> &'static OsString { - static ARGV0: OnceLock = OnceLock::new(); - return ARGV0.get_or_init(|| { - let default = OsStr::new("oryxc"); - let s = env::args_os().next().unwrap_or(default.into()); - return Path::new(&s).file_name().unwrap_or(default).to_os_string(); - }); -} - -fn usage() { - eprintln!( - concat!( - "Usage: {0} data-file property-name\n", - " {0} -c codepoints name\n", - " {0} -h", - ), - progname().display() - ); -} - -fn main() -> io::Result<()> { - let (flags, rest) = match Flags::parse() { - Ok(v) => v, - Err(e) => { - eprintln!("{}: {e}", progname().display()); - usage(); - process::exit(1); - }, - }; - - if flags.help { - usage(); - process::exit(0); - } - - if (flags.codepoints.is_none() && rest.len() != 2) - || (flags.codepoints.is_some() && rest.len() != 1) - { - usage(); - process::exit(1); - } - - let mut bitmap = vec![false; 0x110000]; - let name = match flags.codepoints { - Some(vec) => { - vec.iter().for_each(|c| bitmap[*c as usize] = true); - &rest[0] - }, - None => { - parse_file(&rest[0], &rest[1], &mut bitmap)?; - &rest[1] - }, - }; - let (shift, lvl1, lvl2) = optimize_tables(&bitmap); - write_tables(name, shift, &lvl1, &lvl2); - return Ok(()); -} - -fn optimize_tables(bitmap: &[bool]) -> (usize, Vec, Vec) { - let mut minsz = usize::MAX; - let mut config = (0, Vec::new(), Vec::new()); - - for i in MIN_SHIFT..=MAX_SHIFT { - let (l1, l2) = build_tables(bitmap, i); - let sz = l1.len() * 2 + l2.len() * 8; - if sz < minsz { - minsz = sz; - config = (i, l1, l2); - } - } - - return config; -} - -fn parse_file>( - path: P, - prop: &str, - bitmap: &mut [bool], -) -> io::Result<()> { - let file = File::open(path)?; - let reader = BufReader::new(file); - - for line in reader.lines() { - let line = line?; - let line = line.split('#').next().unwrap_or("").trim(); - if line.is_empty() { - continue; - } - - let parts: Vec<&str> = line.split(';').map(|s| s.trim()).collect(); - if parts.len() < 2 || parts[1] != prop { - continue; - } - - let (beg, end) = if parts[0].contains("..") { - let mut range = parts[0].split(".."); - ( - u32::from_str_radix(range.next().unwrap(), 16).unwrap(), - u32::from_str_radix(range.next().unwrap(), 16).unwrap(), - ) - } else { - let val = u32::from_str_radix(parts[0], 16).unwrap(); - (val, val) - }; - - for cp in beg..=end { - if (cp as usize) < bitmap.len() { - bitmap[cp as usize] = true; - } - } - } - return Ok(()); -} - -fn build_tables(bitmap: &[bool], shift: usize) -> (Vec, Vec) { - let blksz = 1 << shift; - let u64s_per_block = (blksz + 63) / 64; - - let mut lvl2: Vec = Vec::new(); - let mut lvl1: Vec = Vec::new(); - let mut blkmap: HashMap, u16> = HashMap::new(); - - for chunk in bitmap.chunks(blksz) { - let mut blkdata = vec![0u64; u64s_per_block]; - - for (i, &bit) in chunk.iter().enumerate() { - if bit { - let word_idx = i / 64; - let bit_idx = i % 64; - blkdata[word_idx] |= 1 << bit_idx; - } - } - - if let Some(&i) = blkmap.get(&blkdata) { - lvl1.push(i); - } else { - let i = (lvl2.len() / u64s_per_block) as u16; - lvl2.extend_from_slice(&blkdata); - blkmap.insert(blkdata, i); - lvl1.push(i); - } - } - - return (lvl1, lvl2); -} - -fn write_tables(prop_name: &str, shift: usize, level1: &[u16], level2: &[u64]) { - let upper_name = prop_name.to_uppercase(); - let lower_name = prop_name.to_lowercase(); - let block_size = 1 << shift; - let mask = block_size - 1; - let u64s_per_block = (block_size + 63) / 64; - - println!("/* Autogenerated – DO NOT EDIT */\n"); - print!( - "static {upper_name}_L1: [u16; {}] = {level1:?};", - level1.len() - ); - print!( - "static {upper_name}_L2: [u64; {}] = {level2:?};", - level2.len() - ); - - let pred_name = if lower_name.contains('_') { - format!("{lower_name}_p") - } else { - format!("{lower_name}p") - }; - - print!( - "#[inline] - pub fn {pred_name}(c: char) -> bool {{ - let cp = c as usize; - let blki = unsafe {{ *{upper_name}_L1.get_unchecked(cp >> {shift}) }} as usize; - let in_blk_offset_p = cp & 0x{mask:X};" - ); - - if u64s_per_block == 1 { - print!( - " unsafe {{ - return ({upper_name}_L2.get_unchecked(blki) & (1 << in_blk_offset_p)) != 0; - }}" - ); - } else { - print!( - "let wordi = (blki * {u64s_per_block}) + (in_blk_offset_p >> 6); - let biti = in_blk_offset_p & 0x3F; - unsafe {{ - return (*{upper_name}_L2.get_unchecked(wordi) & (1 << biti)) != 0; - }}" - ); - } - - print!("}}"); -} -- cgit v1.2.3