From 8e110e40de9e4e5014512fa587904fabff73a303 Mon Sep 17 00:00:00 2001 From: romir kulshrestha Date: Thu, 5 Mar 2026 19:57:14 +0100 Subject: Fetch feature to control if/when generation occurs --- oryxc/build.rs | 470 ++++++++++++++++++++++++++++++--------------------------- 1 file changed, 249 insertions(+), 221 deletions(-) (limited to 'oryxc/build.rs') diff --git a/oryxc/build.rs b/oryxc/build.rs index 06a35ee..8de0327 100644 --- a/oryxc/build.rs +++ b/oryxc/build.rs @@ -1,261 +1,289 @@ -use std::collections::HashMap; -use std::env; -use std::fs::{ - self, - File, -}; -use std::io::{ - self, - BufRead, - BufReader, - Write, -}; -use std::path::Path; -use std::process::Command; - -const MIN_SHIFT: usize = 1; -const MAX_SHIFT: usize = 22; +const NAMES: &[&str] = &[ + "xid_start", + "xid_continue", + "pattern_white_space", + "line_terminator", +]; fn main() { + use std::env; + let out_dir = env::var("OUT_DIR").unwrap(); - let data = format!("{out_dir}/data"); - - println!("cargo:rerun-if-changed={data}/DerivedCoreProperties.txt"); - println!("cargo:rerun-if-changed={data}/PropList.txt"); - - fetch_data_if_missing(&out_dir, &data); - - generate_from_file( - &out_dir, - &format!("{data}/DerivedCoreProperties.txt"), - "XID_Start", - "xid_start", - ); - generate_from_file( - &out_dir, - &format!("{data}/DerivedCoreProperties.txt"), - "XID_Continue", - "xid_continue", - ); - generate_from_file( - &out_dir, - &format!("{data}/PropList.txt"), - "Pattern_White_Space", - "pattern_white_space", - ); - generate_from_codepoints( - &out_dir, - &[ - '\u{A}', '\u{B}', '\u{C}', '\u{D}', '\u{85}', '\u{2028}', - '\u{2029}', - ], - "line_terminator", - ); -} + let root = env::var("CARGO_MANIFEST_DIR").unwrap(); + let generated = format!("{root}/generated"); -/// Fetches unicode data files if missing. Replaces the fetch script. -fn fetch_data_if_missing(out_dir: &str, data: &str) { - let derived = format!("{data}/DerivedCoreProperties.txt"); - let proplist = format!("{data}/PropList.txt"); - if Path::new(&derived).exists() && Path::new(&proplist).exists() { - // Data exists - return; - } + #[cfg(feature = "fetch")] + fetch::run(&out_dir, &generated); - let zip = format!("{out_dir}/UCD.zip"); - - // curl -LO https://www.unicode.org/Public/zipped/latest/UCD.zip - let status = Command::new("curl") - .args([ - "-Lo", - &zip, - "https://www.unicode.org/Public/zipped/latest/UCD.zip", - ]) - .status() - .expect("failed to run curl"); - assert!(status.success(), "curl failed to download UCD.zip"); - - // mkdir -p data - fs::create_dir_all(data).unwrap(); - - // unzip -od data UCD.zip - let status = Command::new("unzip") - .args(["-od", data, &zip]) - .status() - .expect("failed to run unzip"); - assert!(status.success(), "unzip failed"); - - fs::remove_file(&zip).ok(); - - // XID_Start and XID_Continue additions - let mut f = fs::OpenOptions::new() - .append(true) - .open(&derived) - .expect("failed to open DerivedCoreProperties.txt"); - writeln!( - f, - "0024 ; XID_Start # Pc DOLLAR SIGN\n\ - 005F ; XID_Start # Pc LOW LINE\n\ - 2032..2034 ; XID_Continue # Po [3] PRIME..TRIPLE PRIME\n\ - 2057 ; XID_Continue # Po QUADRUPLE PRIME" - ) - .unwrap(); + #[cfg(not(feature = "fetch"))] + fallback::run(&out_dir, &generated); } -fn generate_from_file(out_dir: &str, path: &str, prop: &str, name: &str) { - let mut bitmap = vec![false; 0x110000]; - parse_file(path, prop, &mut bitmap) - .unwrap_or_else(|e| panic!("failed to read {path}: {e}")); - write_output(out_dir, name, &bitmap); -} +#[cfg(feature = "fetch")] +mod fetch { + use std::collections::HashMap; + use std::fs; + use std::io::{ + self, + BufRead, + BufReader, + Cursor, + Read, + Write, + }; -fn generate_from_codepoints(out_dir: &str, codepoints: &[char], name: &str) { - let mut bitmap = vec![false; 0x110000]; - for &c in codepoints { - bitmap[c as usize] = true; - } - write_output(out_dir, name, &bitmap); -} + const MIN_SHIFT: usize = 1; + const MAX_SHIFT: usize = 22; + const UCD_URL: &str = + "https://www.unicode.org/Public/zipped/latest/UCD.zip"; + + pub fn run(out_dir: &str, generated: &str) { + let data = format!("{out_dir}/data"); + let derived = format!("{data}/DerivedCoreProperties.txt"); + let proplist = format!("{data}/PropList.txt"); + + println!("cargo:rerun-if-changed={derived}"); + println!("cargo:rerun-if-changed={proplist}"); + + if !fs::exists(&derived).unwrap_or(false) + || !fs::exists(&proplist).unwrap_or(false) + { + let mut bytes = Vec::new(); + ureq::get(UCD_URL) + .call() + .expect("failed to download UCD.zip") + .into_reader() + .read_to_end(&mut bytes) + .expect("failed to read UCD.zip"); + + fs::create_dir_all(&data).unwrap(); + zip::ZipArchive::new(Cursor::new(bytes)) + .expect("failed to open UCD.zip") + .extract(&data) + .expect("failed to extract UCD.zip"); + + // XID_Start and XID_Continue additions + let mut f = fs::OpenOptions::new() + .append(true) + .open(&derived) + .expect("failed to open DerivedCoreProperties.txt"); + writeln!( + f, + "0024 ; XID_Start # Pc DOLLAR SIGN\n\ + 005F ; XID_Start # Pc LOW LINE\n\ + 2032..2034 ; XID_Continue # Po [3] PRIME..TRIPLE PRIME\n\ + 2057 ; XID_Continue # Po QUADRUPLE PRIME" + ) + .unwrap(); + } -fn write_output(out_dir: &str, name: &str, bitmap: &[bool]) { - let (shift, lvl1, lvl2) = optimize_tables(bitmap); - let mut f = File::create(format!("{out_dir}/{name}.rs")).unwrap(); - generate_code(&mut f, name, shift, &lvl1, &lvl2); -} + generate_from_file(out_dir, &derived, "XID_Start", "xid_start"); + generate_from_file(out_dir, &derived, "XID_Continue", "xid_continue"); + generate_from_file( + out_dir, + &proplist, + "Pattern_White_Space", + "pattern_white_space", + ); + generate_from_codepoints( + out_dir, + &[ + '\u{A}', '\u{B}', '\u{C}', '\u{D}', '\u{85}', '\u{2028}', + '\u{2029}', + ], + "line_terminator", + ); + + // Keep generated/ in sync so it can be committed as a fallback + fs::create_dir_all(generated).unwrap(); + for name in super::NAMES { + fs::copy( + format!("{out_dir}/{name}.rs"), + format!("{generated}/{name}.rs"), + ) + .unwrap_or_else(|e| { + panic!("failed to copy {name}.rs to generated/: {e}") + }); + } + } -fn optimize_tables(bitmap: &[bool]) -> (usize, Vec, Vec) { - let mut minsz = usize::MAX; - let mut config = (0, Vec::new(), Vec::new()); + fn generate_from_file(out_dir: &str, path: &str, prop: &str, name: &str) { + let mut bitmap = vec![false; 0x110000]; + parse_file(path, prop, &mut bitmap) + .unwrap_or_else(|e| panic!("failed to read {path}: {e}")); + write_output(out_dir, name, &bitmap); + } - for i in MIN_SHIFT..=MAX_SHIFT { - let (l1, l2) = build_tables(bitmap, i); - let sz = l1.len() * 2 + l2.len() * 8; - if sz < minsz { - minsz = sz; - config = (i, l1, l2); + fn generate_from_codepoints( + out_dir: &str, + codepoints: &[char], + name: &str, + ) { + let mut bitmap = vec![false; 0x110000]; + for &c in codepoints { + bitmap[c as usize] = true; } + write_output(out_dir, name, &bitmap); } - config -} + fn write_output(out_dir: &str, name: &str, bitmap: &[bool]) { + let (shift, lvl1, lvl2) = optimize_tables(bitmap); + let mut f = fs::File::create(format!("{out_dir}/{name}.rs")).unwrap(); + generate_code(&mut f, name, shift, &lvl1, &lvl2); + } -fn parse_file(path: &str, prop: &str, bitmap: &mut [bool]) -> io::Result<()> { - let file = File::open(path)?; - let reader = BufReader::new(file); + fn optimize_tables(bitmap: &[bool]) -> (usize, Vec, Vec) { + let mut minsz = usize::MAX; + let mut config = (0, Vec::new(), Vec::new()); - for line in reader.lines() { - let line = line?; - let line = line.split('#').next().unwrap_or("").trim(); - if line.is_empty() { - continue; + for i in MIN_SHIFT..=MAX_SHIFT { + let (l1, l2) = build_tables(bitmap, i); + let sz = l1.len() * 2 + l2.len() * 8; + if sz < minsz { + minsz = sz; + config = (i, l1, l2); + } } - let parts: Vec<&str> = line.split(';').map(|s| s.trim()).collect(); - if parts.len() < 2 || parts[1] != prop { - continue; - } + config + } - let (beg, end) = if parts[0].contains("..") { - let mut range = parts[0].split(".."); - ( - u32::from_str_radix(range.next().unwrap(), 16).unwrap(), - u32::from_str_radix(range.next().unwrap(), 16).unwrap(), - ) - } else { - let val = u32::from_str_radix(parts[0], 16).unwrap(); - (val, val) - }; + fn parse_file( + path: &str, + prop: &str, + bitmap: &mut [bool], + ) -> io::Result<()> { + let file = fs::File::open(path)?; + let reader = BufReader::new(file); + + for line in reader.lines() { + let line = line?; + let line = line.split('#').next().unwrap_or("").trim(); + if line.is_empty() { + continue; + } - for cp in beg..=end { - if (cp as usize) < bitmap.len() { - bitmap[cp as usize] = true; + let parts: Vec<&str> = line.split(';').map(|s| s.trim()).collect(); + if parts.len() < 2 || parts[1] != prop { + continue; + } + + let (beg, end) = if parts[0].contains("..") { + let mut range = parts[0].split(".."); + ( + u32::from_str_radix(range.next().unwrap(), 16).unwrap(), + u32::from_str_radix(range.next().unwrap(), 16).unwrap(), + ) + } else { + let val = u32::from_str_radix(parts[0], 16).unwrap(); + (val, val) + }; + + for cp in beg..=end { + if (cp as usize) < bitmap.len() { + bitmap[cp as usize] = true; + } } } + + Ok(()) } - Ok(()) -} + fn build_tables(bitmap: &[bool], shift: usize) -> (Vec, Vec) { + let blksz = 1 << shift; + let u64s_per_block = (blksz + 63) / 64; -fn build_tables(bitmap: &[bool], shift: usize) -> (Vec, Vec) { - let blksz = 1 << shift; - let u64s_per_block = (blksz + 63) / 64; + let mut lvl2: Vec = Vec::new(); + let mut lvl1: Vec = Vec::new(); + let mut blkmap: HashMap, u16> = HashMap::new(); - let mut lvl2: Vec = Vec::new(); - let mut lvl1: Vec = Vec::new(); - let mut blkmap: HashMap, u16> = HashMap::new(); + for chunk in bitmap.chunks(blksz) { + let mut blkdata = vec![0u64; u64s_per_block]; - for chunk in bitmap.chunks(blksz) { - let mut blkdata = vec![0u64; u64s_per_block]; + for (i, &bit) in chunk.iter().enumerate() { + if bit { + let word_idx = i / 64; + let bit_idx = i % 64; + blkdata[word_idx] |= 1 << bit_idx; + } + } - for (i, &bit) in chunk.iter().enumerate() { - if bit { - let word_idx = i / 64; - let bit_idx = i % 64; - blkdata[word_idx] |= 1 << bit_idx; + if let Some(&i) = blkmap.get(&blkdata) { + lvl1.push(i); + } else { + let i = (lvl2.len() / u64s_per_block) as u16; + lvl2.extend_from_slice(&blkdata); + blkmap.insert(blkdata, i); + lvl1.push(i); } } - if let Some(&i) = blkmap.get(&blkdata) { - lvl1.push(i); - } else { - let i = (lvl2.len() / u64s_per_block) as u16; - lvl2.extend_from_slice(&blkdata); - blkmap.insert(blkdata, i); - lvl1.push(i); - } + (lvl1, lvl2) } - (lvl1, lvl2) -} - -fn generate_code( - f: &mut impl Write, - prop_name: &str, - shift: usize, - level1: &[u16], - level2: &[u64], -) { - let upper_name = prop_name.to_uppercase(); - let lower_name = prop_name.to_lowercase(); - let block_size = 1 << shift; - let mask = block_size - 1; - let u64s_per_block = (block_size + 63) / 64; - - let pred_name = if lower_name.contains('_') { - format!("{lower_name}_p") - } else { - format!("{lower_name}p") - }; + fn generate_code( + f: &mut impl Write, + prop_name: &str, + shift: usize, + level1: &[u16], + level2: &[u64], + ) { + let upper_name = prop_name.to_uppercase(); + let lower_name = prop_name.to_lowercase(); + let block_size = 1 << shift; + let mask = block_size - 1; + let u64s_per_block = (block_size + 63) / 64; + + let pred_name = if lower_name.contains('_') { + format!("{lower_name}_p") + } else { + format!("{lower_name}p") + }; - writeln!(f, "/* Autogenerated – DO NOT EDIT */").unwrap(); - writeln!(f).unwrap(); - writeln!( - f, - "static {upper_name}_L1: [u16; {}] = {level1:?};", - level1.len() - ) - .unwrap(); - writeln!( - f, - "static {upper_name}_L2: [u64; {}] = {level2:?};", - level2.len() - ) - .unwrap(); - writeln!(f, "#[inline]").unwrap(); - writeln!(f, "pub fn {pred_name}(c: char) -> bool {{").unwrap(); - writeln!(f, "\tlet cp = c as usize;").unwrap(); - writeln!(f, "\tlet blki = unsafe {{ *{upper_name}_L1.get_unchecked(cp >> {shift}) }} as usize;").unwrap(); - writeln!(f, "\tlet in_blk_offset_p = cp & 0x{mask:X};").unwrap(); - if u64s_per_block == 1 { - writeln!(f, "\tunsafe {{ return ({upper_name}_L2.get_unchecked(blki) & (1 << in_blk_offset_p)) != 0; }}").unwrap(); - } else { + writeln!(f, "/* Autogenerated – DO NOT EDIT */").unwrap(); + writeln!(f).unwrap(); writeln!( f, - "\tlet wordi = (blki * {u64s_per_block}) + (in_blk_offset_p >> 6);" + "static {upper_name}_L1: [u16; {}] = {level1:?};", + level1.len() ) .unwrap(); - writeln!(f, "\tlet biti = in_blk_offset_p & 0x3F;").unwrap(); - writeln!(f, "\tunsafe {{ return (*{upper_name}_L2.get_unchecked(wordi) & (1 << biti)) != 0; }}").unwrap(); + writeln!( + f, + "static {upper_name}_L2: [u64; {}] = {level2:?};", + level2.len() + ) + .unwrap(); + writeln!(f, "#[inline]").unwrap(); + writeln!(f, "pub fn {pred_name}(c: char) -> bool {{").unwrap(); + writeln!(f, "\tlet cp = c as usize;").unwrap(); + writeln!(f, "\tlet blki = unsafe {{ *{upper_name}_L1.get_unchecked(cp >> {shift}) }} as usize;").unwrap(); + writeln!(f, "\tlet in_blk_offset_p = cp & 0x{mask:X};").unwrap(); + if u64s_per_block == 1 { + writeln!(f, "\tunsafe {{ return ({upper_name}_L2.get_unchecked(blki) & (1 << in_blk_offset_p)) != 0; }}").unwrap(); + } else { + writeln!( + f, + "\tlet wordi = (blki * {u64s_per_block}) + (in_blk_offset_p >> 6);" + ) + .unwrap(); + writeln!(f, "\tlet biti = in_blk_offset_p & 0x3F;").unwrap(); + writeln!(f, "\tunsafe {{ return (*{upper_name}_L2.get_unchecked(wordi) & (1 << biti)) != 0; }}").unwrap(); + } + writeln!(f, "}}").unwrap(); + } +} + +#[cfg(not(feature = "fetch"))] +mod fallback { + use std::fs; + + pub fn run(out_dir: &str, generated: &str) { + for name in super::NAMES { + let src = format!("{generated}/{name}.rs"); + println!("cargo:rerun-if-changed={src}"); + fs::copy(&src, format!("{out_dir}/{name}.rs")) + .unwrap_or_else(|e| panic!("failed to copy {src}: {e}")); + } } - writeln!(f, "}}").unwrap(); } -- cgit v1.2.3