fn main() { #[cfg(feature = "fetch")] { let root = std::env::var("CARGO_MANIFEST_DIR").unwrap(); let out_dir = format!("{root}/src/unicode"); fetch::run(&out_dir); } } #[cfg(feature = "fetch")] mod fetch { use std::collections::HashMap; use std::env; use std::fs::{ self, File, OpenOptions, }; use std::io::{ self, BufRead, BufReader, Cursor, Read, Write, }; use zip::ZipArchive; const MIN_SHIFT: usize = 1; const MAX_SHIFT: usize = 22; const UCD_URL: &str = "https://www.unicode.org/Public/zipped/latest/UCD.zip"; const NAMES: &[&str] = &[ "line_terminator", "pattern_white_space", "xid_continue", "xid_start", ]; pub fn run(out_dir: &str) { let data = format!("{}/data", env::var("CARGO_MANIFEST_DIR").unwrap()); let derived = format!("{data}/DerivedCoreProperties.txt"); let proplist = format!("{data}/PropList.txt"); println!("cargo:rerun-if-changed={derived}"); println!("cargo:rerun-if-changed={proplist}"); if !fs::exists(&derived).unwrap_or(false) || !fs::exists(&proplist).unwrap_or(false) { let mut bytes = Vec::new(); ureq::get(UCD_URL) .call() .expect("failed to download UCD.zip") .into_body() .into_reader() .read_to_end(&mut bytes) .expect("failed to read UCD.zip"); fs::create_dir_all(&data).unwrap(); ZipArchive::new(Cursor::new(bytes)) .expect("failed to open UCD.zip") .extract(&data) .expect("failed to extract UCD.zip"); // XID_Start and XID_Continue additions let mut f = OpenOptions::new() .append(true) .open(&derived) .expect("failed to open DerivedCoreProperties.txt"); writeln!( f, "0024 ; XID_Start # Pc DOLLAR SIGN\n\ 005F ; XID_Start # Pc LOW LINE\n\ 2032..2034 ; XID_Continue # Po [3] PRIME..TRIPLE PRIME\n\ 2057 ; XID_Continue # Po QUADRUPLE PRIME" ) .unwrap(); } fs::create_dir_all(out_dir).unwrap(); generate_from_file(out_dir, &derived, "XID_Start"); generate_from_file(out_dir, &derived, "XID_Continue"); generate_from_file(out_dir, &proplist, "Pattern_White_Space"); generate_from_codepoints( out_dir, &[ '\u{A}', '\u{B}', '\u{C}', '\u{D}', '\u{85}', '\u{2028}', '\u{2029}', ], "line_terminator", ); generate_mod_rs(out_dir); } fn generate_mod_rs(out_dir: &str) { let mut f = File::create(format!("{out_dir}/mod.rs")).unwrap(); writeln!(f, "/* Autogenerated – DO NOT EDIT */\n").unwrap(); for &name in NAMES { writeln!(f, "pub mod {name};").unwrap(); writeln!(f, "pub use {name}::{};", mkpredname(name)).unwrap(); } } fn generate_from_file(out_dir: &str, path: &str, prop: &str) { let mut bitmap = vec![false; 0x110000]; parse_file(path, prop, &mut bitmap) .unwrap_or_else(|e| panic!("failed to read {path}: {e}")); write_output(out_dir, prop.to_lowercase().as_str(), &bitmap); } fn generate_from_codepoints( out_dir: &str, codepoints: &[char], name: &str, ) { let mut bitmap = vec![false; 0x110000]; for &c in codepoints { bitmap[c as usize] = true; } write_output(out_dir, name, &bitmap); } fn write_output(out_dir: &str, name: &str, bitmap: &[bool]) { let (shift, lvl1, lvl2) = optimize_tables(bitmap); let mut f = File::create(format!("{out_dir}/{name}.rs")).unwrap(); generate_code(&mut f, name, shift, &lvl1, &lvl2).unwrap(); } fn optimize_tables(bitmap: &[bool]) -> (usize, Vec, Vec) { let mut minsz = usize::MAX; let mut config = (0, Vec::new(), Vec::new()); for i in MIN_SHIFT..=MAX_SHIFT { let (l1, l2) = build_tables(bitmap, i); let sz = l1.len() * 2 + l2.len() * 8; if sz < minsz { minsz = sz; config = (i, l1, l2); } } return config; } fn parse_file( path: &str, prop: &str, bitmap: &mut [bool], ) -> io::Result<()> { let file = File::open(path)?; let reader = BufReader::new(file); for line in reader.lines() { let line = line?; let line = line.split('#').next().unwrap_or("").trim(); if line.is_empty() { continue; } let parts: Vec<&str> = line.split(';').map(|s| s.trim()).collect(); if parts.len() < 2 || parts[1] != prop { continue; } let (beg, end) = if parts[0].contains("..") { let mut range = parts[0].split(".."); ( u32::from_str_radix(range.next().unwrap(), 16).unwrap(), u32::from_str_radix(range.next().unwrap(), 16).unwrap(), ) } else { let val = u32::from_str_radix(parts[0], 16).unwrap(); (val, val) }; for cp in beg..=end { if (cp as usize) < bitmap.len() { bitmap[cp as usize] = true; } } } return Ok(()); } fn build_tables(bitmap: &[bool], shift: usize) -> (Vec, Vec) { let blksz = 1 << shift; let u64s_per_block = (blksz + 63) / 64; let mut lvl2: Vec = Vec::new(); let mut lvl1: Vec = Vec::new(); let mut blkmap: HashMap, u16> = HashMap::new(); for chunk in bitmap.chunks(blksz) { let mut blkdata = vec![0u64; u64s_per_block]; for (i, &bit) in chunk.iter().enumerate() { if bit { let word_idx = i / 64; let bit_idx = i % 64; blkdata[word_idx] |= 1 << bit_idx; } } if let Some(&i) = blkmap.get(&blkdata) { lvl1.push(i); } else { let i = (lvl2.len() / u64s_per_block) as u16; lvl2.extend_from_slice(&blkdata); blkmap.insert(blkdata, i); lvl1.push(i); } } return (lvl1, lvl2); } fn generate_code( f: &mut impl Write, prop_name: &str, shift: usize, level1: &[u16], level2: &[u64], ) -> io::Result<()> { let upper_name = prop_name.to_uppercase(); let lower_name = prop_name.to_lowercase(); let block_size = 1 << shift; let mask = block_size - 1; let u64s_per_block = (block_size + 63) / 64; let pred_name = mkpredname(&lower_name); let biggest_i = level1.into_iter().max().unwrap().clone(); let l1type = if biggest_i <= u8::MAX as u16 { "u8" } else { "u16" }; writeln!(f, "/* Autogenerated – DO NOT EDIT */")?; writeln!(f)?; writeln!( f, "static {upper_name}_L1: [{l1type}; {}] = {level1:?};", level1.len() )?; writeln!( f, "static {upper_name}_L2: [u64; {}] = {level2:?};", level2.len() )?; writeln!(f, "#[inline]")?; writeln!(f, "pub fn {pred_name}(c: char) -> bool {{")?; writeln!(f, "\tlet cp = c as usize;")?; writeln!( f, "\tlet blki = unsafe {{ *{upper_name}_L1.get_unchecked(cp >> {shift}) }} as usize;" )?; writeln!(f, "\tlet in_blk_offset_p = cp & 0x{mask:X};")?; if u64s_per_block == 1 { writeln!( f, "\tunsafe {{ return ({upper_name}_L2.get_unchecked(blki) & (1 << in_blk_offset_p)) != 0; }}" )?; } else { writeln!( f, "\tlet wordi = (blki * {u64s_per_block}) + (in_blk_offset_p >> 6);" )?; writeln!(f, "\tlet biti = in_blk_offset_p & 0x3F;")?; writeln!( f, "\tunsafe {{ return (*{upper_name}_L2.get_unchecked(wordi) & (1 << biti)) != 0; }}" )?; } return writeln!(f, "}}"); } fn mkpredname>(s: S) -> String { let s = s.as_ref(); return if s.contains('_') { format!("{s}_p") } else { format!("{s}p") }; } }