#!/usr/bin/python3 import math from lib import * MAP = { 'Adlam': 'ADLM', 'Caucasian_Albanian': 'AGHB', 'Ahom': 'AHOM', 'Arabic': 'ARAB', 'Imperial_Aramaic': 'ARMI', 'Armenian': 'ARMN', 'Avestan': 'AVST', 'Balinese': 'BALI', 'Bamum': 'BAMU', 'Bassa_Vah': 'BASS', 'Batak': 'BATK', 'Bengali': 'BENG', 'Bhaiksuki': 'BHKS', 'Bopomofo': 'BOPO', 'Brahmi': 'BRAH', 'Braille': 'BRAI', 'Buginese': 'BUGI', 'Buhid': 'BUHD', 'Chakma': 'CAKM', 'Canadian_Aboriginal': 'CANS', 'Carian': 'CARI', 'Cham': 'CHAM', 'Cherokee': 'CHER', 'Chorasmian': 'CHRS', 'Coptic': 'COPT', 'Cypro_Minoan': 'CPMN', 'Cypriot': 'CPRT', 'Cyrillic': 'CYRL', 'Devanagari': 'DEVA', 'Dives_Akuru': 'DIAK', 'Dogra': 'DOGR', 'Deseret': 'DSRT', 'Duployan': 'DUPL', 'Egyptian_Hieroglyphs': 'EGYP', 'Elbasan': 'ELBA', 'Elymaic': 'ELYM', 'Ethiopic': 'ETHI', 'Garay': 'GARA', 'Georgian': 'GEOR', 'Glagolitic': 'GLAG', 'Gunjala_Gondi': 'GONG', 'Masaram_Gondi': 'GONM', 'Gothic': 'GOTH', 'Grantha': 'GRAN', 'Greek': 'GREK', 'Gujarati': 'GUJR', 'Gurung_Khema': 'GUKH', 'Gurmukhi': 'GURU', 'Hangul': 'HANG', 'Han': 'HANI', 'Hanunoo': 'HANO', 'Hatran': 'HATR', 'Hebrew': 'HEBR', 'Hiragana': 'HIRA', 'Anatolian_Hieroglyphs': 'HLUW', 'Pahawh_Hmong': 'HMNG', 'Nyiakeng_Puachue_Hmong': 'HMNP', 'Katakana_Or_Hiragana': 'HRKT', 'Old_Hungarian': 'HUNG', 'Old_Italic': 'ITAL', 'Javanese': 'JAVA', 'Kayah_Li': 'KALI', 'Katakana': 'KANA', 'Kawi': 'KAWI', 'Kharoshthi': 'KHAR', 'Khmer': 'KHMR', 'Khojki': 'KHOJ', 'Khitan_Small_Script': 'KITS', 'Kannada': 'KNDA', 'Kirat_Rai': 'KRAI', 'Kaithi': 'KTHI', 'Tai_Tham': 'LANA', 'Lao': 'LAOO', 'Latin': 'LATN', 'Lepcha': 'LEPC', 'Limbu': 'LIMB', 'Linear_A': 'LINA', 'Linear_B': 'LINB', 'Lisu': 'LISU', 'Lycian': 'LYCI', 'Lydian': 'LYDI', 'Mahajani': 'MAHJ', 'Makasar': 'MAKA', 'Mandaic': 'MAND', 'Manichaean': 'MANI', 'Marchen': 'MARC', 'Medefaidrin': 'MEDF', 'Mende_Kikakui': 'MEND', 'Meroitic_Cursive': 'MERC', 'Meroitic_Hieroglyphs': 'MERO', 'Malayalam': 'MLYM', 'Modi': 'MODI', 'Mongolian': 'MONG', 'Mro': 'MROO', 'Meetei_Mayek': 'MTEI', 'Multani': 'MULT', 'Myanmar': 'MYMR', 'Nag_Mundari': 'NAGM', 'Nandinagari': 'NAND', 'Old_North_Arabian': 'NARB', 'Nabataean': 'NBAT', 'Newa': 'NEWA', 'Nko': 'NKOO', 'Nushu': 'NSHU', 'Ogham': 'OGAM', 'Ol_Chiki': 'OLCK', 'Ol_Onal': 'ONAO', 'Old_Turkic': 'ORKH', 'Oriya': 'ORYA', 'Osage': 'OSGE', 'Osmanya': 'OSMA', 'Old_Uyghur': 'OUGR', 'Palmyrene': 'PALM', 'Pau_Cin_Hau': 'PAUC', 'Old_Permic': 'PERM', 'Phags_Pa': 'PHAG', 'Inscriptional_Pahlavi': 'PHLI', 'Psalter_Pahlavi': 'PHLP', 'Phoenician': 'PHNX', 'Miao': 'PLRD', 'Inscriptional_Parthian': 'PRTI', 'Rejang': 'RJNG', 'Hanifi_Rohingya': 'ROHG', 'Runic': 'RUNR', 'Samaritan': 'SAMR', 'Old_South_Arabian': 'SARB', 'Saurashtra': 'SAUR', 'SignWriting': 'SGNW', 'Shavian': 'SHAW', 'Sharada': 'SHRD', 'Siddham': 'SIDD', 'Khudawadi': 'SIND', 'Sinhala': 'SINH', 'Sogdian': 'SOGD', 'Old_Sogdian': 'SOGO', 'Sora_Sompeng': 'SORA', 'Soyombo': 'SOYO', 'Sundanese': 'SUND', 'Sunuwar': 'SUNU', 'Syloti_Nagri': 'SYLO', 'Syriac': 'SYRC', 'Tagbanwa': 'TAGB', 'Takri': 'TAKR', 'Tai_Le': 'TALE', 'New_Tai_Lue': 'TALU', 'Tamil': 'TAML', 'Tangut': 'TANG', 'Tai_Viet': 'TAVT', 'Telugu': 'TELU', 'Tifinagh': 'TFNG', 'Tagalog': 'TGLG', 'Thaana': 'THAA', 'Thai': 'THAI', 'Tibetan': 'TIBT', 'Tirhuta': 'TIRH', 'Tangsa': 'TNSA', 'Todhri': 'TODR', 'Toto': 'TOTO', 'Tulu_Tigalari': 'TUTG', 'Ugaritic': 'UGAR', 'Vai': 'VAII', 'Vithkuqi': 'VITH', 'Warang_Citi': 'WARA', 'Wancho': 'WCHO', 'Old_Persian': 'XPEO', 'Cuneiform': 'XSUX', 'Yezidi': 'YEZI', 'Yi': 'YIII', 'Zanabazar_Square': 'ZANB', 'Inherited': 'ZINH', 'Common': 'ZYYY', } longest = 0 def parse(file: str) -> list[bool]: global longest xs = ['SC_ZZZZ'] * 0x110000 with open(file, 'r') as f: for line in f.readlines(): if len(line.strip()) == 0 or line[0] == '#': continue parts = line.split(';') ranges = [int(x, 16) for x in parts[0].strip().split('..')] prop = 'SC_' + MAP[parts[1].split('#')[0].strip()] longest = max(longest, len(prop)) for i in range(ranges[0], ranges[len(ranges) - 1] + 1): xs[i] = prop return xs def genfile(cs: list[tuple[bool, ...]], blksize: int) -> None: Cs = cs cs = list(dict.fromkeys(Cs)) print('''\ /* This file is autogenerated by gen/prop/sc; DO NOT EDIT. */ #include #include "unicode/prop.h" ''') print(f'static constexpr {typename(len(cs) - 1)} stage1[] = {{') for i, c in enumerate(Cs): print(f'%c%{len(str(len(cs) - 1))}d,' % ('\t' if i % 16 == 0 else ' ', cs.index(c)), end='') if i % 16 == 15: print() print('};') print() ppc = columns(blksize, longest + 1) print(f'static constexpr enum uprop_sc stage2[][{blksize}] = {{') for c in cs: for i in range(blksize // ppc): print('\t{' if i == 0 else '\t ', end='') for j in range(ppc): print(c[i*ppc + j], end='') if i < blksize // ppc - 1 or j < ppc - 1: print(',', end='') if j < ppc - 1: print(' ' * (longest + 1 - len(c[i*ppc + j])), end='') if i < blksize // ppc - 1: print() print('},') print('};') print() print(f'''\ enum uprop_sc uprop_get_sc(rune ch) {{ return stage2[stage1[ch / {blksize}]][ch % {blksize}]; }}''') def main() -> None: cwd_init() xs = parse('data/Scripts') blksize = -1 smallest = math.inf for bs in powers_of_2(): if bs > len(xs): break Cs = [tuple(x) for x in chunks(xs, bs)] cs = set(Cs) sz_s1 = len(Cs) * isize(len(cs) - 1) sz_s2 = len(cs) * bs sz = sz_s1 + sz_s2 if sz < smallest: smallest = sz blksize = bs Cs = [tuple(x) for x in chunks(xs, blksize)] with open('lib/unicode/prop/uprop_get_sc.c', 'w') as f: sys.stdout = f genfile(Cs, blksize) report_size(len(xs), smallest) if __name__ == '__main__': main()