diff options
author | Thomas Voss <mail@thomasvoss.com> | 2024-04-30 20:08:37 +0200 |
---|---|---|
committer | Thomas Voss <mail@thomasvoss.com> | 2024-04-30 20:08:37 +0200 |
commit | 34c55c4d07af131c9da06c367ac2958a6090f2a3 (patch) | |
tree | 2d0fe61b618928feb3a0fffa031e9285a816f0cd /gen/prop | |
parent | 04e8ee70d94a579f1d24aaa80e9341c9000d2dec (diff) |
Add more 2-stage lookup tables
Diffstat (limited to 'gen/prop')
-rwxr-xr-x | gen/prop/blk | 812 | ||||
-rwxr-xr-x | gen/prop/bpt | 154 | ||||
-rwxr-xr-x | gen/prop/ccc | 269 | ||||
-rwxr-xr-x | gen/prop/dt | 197 | ||||
-rwxr-xr-x | gen/prop/gc | 4 |
5 files changed, 807 insertions, 629 deletions
diff --git a/gen/prop/blk b/gen/prop/blk index a3bf56d..4883d1c 100755 --- a/gen/prop/blk +++ b/gen/prop/blk @@ -1,395 +1,439 @@ -#!/bin/sh +#!/usr/bin/python3 -set -e -cd "${0%/*}/../.." -exec >lib/unicode/prop/uprop_get_blk.c +import math -gawk ' -BEGIN { - FS = " *(; *|#.*)" +from lib import * - map["adlam"] = "ADLAM" - map["aegean_numbers"] = "AEGEAN_NUMBERS" - map["ahom"] = "AHOM" - map["alchemical_symbols"] = "ALCHEMICAL" - map["alphabetic_presentation_forms"] = "ALPHABETIC_PF" - map["anatolian_hieroglyphs"] = "ANATOLIAN_HIEROGLYPHS" - map["ancient_greek_musical_notation"] = "ANCIENT_GREEK_MUSIC" - map["ancient_greek_numbers"] = "ANCIENT_GREEK_NUMBERS" - map["ancient_symbols"] = "ANCIENT_SYMBOLS" - map["arabic"] = "ARABIC" - map["arabic_extended_a"] = "ARABIC_EXT_A" - map["arabic_extended_b"] = "ARABIC_EXT_B" - map["arabic_extended_c"] = "ARABIC_EXT_C" - map["arabic_mathematical_alphabetic_symbols"] = "ARABIC_MATH" - map["arabic_presentation_forms_a"] = "ARABIC_PF_A" - map["arabic_presentation_forms_b"] = "ARABIC_PF_B" - map["arabic_supplement"] = "ARABIC_SUP" - map["armenian"] = "ARMENIAN" - map["arrows"] = "ARROWS" - map["avestan"] = "AVESTAN" - map["balinese"] = "BALINESE" - map["bamum"] = "BAMUM" - map["bamum_supplement"] = "BAMUM_SUP" - map["basic_latin"] = "ASCII" - map["bassa_vah"] = "BASSA_VAH" - map["batak"] = "BATAK" - map["bengali"] = "BENGALI" - map["bhaiksuki"] = "BHAIKSUKI" - map["block_elements"] = "BLOCK_ELEMENTS" - map["bopomofo"] = "BOPOMOFO" - map["bopomofo_extended"] = "BOPOMOFO_EXT" - map["box_drawing"] = "BOX_DRAWING" - map["brahmi"] = "BRAHMI" - map["braille_patterns"] = "BRAILLE" - map["buginese"] = "BUGINESE" - map["buhid"] = "BUHID" - map["byzantine_musical_symbols"] = "BYZANTINE_MUSIC" - map["carian"] = "CARIAN" - map["caucasian_albanian"] = "CAUCASIAN_ALBANIAN" - map["chakma"] = "CHAKMA" - map["cham"] = "CHAM" - map["cherokee"] = "CHEROKEE" - map["cherokee_supplement"] = "CHEROKEE_SUP" - map["chess_symbols"] = "CHESS_SYMBOLS" - map["chorasmian"] = "CHORASMIAN" - map["cjk_compatibility"] = "CJK_COMPAT" - map["cjk_compatibility_forms"] = "CJK_COMPAT_FORMS" - map["cjk_compatibility_ideographs"] = "CJK_COMPAT_IDEOGRAPHS" - map["cjk_compatibility_ideographs_supplement"] = "CJK_COMPAT_IDEOGRAPHS_SUP" - map["cjk_radicals_supplement"] = "CJK_RADICALS_SUP" - map["cjk_strokes"] = "CJK_STROKES" - map["cjk_symbols_and_punctuation"] = "CJK_SYMBOLS" - map["cjk_unified_ideographs"] = "CJK" - map["cjk_unified_ideographs_extension_a"] = "CJK_EXT_A" - map["cjk_unified_ideographs_extension_b"] = "CJK_EXT_B" - map["cjk_unified_ideographs_extension_c"] = "CJK_EXT_C" - map["cjk_unified_ideographs_extension_d"] = "CJK_EXT_D" - map["cjk_unified_ideographs_extension_e"] = "CJK_EXT_E" - map["cjk_unified_ideographs_extension_f"] = "CJK_EXT_F" - map["cjk_unified_ideographs_extension_g"] = "CJK_EXT_G" - map["cjk_unified_ideographs_extension_h"] = "CJK_EXT_H" - map["cjk_unified_ideographs_extension_i"] = "CJK_EXT_I" - map["combining_diacritical_marks"] = "DIACRITICALS" - map["combining_diacritical_marks_extended"] = "DIACRITICALS_EXT" - map["combining_diacritical_marks_for_symbols"] = "DIACRITICALS_FOR_SYMBOLS" - map["combining_diacritical_marks_supplement"] = "DIACRITICALS_SUP" - map["combining_half_marks"] = "HALF_MARKS" - map["common_indic_number_forms"] = "INDIC_NUMBER_FORMS" - map["control_pictures"] = "CONTROL_PICTURES" - map["coptic"] = "COPTIC" - map["coptic_epact_numbers"] = "COPTIC_EPACT_NUMBERS" - map["counting_rod_numerals"] = "COUNTING_ROD" - map["cuneiform"] = "CUNEIFORM" - map["cuneiform_numbers_and_punctuation"] = "CUNEIFORM_NUMBERS" - map["currency_symbols"] = "CURRENCY_SYMBOLS" - map["cypriot_syllabary"] = "CYPRIOT_SYLLABARY" - map["cypro_minoan"] = "CYPRO_MINOAN" - map["cyrillic"] = "CYRILLIC" - map["cyrillic_extended_a"] = "CYRILLIC_EXT_A" - map["cyrillic_extended_b"] = "CYRILLIC_EXT_B" - map["cyrillic_extended_c"] = "CYRILLIC_EXT_C" - map["cyrillic_extended_d"] = "CYRILLIC_EXT_D" - map["cyrillic_supplement"] = "CYRILLIC_SUP" - map["deseret"] = "DESERET" - map["devanagari"] = "DEVANAGARI" - map["devanagari_extended_a"] = "DEVANAGARI_EXT_A" - map["devanagari_extended"] = "DEVANAGARI_EXT" - map["dingbats"] = "DINGBATS" - map["dives_akuru"] = "DIVES_AKURU" - map["dogra"] = "DOGRA" - map["domino_tiles"] = "DOMINO" - map["duployan"] = "DUPLOYAN" - map["early_dynastic_cuneiform"] = "EARLY_DYNASTIC_CUNEIFORM" - map["egyptian_hieroglyph_format_controls"] = "EGYPTIAN_HIEROGLYPH_FORMAT_CONTROLS" - map["egyptian_hieroglyphs"] = "EGYPTIAN_HIEROGLYPHS" - map["elbasan"] = "ELBASAN" - map["elymaic"] = "ELYMAIC" - map["emoticons"] = "EMOTICONS" - map["enclosed_alphanumerics"] = "ENCLOSED_ALPHANUM" - map["enclosed_alphanumeric_supplement"] = "ENCLOSED_ALPHANUM_SUP" - map["enclosed_cjk_letters_and_months"] = "ENCLOSED_CJK" - map["enclosed_ideographic_supplement"] = "ENCLOSED_IDEOGRAPHIC_SUP" - map["ethiopic"] = "ETHIOPIC" - map["ethiopic_extended_a"] = "ETHIOPIC_EXT_A" - map["ethiopic_extended_b"] = "ETHIOPIC_EXT_B" - map["ethiopic_extended"] = "ETHIOPIC_EXT" - map["ethiopic_supplement"] = "ETHIOPIC_SUP" - map["general_punctuation"] = "PUNCTUATION" - map["geometric_shapes_extended"] = "GEOMETRIC_SHAPES_EXT" - map["geometric_shapes"] = "GEOMETRIC_SHAPES" - map["georgian_extended"] = "GEORGIAN_EXT" - map["georgian"] = "GEORGIAN" - map["georgian_supplement"] = "GEORGIAN_SUP" - map["glagolitic"] = "GLAGOLITIC" - map["glagolitic_supplement"] = "GLAGOLITIC_SUP" - map["gothic"] = "GOTHIC" - map["grantha"] = "GRANTHA" - map["greek_and_coptic"] = "GREEK" - map["greek_extended"] = "GREEK_EXT" - map["gujarati"] = "GUJARATI" - map["gunjala_gondi"] = "GUNJALA_GONDI" - map["gurmukhi"] = "GURMUKHI" - map["halfwidth_and_fullwidth_forms"] = "HALF_AND_FULL_FORMS" - map["hangul_compatibility_jamo"] = "COMPAT_JAMO" - map["hangul_jamo_extended_a"] = "JAMO_EXT_A" - map["hangul_jamo_extended_b"] = "JAMO_EXT_B" - map["hangul_jamo"] = "JAMO" - map["hangul_syllables"] = "HANGUL" - map["hanifi_rohingya"] = "HANIFI_ROHINGYA" - map["hanunoo"] = "HANUNOO" - map["hatran"] = "HATRAN" - map["hebrew"] = "HEBREW" - map["high_private_use_surrogates"] = "HIGH_PU_SURROGATES" - map["high_surrogates"] = "HIGH_SURROGATES" - map["hiragana"] = "HIRAGANA" - map["ideographic_description_characters"] = "IDC" - map["ideographic_symbols_and_punctuation"] = "IDEOGRAPHIC_SYMBOLS" - map["imperial_aramaic"] = "IMPERIAL_ARAMAIC" - map["indic_siyaq_numbers"] = "INDIC_SIYAQ_NUMBERS" - map["inscriptional_pahlavi"] = "INSCRIPTIONAL_PAHLAVI" - map["inscriptional_parthian"] = "INSCRIPTIONAL_PARTHIAN" - map["ipa_extensions"] = "IPA_EXT" - map["javanese"] = "JAVANESE" - map["kaithi"] = "KAITHI" - map["kaktovik_numerals"] = "KAKTOVIK_NUMERALS" - map["kana_extended_a"] = "KANA_EXT_A" - map["kana_extended_b"] = "KANA_EXT_B" - map["kana_supplement"] = "KANA_SUP" - map["kanbun"] = "KANBUN" - map["kangxi_radicals"] = "KANGXI" - map["kannada"] = "KANNADA" - map["katakana"] = "KATAKANA" - map["katakana_phonetic_extensions"] = "KATAKANA_EXT" - map["kawi"] = "KAWI" - map["kayah_li"] = "KAYAH_LI" - map["kharoshthi"] = "KHAROSHTHI" - map["khitan_small_script"] = "KHITAN_SMALL_SCRIPT" - map["khmer"] = "KHMER" - map["khmer_symbols"] = "KHMER_SYMBOLS" - map["khojki"] = "KHOJKI" - map["khudawadi"] = "KHUDAWADI" - map["lao"] = "LAO" - map["latin_1_supplement"] = "LATIN_1_SUP" - map["latin_extended_additional"] = "LATIN_EXT_ADDITIONAL" - map["latin_extended_a"] = "LATIN_EXT_A" - map["latin_extended_b"] = "LATIN_EXT_B" - map["latin_extended_c"] = "LATIN_EXT_C" - map["latin_extended_d"] = "LATIN_EXT_D" - map["latin_extended_e"] = "LATIN_EXT_E" - map["latin_extended_f"] = "LATIN_EXT_F" - map["latin_extended_g"] = "LATIN_EXT_G" - map["lepcha"] = "LEPCHA" - map["letterlike_symbols"] = "LETTERLIKE_SYMBOLS" - map["limbu"] = "LIMBU" - map["linear_a"] = "LINEAR_A" - map["linear_b_ideograms"] = "LINEAR_B_IDEOGRAMS" - map["linear_b_syllabary"] = "LINEAR_B_SYLLABARY" - map["lisu"] = "LISU" - map["lisu_supplement"] = "LISU_SUP" - map["low_surrogates"] = "LOW_SURROGATES" - map["lycian"] = "LYCIAN" - map["lydian"] = "LYDIAN" - map["mahajani"] = "MAHAJANI" - map["mahjong_tiles"] = "MAHJONG" - map["makasar"] = "MAKASAR" - map["malayalam"] = "MALAYALAM" - map["mandaic"] = "MANDAIC" - map["manichaean"] = "MANICHAEAN" - map["marchen"] = "MARCHEN" - map["masaram_gondi"] = "MASARAM_GONDI" - map["mathematical_alphanumeric_symbols"] = "MATH_ALPHANUM" - map["mathematical_operators"] = "MATH_OPERATORS" - map["mayan_numerals"] = "MAYAN_NUMERALS" - map["medefaidrin"] = "MEDEFAIDRIN" - map["meetei_mayek_extensions"] = "MEETEI_MAYEK_EXT" - map["meetei_mayek"] = "MEETEI_MAYEK" - map["mende_kikakui"] = "MENDE_KIKAKUI" - map["meroitic_cursive"] = "MEROITIC_CURSIVE" - map["meroitic_hieroglyphs"] = "MEROITIC_HIEROGLYPHS" - map["miao"] = "MIAO" - map["miscellaneous_mathematical_symbols_a"] = "MISC_MATH_SYMBOLS_A" - map["miscellaneous_mathematical_symbols_b"] = "MISC_MATH_SYMBOLS_B" - map["miscellaneous_symbols_and_arrows"] = "MISC_ARROWS" - map["miscellaneous_symbols_and_pictographs"] = "MISC_PICTOGRAPHS" - map["miscellaneous_symbols"] = "MISC_SYMBOLS" - map["miscellaneous_technical"] = "MISC_TECHNICAL" - map["modifier_tone_letters"] = "MODIFIER_TONE_LETTERS" - map["modi"] = "MODI" - map["mongolian"] = "MONGOLIAN" - map["mongolian_supplement"] = "MONGOLIAN_SUP" - map["mro"] = "MRO" - map["multani"] = "MULTANI" - map["musical_symbols"] = "MUSIC" - map["myanmar_extended_a"] = "MYANMAR_EXT_A" - map["myanmar_extended_b"] = "MYANMAR_EXT_B" - map["myanmar"] = "MYANMAR" - map["nabataean"] = "NABATAEAN" - map["nag_mundari"] = "NAG_MUNDARI" - map["nandinagari"] = "NANDINAGARI" - map["newa"] = "NEWA" - map["new_tai_lue"] = "NEW_TAI_LUE" - map["nko"] = "NKO" - map["number_forms"] = "NUMBER_FORMS" - map["nushu"] = "NUSHU" - map["nyiakeng_puachue_hmong"] = "NYIAKENG_PUACHUE_HMONG" - map["ogham"] = "OGHAM" - map["ol_chiki"] = "OL_CHIKI" - map["old_hungarian"] = "OLD_HUNGARIAN" - map["old_italic"] = "OLD_ITALIC" - map["old_north_arabian"] = "OLD_NORTH_ARABIAN" - map["old_permic"] = "OLD_PERMIC" - map["old_persian"] = "OLD_PERSIAN" - map["old_sogdian"] = "OLD_SOGDIAN" - map["old_south_arabian"] = "OLD_SOUTH_ARABIAN" - map["old_turkic"] = "OLD_TURKIC" - map["old_uyghur"] = "OLD_UYGHUR" - map["optical_character_recognition"] = "OCR" - map["oriya"] = "ORIYA" - map["ornamental_dingbats"] = "ORNAMENTAL_DINGBATS" - map["osage"] = "OSAGE" - map["osmanya"] = "OSMANYA" - map["ottoman_siyaq_numbers"] = "OTTOMAN_SIYAQ_NUMBERS" - map["pahawh_hmong"] = "PAHAWH_HMONG" - map["palmyrene"] = "PALMYRENE" - map["pau_cin_hau"] = "PAU_CIN_HAU" - map["phags_pa"] = "PHAGS_PA" - map["phaistos_disc"] = "PHAISTOS" - map["phoenician"] = "PHOENICIAN" - map["phonetic_extensions"] = "PHONETIC_EXT" - map["phonetic_extensions_supplement"] = "PHONETIC_EXT_SUP" - map["playing_cards"] = "PLAYING_CARDS" - map["private_use_area"] = "PUA" - map["psalter_pahlavi"] = "PSALTER_PAHLAVI" - map["rejang"] = "REJANG" - map["rumi_numeral_symbols"] = "RUMI" - map["runic"] = "RUNIC" - map["samaritan"] = "SAMARITAN" - map["saurashtra"] = "SAURASHTRA" - map["sharada"] = "SHARADA" - map["shavian"] = "SHAVIAN" - map["shorthand_format_controls"] = "SHORTHAND_FORMAT_CONTROLS" - map["siddham"] = "SIDDHAM" - map["sinhala_archaic_numbers"] = "SINHALA_ARCHAIC_NUMBERS" - map["sinhala"] = "SINHALA" - map["small_form_variants"] = "SMALL_FORMS" - map["small_kana_extension"] = "SMALL_KANA_EXT" - map["sogdian"] = "SOGDIAN" - map["sora_sompeng"] = "SORA_SOMPENG" - map["soyombo"] = "SOYOMBO" - map["spacing_modifier_letters"] = "MODIFIER_LETTERS" - map["specials"] = "SPECIALS" - map["sundanese"] = "SUNDANESE" - map["sundanese_supplement"] = "SUNDANESE_SUP" - map["superscripts_and_subscripts"] = "SUPER_AND_SUB" - map["supplemental_arrows_a"] = "SUP_ARROWS_A" - map["supplemental_arrows_b"] = "SUP_ARROWS_B" - map["supplemental_arrows_c"] = "SUP_ARROWS_C" - map["supplemental_mathematical_operators"] = "SUP_MATH_OPERATORS" - map["supplemental_punctuation"] = "SUP_PUNCTUATION" - map["supplemental_symbols_and_pictographs"] = "SUP_SYMBOLS_AND_PICTOGRAPHS" - map["supplementary_private_use_area_a"] = "SUP_PUA_A" - map["supplementary_private_use_area_b"] = "SUP_PUA_B" - map["sutton_signwriting"] = "SUTTON_SIGNWRITING" - map["syloti_nagri"] = "SYLOTI_NAGRI" - map["symbols_and_pictographs_extended_a"] = "SYMBOLS_AND_PICTOGRAPHS_EXT_A" - map["symbols_for_legacy_computing"] = "SYMBOLS_FOR_LEGACY_COMPUTING" - map["syriac_supplement"] = "SYRIAC_SUP" - map["syriac"] = "SYRIAC" - map["tagalog"] = "TAGALOG" - map["tagbanwa"] = "TAGBANWA" - map["tags"] = "TAGS" - map["tai_le"] = "TAI_LE" - map["tai_tham"] = "TAI_THAM" - map["tai_viet"] = "TAI_VIET" - map["tai_xuan_jing_symbols"] = "TAI_XUAN_JING" - map["takri"] = "TAKRI" - map["tamil_supplement"] = "TAMIL_SUP" - map["tamil"] = "TAMIL" - map["tangsa"] = "TANGSA" - map["tangut_components"] = "TANGUT_COMPONENTS" - map["tangut_supplement"] = "TANGUT_SUP" - map["tangut"] = "TANGUT" - map["telugu"] = "TELUGU" - map["thaana"] = "THAANA" - map["thai"] = "THAI" - map["tibetan"] = "TIBETAN" - map["tifinagh"] = "TIFINAGH" - map["tirhuta"] = "TIRHUTA" - map["toto"] = "TOTO" - map["transport_and_map_symbols"] = "TRANSPORT_AND_MAP" - map["ugaritic"] = "UGARITIC" - map["unified_canadian_aboriginal_syllabics_extended_a"] = "UCAS_EXT_A" - map["unified_canadian_aboriginal_syllabics_extended"] = "UCAS_EXT" - map["unified_canadian_aboriginal_syllabics"] = "UCAS" - map["vai"] = "VAI" - map["variation_selectors_supplement"] = "VS_SUP" - map["variation_selectors"] = "VS" - map["vedic_extensions"] = "VEDIC_EXT" - map["vertical_forms"] = "VERTICAL_FORMS" - map["vithkuqi"] = "VITHKUQI" - map["wancho"] = "WANCHO" - map["warang_citi"] = "WARANG_CITI" - map["yezidi"] = "YEZIDI" - map["yijing_hexagram_symbols"] = "YIJING" - map["yi_radicals"] = "YI_RADICALS" - map["yi_syllables"] = "YI_SYLLABLES" - map["zanabazar_square"] = "ZANABAZAR_SQUARE" - map["znamenny_musical_notation"] = "ZNAMENNY_MUSIC" - print "/* This file is autogenerated by gen/prop/blk; DO NOT EDIT. */" - print "" - print "#include \"_bsearch.h\"" - print "#include \"macros.h\"" - print "#include \"rune.h\"" - print "#include \"unicode/prop.h\"" - print "" +MAP = { + 'adlam' : 'ADLAM', + 'aegean_numbers' : 'AEGEAN_NUMBERS', + 'ahom' : 'AHOM', + 'alchemical_symbols' : 'ALCHEMICAL', + 'alphabetic_presentation_forms' : 'ALPHABETIC_PF', + 'anatolian_hieroglyphs' : 'ANATOLIAN_HIEROGLYPHS', + 'ancient_greek_musical_notation' : 'ANCIENT_GREEK_MUSIC', + 'ancient_greek_numbers' : 'ANCIENT_GREEK_NUMBERS', + 'ancient_symbols' : 'ANCIENT_SYMBOLS', + 'arabic' : 'ARABIC', + 'arabic_extended_a' : 'ARABIC_EXT_A', + 'arabic_extended_b' : 'ARABIC_EXT_B', + 'arabic_extended_c' : 'ARABIC_EXT_C', + 'arabic_mathematical_alphabetic_symbols' : 'ARABIC_MATH', + 'arabic_presentation_forms_a' : 'ARABIC_PF_A', + 'arabic_presentation_forms_b' : 'ARABIC_PF_B', + 'arabic_supplement' : 'ARABIC_SUP', + 'armenian' : 'ARMENIAN', + 'arrows' : 'ARROWS', + 'avestan' : 'AVESTAN', + 'balinese' : 'BALINESE', + 'bamum' : 'BAMUM', + 'bamum_supplement' : 'BAMUM_SUP', + 'basic_latin' : 'ASCII', + 'bassa_vah' : 'BASSA_VAH', + 'batak' : 'BATAK', + 'bengali' : 'BENGALI', + 'bhaiksuki' : 'BHAIKSUKI', + 'block_elements' : 'BLOCK_ELEMENTS', + 'bopomofo' : 'BOPOMOFO', + 'bopomofo_extended' : 'BOPOMOFO_EXT', + 'box_drawing' : 'BOX_DRAWING', + 'brahmi' : 'BRAHMI', + 'braille_patterns' : 'BRAILLE', + 'buginese' : 'BUGINESE', + 'buhid' : 'BUHID', + 'byzantine_musical_symbols' : 'BYZANTINE_MUSIC', + 'carian' : 'CARIAN', + 'caucasian_albanian' : 'CAUCASIAN_ALBANIAN', + 'chakma' : 'CHAKMA', + 'cham' : 'CHAM', + 'cherokee' : 'CHEROKEE', + 'cherokee_supplement' : 'CHEROKEE_SUP', + 'chess_symbols' : 'CHESS_SYMBOLS', + 'chorasmian' : 'CHORASMIAN', + 'cjk_compatibility' : 'CJK_COMPAT', + 'cjk_compatibility_forms' : 'CJK_COMPAT_FORMS', + 'cjk_compatibility_ideographs' : 'CJK_COMPAT_IDEOGRAPHS', + 'cjk_compatibility_ideographs_supplement' : 'CJK_COMPAT_IDEOGRAPHS_SUP', + 'cjk_radicals_supplement' : 'CJK_RADICALS_SUP', + 'cjk_strokes' : 'CJK_STROKES', + 'cjk_symbols_and_punctuation' : 'CJK_SYMBOLS', + 'cjk_unified_ideographs' : 'CJK', + 'cjk_unified_ideographs_extension_a' : 'CJK_EXT_A', + 'cjk_unified_ideographs_extension_b' : 'CJK_EXT_B', + 'cjk_unified_ideographs_extension_c' : 'CJK_EXT_C', + 'cjk_unified_ideographs_extension_d' : 'CJK_EXT_D', + 'cjk_unified_ideographs_extension_e' : 'CJK_EXT_E', + 'cjk_unified_ideographs_extension_f' : 'CJK_EXT_F', + 'cjk_unified_ideographs_extension_g' : 'CJK_EXT_G', + 'cjk_unified_ideographs_extension_h' : 'CJK_EXT_H', + 'cjk_unified_ideographs_extension_i' : 'CJK_EXT_I', + 'combining_diacritical_marks' : 'DIACRITICALS', + 'combining_diacritical_marks_extended' : 'DIACRITICALS_EXT', + 'combining_diacritical_marks_for_symbols' : 'DIACRITICALS_FOR_SYMBOLS', + 'combining_diacritical_marks_supplement' : 'DIACRITICALS_SUP', + 'combining_half_marks' : 'HALF_MARKS', + 'common_indic_number_forms' : 'INDIC_NUMBER_FORMS', + 'control_pictures' : 'CONTROL_PICTURES', + 'coptic' : 'COPTIC', + 'coptic_epact_numbers' : 'COPTIC_EPACT_NUMBERS', + 'counting_rod_numerals' : 'COUNTING_ROD', + 'cuneiform' : 'CUNEIFORM', + 'cuneiform_numbers_and_punctuation' : 'CUNEIFORM_NUMBERS', + 'currency_symbols' : 'CURRENCY_SYMBOLS', + 'cypriot_syllabary' : 'CYPRIOT_SYLLABARY', + 'cypro_minoan' : 'CYPRO_MINOAN', + 'cyrillic' : 'CYRILLIC', + 'cyrillic_extended_a' : 'CYRILLIC_EXT_A', + 'cyrillic_extended_b' : 'CYRILLIC_EXT_B', + 'cyrillic_extended_c' : 'CYRILLIC_EXT_C', + 'cyrillic_extended_d' : 'CYRILLIC_EXT_D', + 'cyrillic_supplement' : 'CYRILLIC_SUP', + 'deseret' : 'DESERET', + 'devanagari' : 'DEVANAGARI', + 'devanagari_extended_a' : 'DEVANAGARI_EXT_A', + 'devanagari_extended' : 'DEVANAGARI_EXT', + 'dingbats' : 'DINGBATS', + 'dives_akuru' : 'DIVES_AKURU', + 'dogra' : 'DOGRA', + 'domino_tiles' : 'DOMINO', + 'duployan' : 'DUPLOYAN', + 'early_dynastic_cuneiform' : 'EARLY_DYNASTIC_CUNEIFORM', + 'egyptian_hieroglyph_format_controls' : 'EGYPTIAN_HIEROGLYPH_FORMAT_CONTROLS', + 'egyptian_hieroglyphs' : 'EGYPTIAN_HIEROGLYPHS', + 'elbasan' : 'ELBASAN', + 'elymaic' : 'ELYMAIC', + 'emoticons' : 'EMOTICONS', + 'enclosed_alphanumerics' : 'ENCLOSED_ALPHANUM', + 'enclosed_alphanumeric_supplement' : 'ENCLOSED_ALPHANUM_SUP', + 'enclosed_cjk_letters_and_months' : 'ENCLOSED_CJK', + 'enclosed_ideographic_supplement' : 'ENCLOSED_IDEOGRAPHIC_SUP', + 'ethiopic' : 'ETHIOPIC', + 'ethiopic_extended_a' : 'ETHIOPIC_EXT_A', + 'ethiopic_extended_b' : 'ETHIOPIC_EXT_B', + 'ethiopic_extended' : 'ETHIOPIC_EXT', + 'ethiopic_supplement' : 'ETHIOPIC_SUP', + 'general_punctuation' : 'PUNCTUATION', + 'geometric_shapes_extended' : 'GEOMETRIC_SHAPES_EXT', + 'geometric_shapes' : 'GEOMETRIC_SHAPES', + 'georgian_extended' : 'GEORGIAN_EXT', + 'georgian' : 'GEORGIAN', + 'georgian_supplement' : 'GEORGIAN_SUP', + 'glagolitic' : 'GLAGOLITIC', + 'glagolitic_supplement' : 'GLAGOLITIC_SUP', + 'gothic' : 'GOTHIC', + 'grantha' : 'GRANTHA', + 'greek_and_coptic' : 'GREEK', + 'greek_extended' : 'GREEK_EXT', + 'gujarati' : 'GUJARATI', + 'gunjala_gondi' : 'GUNJALA_GONDI', + 'gurmukhi' : 'GURMUKHI', + 'halfwidth_and_fullwidth_forms' : 'HALF_AND_FULL_FORMS', + 'hangul_compatibility_jamo' : 'COMPAT_JAMO', + 'hangul_jamo_extended_a' : 'JAMO_EXT_A', + 'hangul_jamo_extended_b' : 'JAMO_EXT_B', + 'hangul_jamo' : 'JAMO', + 'hangul_syllables' : 'HANGUL', + 'hanifi_rohingya' : 'HANIFI_ROHINGYA', + 'hanunoo' : 'HANUNOO', + 'hatran' : 'HATRAN', + 'hebrew' : 'HEBREW', + 'high_private_use_surrogates' : 'HIGH_PU_SURROGATES', + 'high_surrogates' : 'HIGH_SURROGATES', + 'hiragana' : 'HIRAGANA', + 'ideographic_description_characters' : 'IDC', + 'ideographic_symbols_and_punctuation' : 'IDEOGRAPHIC_SYMBOLS', + 'imperial_aramaic' : 'IMPERIAL_ARAMAIC', + 'indic_siyaq_numbers' : 'INDIC_SIYAQ_NUMBERS', + 'inscriptional_pahlavi' : 'INSCRIPTIONAL_PAHLAVI', + 'inscriptional_parthian' : 'INSCRIPTIONAL_PARTHIAN', + 'ipa_extensions' : 'IPA_EXT', + 'javanese' : 'JAVANESE', + 'kaithi' : 'KAITHI', + 'kaktovik_numerals' : 'KAKTOVIK_NUMERALS', + 'kana_extended_a' : 'KANA_EXT_A', + 'kana_extended_b' : 'KANA_EXT_B', + 'kana_supplement' : 'KANA_SUP', + 'kanbun' : 'KANBUN', + 'kangxi_radicals' : 'KANGXI', + 'kannada' : 'KANNADA', + 'katakana' : 'KATAKANA', + 'katakana_phonetic_extensions' : 'KATAKANA_EXT', + 'kawi' : 'KAWI', + 'kayah_li' : 'KAYAH_LI', + 'kharoshthi' : 'KHAROSHTHI', + 'khitan_small_script' : 'KHITAN_SMALL_SCRIPT', + 'khmer' : 'KHMER', + 'khmer_symbols' : 'KHMER_SYMBOLS', + 'khojki' : 'KHOJKI', + 'khudawadi' : 'KHUDAWADI', + 'lao' : 'LAO', + 'latin_1_supplement' : 'LATIN_1_SUP', + 'latin_extended_additional' : 'LATIN_EXT_ADDITIONAL', + 'latin_extended_a' : 'LATIN_EXT_A', + 'latin_extended_b' : 'LATIN_EXT_B', + 'latin_extended_c' : 'LATIN_EXT_C', + 'latin_extended_d' : 'LATIN_EXT_D', + 'latin_extended_e' : 'LATIN_EXT_E', + 'latin_extended_f' : 'LATIN_EXT_F', + 'latin_extended_g' : 'LATIN_EXT_G', + 'lepcha' : 'LEPCHA', + 'letterlike_symbols' : 'LETTERLIKE_SYMBOLS', + 'limbu' : 'LIMBU', + 'linear_a' : 'LINEAR_A', + 'linear_b_ideograms' : 'LINEAR_B_IDEOGRAMS', + 'linear_b_syllabary' : 'LINEAR_B_SYLLABARY', + 'lisu' : 'LISU', + 'lisu_supplement' : 'LISU_SUP', + 'low_surrogates' : 'LOW_SURROGATES', + 'lycian' : 'LYCIAN', + 'lydian' : 'LYDIAN', + 'mahajani' : 'MAHAJANI', + 'mahjong_tiles' : 'MAHJONG', + 'makasar' : 'MAKASAR', + 'malayalam' : 'MALAYALAM', + 'mandaic' : 'MANDAIC', + 'manichaean' : 'MANICHAEAN', + 'marchen' : 'MARCHEN', + 'masaram_gondi' : 'MASARAM_GONDI', + 'mathematical_alphanumeric_symbols' : 'MATH_ALPHANUM', + 'mathematical_operators' : 'MATH_OPERATORS', + 'mayan_numerals' : 'MAYAN_NUMERALS', + 'medefaidrin' : 'MEDEFAIDRIN', + 'meetei_mayek_extensions' : 'MEETEI_MAYEK_EXT', + 'meetei_mayek' : 'MEETEI_MAYEK', + 'mende_kikakui' : 'MENDE_KIKAKUI', + 'meroitic_cursive' : 'MEROITIC_CURSIVE', + 'meroitic_hieroglyphs' : 'MEROITIC_HIEROGLYPHS', + 'miao' : 'MIAO', + 'miscellaneous_mathematical_symbols_a' : 'MISC_MATH_SYMBOLS_A', + 'miscellaneous_mathematical_symbols_b' : 'MISC_MATH_SYMBOLS_B', + 'miscellaneous_symbols_and_arrows' : 'MISC_ARROWS', + 'miscellaneous_symbols_and_pictographs' : 'MISC_PICTOGRAPHS', + 'miscellaneous_symbols' : 'MISC_SYMBOLS', + 'miscellaneous_technical' : 'MISC_TECHNICAL', + 'modifier_tone_letters' : 'MODIFIER_TONE_LETTERS', + 'modi' : 'MODI', + 'mongolian' : 'MONGOLIAN', + 'mongolian_supplement' : 'MONGOLIAN_SUP', + 'mro' : 'MRO', + 'multani' : 'MULTANI', + 'musical_symbols' : 'MUSIC', + 'myanmar_extended_a' : 'MYANMAR_EXT_A', + 'myanmar_extended_b' : 'MYANMAR_EXT_B', + 'myanmar' : 'MYANMAR', + 'nabataean' : 'NABATAEAN', + 'nag_mundari' : 'NAG_MUNDARI', + 'nandinagari' : 'NANDINAGARI', + 'newa' : 'NEWA', + 'new_tai_lue' : 'NEW_TAI_LUE', + 'nko' : 'NKO', + 'number_forms' : 'NUMBER_FORMS', + 'nushu' : 'NUSHU', + 'nyiakeng_puachue_hmong' : 'NYIAKENG_PUACHUE_HMONG', + 'ogham' : 'OGHAM', + 'ol_chiki' : 'OL_CHIKI', + 'old_hungarian' : 'OLD_HUNGARIAN', + 'old_italic' : 'OLD_ITALIC', + 'old_north_arabian' : 'OLD_NORTH_ARABIAN', + 'old_permic' : 'OLD_PERMIC', + 'old_persian' : 'OLD_PERSIAN', + 'old_sogdian' : 'OLD_SOGDIAN', + 'old_south_arabian' : 'OLD_SOUTH_ARABIAN', + 'old_turkic' : 'OLD_TURKIC', + 'old_uyghur' : 'OLD_UYGHUR', + 'optical_character_recognition' : 'OCR', + 'oriya' : 'ORIYA', + 'ornamental_dingbats' : 'ORNAMENTAL_DINGBATS', + 'osage' : 'OSAGE', + 'osmanya' : 'OSMANYA', + 'ottoman_siyaq_numbers' : 'OTTOMAN_SIYAQ_NUMBERS', + 'pahawh_hmong' : 'PAHAWH_HMONG', + 'palmyrene' : 'PALMYRENE', + 'pau_cin_hau' : 'PAU_CIN_HAU', + 'phags_pa' : 'PHAGS_PA', + 'phaistos_disc' : 'PHAISTOS', + 'phoenician' : 'PHOENICIAN', + 'phonetic_extensions' : 'PHONETIC_EXT', + 'phonetic_extensions_supplement' : 'PHONETIC_EXT_SUP', + 'playing_cards' : 'PLAYING_CARDS', + 'private_use_area' : 'PUA', + 'psalter_pahlavi' : 'PSALTER_PAHLAVI', + 'rejang' : 'REJANG', + 'rumi_numeral_symbols' : 'RUMI', + 'runic' : 'RUNIC', + 'samaritan' : 'SAMARITAN', + 'saurashtra' : 'SAURASHTRA', + 'sharada' : 'SHARADA', + 'shavian' : 'SHAVIAN', + 'shorthand_format_controls' : 'SHORTHAND_FORMAT_CONTROLS', + 'siddham' : 'SIDDHAM', + 'sinhala_archaic_numbers' : 'SINHALA_ARCHAIC_NUMBERS', + 'sinhala' : 'SINHALA', + 'small_form_variants' : 'SMALL_FORMS', + 'small_kana_extension' : 'SMALL_KANA_EXT', + 'sogdian' : 'SOGDIAN', + 'sora_sompeng' : 'SORA_SOMPENG', + 'soyombo' : 'SOYOMBO', + 'spacing_modifier_letters' : 'MODIFIER_LETTERS', + 'specials' : 'SPECIALS', + 'sundanese' : 'SUNDANESE', + 'sundanese_supplement' : 'SUNDANESE_SUP', + 'superscripts_and_subscripts' : 'SUPER_AND_SUB', + 'supplemental_arrows_a' : 'SUP_ARROWS_A', + 'supplemental_arrows_b' : 'SUP_ARROWS_B', + 'supplemental_arrows_c' : 'SUP_ARROWS_C', + 'supplemental_mathematical_operators' : 'SUP_MATH_OPERATORS', + 'supplemental_punctuation' : 'SUP_PUNCTUATION', + 'supplemental_symbols_and_pictographs' : 'SUP_SYMBOLS_AND_PICTOGRAPHS', + 'supplementary_private_use_area_a' : 'SUP_PUA_A', + 'supplementary_private_use_area_b' : 'SUP_PUA_B', + 'sutton_signwriting' : 'SUTTON_SIGNWRITING', + 'syloti_nagri' : 'SYLOTI_NAGRI', + 'symbols_and_pictographs_extended_a' : 'SYMBOLS_AND_PICTOGRAPHS_EXT_A', + 'symbols_for_legacy_computing' : 'SYMBOLS_FOR_LEGACY_COMPUTING', + 'syriac_supplement' : 'SYRIAC_SUP', + 'syriac' : 'SYRIAC', + 'tagalog' : 'TAGALOG', + 'tagbanwa' : 'TAGBANWA', + 'tags' : 'TAGS', + 'tai_le' : 'TAI_LE', + 'tai_tham' : 'TAI_THAM', + 'tai_viet' : 'TAI_VIET', + 'tai_xuan_jing_symbols' : 'TAI_XUAN_JING', + 'takri' : 'TAKRI', + 'tamil_supplement' : 'TAMIL_SUP', + 'tamil' : 'TAMIL', + 'tangsa' : 'TANGSA', + 'tangut_components' : 'TANGUT_COMPONENTS', + 'tangut_supplement' : 'TANGUT_SUP', + 'tangut' : 'TANGUT', + 'telugu' : 'TELUGU', + 'thaana' : 'THAANA', + 'thai' : 'THAI', + 'tibetan' : 'TIBETAN', + 'tifinagh' : 'TIFINAGH', + 'tirhuta' : 'TIRHUTA', + 'toto' : 'TOTO', + 'transport_and_map_symbols' : 'TRANSPORT_AND_MAP', + 'ugaritic' : 'UGARITIC', + 'unified_canadian_aboriginal_syllabics_extended_a': 'UCAS_EXT_A', + 'unified_canadian_aboriginal_syllabics_extended' : 'UCAS_EXT', + 'unified_canadian_aboriginal_syllabics' : 'UCAS', + 'vai' : 'VAI', + 'variation_selectors_supplement' : 'VS_SUP', + 'variation_selectors' : 'VS', + 'vedic_extensions' : 'VEDIC_EXT', + 'vertical_forms' : 'VERTICAL_FORMS', + 'vithkuqi' : 'VITHKUQI', + 'wancho' : 'WANCHO', + 'warang_citi' : 'WARANG_CITI', + 'yezidi' : 'YEZIDI', + 'yijing_hexagram_symbols' : 'YIJING', + 'yi_radicals' : 'YI_RADICALS', + 'yi_syllables' : 'YI_SYLLABLES', + 'zanabazar_square' : 'ZANABAZAR_SQUARE', + 'znamenny_musical_notation' : 'ZNAMENNY_MUSIC', } -/^[^#]/ { - n = split($1, a, /\.\./) - lo = strtonum("0X" a[1]) - hi = strtonum("0X" a[n]) +longest = 0 - for (i = lo; i <= hi; i++) { - gsub(/^; /, "", $2) - gsub(/[- ]/, "_", $2) - props[i] = "BLK_" map[tolower($2)] - } -} +def parse(file: str) -> list[bool]: + global longest -END { - print "static constexpr enum uprop_blk lookup_lat1[] = {" - for (i = 0; i < 0x100; i++) { - if (i % 8 == 0) - printf "\t" - printf "%-15s,%s", props[i] ? props[i] : 0, i % 8 == 7 ? "\n" : " " - } - print "};" - print "" + xs = ['BLK_NB'] * 0x110000 + with open(file, 'r') as f: + for line in f.readlines(): + if len(line.strip()) == 0 or line[0] == '#': + continue - print "static const struct {" - print "\trune lo, hi;" - print "\tenum uprop_blk val;" - print "} lookup[] = {" + parts = line.split(';') + ranges = [int(x, 16) for x in parts[0].strip().split('..')] + prop = 'BLK_' + MAP[( + parts[1] + .split('#')[0] + .strip() + .lower() + .replace('-', '_') + .replace(' ', '_') + )] + longest = max(longest, len(prop)) - for (i = 0x100; i <= 0x10FFFF; i++) { - if (!props[i]) - continue - lo = i - while (props[lo] == props[i + 1]) - i++ - printf "\t{RUNE_C(0x%06X), RUNE_C(0x%06X), %s},\n", lo, i, props[i] - } + for i in range(ranges[0], ranges[len(ranges) - 1] + 1): + xs[i] = prop + return xs - print "};" - print "" - print "_MLIB_DEFINE_BSEARCH(enum uprop_blk, lookup, BLK_NB)" - print "" - print "enum uprop_blk" - print "uprop_get_blk(rune ch)" - print "{" - print "\treturn ch <= lengthof(lookup_lat1) ? lookup_lat1[ch] : mlib_lookup(ch);" - print "}" -} -' data/Blocks | sed 's/\s*$//' +def genfile(cs: list[tuple[bool, ...]], blksize: int) -> None: + Cs = cs + cs = list(dict.fromkeys(Cs)) + + print('''\ +/* This file is autogenerated by gen/prop/blk; DO NOT EDIT. */ + +#include "unicode/prop.h" +''') + + print(f'static constexpr {typename(len(cs) - 1)} stage1[] = {{') + for i, c in enumerate(Cs): + print(f'%c%{len(str(len(cs) - 1))}d,' % ('\t' if i % 16 == 0 else ' ', cs.index(c)), end='') + if i % 16 == 15: + print() + print('};') + + print() + + ppc = columns(blksize, longest + 1) + print(f'static constexpr enum uprop_blk stage2[][{blksize}] = {{') + for c in cs: + for i in range(blksize // ppc): + print('\t{' if i == 0 else '\t ', end='') + for j in range(ppc): + print(c[i*ppc + j], end='') + if i < blksize // ppc - 1 or j < ppc - 1: + print(',', end='') + if j < ppc - 1: + print(' ' * (longest + 1 - len(c[i*ppc + j])), end='') + if i < blksize // ppc - 1: + print() + print('},') + print('};') + + print() + + print(f'''\ +enum uprop_blk +uprop_get_blk(rune ch) +{{ + return stage2[stage1[ch / {blksize}]][ch % {blksize}]; +}}''') + +def main() -> None: + cwd_init() + xs = parse('data/Blocks') + + blksize = -1 + smallest = math.inf + + for bs in powers_of_2(): + if bs > len(xs): + break + Cs = [tuple(x) for x in chunks(xs, bs)] + cs = set(Cs) + + sz_s1 = len(Cs) * isize(len(cs) - 1) + sz_s2 = len(cs) * bs * 2 + sz = sz_s1 + sz_s2 + + if sz < smallest: + smallest = sz + blksize = bs + + Cs = [tuple(x) for x in chunks(xs, blksize)] + with open('lib/unicode/prop/uprop_get_blk.c', 'w') as f: + sys.stdout = f + genfile(Cs, blksize) + + report_size(len(xs), smallest) + +if __name__ == '__main__': + main() diff --git a/gen/prop/bpt b/gen/prop/bpt index 1ad1741..72a9215 100755 --- a/gen/prop/bpt +++ b/gen/prop/bpt @@ -1,53 +1,101 @@ -#!/bin/sh - -set -e -cd "${0%/*}/../.." -exec >lib/unicode/prop/uprop_get_bpt.c - -gawk ' -BEGIN { - FS = " *(; *|#.*)" - - print "/* This file is autogenerated by gen/prop/bpt; DO NOT EDIT. */" - print "" - print "#include \"_bsearch.h\"" - print "#include \"macros.h\"" - print "#include \"rune.h\"" - print "#include \"unicode/prop.h\"" - print "" -} - -/^[^#]/ { - props[strtonum("0X" $1)] = "BPT_" toupper($3) -} - -END { - print "static constexpr enum uprop_bpt lookup_lat1[] = {" - for (i = 0; i < 0x100; i++) { - if (i % 8 == 0) - printf "\t" - printf "%5s,%s", props[i] ? props[i] : "BPT_N", i % 8 == 7 ? "\n" : " " - } - print "};" - print "" - print "static const struct {" - print "\trune k;" - print "\tenum uprop_bpt v;" - print "} lookup[] = {" - - for (i = 0x100; i <= 0x10FFFF; i++) { - if (props[i]) - printf "\t{RUNE_C(0x%06X), %s},\n", i, props[i] - } - - print "};" - print "" - print "_MLIB_DEFINE_BSEARCH_KV(enum uprop_bpt, lookup, BPT_N)" - print "" - print "enum uprop_bpt" - print "uprop_get_bpt(rune ch)" - print "{" - print "\treturn ch < lengthof(lookup_lat1) ? lookup_lat1[ch] : mlib_lookup_kv(ch);" - print "}" -} -' data/BidiBrackets +#!/usr/bin/python3 + +import math + +from lib import * + + +longest = 0 + +def parse(file: str) -> list[bool]: + global longest + + xs = ['BPT_N'] * 0x110000 + with open(file, 'r') as f: + for line in f.readlines(): + if len(line.strip()) == 0 or line[0] == '#': + continue + + parts = line.split(';') + ranges = [int(x, 16) for x in parts[0].strip().split('..')] + prop = 'BPT_' + parts[2].split('#')[0].strip().upper() + longest = max(longest, len(prop)) + + for i in range(ranges[0], ranges[len(ranges) - 1] + 1): + xs[i] = prop + return xs + +def genfile(cs: list[tuple[bool, ...]], blksize: int) -> None: + Cs = cs + cs = list(dict.fromkeys(Cs)) + + print('''\ +/* This file is autogenerated by gen/prop/bpt; DO NOT EDIT. */ + +#include "unicode/prop.h" +''') + + print(f'static constexpr {typename(len(cs) - 1)} stage1[] = {{') + for i, c in enumerate(Cs): + print(f'%c%{len(str(len(cs) - 1))}d,' % ('\t' if i % 16 == 0 else ' ', cs.index(c)), end='') + if i % 16 == 15: + print() + print('};') + + print() + + ppc = columns(blksize, longest + 1) + print(f'static constexpr enum uprop_bpt stage2[][{blksize}] = {{') + for c in cs: + for i in range(blksize // ppc): + print('\t{' if i == 0 else '\t ', end='') + for j in range(ppc): + print(c[i*ppc + j], end='') + if i < blksize // ppc - 1 or j < ppc - 1: + print(',', end='') + if j < ppc - 1: + print(' ' * (longest + 1 - len(c[i*ppc + j])), end='') + if i < blksize // ppc - 1: + print() + print('},') + print('};') + + print() + + print(f'''\ +enum uprop_bpt +uprop_get_bpt(rune ch) +{{ + return stage2[stage1[ch / {blksize}]][ch % {blksize}]; +}}''') + +def main() -> None: + cwd_init() + xs = parse('data/BidiBrackets') + + blksize = -1 + smallest = math.inf + + for bs in powers_of_2(): + if bs > len(xs): + break + Cs = [tuple(x) for x in chunks(xs, bs)] + cs = set(Cs) + + sz_s1 = len(Cs) * isize(len(cs) - 1) + sz_s2 = len(cs) * bs * 2 + sz = sz_s1 + sz_s2 + + if sz < smallest: + smallest = sz + blksize = bs + + Cs = [tuple(x) for x in chunks(xs, blksize)] + with open('lib/unicode/prop/uprop_get_bpt.c', 'w') as f: + sys.stdout = f + genfile(Cs, blksize) + + report_size(len(xs), smallest) + +if __name__ == '__main__': + main() diff --git a/gen/prop/ccc b/gen/prop/ccc index 4f370e7..5339748 100755 --- a/gen/prop/ccc +++ b/gen/prop/ccc @@ -1,116 +1,163 @@ -#!/bin/sh - -set -e -cd "${0%/*}/../.." -exec >lib/unicode/prop/uprop_get_ccc.c - -gawk ' -BEGIN { - FS = ";" - - map[1] = "OV" - map[6] = "HANR" - map[7] = "NK" - map[8] = "KV" - map[9] = "VR" - map[10] = "CCC10" - map[11] = "CCC11" - map[12] = "CCC12" - map[13] = "CCC13" - map[14] = "CCC14" - map[15] = "CCC15" - map[16] = "CCC16" - map[17] = "CCC17" - map[18] = "CCC18" - map[19] = "CCC19" - map[20] = "CCC20" - map[21] = "CCC21" - map[22] = "CCC22" - map[23] = "CCC23" - map[24] = "CCC24" - map[25] = "CCC25" - map[26] = "CCC26" - map[27] = "CCC27" - map[28] = "CCC28" - map[29] = "CCC29" - map[30] = "CCC30" - map[31] = "CCC31" - map[32] = "CCC32" - map[33] = "CCC33" - map[34] = "CCC34" - map[35] = "CCC35" - map[36] = "CCC36" - map[84] = "CCC84" - map[91] = "CCC91" - map[103] = "CCC103" - map[107] = "CCC107" - map[118] = "CCC118" - map[122] = "CCC122" - map[129] = "CCC129" - map[130] = "CCC130" - map[132] = "CCC132" - map[133] = "CCC133" - map[200] = "ATBL" - map[202] = "ATB" - map[214] = "ATA" - map[216] = "ATAR" - map[218] = "BL" - map[220] = "B" - map[222] = "BR" - map[224] = "L" - map[226] = "R" - map[228] = "AL" - map[230] = "A" - map[232] = "AR" - map[233] = "DB" - map[234] = "DA" - map[240] = "IS" - - print "/* This file is autogenerated by gen/prop/ccc; DO NOT EDIT. */" - print "" - print "#include \"_bsearch.h\"" - print "#include \"macros.h\"" - print "#include \"rune.h\"" - print "#include \"unicode/prop.h\"" - print "" -} +#!/usr/bin/python3 -{ - s = "CCC_" (map[$4] ? map[$4] : "NR") - lo = strtonum("0X" $1) +import math - if ($2 ~ /First/) { - getline - hi = strtonum("0X" $1) - } else - hi = lo +from lib import * - for (i = lo; i <= hi; i++) - props[i] = s -} -END { - print "static const struct {" - print "\trune lo, hi;" - print "\tenum uprop_ccc val;" - print "} lookup[] = {" - - for (i = 0; i <= 0x10FFFF; i++) { - if (!props[i] || props[i] == "CCC_NR") - continue - for (lo = i; props[lo] == props[i + 1]; i++) - ; - printf "\t{RUNE_C(0x%06X), RUNE_C(0x%06X), %s},\n", lo, i, props[lo] - } - - print "};" - print "" - print "_MLIB_DEFINE_BSEARCH(enum uprop_ccc, lookup, CCC_NR)" - print "" - print "enum uprop_ccc" - print "uprop_get_ccc(rune ch)" - print "{" - print "\treturn ch < lookup[0].lo ? CCC_NR : mlib_lookup(ch);" - print "}" +MAP = { + '0' : 'NR', + '1' : 'OV', + '6' : 'HANR', + '7' : 'NK', + '8' : 'KV', + '9' : 'VR', + '10' : '10', + '11' : '11', + '12' : '12', + '13' : '13', + '14' : '14', + '15' : '15', + '16' : '16', + '17' : '17', + '18' : '18', + '19' : '19', + '20' : '20', + '21' : '21', + '22' : '22', + '23' : '23', + '24' : '24', + '25' : '25', + '26' : '26', + '27' : '27', + '28' : '28', + '29' : '29', + '30' : '30', + '31' : '31', + '32' : '32', + '33' : '33', + '34' : '34', + '35' : '35', + '36' : '36', + '84' : '84', + '91' : '91', + '103': '103', + '107': '107', + '118': '118', + '122': '122', + '129': '129', + '130': '130', + '132': '132', + '133': '133', + '200': 'ATBL', + '202': 'ATB', + '214': 'ATA', + '216': 'ATAR', + '218': 'BL', + '220': 'B', + '222': 'BR', + '224': 'L', + '226': 'R', + '228': 'AL', + '230': 'A', + '232': 'AR', + '233': 'DB', + '234': 'DA', + '240': 'IS', } -' data/UnicodeData + +longest = 0 + +def parse(file: str) -> list[bool]: + global longest + + xs = ['CCC_NR'] * 0x110000 + with open(file, 'r') as f: + for line in f.readlines(): + parts = line.split(';') + parts[0] = int(parts[0], 16) + if 'First' in parts[1]: + lo = parts[0] + elif 'Last' in parts[1]: + hi = parts[0] + for i in range(lo, hi + 1): + xs[i] = f'CCC_{MAP[parts[3]]}' + longest = max(longest, len(xs[i])) + else: + xs[parts[0]] = f'CCC_{MAP[parts[3]]}' + longest = max(longest, len(xs[parts[0]])) + return xs + +def genfile(cs: list[tuple[bool, ...]], blksize: int) -> None: + Cs = cs + cs = list(dict.fromkeys(Cs)) + + print('''\ +/* This file is autogenerated by gen/prop/ccc; DO NOT EDIT. */ + +#include "unicode/prop.h" +''') + + print(f'static constexpr {typename(len(cs) - 1)} stage1[] = {{') + for i, c in enumerate(Cs): + print(f'%c%{len(str(len(cs) - 1))}d,' % ('\t' if i % 16 == 0 else ' ', cs.index(c)), end='') + if i % 16 == 15: + print() + print('};') + + print() + + ppc = columns(blksize, longest + 1) + print(f'static constexpr enum uprop_ccc stage2[][{blksize}] = {{') + for c in cs: + for i in range(blksize // ppc): + print('\t{' if i == 0 else '\t ', end='') + for j in range(ppc): + print(c[i*ppc + j], end='') + if i < blksize // ppc - 1 or j < ppc - 1: + print(',', end='') + if j < ppc - 1: + print(' ' * (longest + 1 - len(c[i*ppc + j])), end='') + if i < blksize // ppc - 1: + print() + print('},') + print('};') + + print() + + print(f'''\ +enum uprop_ccc +uprop_get_ccc(rune ch) +{{ + return stage2[stage1[ch / {blksize}]][ch % {blksize}]; +}}''') + +def main() -> None: + cwd_init() + sys.stdout = open('lib/unicode/prop/uprop_get_ccc.c', 'w') + xs = parse('data/UnicodeData') + + blksize = -1 + smallest = math.inf + + for bs in powers_of_2(): + if bs > len(xs): + break + Cs = [tuple(x) for x in chunks(xs, bs)] + cs = set(Cs) + + sz_s1 = len(Cs) * isize(len(cs) - 1) + sz_s2 = len(cs) * bs * 4 + sz = sz_s1 + sz_s2 + + if sz < smallest: + smallest = sz + blksize = bs + + Cs = [tuple(x) for x in chunks(xs, blksize)] + genfile(Cs, blksize) + + report_size(len(xs), smallest) + +if __name__ == '__main__': + main() diff --git a/gen/prop/dt b/gen/prop/dt index 12881c5..81503d2 100755 --- a/gen/prop/dt +++ b/gen/prop/dt @@ -1,84 +1,121 @@ -#!/bin/sh - -set -e -cd "${0%/*}/../.." -exec >lib/unicode/prop/uprop_get_dt.c - -gawk ' -BEGIN { - FS = "( *#.*| +; +)" - - map["Canonical"] = "CAN" - map["Compat"] = "COM" - map["Circle"] = "ENC" - map["Final"] = "FIN" - map["Font"] = "FONT" - map["Fraction"] = "FRA" - map["Initial"] = "INIT" - map["Isolated"] = "ISO" - map["Medial"] = "MED" - map["Narrow"] = "NAR" - map["Nobreak"] = "NB" - map["Small"] = "SML" - map["Square"] = "SQR" - map["Sub"] = "SUB" - map["Super"] = "SUP" - map["Vertical"] = "VERT" - map["Wide"] = "WIDE" - - - print "/* This file is autogenerated by gen/prop/dt; DO NOT EDIT. */" - print "" - print "#include \"_bsearch.h\"" - print "#include \"macros.h\"" - print "#include \"rune.h\"" - print "#include \"unicode/prop.h\"" - print "" -} +#!/usr/bin/python3 -/^[^#]/ { - n = split($1, a, /\.\./) - lo = strtonum("0X" a[1]) - hi = strtonum("0X" a[n]) +import math + +from lib import * - for (i = lo; i <= hi; i++) { - gsub(/^; /, "", $2) - props[i] = "DT_" map[$2] - } -} -END { - print "static constexpr enum uprop_dt lookup_lat1[] = {" - for (i = 0; i < 0x100; i++) { - if (i % 8 == 0) - printf "\t" - printf "%-8s%s", (props[i] ? props[i] : "DT_NONE") ",", \ - i % 8 == 7 ? "\n" : " " - } - print "};" - print "" - print "static const struct {" - print "\trune lo, hi;" - print "\tenum uprop_dt val;" - print "} lookup[] = {" - - for (i = 0x100; i <= 0x10FFFF; i++) { - if (!props[i]) - continue - lo = i - while (props[lo] == props[i + 1]) - i++ - printf "\t{RUNE_C(0x%06X), RUNE_C(0x%06X), %s},\n", lo, i, props[i] - } - - print "};" - print "" - print "_MLIB_DEFINE_BSEARCH(enum uprop_dt, lookup, DT_NONE)" - print "" - print "enum uprop_dt" - print "uprop_get_dt(rune ch)" - print "{" - print "\treturn ch < lengthof(lookup_lat1) ? lookup_lat1[ch] : mlib_lookup(ch);" - print "}" +MAP = { + 'Canonical': 'CAN', + 'Compat' : 'COM', + 'Circle' : 'ENC', + 'Final' : 'FIN', + 'Font' : 'FONT', + 'Fraction' : 'FRA', + 'Initial' : 'INIT', + 'Isolated' : 'ISO', + 'Medial' : 'MED', + 'Narrow' : 'NAR', + 'Nobreak' : 'NB', + 'Small' : 'SML', + 'Square' : 'SQR', + 'Sub' : 'SUB', + 'Super' : 'SUP', + 'Vertical' : 'VERT', + 'Wide' : 'WIDE', } -' data/DerivedDecompositionType | sed 's/\s*$//' + +longest = 0 + +def parse(file: str) -> list[bool]: + global longest + + xs = ['DT_NONE'] * 0x110000 + with open(file, 'r') as f: + for line in f.readlines(): + if len(line.strip()) == 0 or line[0] == '#': + continue + + parts = line.split(';') + ranges = [int(x, 16) for x in parts[0].strip().split('..')] + prop = 'DT_' + MAP[parts[1].split('#')[0].strip()] + longest = max(longest, len(prop)) + + for i in range(ranges[0], ranges[len(ranges) - 1] + 1): + xs[i] = prop + return xs + +def genfile(cs: list[tuple[bool, ...]], blksize: int) -> None: + Cs = cs + cs = list(dict.fromkeys(Cs)) + + print('''\ +/* This file is autogenerated by gen/prop/dt; DO NOT EDIT. */ + +#include "unicode/prop.h" +''') + + print(f'static constexpr {typename(len(cs) - 1)} stage1[] = {{') + for i, c in enumerate(Cs): + print(f'%c%{len(str(len(cs) - 1))}d,' % ('\t' if i % 16 == 0 else ' ', cs.index(c)), end='') + if i % 16 == 15: + print() + print('};') + + print() + + ppc = columns(blksize, longest + 1) + print(f'static constexpr enum uprop_dt stage2[][{blksize}] = {{') + for c in cs: + for i in range(blksize // ppc): + print('\t{' if i == 0 else '\t ', end='') + for j in range(ppc): + print(c[i*ppc + j], end='') + if i < blksize // ppc - 1 or j < ppc - 1: + print(',', end='') + if j < ppc - 1: + print(' ' * (longest + 1 - len(c[i*ppc + j])), end='') + if i < blksize // ppc - 1: + print() + print('},') + print('};') + + print() + + print(f'''\ +enum uprop_dt +uprop_get_dt(rune ch) +{{ + return stage2[stage1[ch / {blksize}]][ch % {blksize}]; +}}''') + +def main() -> None: + cwd_init() + xs = parse('data/DerivedDecompositionType') + + blksize = -1 + smallest = math.inf + + for bs in powers_of_2(): + if bs > len(xs): + break + Cs = [tuple(x) for x in chunks(xs, bs)] + cs = set(Cs) + + sz_s1 = len(Cs) * isize(len(cs) - 1) + sz_s2 = len(cs) * bs * 2 + sz = sz_s1 + sz_s2 + + if sz < smallest: + smallest = sz + blksize = bs + + Cs = [tuple(x) for x in chunks(xs, blksize)] + with open('lib/unicode/prop/uprop_get_dt.c', 'w') as f: + sys.stdout = f + genfile(Cs, blksize) + + report_size(len(xs), smallest) + +if __name__ == '__main__': + main() diff --git a/gen/prop/gc b/gen/prop/gc index f37cb4b..cc4d35b 100755 --- a/gen/prop/gc +++ b/gen/prop/gc @@ -77,7 +77,7 @@ def main() -> None: if bs > len(xs): break Cs = [tuple(x) for x in chunks(xs, bs)] - cs = list(dict.fromkeys(Cs)) + cs = set(Cs) sz_s1 = len(Cs) * isize(len(cs) - 1) sz_s2 = len(cs) * bs * 4 @@ -90,5 +90,7 @@ def main() -> None: Cs = [tuple(x) for x in chunks(xs, blksize)] genfile(Cs, blksize) + report_size(len(xs), smallest) + if __name__ == '__main__': main() |