aboutsummaryrefslogtreecommitdiff
path: root/data/CompositionExclusions
blob: db708a71365cb66ef8e73e879a121d0fa2d5d5fe (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
# CompositionExclusions-15.1.0.txt
# Date: 2023-01-05
# © 2023 Unicode®, Inc.
# For terms of use, see https://www.unicode.org/terms_of_use.html
#
# Unicode Character Database
# For documentation, see https://www.unicode.org/reports/tr44/
#
# This file lists the characters for the Composition Exclusion Table
# defined in UAX #15, Unicode Normalization Forms.
#
# This file is a normative contributory data file in the
# Unicode Character Database.
#
# For more information, see
# https://www.unicode.org/reports/tr15/#Primary_Exclusion_List_Table
#
# For a full derivation of composition exclusions, see the derived property
# Full_Composition_Exclusion in DerivedNormalizationProps.txt
#

# ================================================
# (1) Script Specifics
#
# This list of characters cannot be derived from the UnicodeData.txt file.
#
# Included are the following subcategories:
#
# - Many precomposed characters using a nukta diacritic in the Devanagari,
#   Bangla/Bengali, Gurmukhi, or Odia/Oriya scripts.
# - Tibetan letters and subjoined letters with decompositions including 
#   U+0FB7 TIBETAN SUBJOINED LETTER HA or U+0FB5 TIBETAN SUBJOINED LETTER SSA.
# - Two two-part Tibetan vowel signs involving top and bottom pieces.
# - A large collection of compatibility precomposed characters for Hebrew
#   involving dagesh and/or other combining marks.
#
# This list is unlikely to grow.
#
# ================================================

0958    #  DEVANAGARI LETTER QA
0959    #  DEVANAGARI LETTER KHHA
095A    #  DEVANAGARI LETTER GHHA
095B    #  DEVANAGARI LETTER ZA
095C    #  DEVANAGARI LETTER DDDHA
095D    #  DEVANAGARI LETTER RHA
095E    #  DEVANAGARI LETTER FA
095F    #  DEVANAGARI LETTER YYA
09DC    #  BENGALI LETTER RRA
09DD    #  BENGALI LETTER RHA
09DF    #  BENGALI LETTER YYA
0A33    #  GURMUKHI LETTER LLA
0A36    #  GURMUKHI LETTER SHA
0A59    #  GURMUKHI LETTER KHHA
0A5A    #  GURMUKHI LETTER GHHA
0A5B    #  GURMUKHI LETTER ZA
0A5E    #  GURMUKHI LETTER FA
0B5C    #  ORIYA LETTER RRA
0B5D    #  ORIYA LETTER RHA
0F43    #  TIBETAN LETTER GHA
0F4D    #  TIBETAN LETTER DDHA
0F52    #  TIBETAN LETTER DHA
0F57    #  TIBETAN LETTER BHA
0F5C    #  TIBETAN LETTER DZHA
0F69    #  TIBETAN LETTER KSSA
0F76    #  TIBETAN VOWEL SIGN VOCALIC R
0F78    #  TIBETAN VOWEL SIGN VOCALIC L
0F93    #  TIBETAN SUBJOINED LETTER GHA
0F9D    #  TIBETAN SUBJOINED LETTER DDHA
0FA2    #  TIBETAN SUBJOINED LETTER DHA
0FA7    #  TIBETAN SUBJOINED LETTER BHA
0FAC    #  TIBETAN SUBJOINED LETTER DZHA
0FB9    #  TIBETAN SUBJOINED LETTER KSSA
FB1D    #  HEBREW LETTER YOD WITH HIRIQ
FB1F    #  HEBREW LIGATURE YIDDISH YOD YOD PATAH
FB2A    #  HEBREW LETTER SHIN WITH SHIN DOT
FB2B    #  HEBREW LETTER SHIN WITH SIN DOT
FB2C    #  HEBREW LETTER SHIN WITH DAGESH AND SHIN DOT
FB2D    #  HEBREW LETTER SHIN WITH DAGESH AND SIN DOT
FB2E    #  HEBREW LETTER ALEF WITH PATAH
FB2F    #  HEBREW LETTER ALEF WITH QAMATS
FB30    #  HEBREW LETTER ALEF WITH MAPIQ
FB31    #  HEBREW LETTER BET WITH DAGESH
FB32    #  HEBREW LETTER GIMEL WITH DAGESH
FB33    #  HEBREW LETTER DALET WITH DAGESH
FB34    #  HEBREW LETTER HE WITH MAPIQ
FB35    #  HEBREW LETTER VAV WITH DAGESH
FB36    #  HEBREW LETTER ZAYIN WITH DAGESH
FB38    #  HEBREW LETTER TET WITH DAGESH
FB39    #  HEBREW LETTER YOD WITH DAGESH
FB3A    #  HEBREW LETTER FINAL KAF WITH DAGESH
FB3B    #  HEBREW LETTER KAF WITH DAGESH
FB3C    #  HEBREW LETTER LAMED WITH DAGESH
FB3E    #  HEBREW LETTER MEM WITH DAGESH
FB40    #  HEBREW LETTER NUN WITH DAGESH
FB41    #  HEBREW LETTER SAMEKH WITH DAGESH
FB43    #  HEBREW LETTER FINAL PE WITH DAGESH
FB44    #  HEBREW LETTER PE WITH DAGESH
FB46    #  HEBREW LETTER TSADI WITH DAGESH
FB47    #  HEBREW LETTER QOF WITH DAGESH
FB48    #  HEBREW LETTER RESH WITH DAGESH
FB49    #  HEBREW LETTER SHIN WITH DAGESH
FB4A    #  HEBREW LETTER TAV WITH DAGESH
FB4B    #  HEBREW LETTER VAV WITH HOLAM
FB4C    #  HEBREW LETTER BET WITH RAFE
FB4D    #  HEBREW LETTER KAF WITH RAFE
FB4E    #  HEBREW LETTER PE WITH RAFE

# Total code points: 67

# ================================================
# (2) Post Composition Version precomposed characters
#
# These characters cannot be derived solely from the UnicodeData.txt file
# in this version of Unicode.
#
# Note that characters added to the standard after the
# Composition Version and which have canonical decomposition mappings
# are not automatically added to this list of Post Composition
# Version precomposed characters.
# ================================================

2ADC    #  FORKING
1D15E   #  MUSICAL SYMBOL HALF NOTE
1D15F   #  MUSICAL SYMBOL QUARTER NOTE
1D160   #  MUSICAL SYMBOL EIGHTH NOTE
1D161   #  MUSICAL SYMBOL SIXTEENTH NOTE
1D162   #  MUSICAL SYMBOL THIRTY-SECOND NOTE
1D163   #  MUSICAL SYMBOL SIXTY-FOURTH NOTE
1D164   #  MUSICAL SYMBOL ONE HUNDRED TWENTY-EIGHTH NOTE
1D1BB   #  MUSICAL SYMBOL MINIMA
1D1BC   #  MUSICAL SYMBOL MINIMA BLACK
1D1BD   #  MUSICAL SYMBOL SEMIMINIMA WHITE
1D1BE   #  MUSICAL SYMBOL SEMIMINIMA BLACK
1D1BF   #  MUSICAL SYMBOL FUSA WHITE
1D1C0   #  MUSICAL SYMBOL FUSA BLACK

# Total code points: 14

# ================================================
# (3) Singleton Decompositions
#
# These characters can be derived from the UnicodeData.txt file
# by including all canonically decomposable characters whose
# canonical decomposition consists of a single character.
#
# These characters are simply quoted here for reference.
# See also Full_Composition_Exclusion in DerivedNormalizationProps.txt
# ================================================

# 0340..0341       [2] COMBINING GRAVE TONE MARK..COMBINING ACUTE TONE MARK
# 0343                 COMBINING GREEK KORONIS
# 0374                 GREEK NUMERAL SIGN
# 037E                 GREEK QUESTION MARK
# 0387                 GREEK ANO TELEIA
# 1F71                 GREEK SMALL LETTER ALPHA WITH OXIA
# 1F73                 GREEK SMALL LETTER EPSILON WITH OXIA
# 1F75                 GREEK SMALL LETTER ETA WITH OXIA
# 1F77                 GREEK SMALL LETTER IOTA WITH OXIA
# 1F79                 GREEK SMALL LETTER OMICRON WITH OXIA
# 1F7B                 GREEK SMALL LETTER UPSILON WITH OXIA
# 1F7D                 GREEK SMALL LETTER OMEGA WITH OXIA
# 1FBB                 GREEK CAPITAL LETTER ALPHA WITH OXIA
# 1FBE                 GREEK PROSGEGRAMMENI
# 1FC9                 GREEK CAPITAL LETTER EPSILON WITH OXIA
# 1FCB                 GREEK CAPITAL LETTER ETA WITH OXIA
# 1FD3                 GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
# 1FDB                 GREEK CAPITAL LETTER IOTA WITH OXIA
# 1FE3                 GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
# 1FEB                 GREEK CAPITAL LETTER UPSILON WITH OXIA
# 1FEE..1FEF       [2] GREEK DIALYTIKA AND OXIA..GREEK VARIA
# 1FF9                 GREEK CAPITAL LETTER OMICRON WITH OXIA
# 1FFB                 GREEK CAPITAL LETTER OMEGA WITH OXIA
# 1FFD                 GREEK OXIA
# 2000..2001       [2] EN QUAD..EM QUAD
# 2126                 OHM SIGN
# 212A..212B       [2] KELVIN SIGN..ANGSTROM SIGN
# 2329                 LEFT-POINTING ANGLE BRACKET
# 232A                 RIGHT-POINTING ANGLE BRACKET
# F900..FA0D     [270] CJK COMPATIBILITY IDEOGRAPH-F900..CJK COMPATIBILITY IDEOGRAPH-FA0D
# FA10                 CJK COMPATIBILITY IDEOGRAPH-FA10
# FA12                 CJK COMPATIBILITY IDEOGRAPH-FA12
# FA15..FA1E      [10] CJK COMPATIBILITY IDEOGRAPH-FA15..CJK COMPATIBILITY IDEOGRAPH-FA1E
# FA20                 CJK COMPATIBILITY IDEOGRAPH-FA20
# FA22                 CJK COMPATIBILITY IDEOGRAPH-FA22
# FA25..FA26       [2] CJK COMPATIBILITY IDEOGRAPH-FA25..CJK COMPATIBILITY IDEOGRAPH-FA26
# FA2A..FA6D      [68] CJK COMPATIBILITY IDEOGRAPH-FA2A..CJK COMPATIBILITY IDEOGRAPH-FA6D
# FA70..FAD9     [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPATIBILITY IDEOGRAPH-FAD9
# 2F800..2FA1D   [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D

# Total code points: 1035

# ================================================
# (4) Non-Starter Decompositions
#
# These characters can be derived from the UnicodeData.txt file
# by including each expanding canonical decomposition
# (i.e., those which canonically decompose to a sequence
# of characters instead of a single character), such that:
#
# A. The character is not a Starter.
#
# OR (inclusive)
#
# B. The character's canonical decomposition begins
# with a character that is not a Starter.
#
# Note that a "Starter" is any character with a zero combining class.
#
# These characters are simply quoted here for reference.
# See also Full_Composition_Exclusion in DerivedNormalizationProps.txt
# ================================================

# 0344                 COMBINING GREEK DIALYTIKA TONOS
# 0F73                 TIBETAN VOWEL SIGN II
# 0F75                 TIBETAN VOWEL SIGN UU
# 0F81                 TIBETAN VOWEL SIGN REVERSED II

# Total code points: 4

# EOF