aboutsummaryrefslogtreecommitdiff
path: root/include/unicode/prop.h
blob: 8def75b4bd179133bc5672b3cb5d8b776b457a9d (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
#ifndef MLIB_UNICODE_PROP_H
#define MLIB_UNICODE_PROP_H

#include <inttypes.h>
#include <stddef.h>

#include "__rune.h"
#include "__u8view.h"

struct rview {
	const rune *p;
	size_t len;
};

struct lcctx {
	bool az_or_tr : 1; /* Azeri or Turkish */
	bool lt       : 1; /* Lithuanian */

	bool after_I    : 1; /* After ‘I’ */
	bool before_acc : 1; /* Before accent on ‘i’ or ‘j’ in Lithuanian */
	bool before_dot : 1; /* Before U+0307 */
	bool eow        : 1; /* End of word */
};

struct tcctx {
	bool az_or_tr : 1; /* Azeri or Turkish */
	bool lt       : 1; /* Lithuanian */

	bool after_i  : 1; /* After ‘i’ */
};

struct ucctx {
	bool az_or_tr : 1; /* Azeri or Turkish */
	bool lt       : 1; /* Lithuanian */

	bool        : 1; /* Uppercase ‘ß’ into ‘ẞ’ (instead of ‘SS’) */
	bool after_i : 1; /* After ‘i’ */
};

enum uprop_bpt {
	BPT_N, /* None */
	BPT_C, /* Close */
	BPT_O, /* Open */
};

enum uprop_dt {
	DT_NONE, /* None */
	DT_CAN,  /* Canonical */
	DT_COM,  /* Compat */
	DT_ENC,  /* Circle */
	DT_FIN,  /* Final */
	DT_FONT, /* Font */
	DT_FRA,  /* Fraction */
	DT_INIT, /* Initial */
	DT_ISO,  /* Isolated */
	DT_MED,  /* Medial */
	DT_NAR,  /* Narrow */
	DT_NB,   /* Nobreak */
	DT_SML,  /* Small */
	DT_SQR,  /* Square */
	DT_SUB,  /* Sub */
	DT_SUP,  /* Super */
	DT_VERT, /* Vertical */
	DT_WIDE, /* Wide */
};

enum uprop_ea {
	EA_A,  /* Ambiguous */
	EA_F,  /* Fullwidth */
	EA_H,  /* Halfwidth */
	EA_NA, /* Wide */
	EA_N,  /* Neutral */
	EA_W,  /* Narrow */
};

enum [[clang::__flag_enum__]] uprop_gc : uint_fast32_t {
	GC_CN = UINT32_C(1) << 0,  /* Not Assigned */
	GC_CC = UINT32_C(1) << 1,  /* Control */
	GC_CF = UINT32_C(1) << 2,  /* Format */
	GC_CO = UINT32_C(1) << 3,  /* Private Use */
	GC_CS = UINT32_C(1) << 4,  /* Surrogate */
	GC_LL = UINT32_C(1) << 5,  /* Lowercase Letter */
	GC_LM = UINT32_C(1) << 6,  /* Modifier Letter */
	GC_LO = UINT32_C(1) << 7,  /* Other Letter */
	GC_LT = UINT32_C(1) << 8,  /* Titlecase Letter */
	GC_LU = UINT32_C(1) << 9,  /* Uppercase Letter */
	GC_MC = UINT32_C(1) << 10, /* Spacing Mark */
	GC_ME = UINT32_C(1) << 11, /* Enclosing Mark */
	GC_MN = UINT32_C(1) << 12, /* Nonspacing Mark */
	GC_ND = UINT32_C(1) << 13, /* Decimal Number */
	GC_NL = UINT32_C(1) << 14, /* Letter Number */
	GC_NO = UINT32_C(1) << 15, /* Other Number */
	GC_PC = UINT32_C(1) << 16, /* Connector Punctuation */
	GC_PD = UINT32_C(1) << 17, /* Dash Punctuation */
	GC_PE = UINT32_C(1) << 18, /* Close Punctuation */
	GC_PF = UINT32_C(1) << 19, /* Final Punctuation */
	GC_PI = UINT32_C(1) << 20, /* Initial Punctuation */
	GC_PO = UINT32_C(1) << 21, /* Other Punctuation */
	GC_PS = UINT32_C(1) << 22, /* Open Punctuation */
	GC_SC = UINT32_C(1) << 23, /* Currency Symbol */
	GC_SK = UINT32_C(1) << 24, /* Modifier Symbol */
	GC_SM = UINT32_C(1) << 25, /* Math Symbol */
	GC_SO = UINT32_C(1) << 26, /* Other Symbol */
	GC_ZL = UINT32_C(1) << 27, /* Line Separator */
	GC_ZP = UINT32_C(1) << 28, /* Paragraph Separator */
	GC_ZS = UINT32_C(1) << 29, /* Space Separator */

	/* Punctuation */
	GC_P = GC_PC | GC_PD | GC_PE | GC_PF | GC_PI | GC_PO | GC_PS,
	GC_C = GC_CC | GC_CF | GC_CN | GC_CO | GC_CS, /* Other */
	GC_LC = GC_LU | GC_LL | GC_LT,                /* Cased Letter */
	GC_L = GC_LL | GC_LM | GC_LO | GC_LT | GC_LU, /* Letter */
	GC_M = GC_MC | GC_ME | GC_MN,                 /* Mark */
	GC_N = GC_ND | GC_NL | GC_NO,                 /* Number */
	GC_S = GC_SC | GC_SK | GC_SM | GC_SO,         /* Symbol */
	GC_Z = GC_ZL | GC_ZP | GC_ZS,                 /* Separator */
};

enum uprop_lb {
	LB_XX,  /* Unknown */
	LB_AI,  /* Ambiguous */
	LB_AK,  /* Aksara */
	LB_AL,  /* Alphabetic */
	LB_AP,  /* Aksara Prebase */
	LB_AS,  /* Aksara Start */
	LB_B2,  /* Break Both */
	LB_BA,  /* Break After */
	LB_BB,  /* Break Before */
	LB_BK,  /* Mandatory Break */
	LB_CB,  /* Contingent Break */
	LB_CJ,  /* Conditional Japanese_Starter */
	LB_CL,  /* Close Punctuation */
	LB_CM,  /* Combining Mark */
	LB_CP,  /* Close Parenthesis */
	LB_CR,  /* Carriage Return */
	LB_EB,  /* E Base */
	LB_EM,  /* E Modifier */
	LB_EX,  /* Exclamation */
	LB_GL,  /* Glue */
	LB_H2,  /* H2 */
	LB_H3,  /* H3 */
	LB_HL,  /* Hebrew Letter */
	LB_HY,  /* Hyphen */
	LB_ID,  /* Ideographic */
	LB_IN,  /* Inseparable */
	LB_IS,  /* Infix Numeric */
	LB_JL,  /* JL */
	LB_JT,  /* JT */
	LB_JV,  /* JV */
	LB_LF,  /* Line Feed */
	LB_NL,  /* Next Line */
	LB_NS,  /* Nonstarter */
	LB_NU,  /* Numeric */
	LB_OP,  /* Open Punctuation */
	LB_PO,  /* Postfix Numeric */
	LB_PR,  /* Prefix Numeric */
	LB_QU,  /* Quotation */
	LB_RI,  /* Regional Indicator */
	LB_SA,  /* Complex Context */
	LB_SG,  /* Surrogate */
	LB_SP,  /* Space */
	LB_SY,  /* Break Symbols */
	LB_VF,  /* Virama Final */
	LB_VI,  /* Virama */
	LB_WJ,  /* Word Joiner */
	LB_ZWJ, /* ZWJ */
	LB_ZW,  /* ZWSpace */
};

enum uprop_nt {
	NT_NONE, /* None */
	NT_DE,   /* Decimal */
	NT_DI,   /* Digit */
	NT_NU,   /* Numeric */
};

#define __mlib_uprop_attrs __nodiscard__, __unsequenced__

[[__mlib_uprop_attrs]] double uprop_get_nv(rune);
[[__mlib_uprop_attrs]] enum uprop_bpt uprop_get_bpt(rune);
[[__mlib_uprop_attrs]] enum uprop_dt uprop_get_dt(rune);
[[__mlib_uprop_attrs]] enum uprop_ea uprop_get_ea(rune);
[[__mlib_uprop_attrs]] enum uprop_gc uprop_get_gc(rune);
[[__mlib_uprop_attrs]] enum uprop_lb uprop_get_lb(rune);
[[__mlib_uprop_attrs]] enum uprop_nt uprop_get_nt(rune);
[[__mlib_uprop_attrs]] rune uprop_get_bpb(rune);
[[__mlib_uprop_attrs]] rune uprop_get_slc(rune);
[[__mlib_uprop_attrs]] rune uprop_get_stc(rune);
[[__mlib_uprop_attrs]] rune uprop_get_suc(rune);
[[__mlib_uprop_attrs]] struct rview uprop_get_lc(rune, struct lcctx);
[[__mlib_uprop_attrs]] struct rview uprop_get_tc(rune, struct tcctx);
[[__mlib_uprop_attrs]] struct rview uprop_get_uc(rune, struct ucctx);
[[__mlib_uprop_attrs]] struct u8view uprop_get_na1(rune);
[[__mlib_uprop_attrs]] struct u8view uprop_get_na(rune);

/* PROP PREDICATES START */
[[__mlib_uprop_attrs]] bool uprop_is_ahex(rune);
[[__mlib_uprop_attrs]] bool uprop_is_alpha(rune);
[[__mlib_uprop_attrs]] bool uprop_is_bidi_c(rune);
[[__mlib_uprop_attrs]] bool uprop_is_bidi_m(rune);
[[__mlib_uprop_attrs]] bool uprop_is_cased(rune);
[[__mlib_uprop_attrs]] bool uprop_is_ci(rune);
[[__mlib_uprop_attrs]] bool uprop_is_cwcf(rune);
[[__mlib_uprop_attrs]] bool uprop_is_cwcm(rune);
[[__mlib_uprop_attrs]] bool uprop_is_cwkcf(rune);
[[__mlib_uprop_attrs]] bool uprop_is_cwl(rune);
[[__mlib_uprop_attrs]] bool uprop_is_cwt(rune);
[[__mlib_uprop_attrs]] bool uprop_is_cwu(rune);
[[__mlib_uprop_attrs]] bool uprop_is_dash(rune);
[[__mlib_uprop_attrs]] bool uprop_is_dep(rune);
[[__mlib_uprop_attrs]] bool uprop_is_di(rune);
[[__mlib_uprop_attrs]] bool uprop_is_dia(rune);
[[__mlib_uprop_attrs]] bool uprop_is_ebase(rune);
[[__mlib_uprop_attrs]] bool uprop_is_ecomp(rune);
[[__mlib_uprop_attrs]] bool uprop_is_emod(rune);
[[__mlib_uprop_attrs]] bool uprop_is_emoji(rune);
[[__mlib_uprop_attrs]] bool uprop_is_epres(rune);
[[__mlib_uprop_attrs]] bool uprop_is_ext(rune);
[[__mlib_uprop_attrs]] bool uprop_is_extpic(rune);
[[__mlib_uprop_attrs]] bool uprop_is_gr_base(rune);
[[__mlib_uprop_attrs]] bool uprop_is_gr_ext(rune);
[[__mlib_uprop_attrs]] bool uprop_is_hex(rune);
[[__mlib_uprop_attrs]] bool uprop_is_id_compat_math_continue(rune);
[[__mlib_uprop_attrs]] bool uprop_is_id_compat_math_start(rune);
[[__mlib_uprop_attrs]] bool uprop_is_idc(rune);
[[__mlib_uprop_attrs]] bool uprop_is_ideo(rune);
[[__mlib_uprop_attrs]] bool uprop_is_ids(rune);
[[__mlib_uprop_attrs]] bool uprop_is_idsb(rune);
[[__mlib_uprop_attrs]] bool uprop_is_incb(rune);
[[__mlib_uprop_attrs]] bool uprop_is_loe(rune);
[[__mlib_uprop_attrs]] bool uprop_is_lower(rune);
[[__mlib_uprop_attrs]] bool uprop_is_math(rune);
[[__mlib_uprop_attrs]] bool uprop_is_pat_syn(rune);
[[__mlib_uprop_attrs]] bool uprop_is_pat_ws(rune);
[[__mlib_uprop_attrs]] bool uprop_is_pcm(rune);
[[__mlib_uprop_attrs]] bool uprop_is_qmark(rune);
[[__mlib_uprop_attrs]] bool uprop_is_radical(rune);
[[__mlib_uprop_attrs]] bool uprop_is_sd(rune);
[[__mlib_uprop_attrs]] bool uprop_is_sterm(rune);
[[__mlib_uprop_attrs]] bool uprop_is_term(rune);
[[__mlib_uprop_attrs]] bool uprop_is_uideo(rune);
[[__mlib_uprop_attrs]] bool uprop_is_upper(rune);
[[__mlib_uprop_attrs]] bool uprop_is_vs(rune);
[[__mlib_uprop_attrs]] bool uprop_is_wspace(rune);
[[__mlib_uprop_attrs]] bool uprop_is_xidc(rune);
[[__mlib_uprop_attrs]] bool uprop_is_xids(rune);
/* PROP PREDICATES END */

/* Manually implemented predicates */
[[__mlib_uprop_attrs]] bool uprop_is_idst(rune);
[[__mlib_uprop_attrs]] bool uprop_is_idsu(rune);
[[__mlib_uprop_attrs]] bool uprop_is_join_c(rune);
[[__mlib_uprop_attrs]] bool uprop_is_nchar(rune);
[[__mlib_uprop_attrs]] bool uprop_is_ri(rune);

#endif /* !MLIB_UNICODE_PROP_H */