aboutsummaryrefslogtreecommitdiff
path: root/lib/unicode/string/u8title.c
blob: 380e8749df709f2c06fd7c4cb54138aabf5d6afd (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#include "mbstring.h"
#include "unicode/prop.h"
#include "unicode/string.h"

constexpr rune COMB_GRAVE = 0x0300;
constexpr rune COMB_ACUTE = 0x0301;
constexpr rune COMB_TILDE = 0x0303;
constexpr rune COMB_DOT_ABOVE = 0x0307;

size_t
u8title(char8_t *restrict dst, size_t dstn, const char8_t *src, size_t srcn,
        enum caseflags flags)
{
	struct tcctx ctx_t = {
		.az_or_tr = flags & CF_LANG_AZ,
		.lt = flags & CF_LANG_LT,
	};
	struct lcctx ctx_l = {
		.az_or_tr = ctx_t.az_or_tr,
		.lt = ctx_t.lt,
	};

	int w;
	rune ch;
	size_t n = 0;
	bool lt_special, nl_special;
	struct u8view word = {}, cpy = {src, srcn};

	lt_special = nl_special = false;

	while (w = u8next(&ch, &src, &srcn)) {
		rune next = 0;
		if (srcn > 0)
			u8tor(&next, src);
		if (src > word.p + word.len)
			u8wnext(&word, U8_ARGSP(cpy));

		bool sow = src - w == word.p;
		ctx_l.final_sigma = src == word.p + word.len;
		ctx_l.before_dot = next == COMB_DOT_ABOVE;
		ctx_l.more_above =
			next == COMB_GRAVE || next == COMB_ACUTE || next == COMB_TILDE;

		struct rview rv;
		if (nl_special && (ch == 'j' || ch == 'J'))
			rv = (struct rview){.p = U"J", .len = 1};
		else
			rv = sow || lt_special ? uprop_get_tc(ch, ctx_t)
			                       : uprop_get_lc(ch, ctx_l);
		for (size_t i = 0; i < rv.len; i++) {
			if (n >= dstn) {
				char8_t buf[U8_LEN_MAX];
				n += rtou8(buf, sizeof(buf), rv.p[i]);
			} else
				n += rtou8(dst + n, dstn - n, rv.p[i]);
		}

		if (flags & CF_LANG_NL)
			nl_special = sow && (ch == 'i' || ch == 'I');
		if (ctx_t.lt) {
			/* If the rune at SOW is Soft_Dotted, then the next rune should be
			   titlecased if it is U+0307 or if does not have ccc=0 and ccc=230.
			   If the current rune was titlecased as a result of the above rule,
			   then the rule should be applied again to the next rune.  If the
			   current rune was titlecased and is U+0307, then lowercase until
			   the next word boundary. */
			enum uprop_ccc ccc;
			if (lt_special || uprop_is_sd(ch)) {
				ctx_t.after_soft_dotted = true;
				lt_special =
					(sow || lt_special) && ch != COMB_DOT_ABOVE
					&& (next == COMB_DOT_ABOVE
				        || ((ccc = uprop_get_ccc(next)) != 0 && ccc != 230));
			} else
				ctx_t.after_soft_dotted = false;
		}

		ctx_l.after_I = ch == 'I';
	}

	return n;
}