#include #include #include #include #include #include #include #include #include #include "alloc.h" #include "errors.h" #include "lexer.h" #include "unicode.h" #define LEXEMES_DFLT_CAP (2048) #define SIZE_WDTH (sizeof(size_t) * CHAR_BIT) static bool skip_comment(const uchar **, const uchar *); static struct lexemes_soa mk_lexemes_soa(void); static void lexemes_soa_resz(struct lexemes_soa *); static const bool is_numeric_lookup[UCHAR_MAX + 1] = { ['0'] = true, ['1'] = true, ['2'] = true, ['3'] = true, ['4'] = true, ['5'] = true, ['6'] = true, ['7'] = true, ['8'] = true, ['9'] = true, ['\''] = true, }; struct lexemes_soa lexstring(const uchar *code, size_t codesz) { #if ORYX_SIMD if (!utf8_validate_simd(code, codesz)) { #endif size_t loc = utf8_validate_off(code, codesz); if (loc != 0) { err("Invalid byte ‘0x%02" PRIx8 "’ in UTF-8 input at byte %zu", code[loc - 1], loc); } #if ORYX_SIMD } #endif struct lexemes_soa data = mk_lexemes_soa(); const uchar *start = code, *end = start + codesz; while (likely(code < end)) { const uchar *spnbeg = code, *spnend; rune ch = utf8_decode(&code); switch (ch) { /* Single-byte literals */ case '&': case '(': case ')': case '*': case '+': case '-': case ':': case ';': case '=': case '[': case ']': case '{': case '|': case '}': case '~': data.kinds[data.len++] = ch; break; /* Single- or double-byte literals */ case '/': if (code < end && code[0] == '*') { if (!skip_comment(&code, end)) err("Unterminated comment at byte %td", code - start); continue; } data.kinds[data.len++] = ch; break; case '<': case '>': data.kinds[data.len++] = ch; /* See the comment in lexer.h for where 193 comes from */ if (code < end && code[0] == ch) { code++; data.kinds[data.len - 1] += 193; } break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': data.kinds[data.len] = LEXNUM; data.strs[data.len].p = spnbeg; while (likely(code < end) && is_numeric_lookup[code[0]]) { if (unlikely(code[0] == '\'' && code[-1] == '\'')) { err("Adjacent numeric separators at byte %td", code - start); } code++; } if (unlikely(code < end && code[-1] == '\'')) { err("Numeric literal ends with numeric separator at byte %td", code - start); } data.strs[data.len++].len = code - spnbeg; break; default: if (!rune_is_xids(ch)) continue; data.kinds[data.len] = LEXIDENT; data.strs[data.len].p = spnbeg; spnend = code; while (likely(code < end) && rune_is_xidc(ch)) { spnend = code; ch = utf8_decode(&code); } if (likely(code < end)) code = spnend; data.strs[data.len++].len = spnend - spnbeg; } if (unlikely(data.len == data.cap)) lexemes_soa_resz(&data); } if (unlikely(data.len == data.cap)) lexemes_soa_resz(&data); data.kinds[data.len++] = LEXEOF; return data; } bool skip_comment(const uchar **ptr, const uchar *end) { int nst = 1; const uchar *p = *ptr; for (p++; likely(p < end); p++) { if (p + 1 < end) { if (p[0] == '*' && p[1] == '/') { p++; if (--nst == 0) goto out; } else if (p[0] == '/' && p[1] == '*') { p++; nst++; } } } return false; out: *ptr = ++p; return true; } struct lexemes_soa mk_lexemes_soa(void) { struct lexemes_soa soa; static_assert(offsetof(struct lexemes_soa, kinds) < offsetof(struct lexemes_soa, strs), "KINDS is not the first field before STRS"); static_assert(LEXEMES_DFLT_CAP * sizeof(*soa.kinds) % alignof(*soa.strs) == 0, "Additional padding is required to properly align STRS"); soa.len = 0; soa.cap = LEXEMES_DFLT_CAP; soa.kinds = bufalloc(NULL, soa.cap, LEXEMES_SOA_BLKSZ); soa.strs = (void *)((char *)soa.kinds + soa.cap * sizeof(*soa.kinds)); return soa; } void lexemes_soa_resz(struct lexemes_soa *soa) { static_assert(offsetof(struct lexemes_soa, kinds) < offsetof(struct lexemes_soa, strs), "KINDS is not the first field before STRS"); size_t ncap, pad, newsz; ptrdiff_t off = (char *)soa->strs - (char *)soa->kinds; /* The capacity is always going to be a power of 2, so checking for overflow becomes pretty trivial */ if ((soa->cap >> (SIZE_WDTH - 1)) != 0) { errno = EOVERFLOW; err("lexemes_soa_resz:"); } ncap = soa->cap << 1; /* Ensure that soa->strs is properly aligned */ pad = alignof(*soa->strs) - ncap * sizeof(*soa->kinds) % alignof(*soa->strs); if (pad == alignof(*soa->strs)) pad = 0; newsz = ncap * LEXEMES_SOA_BLKSZ + pad; soa->kinds = bufalloc(soa->kinds, newsz, 1); soa->strs = (void *)((char *)soa->kinds + ncap * sizeof(*soa->kinds) + pad); memmove(soa->strs, (char *)soa->kinds + off, soa->len * sizeof(*soa->strs)); soa->cap = ncap; }