#include #include #include #include #include #include #include #include #include "errors.h" #include "lexer.h" #include "unicode.h" #ifdef __GNUC__ # define likely(x) __builtin_expect(!!(x), 1) # define unlikely(x) __builtin_expect(!!(x), 0) #else # define likely(x) (x) # define unlikely(x) (x) #endif #define SIZE_WDTH (sizeof(size_t) * 8) static bool skip_comment(const uchar **, const uchar *); static struct lexemes_soa mk_lexemes_soa(void); static void lexemes_soa_resz(struct lexemes_soa *); struct lexemes_soa lexstring(const uchar *code, size_t codesz) { #if ORYX_SIMD if (!utf8_validate_simd(code, codesz)) { #endif size_t loc = utf8_validate_off(code, codesz); if (loc != 0) { err("Invalid byte ‘0x%02" PRIx8 "’ in UTF-8 input at byte %zu", code[loc - 1], loc); } #if ORYX_SIMD } #endif struct lexemes_soa data = mk_lexemes_soa(); const uchar *start = code, *end = start + codesz; while (likely(code < end)) { const uchar *spnbeg = code, *spnend; rune ch = utf8_decode(&code); switch (ch) { /* Single-byte literals */ case '&': case '(': case ')': case '*': case '+': case '-': case ':': case ';': case '=': case '[': case ']': case '{': case '|': case '}': case '~': data.kinds[data.len++] = ch; break; /* Single- or double-byte literals */ case '/': if (code < end && code[0] == '*') { if (!skip_comment(&code, end)) err("Unterminated comment at byte %td", code - start); continue; } data.kinds[data.len++] = ch; break; case '<': case '>': data.kinds[data.len++] = ch; /* See the comment in lexer.h for where 193 comes from */ if (code < end && code[0] == ch) { code++; data.kinds[data.len - 1] += 193; } break; default: if (!rune_is_xids(ch)) continue; data.kinds[data.len] = LEXIDENT; data.strs[data.len].p = spnbeg; spnend = code; while (likely(code < end) && rune_is_xidc(ch)) { spnend = code; ch = utf8_decode(&code); } if (likely(code < end)) code = spnend; data.strs[data.len++].len = spnend - spnbeg; } if (unlikely(data.len == data.cap)) lexemes_soa_resz(&data); } return data; } bool skip_comment(const uchar **ptr, const uchar *end) { int nst = 1; const uchar *p = *ptr; for (p++; likely(p < end); p++) { if (p + 1 < end) { if (p[0] == '*' && p[1] == '/') { p++; if (--nst == 0) goto out; } else if (p[0] == '/' && p[1] == '*') { p++; nst++; } } } return false; out: *ptr = ++p; return true; } struct lexemes_soa mk_lexemes_soa(void) { static_assert(offsetof(struct lexemes_soa, kinds) < offsetof(struct lexemes_soa, strs), "KINDS is not the first field before STRS"); struct lexemes_soa soa; soa.len = 0; soa.cap = 2048; /* Ensure that soa.strs is properly aligned */ size_t pad = alignof(*soa.strs) - soa.cap * sizeof(*soa.kinds) % alignof(*soa.strs); if (pad == 8) pad = 0; if ((soa.kinds = malloc(soa.cap * LEXEMES_SOA_BLKSZ + pad)) == NULL) err("malloc:"); soa.strs = (void *)((char *)soa.kinds + soa.cap * sizeof(*soa.kinds) + pad); return soa; } void lexemes_soa_resz(struct lexemes_soa *soa) { static_assert(offsetof(struct lexemes_soa, kinds) < offsetof(struct lexemes_soa, strs), "KINDS is not the first field before STRS"); size_t ncap, pad, newsz; ptrdiff_t off = (char *)soa->strs - (char *)soa->kinds; /* The capacity is always going to be a power of 2, so checking for overflow becomes pretty trivial */ if ((soa->cap >> (SIZE_WDTH - 1)) != 0) { errno = EOVERFLOW; err("lexemes_soa_resz:"); } ncap = soa->cap << 1; /* Ensure that soa->strs is properly aligned */ pad = alignof(*soa->strs) - ncap * sizeof(*soa->kinds) % alignof(*soa->strs); if (pad == 8) pad = 0; newsz = ncap * LEXEMES_SOA_BLKSZ + pad; if ((soa->kinds = realloc(soa->kinds, newsz)) == NULL) err("realloc:"); soa->strs = (void *)((char *)soa->kinds + ncap * sizeof(*soa->kinds) + pad); memmove(soa->strs, (char *)soa->kinds + off, soa->len * sizeof(*soa->strs)); soa->cap = ncap; }