From 4b8318aed38de0372f5d249359f2455435776cc3 Mon Sep 17 00:00:00 2001 From: Thomas Voss Date: Sat, 3 Aug 2024 23:50:05 +0200 Subject: Decode UTF-8 with PEXT when BMI2 is available --- make.c | 44 +++++++++++++++++++++++++++++--------------- src/unicode.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 86 insertions(+), 17 deletions(-) diff --git a/make.c b/make.c index c7ad047..79a1760 100644 --- a/make.c +++ b/make.c @@ -21,9 +21,10 @@ #endif enum { - SIMD_AVX2 = 1 << 0, - SIMD_NEON = 1 << 1, - SIMD_SSE4_1 = 1 << 2, + CPU_AVX2 = 1 << 0, + CPU_NEON = 1 << 1, + CPU_SSE4_1 = 1 << 2, + CPU_BMI2 = 1 << 3, }; static char *cflags_all[] = { @@ -60,7 +61,7 @@ static char *cflags_rls[] = { static char *argv0; static bool fflag, Fflag, rflag, Sflag; -static int simd_flags; +static int cpu_flags; static void ld(void); static void mkgmp(int); @@ -213,8 +214,10 @@ cc(void *arg) strspushenv(&cmd, "CFLAGS", cflags_dbg, lengthof(cflags_dbg)); if (!rflag && !Sflag) strspushl(&cmd, "-fsanitize=address,undefined"); - if (simd_flags != 0) + if (cpu_flags != 0) strspushl(&cmd, "-DORYX_SIMD=1"); + if (cpu_flags & CPU_BMI2) + strspushl(&cmd, "-DORYX_BMI2=1"); llvmquery(&cmd, LLVM_CFLAGS); strspushl(&cmd, "-o", dst, "-c", src); @@ -261,8 +264,10 @@ cc_test(void *arg) strspushenv(&cmd, "CFLAGS", cflags_dbg, lengthof(cflags_dbg)); if (!rflag && !Sflag) strspushl(&cmd, "-fsanitize=address,undefined"); - if (simd_flags != 0) + if (cpu_flags != 0) strspushl(&cmd, "-DORYX_SIMD=1"); + if (cpu_flags & CPU_BMI2) + strspushl(&cmd, "-DORYX_BMI2=1"); strspushl(&cmd, "-Isrc", "-o", dst, src); strspush(&cmd, d.objs, d.len); @@ -382,9 +387,9 @@ tagvalid(const char *file) char *tag; int flag; } tags[] = { - {"avx2", SIMD_AVX2 }, - {"sse4_1", SIMD_SSE4_1}, - {"neon", SIMD_NEON }, + {"avx2", CPU_AVX2 }, + {"sse4_1", CPU_SSE4_1}, + {"neon", CPU_NEON }, {"generic", 0 }, }; @@ -399,7 +404,7 @@ tagvalid(const char *file) sprintf(buf, "%.*s-%s%s", (int)(sep - file), file, tags[i].tag, ext); if (fexists(buf) - && ((simd_flags & tags[i].flag) != 0 || tags[i].flag == 0)) + && ((cpu_flags & tags[i].flag) != 0 || tags[i].flag == 0)) { want_and_have = buf; break; @@ -425,27 +430,36 @@ chk_cpu_flags(void) if (!rflag) return; + /* Test for BMI2 */ +#if __BMI2__ + cpu_flags |= CPU_BMI2; +#elif __GNUC__ && __x86_64__ + asm volatile("cpuid" : "=b"(exx) : "a"(7), "c"(0)); + if (exx & (1 << 8)) + cpu_flags |= CPU_BMI2; +#endif + /* Test for AVX512 */ #if __AVX512F__ - simd_flags |= SIMD_AVX2; + cpu_flags |= CPU_AVX2; #elif __GNUC__ && __x86_64__ asm volatile("cpuid" : "=b"(exx) : "a"(7), "c"(0)); if (exx & (1 << 5)) - simd_flags |= SIMD_AVX2; + cpu_flags |= CPU_AVX2; #endif /* Test for SSE4.1 */ #if __SSE4_1__ - simd_flags |= SIMD_SSE4_1; + cpu_flags |= CPU_SSE4_1; #elif __GNUC__ && __x86_64__ asm volatile("cpuid" : "=c"(exx) : "a"(1), "c"(0)); if (exx & (1 << 19)) - simd_flags |= SIMD_SSE4_1; + cpu_flags |= CPU_SSE4_1; #endif /* Test for NEON */ #if __ARM_NEON || __ARM_NEON__ - simd_flags |= SIMD_NEON; + cpu_flags |= CPU_NEON; #endif } diff --git a/src/unicode.c b/src/unicode.c index b32b413..6ad34a3 100644 --- a/src/unicode.c +++ b/src/unicode.c @@ -1,3 +1,7 @@ +#if ORYX_BMI2 +# include +#endif + #include "common.h" #include "types.h" #include "unicode-data.h" @@ -45,9 +49,19 @@ RUNE_IS_GEN(rune_is_xidc, xidc_stage1, xidc_stage2, 128) static const char lengths[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0}; static const rune mins[] = {RUNE_C(4194304), 0, 128, 2048, RUNE_C(65536)}; +static const int shifte[] = {0, 6, 4, 2, 0}; + +#if ORYX_BMI2 +static const uint32_t pextmsk[] = { + 0x7F000000, + 0x1F3F0000, + 0x0F3F3F00, + 0x073F3F3F, +}; +#else static const int masks[] = {0x00, 0x7f, 0x1f, 0x0f, 0x07}; static const int shiftc[] = {0, 18, 12, 6, 0}; -static const int shifte[] = {0, 6, 4, 2, 0}; +#endif rune utf8_decode(const uchar **buf) @@ -55,12 +69,32 @@ utf8_decode(const uchar **buf) const uchar *s = *buf; int len = lengths[s[0] >> 3]; *buf = s + len + !len; - +#if ORYX_BMI2 + rune c = 0; + switch (len) { + case 4: + c |= (rune)s[3] << 0; + /* fallthrough */ + case 3: + c |= (rune)s[2] << 8; + /* fallthrough */ + case 2: + c |= (rune)s[1] << 16; + /* fallthrough */ + case 1: + c |= (rune)s[0] << 24; + break; + default: + __builtin_unreachable(); + } + return _pext_u32(c, pextmsk[len - 1]); +#else rune c = (rune)(s[0] & masks[len]) << 18; c |= (rune)(s[1] & 0x3f) << 12; c |= (rune)(s[2] & 0x3f) << 6; c |= (rune)(s[3] & 0x3f) << 0; return c >> shiftc[len]; +#endif } size_t @@ -72,11 +106,32 @@ utf8_validate_off(const uchar *s, size_t len) const uchar *next = s + len + !len; +#if ORYX_BMI2 + rune c = 0; + switch (len) { + case 4: + c |= (rune)s[3] << 0; + /* fallthrough */ + case 3: + c |= (rune)s[2] << 8; + /* fallthrough */ + case 2: + c |= (rune)s[1] << 16; + /* fallthrough */ + case 1: + c |= (rune)s[0] << 24; + break; + default: + __builtin_unreachable(); + } + c = _pext_u32(c, pextmsk[len - 1]); +#else rune c = (rune)(s[0] & masks[len]) << 18; c |= (rune)(s[1] & 0x3f) << 12; c |= (rune)(s[2] & 0x3f) << 6; c |= (rune)(s[3] & 0x3f) << 0; c >>= shiftc[len]; +#endif int e = (c < mins[len]) << 6; e |= ((c >> 11) == 0x1B) << 7; -- cgit v1.2.3