aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorThomas Voss <mail@thomasvoss.com> 2024-08-03 23:50:05 +0200
committerThomas Voss <mail@thomasvoss.com> 2024-08-03 23:50:05 +0200
commit4b8318aed38de0372f5d249359f2455435776cc3 (patch)
treea3f1ba03dda8d601cff8dccaf657ee816db83f82
parentcd9a85085e03a6719c9cb736396ca1d82e65d89c (diff)
Decode UTF-8 with PEXT when BMI2 is available
-rw-r--r--make.c44
-rw-r--r--src/unicode.c59
2 files changed, 86 insertions, 17 deletions
diff --git a/make.c b/make.c
index c7ad047..79a1760 100644
--- a/make.c
+++ b/make.c
@@ -21,9 +21,10 @@
#endif
enum {
- SIMD_AVX2 = 1 << 0,
- SIMD_NEON = 1 << 1,
- SIMD_SSE4_1 = 1 << 2,
+ CPU_AVX2 = 1 << 0,
+ CPU_NEON = 1 << 1,
+ CPU_SSE4_1 = 1 << 2,
+ CPU_BMI2 = 1 << 3,
};
static char *cflags_all[] = {
@@ -60,7 +61,7 @@ static char *cflags_rls[] = {
static char *argv0;
static bool fflag, Fflag, rflag, Sflag;
-static int simd_flags;
+static int cpu_flags;
static void ld(void);
static void mkgmp(int);
@@ -213,8 +214,10 @@ cc(void *arg)
strspushenv(&cmd, "CFLAGS", cflags_dbg, lengthof(cflags_dbg));
if (!rflag && !Sflag)
strspushl(&cmd, "-fsanitize=address,undefined");
- if (simd_flags != 0)
+ if (cpu_flags != 0)
strspushl(&cmd, "-DORYX_SIMD=1");
+ if (cpu_flags & CPU_BMI2)
+ strspushl(&cmd, "-DORYX_BMI2=1");
llvmquery(&cmd, LLVM_CFLAGS);
strspushl(&cmd, "-o", dst, "-c", src);
@@ -261,8 +264,10 @@ cc_test(void *arg)
strspushenv(&cmd, "CFLAGS", cflags_dbg, lengthof(cflags_dbg));
if (!rflag && !Sflag)
strspushl(&cmd, "-fsanitize=address,undefined");
- if (simd_flags != 0)
+ if (cpu_flags != 0)
strspushl(&cmd, "-DORYX_SIMD=1");
+ if (cpu_flags & CPU_BMI2)
+ strspushl(&cmd, "-DORYX_BMI2=1");
strspushl(&cmd, "-Isrc", "-o", dst, src);
strspush(&cmd, d.objs, d.len);
@@ -382,9 +387,9 @@ tagvalid(const char *file)
char *tag;
int flag;
} tags[] = {
- {"avx2", SIMD_AVX2 },
- {"sse4_1", SIMD_SSE4_1},
- {"neon", SIMD_NEON },
+ {"avx2", CPU_AVX2 },
+ {"sse4_1", CPU_SSE4_1},
+ {"neon", CPU_NEON },
{"generic", 0 },
};
@@ -399,7 +404,7 @@ tagvalid(const char *file)
sprintf(buf, "%.*s-%s%s", (int)(sep - file), file, tags[i].tag, ext);
if (fexists(buf)
- && ((simd_flags & tags[i].flag) != 0 || tags[i].flag == 0))
+ && ((cpu_flags & tags[i].flag) != 0 || tags[i].flag == 0))
{
want_and_have = buf;
break;
@@ -425,27 +430,36 @@ chk_cpu_flags(void)
if (!rflag)
return;
+ /* Test for BMI2 */
+#if __BMI2__
+ cpu_flags |= CPU_BMI2;
+#elif __GNUC__ && __x86_64__
+ asm volatile("cpuid" : "=b"(exx) : "a"(7), "c"(0));
+ if (exx & (1 << 8))
+ cpu_flags |= CPU_BMI2;
+#endif
+
/* Test for AVX512 */
#if __AVX512F__
- simd_flags |= SIMD_AVX2;
+ cpu_flags |= CPU_AVX2;
#elif __GNUC__ && __x86_64__
asm volatile("cpuid" : "=b"(exx) : "a"(7), "c"(0));
if (exx & (1 << 5))
- simd_flags |= SIMD_AVX2;
+ cpu_flags |= CPU_AVX2;
#endif
/* Test for SSE4.1 */
#if __SSE4_1__
- simd_flags |= SIMD_SSE4_1;
+ cpu_flags |= CPU_SSE4_1;
#elif __GNUC__ && __x86_64__
asm volatile("cpuid" : "=c"(exx) : "a"(1), "c"(0));
if (exx & (1 << 19))
- simd_flags |= SIMD_SSE4_1;
+ cpu_flags |= CPU_SSE4_1;
#endif
/* Test for NEON */
#if __ARM_NEON || __ARM_NEON__
- simd_flags |= SIMD_NEON;
+ cpu_flags |= CPU_NEON;
#endif
}
diff --git a/src/unicode.c b/src/unicode.c
index b32b413..6ad34a3 100644
--- a/src/unicode.c
+++ b/src/unicode.c
@@ -1,3 +1,7 @@
+#if ORYX_BMI2
+# include <immintrin.h>
+#endif
+
#include "common.h"
#include "types.h"
#include "unicode-data.h"
@@ -45,9 +49,19 @@ RUNE_IS_GEN(rune_is_xidc, xidc_stage1, xidc_stage2, 128)
static const char lengths[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0};
static const rune mins[] = {RUNE_C(4194304), 0, 128, 2048, RUNE_C(65536)};
+static const int shifte[] = {0, 6, 4, 2, 0};
+
+#if ORYX_BMI2
+static const uint32_t pextmsk[] = {
+ 0x7F000000,
+ 0x1F3F0000,
+ 0x0F3F3F00,
+ 0x073F3F3F,
+};
+#else
static const int masks[] = {0x00, 0x7f, 0x1f, 0x0f, 0x07};
static const int shiftc[] = {0, 18, 12, 6, 0};
-static const int shifte[] = {0, 6, 4, 2, 0};
+#endif
rune
utf8_decode(const uchar **buf)
@@ -55,12 +69,32 @@ utf8_decode(const uchar **buf)
const uchar *s = *buf;
int len = lengths[s[0] >> 3];
*buf = s + len + !len;
-
+#if ORYX_BMI2
+ rune c = 0;
+ switch (len) {
+ case 4:
+ c |= (rune)s[3] << 0;
+ /* fallthrough */
+ case 3:
+ c |= (rune)s[2] << 8;
+ /* fallthrough */
+ case 2:
+ c |= (rune)s[1] << 16;
+ /* fallthrough */
+ case 1:
+ c |= (rune)s[0] << 24;
+ break;
+ default:
+ __builtin_unreachable();
+ }
+ return _pext_u32(c, pextmsk[len - 1]);
+#else
rune c = (rune)(s[0] & masks[len]) << 18;
c |= (rune)(s[1] & 0x3f) << 12;
c |= (rune)(s[2] & 0x3f) << 6;
c |= (rune)(s[3] & 0x3f) << 0;
return c >> shiftc[len];
+#endif
}
size_t
@@ -72,11 +106,32 @@ utf8_validate_off(const uchar *s, size_t len)
const uchar *next = s + len + !len;
+#if ORYX_BMI2
+ rune c = 0;
+ switch (len) {
+ case 4:
+ c |= (rune)s[3] << 0;
+ /* fallthrough */
+ case 3:
+ c |= (rune)s[2] << 8;
+ /* fallthrough */
+ case 2:
+ c |= (rune)s[1] << 16;
+ /* fallthrough */
+ case 1:
+ c |= (rune)s[0] << 24;
+ break;
+ default:
+ __builtin_unreachable();
+ }
+ c = _pext_u32(c, pextmsk[len - 1]);
+#else
rune c = (rune)(s[0] & masks[len]) << 18;
c |= (rune)(s[1] & 0x3f) << 12;
c |= (rune)(s[2] & 0x3f) << 6;
c |= (rune)(s[3] & 0x3f) << 0;
c >>= shiftc[len];
+#endif
int e = (c < mins[len]) << 6;
e |= ((c >> 11) == 0x1B) << 7;