From 3db6b4427de43ec3ab54f6cec3e1a014780d6890 Mon Sep 17 00:00:00 2001 From: Thomas Voss Date: Sun, 25 Aug 2024 00:24:25 +0200 Subject: Add simd-unicode --- c/simd-unicode/unicode.c | 152 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 152 insertions(+) create mode 100644 c/simd-unicode/unicode.c (limited to 'c/simd-unicode/unicode.c') diff --git a/c/simd-unicode/unicode.c b/c/simd-unicode/unicode.c new file mode 100644 index 0000000..988d9f6 --- /dev/null +++ b/c/simd-unicode/unicode.c @@ -0,0 +1,152 @@ +#if !__AVX512F__ +# error "AVX512 intrinsics are required" +#endif + +#define _GNU_SOURCE +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "unicode.h" + +#define MIN(x, y) ((x) < (y) ? (x) : (y)) + +static int unicode_generic(const char *, size_t); +static int unicode_simd(const char *, size_t); +static char *readfile(const char *, size_t *); + +int +main(int argc, char **argv) +{ + if (argc != 2) { + fprintf(stderr, "Usage: %s file\n", getprogname()); + exit(EXIT_FAILURE); + } + + size_t n; + int rv = EXIT_SUCCESS; + const char *s = readfile(argv[1], &n); + + clock_t tmbeg = clock(); + printf("%d: ", unicode_generic(s, n)); + printf("Elapsed time: %.3fs\n", (double)(clock() - tmbeg) / CLOCKS_PER_SEC); + + tmbeg = clock(); + printf("%d: ", unicode_simd(s, n)); + printf("Elapsed time: %.3fs\n", (double)(clock() - tmbeg) / CLOCKS_PER_SEC); + + return rv; +} + +int +unicode_generic(const char *s, size_t n) +{ + int acc = 0; + while (n > 0) { + rune ch; + int w = u8tor(&ch, s); + acc += stage2[stage1[ch / CCCBLKSIZ]][ch % CCCBLKSIZ]; + s += w; + n -= w; + } + return acc; +} + +int +unicode_simd(const char *s, size_t n) +{ + int acc = 0; + const __m512i modmsk = _mm512_set1_epi32(CCCBLKSIZ - 1); + + while (n > 0) { + int w = 0; + alignas(64) uint32_t data[16]; + + w += u8tor(&data[ 0], s + w); + w += u8tor(&data[ 1], s + w); + w += u8tor(&data[ 2], s + w); + w += u8tor(&data[ 3], s + w); + w += u8tor(&data[ 4], s + w); + w += u8tor(&data[ 5], s + w); + w += u8tor(&data[ 6], s + w); + w += u8tor(&data[ 7], s + w); + w += u8tor(&data[ 8], s + w); + w += u8tor(&data[ 9], s + w); + w += u8tor(&data[10], s + w); + w += u8tor(&data[11], s + w); + w += u8tor(&data[12], s + w); + w += u8tor(&data[13], s + w); + w += u8tor(&data[14], s + w); + w += u8tor(&data[15], s + w); + + s += w; + n -= w; + + __m512i vec = _mm512_load_epi32(data); + __m512i vidx = _mm512_srli_epi32(vec, CCCBLKSIZ_LOG2); /* vidx = vec / CCCBLKSZ */ +#if USE_GATHER + vidx = _mm512_i32gather_epi32(vidx, stage1, sizeof(rune)); +#else + _mm512_store_epi32(data, vidx); + for (int i = 0; i < 16; i++) + data[i] = stage1[data[i]]; + vidx = _mm512_load_epi32(data); +#endif + vidx = _mm512_slli_epi32(vidx, CCCBLKSIZ_LOG2); + vec = _mm512_and_epi32(vec, modmsk); + vidx = _mm512_add_epi32(vidx, vec); +#if USE_GATHER + vec = _mm512_i32gather_epi32(vidx, stage2, sizeof(rune)); + _mm512_store_epi32(data, vec); +#else + _mm512_store_epi32(data, vidx); + for (int i = 0; i < 16; i++) + data[i] = ((uprop_ccc_t *)stage2)[data[i]]; +#endif + + for (int i = 0; i < 16; i++) + acc += (uprop_ccc_t)data[i]; + } + + return acc; +} + +char * +readfile(const char *filename, size_t *n) +{ + int fd = open(filename, O_RDONLY); + if (fd == -1) + err(1, "open: %s", filename); + + struct stat sb; + if (fstat(fd, &sb) == -1) + err(1, "fstat: %s", filename); + + char *p = malloc(sb.st_size + 4); + if (p == NULL) + err(1, "malloc"); + + ssize_t nr; + for (size_t off = 0; (nr = read(fd, p + off, sb.st_blksize)) > 0; off += nr) + ; + if (nr == -1) + err(1, "read: %s", filename); + + p[sb.st_size + 0] = + p[sb.st_size + 1] = + p[sb.st_size + 2] = + p[sb.st_size + 3] = 0; + + *n = sb.st_size; + close(fd); + return p; +} -- cgit v1.2.3