From a89a14ef5da44684a16b204e7a70460cc8c4922a Mon Sep 17 00:00:00 2001 From: Thomas Voss Date: Fri, 21 Jun 2024 23:36:36 +0200 Subject: Basic constant folding implementation --- vendor/gmp-6.3.0/mpn/x86_64/bd1/popcount.asm | 191 +++++++++++++++++++++++++++ 1 file changed, 191 insertions(+) create mode 100644 vendor/gmp-6.3.0/mpn/x86_64/bd1/popcount.asm (limited to 'vendor/gmp-6.3.0/mpn/x86_64/bd1/popcount.asm') diff --git a/vendor/gmp-6.3.0/mpn/x86_64/bd1/popcount.asm b/vendor/gmp-6.3.0/mpn/x86_64/bd1/popcount.asm new file mode 100644 index 0000000..7b084f4 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/bd1/popcount.asm @@ -0,0 +1,191 @@ +dnl AMD64 SSSE3/XOP mpn_popcount -- population count. + +dnl Copyright 2010-2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C cycles/limb good for cpu? +C AMD K8,K9 n/a +C AMD K10 n/a +C AMD bd1 1.27 y +C AMD bd2 1.24 y +C AMD bd3 ? +C AMD bd4 1.22 +C AMD zen n/a +C AMD bobcat n/a +C AMD jaguar n/a +C Intel P4 n/a +C Intel CNR n/a +C Intel PNR n/a +C Intel NHM n/a +C Intel SBR n/a +C Intel IBR n/a +C Intel HWL n/a +C Intel BWL n/a +C Intel SKL n/a +C Intel atom n/a +C Intel SLM n/a +C VIA nano n/a + +C TODO +C * We need to use .byte for vpshlb, vpperm, vphaddubq, and all popcnt if we +C intend to support old systems. + +C We use vpshlb and vpperm below, which are XOP extensions to AVX. Some +C systems, e.g., NetBSD, set OSXSAVE but nevertheless trigger SIGILL for AVX. +C We fall back to the core2 code. +ifdef(`GMP_AVX_NOT_REALLY_AVAILABLE',` +MULFUNC_PROLOGUE(mpn_popcount) +include_mpn(`x86_64/core2/popcount.asm') +',` + +define(`up', `%rdi') +define(`n', `%rsi') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_popcount) + FUNC_ENTRY(3) + lea L(cnsts)(%rip), %r9 + +ifdef(`PIC', `define(`OFF1',32) define(`OFF2',48) define(`OFF3',64)', + `define(`OFF1',64) define(`OFF2',80) define(`OFF3',96)') + movdqa OFF1`'(%r9), %xmm7 C nibble counts table + movdqa OFF2`'(%r9), %xmm6 C splat shift counts + movdqa OFF3`'(%r9), %xmm9 C masks + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 C 0-reg + pxor %xmm8, %xmm8 C grand total count + + xor R32(%rdx), R32(%rdx) + + mov R32(n), R32(%rax) + and $7, R32(%rax) +ifdef(`PIC',` + movslq (%r9,%rax,4), %rax + add %r9, %rax + jmp *%rax +',` + jmp *(%r9,%rax,8) +') + +L(1): .byte 0xf3,0x48,0x0f,0xb8,0x17 C popcnt (up),%rdx + add $8, up + dec n + jnz L(top) + mov %rdx, %rax + FUNC_EXIT() + ret + +L(2): add $-48, up + jmp L(e2) + +L(3): .byte 0xf3,0x48,0x0f,0xb8,0x17 C popcnt (up), %rdx + add $-40, up + jmp L(e2) + +L(4): add $-32, up + jmp L(e4) + +L(5): .byte 0xf3,0x48,0x0f,0xb8,0x17 C popcnt (up), %rdx + add $-24, up + jmp L(e4) + +L(6): add $-16, up + jmp L(e6) + +L(7): .byte 0xf3,0x48,0x0f,0xb8,0x17 C popcnt (up), %rdx + add $-8, up + jmp L(e6) + + ALIGN(32) +L(top): lddqu (up), %xmm0 + .byte 0x8f,0xe9,0x48,0x94,0xc8 C vpshlb %xmm6, %xmm0, %xmm1 + pand %xmm9, %xmm0 + pand %xmm9, %xmm1 + .byte 0x8f,0xe8,0x40,0xa3,0xd7,0x00 C vpperm %xmm0,%xmm7,%xmm7,%xmm2 + .byte 0x8f,0xe8,0x40,0xa3,0xdf,0x10 C vpperm %xmm1, %xmm7, %xmm7, %xmm3 + paddb %xmm2, %xmm3 + paddb %xmm3, %xmm4 +L(e6): lddqu 16(up), %xmm0 + .byte 0x8f,0xe9,0x48,0x94,0xc8 C vpshlb %xmm6, %xmm0, %xmm1 + pand %xmm9, %xmm0 + pand %xmm9, %xmm1 + .byte 0x8f,0xe8,0x40,0xa3,0xd7,0x00 C vpperm %xmm0,%xmm7,%xmm7,%xmm2 + .byte 0x8f,0xe8,0x40,0xa3,0xdf,0x10 C vpperm %xmm1,%xmm7,%xmm7,%xmm3 + paddb %xmm2, %xmm3 + paddb %xmm3, %xmm4 +L(e4): lddqu 32(up), %xmm0 + .byte 0x8f,0xe9,0x48,0x94,0xc8 C vpshlb %xmm6, %xmm0, %xmm1 + pand %xmm9, %xmm0 + pand %xmm9, %xmm1 + .byte 0x8f,0xe8,0x40,0xa3,0xd7,0x00 C vpperm %xmm0, %xmm7, %xmm7, %xmm2 + .byte 0x8f,0xe9,0x78,0xd3,0xec C vphaddubq %xmm4, %xmm5 + .byte 0x8f,0xe8,0x40,0xa3,0xe7,0x10 C vpperm %xmm1,%xmm7,%xmm7,%xmm4 + paddb %xmm2, %xmm4 +L(e2): popcnt 48(up), %r8 + popcnt 56(up), %r9 + add $64, up + paddq %xmm5, %xmm8 C sum to 2 x 64-bit counts + add %r8, %rdx + add %r9, %rdx + sub $8, n + jg L(top) + + .byte 0x8f,0xe9,0x78,0xd3,0xec C vphaddubq %xmm4, %xmm5 + paddq %xmm5, %xmm8 + pshufd $14, %xmm8, %xmm0 + paddq %xmm8, %xmm0 + movd %xmm0, %rax + add %rdx, %rax + FUNC_EXIT() + ret +EPILOGUE() +DEF_OBJECT(L(cnsts),16,`JUMPTABSECT') + JMPENT( L(top), L(cnsts)) + JMPENT( L(1), L(cnsts)) + JMPENT( L(2), L(cnsts)) + JMPENT( L(3), L(cnsts)) + JMPENT( L(4), L(cnsts)) + JMPENT( L(5), L(cnsts)) + JMPENT( L(6), L(cnsts)) + JMPENT( L(7), L(cnsts)) + .byte 0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03 + .byte 0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04 + .byte -4,-4,-4,-4,-4,-4,-4,-4 + .byte -4,-4,-4,-4,-4,-4,-4,-4 + .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f + .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f +END_OBJECT(L(cnsts)) +') -- cgit v1.2.3