From a89a14ef5da44684a16b204e7a70460cc8c4922a Mon Sep 17 00:00:00 2001 From: Thomas Voss Date: Fri, 21 Jun 2024 23:36:36 +0200 Subject: Basic constant folding implementation --- vendor/gmp-6.3.0/mpn/powerpc32/vmx/copyd.asm | 203 +++++++++++ vendor/gmp-6.3.0/mpn/powerpc32/vmx/copyi.asm | 198 +++++++++++ vendor/gmp-6.3.0/mpn/powerpc32/vmx/logops_n.asm | 310 ++++++++++++++++ vendor/gmp-6.3.0/mpn/powerpc32/vmx/mod_34lsub1.asm | 388 +++++++++++++++++++++ vendor/gmp-6.3.0/mpn/powerpc32/vmx/popcount.asm | 34 ++ 5 files changed, 1133 insertions(+) create mode 100644 vendor/gmp-6.3.0/mpn/powerpc32/vmx/copyd.asm create mode 100644 vendor/gmp-6.3.0/mpn/powerpc32/vmx/copyi.asm create mode 100644 vendor/gmp-6.3.0/mpn/powerpc32/vmx/logops_n.asm create mode 100644 vendor/gmp-6.3.0/mpn/powerpc32/vmx/mod_34lsub1.asm create mode 100644 vendor/gmp-6.3.0/mpn/powerpc32/vmx/popcount.asm (limited to 'vendor/gmp-6.3.0/mpn/powerpc32/vmx') diff --git a/vendor/gmp-6.3.0/mpn/powerpc32/vmx/copyd.asm b/vendor/gmp-6.3.0/mpn/powerpc32/vmx/copyd.asm new file mode 100644 index 0000000..dee7266 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/powerpc32/vmx/copyd.asm @@ -0,0 +1,203 @@ +dnl PowerPC-32/VMX and PowerPC-64/VMX mpn_copyd. + +dnl Copyright 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C 16-byte coaligned unaligned +C cycles/limb cycles/limb +C 7400,7410 (G4): 0.5 0.64 +C 744x,745x (G4+): 0.75 0.82 +C 970 (G5): 0.78 1.02 (64-bit limbs) + +C STATUS +C * Works for all sizes and alignments. + +C TODO +C * Optimize unaligned case. Some basic tests with 2-way and 4-way unrolling +C indicate that we can reach 0.56 c/l for 7400, 0.75 c/l for 745x, and 0.80 +C c/l for 970. +C * Consider using VMX instructions also for head and tail, by using some +C read-modify-write tricks. +C * The VMX code is used from the smallest sizes it handles, but measurements +C show a large speed bump at the cutoff points. Small copying (perhaps +C using some read-modify-write technique) should be optimized. +C * Make an mpn_com based on this code. + +define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8)) +define(`LIMBS_PER_VR', eval(16/GMP_LIMB_BYTES)) +define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES)) + + +ifelse(GMP_LIMB_BITS,32,` + define(`LIMB32',` $1') + define(`LIMB64',`') +',` + define(`LIMB32',`') + define(`LIMB64',` $1') +') + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') + +define(`us', `v4') + + +ASM_START() +PROLOGUE(mpn_copyd) + +LIMB32(`slwi. r0, n, 2 ') +LIMB64(`sldi. r0, n, 3 ') + add rp, rp, r0 + add up, up, r0 + +LIMB32(`cmpi cr7, n, 11 ') +LIMB64(`cmpdi cr7, n, 5 ') + bge cr7, L(big) + + beqlr cr0 + +C Handle small cases with plain operations + mtctr n +L(topS): +LIMB32(`lwz r0, -4(up) ') +LIMB64(`ld r0, -8(up) ') + addi up, up, -GMP_LIMB_BYTES +LIMB32(`stw r0, -4(rp) ') +LIMB64(`std r0, -8(rp) ') + addi rp, rp, -GMP_LIMB_BYTES + bdnz L(topS) + blr + +C Handle large cases with VMX operations +L(big): + addi rp, rp, -16 + addi up, up, -16 + mfspr r12, 256 + oris r0, r12, 0xf800 C Set VRSAVE bit 0-4 + mtspr 256, r0 + +LIMB32(`rlwinm. r7, rp, 30,30,31') C (rp >> 2) mod 4 +LIMB64(`rlwinm. r7, rp, 29,31,31') C (rp >> 3) mod 2 + beq L(rp_aligned) + + subf n, r7, n +L(top0): +LIMB32(`lwz r0, 12(up) ') +LIMB64(`ld r0, 8(up) ') + addi up, up, -GMP_LIMB_BYTES +LIMB32(`addic. r7, r7, -1 ') +LIMB32(`stw r0, 12(rp) ') +LIMB64(`std r0, 8(rp) ') + addi rp, rp, -GMP_LIMB_BYTES +LIMB32(`bne L(top0) ') + +L(rp_aligned): + +LIMB32(`rlwinm. r0, up, 30,30,31') C (up >> 2) mod 4 +LIMB64(`rlwinm. r0, up, 29,31,31') C (up >> 3) mod 2 + +LIMB64(`srdi r7, n, 2 ') C loop count corresponding to n +LIMB32(`srwi r7, n, 3 ') C loop count corresponding to n + mtctr r7 C copy n to count register + + li r10, -16 + + beq L(up_aligned) + + lvsl us, 0, up + + addi up, up, 16 +LIMB32(`andi. r0, n, 0x4 ') +LIMB64(`andi. r0, n, 0x2 ') + beq L(1) + lvx v0, 0, up + lvx v2, r10, up + vperm v3, v2, v0, us + stvx v3, 0, rp + addi up, up, -32 + addi rp, rp, -16 + b L(lpu) +L(1): lvx v2, 0, up + addi up, up, -16 + b L(lpu) + + ALIGN(32) +L(lpu): lvx v0, 0, up + vperm v3, v0, v2, us + stvx v3, 0, rp + lvx v2, r10, up + addi up, up, -32 + vperm v3, v2, v0, us + stvx v3, r10, rp + addi rp, rp, -32 + bdnz L(lpu) + + b L(tail) + +L(up_aligned): + +LIMB32(`andi. r0, n, 0x4 ') +LIMB64(`andi. r0, n, 0x2 ') + beq L(lpa) + lvx v0, 0, up + stvx v0, 0, rp + addi up, up, -16 + addi rp, rp, -16 + b L(lpa) + + ALIGN(32) +L(lpa): lvx v0, 0, up + lvx v1, r10, up + addi up, up, -32 + nop + stvx v0, 0, rp + stvx v1, r10, rp + addi rp, rp, -32 + bdnz L(lpa) + +L(tail): +LIMB32(`rlwinm. r7, n, 0,30,31 ') C r7 = n mod 4 +LIMB64(`rlwinm. r7, n, 0,31,31 ') C r7 = n mod 2 + beq L(ret) +LIMB32(`li r10, 12 ') +L(top2): +LIMB32(`lwzx r0, r10, up ') +LIMB64(`ld r0, 8(up) ') +LIMB32(`addic. r7, r7, -1 ') +LIMB32(`stwx r0, r10, rp ') +LIMB64(`std r0, 8(rp) ') +LIMB32(`addi r10, r10, -GMP_LIMB_BYTES') +LIMB32(`bne L(top2) ') + +L(ret): mtspr 256, r12 + blr +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/powerpc32/vmx/copyi.asm b/vendor/gmp-6.3.0/mpn/powerpc32/vmx/copyi.asm new file mode 100644 index 0000000..992b468 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/powerpc32/vmx/copyi.asm @@ -0,0 +1,198 @@ +dnl PowerPC-32/VMX and PowerPC-64/VMX mpn_copyi. + +dnl Copyright 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C 16-byte coaligned unaligned +C cycles/limb cycles/limb +C 7400,7410 (G4): 0.5 0.64 +C 744x,745x (G4+): 0.75 0.82 +C 970 (G5): 0.78 1.02 (64-bit limbs) + +C STATUS +C * Works for all sizes and alignments. + +C TODO +C * Optimize unaligned case. Some basic tests with 2-way and 4-way unrolling +C indicate that we can reach 0.56 c/l for 7400, 0.75 c/l for 745x, and 0.80 +C c/l for 970. +C * Consider using VMX instructions also for head and tail, by using some +C read-modify-write tricks. +C * The VMX code is used from the smallest sizes it handles, but measurements +C show a large speed bump at the cutoff points. Small copying (perhaps +C using some read-modify-write technique) should be optimized. +C * Make an mpn_com based on this code. + +define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8)) +define(`LIMBS_PER_VR', eval(16/GMP_LIMB_BYTES)) +define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES)) + + +ifelse(GMP_LIMB_BITS,32,` + define(`LIMB32',` $1') + define(`LIMB64',`') +',` + define(`LIMB32',`') + define(`LIMB64',` $1') +') + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') + +define(`us', `v4') + + +ASM_START() +PROLOGUE(mpn_copyi) + +LIMB32(`cmpi cr7, n, 11 ') +LIMB64(`cmpdi cr7, n, 5 ') + bge cr7, L(big) + + or. r0, n, n + beqlr cr0 + +C Handle small cases with plain operations + mtctr n +L(topS): +LIMB32(`lwz r0, 0(up) ') +LIMB64(`ld r0, 0(up) ') + addi up, up, GMP_LIMB_BYTES +LIMB32(`stw r0, 0(rp) ') +LIMB64(`std r0, 0(rp) ') + addi rp, rp, GMP_LIMB_BYTES + bdnz L(topS) + blr + +C Handle large cases with VMX operations +L(big): + mfspr r12, 256 + oris r0, r12, 0xf800 C Set VRSAVE bit 0-4 + mtspr 256, r0 + +LIMB32(`rlwinm. r7, rp, 30,30,31') C (rp >> 2) mod 4 +LIMB64(`rlwinm. r7, rp, 29,31,31') C (rp >> 3) mod 2 + beq L(rp_aligned) + + subfic r7, r7, LIMBS_PER_VR + subf n, r7, n +L(top0): +LIMB32(`lwz r0, 0(up) ') +LIMB64(`ld r0, 0(up) ') + addi up, up, GMP_LIMB_BYTES +LIMB32(`addic. r7, r7, -1 ') +LIMB32(`stw r0, 0(rp) ') +LIMB64(`std r0, 0(rp) ') + addi rp, rp, GMP_LIMB_BYTES +LIMB32(`bne L(top0) ') + +L(rp_aligned): + +LIMB32(`rlwinm. r0, up, 30,30,31') C (up >> 2) mod 4 +LIMB64(`rlwinm. r0, up, 29,31,31') C (up >> 3) mod 2 + +LIMB64(`srdi r7, n, 2 ') C loop count corresponding to n +LIMB32(`srwi r7, n, 3 ') C loop count corresponding to n + mtctr r7 C copy n to count register + + li r10, 16 + + beq L(up_aligned) + + lvsl us, 0, up + +LIMB32(`andi. r0, n, 0x4 ') +LIMB64(`andi. r0, n, 0x2 ') + beq L(1) + lvx v0, 0, up + lvx v2, r10, up + vperm v3, v0, v2, us + stvx v3, 0, rp + addi up, up, 32 + addi rp, rp, 16 + b L(lpu) +L(1): lvx v2, 0, up + addi up, up, 16 + b L(lpu) + + ALIGN(32) +L(lpu): lvx v0, 0, up + vperm v3, v2, v0, us + stvx v3, 0, rp + lvx v2, r10, up + addi up, up, 32 + vperm v3, v0, v2, us + stvx v3, r10, rp + addi rp, rp, 32 + bdnz L(lpu) + + addi up, up, -16 + b L(tail) + +L(up_aligned): + +LIMB32(`andi. r0, n, 0x4 ') +LIMB64(`andi. r0, n, 0x2 ') + beq L(lpa) + lvx v0, 0, up + stvx v0, 0, rp + addi up, up, 16 + addi rp, rp, 16 + b L(lpa) + + ALIGN(32) +L(lpa): lvx v0, 0, up + lvx v1, r10, up + addi up, up, 32 + nop + stvx v0, 0, rp + stvx v1, r10, rp + addi rp, rp, 32 + bdnz L(lpa) + +L(tail): +LIMB32(`rlwinm. r7, n, 0,30,31 ') C r7 = n mod 4 +LIMB64(`rlwinm. r7, n, 0,31,31 ') C r7 = n mod 2 + beq L(ret) +LIMB32(`li r10, 0 ') +L(top2): +LIMB32(`lwzx r0, r10, up ') +LIMB64(`ld r0, 0(up) ') +LIMB32(`addic. r7, r7, -1 ') +LIMB32(`stwx r0, r10, rp ') +LIMB64(`std r0, 0(rp) ') +LIMB32(`addi r10, r10, GMP_LIMB_BYTES') +LIMB32(`bne L(top2) ') + +L(ret): mtspr 256, r12 + blr +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/powerpc32/vmx/logops_n.asm b/vendor/gmp-6.3.0/mpn/powerpc32/vmx/logops_n.asm new file mode 100644 index 0000000..d656d3b --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/powerpc32/vmx/logops_n.asm @@ -0,0 +1,310 @@ +dnl PowerPC-32/VMX and PowerPC-64/VMX mpn_and_n, mpn_andn_n, mpn_nand_n, +dnl mpn_ior_n, mpn_iorn_n, mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise +dnl logical operations. + +dnl Copyright 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C and,ior,andn,nior,xor iorn,xnor nand +C cycles/limb cycles/limb cycles/limb +C 7400,7410 (G4): 1.39 ? ? +C 744x,745x (G4+): 1.14 1.39 1.39 +C 970: 1.7 2.0 2.0 + +C STATUS +C * Works for all sizes and alignment for 32-bit limbs. +C * Works for n >= 4 for 64-bit limbs; untested for smaller operands. +C * Current performance makes this pointless for 970 + +C TODO +C * Might want to make variants when just one of the source operands needs +C vperm, and when neither needs it. The latter runs 50% faster on 7400. +C * Idea: If the source operands are equally aligned, we could do the logops +C first, then vperm before storing! That means we never need more than one +C vperm, ever! +C * Perhaps align `rp' after initial alignment loop? +C * Instead of having scalar code in the beginning and end, consider using +C read-modify-write vector code. +C * Software pipeline? Hopefully not too important, this is hairy enough +C already. +C * At least be more clever about operand loading, i.e., load v operands before +C u operands, since v operands are sometimes negated. + +define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8)) +define(`LIMBS_PER_VR', eval(16/GMP_LIMB_BYTES)) +define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES)) + +define(`vnegb', `') C default neg-before to null +define(`vnega', `') C default neg-before to null + +ifdef(`OPERATION_and_n', +` define(`func', `mpn_and_n') + define(`logopS',`and $1,$2,$3') + define(`logop', `vand $1,$2,$3')') +ifdef(`OPERATION_andn_n', +` define(`func', `mpn_andn_n') + define(`logopS',`andc $1,$2,$3') + define(`logop', `vandc $1,$2,$3')') +ifdef(`OPERATION_nand_n', +` define(`func', `mpn_nand_n') + define(`logopS',`nand $1,$2,$3') + define(`logop', `vand $1,$2,$3') + define(`vnega', `vnor $1,$2,$2')') +ifdef(`OPERATION_ior_n', +` define(`func', `mpn_ior_n') + define(`logopS',`or $1,$2,$3') + define(`logop', `vor $1,$2,$3')') +ifdef(`OPERATION_iorn_n', +` define(`func', `mpn_iorn_n') + define(`logopS',`orc $1,$2,$3') + define(`vnegb', `vnor $1,$2,$2') + define(`logop', `vor $1,$2,$3')') +ifdef(`OPERATION_nior_n', +` define(`func', `mpn_nior_n') + define(`logopS',`nor $1,$2,$3') + define(`logop', `vnor $1,$2,$3')') +ifdef(`OPERATION_xor_n', +` define(`func', `mpn_xor_n') + define(`logopS',`xor $1,$2,$3') + define(`logop', `vxor $1,$2,$3')') +ifdef(`OPERATION_xnor_n', +` define(`func',`mpn_xnor_n') + define(`logopS',`eqv $1,$2,$3') + define(`vnegb', `vnor $1,$2,$2') + define(`logop', `vxor $1,$2,$3')') + +ifelse(GMP_LIMB_BITS,`32',` + define(`LIMB32',` $1') + define(`LIMB64',`') +',` + define(`LIMB32',`') + define(`LIMB64',` $1') +') + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`vp', `r5') +define(`n', `r6') + +define(`us', `v8') +define(`vs', `v9') + +MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n) + +ASM_START() +PROLOGUE(func) + +LIMB32(`cmpwi cr0, n, 8 ') +LIMB64(`cmpdi cr0, n, 4 ') + bge L(big) + + mtctr n + +LIMB32(`lwz r8, 0(up) ') +LIMB32(`lwz r9, 0(vp) ') +LIMB32(`logopS( r0, r8, r9) ') +LIMB32(`stw r0, 0(rp) ') +LIMB32(`bdz L(endS) ') + +L(topS): +LIMB32(`lwzu r8, 4(up) ') +LIMB64(`ld r8, 0(up) ') +LIMB64(`addi up, up, GMP_LIMB_BYTES ') +LIMB32(`lwzu r9, 4(vp) ') +LIMB64(`ld r9, 0(vp) ') +LIMB64(`addi vp, vp, GMP_LIMB_BYTES ') + logopS( r0, r8, r9) +LIMB32(`stwu r0, 4(rp) ') +LIMB64(`std r0, 0(rp) ') +LIMB64(`addi rp, rp, GMP_LIMB_BYTES ') + bdnz L(topS) +L(endS): + blr + +L(big): mfspr r12, 256 + oris r0, r12, 0xfffc C Set VRSAVE bit 0-13 FIXME + mtspr 256, r0 + +C First loop until the destination is 16-byte aligned. This will execute 0 or 1 +C times for 64-bit machines, and 0 to 3 times for 32-bit machines. + +LIMB32(`rlwinm. r0, rp, 30,30,31') C (rp >> 2) mod 4 +LIMB64(`rlwinm. r0, rp, 29,31,31') C (rp >> 3) mod 2 + beq L(aligned) + + subfic r7, r0, LIMBS_PER_VR +LIMB32(`li r10, 0 ') + subf n, r7, n +L(top0): +LIMB32(`lwz r8, 0(up) ') +LIMB64(`ld r8, 0(up) ') + addi up, up, GMP_LIMB_BYTES +LIMB32(`lwz r9, 0(vp) ') +LIMB64(`ld r9, 0(vp) ') + addi vp, vp, GMP_LIMB_BYTES +LIMB32(`addic. r7, r7, -1 ') + logopS( r0, r8, r9) +LIMB32(`stwx r0, r10, rp ') +LIMB64(`std r0, 0(rp) ') +LIMB32(`addi r10, r10, GMP_LIMB_BYTES') +LIMB32(`bne L(top0) ') + + addi rp, rp, 16 C update rp, but preserve its alignment + +L(aligned): +LIMB64(`srdi r7, n, 1 ') C loop count corresponding to n +LIMB32(`srwi r7, n, 2 ') C loop count corresponding to n + mtctr r7 C copy n to count register + + li r10, 16 + lvsl us, 0, up + lvsl vs, 0, vp + + lvx v2, 0, up + lvx v3, 0, vp + bdnz L(gt1) + lvx v0, r10, up + lvx v1, r10, vp + vperm v4, v2, v0, us + vperm v5, v3, v1, vs + vnegb( v5, v5) + logop( v6, v4, v5) + vnega( v6, v6) + stvx v6, 0, rp + addi up, up, 16 + addi vp, vp, 16 + addi rp, rp, 4 + b L(tail) + +L(gt1): addi up, up, 16 + addi vp, vp, 16 + +L(top): lvx v0, 0, up + lvx v1, 0, vp + vperm v4, v2, v0, us + vperm v5, v3, v1, vs + vnegb( v5, v5) + logop( v6, v4, v5) + vnega( v6, v6) + stvx v6, 0, rp + bdz L(end) + lvx v2, r10, up + lvx v3, r10, vp + vperm v4, v0, v2, us + vperm v5, v1, v3, vs + vnegb( v5, v5) + logop( v6, v4, v5) + vnega( v6, v6) + stvx v6, r10, rp + addi up, up, 32 + addi vp, vp, 32 + addi rp, rp, 32 + bdnz L(top) + + andi. r0, up, 15 + vxor v0, v0, v0 + beq 1f + lvx v0, 0, up +1: andi. r0, vp, 15 + vxor v1, v1, v1 + beq 1f + lvx v1, 0, vp +1: vperm v4, v2, v0, us + vperm v5, v3, v1, vs + vnegb( v5, v5) + logop( v6, v4, v5) + vnega( v6, v6) + stvx v6, 0, rp + addi rp, rp, 4 + b L(tail) + +L(end): andi. r0, up, 15 + vxor v2, v2, v2 + beq 1f + lvx v2, r10, up +1: andi. r0, vp, 15 + vxor v3, v3, v3 + beq 1f + lvx v3, r10, vp +1: vperm v4, v0, v2, us + vperm v5, v1, v3, vs + vnegb( v5, v5) + logop( v6, v4, v5) + vnega( v6, v6) + stvx v6, r10, rp + + addi up, up, 16 + addi vp, vp, 16 + addi rp, rp, 20 + +L(tail): +LIMB32(`rlwinm. r7, n, 0,30,31 ') C r7 = n mod 4 +LIMB64(`rlwinm. r7, n, 0,31,31 ') C r7 = n mod 2 + beq L(ret) + addi rp, rp, 15 +LIMB32(`rlwinm rp, rp, 0,0,27 ') +LIMB64(`rldicr rp, rp, 0,59 ') + li r10, 0 +L(top2): +LIMB32(`lwzx r8, r10, up ') +LIMB64(`ldx r8, r10, up ') +LIMB32(`lwzx r9, r10, vp ') +LIMB64(`ldx r9, r10, vp ') +LIMB32(`addic. r7, r7, -1 ') + logopS( r0, r8, r9) +LIMB32(`stwx r0, r10, rp ') +LIMB64(`std r0, 0(rp) ') +LIMB32(`addi r10, r10, GMP_LIMB_BYTES') +LIMB32(`bne L(top2) ') + +L(ret): mtspr 256, r12 + blr +EPILOGUE() + +C This works for 64-bit PowerPC, since a limb ptr can only be aligned +C in 2 relevant ways, which means we can always find a pair of aligned +C pointers of rp, up, and vp. +C process words until rp is 16-byte aligned +C if (((up | vp) & 15) == 0) +C process with VMX without any vperm +C else if ((up & 15) != 0 && (vp & 15) != 0) +C process with VMX using vperm on store data +C else if ((up & 15) != 0) +C process with VMX using vperm on up data +C else +C process with VMX using vperm on vp data +C +C rlwinm, r0, up, 0,28,31 +C rlwinm r0, vp, 0,28,31 +C cmpwi cr7, r0, 0 +C cror cr6, cr0, cr7 +C crand cr0, cr0, cr7 diff --git a/vendor/gmp-6.3.0/mpn/powerpc32/vmx/mod_34lsub1.asm b/vendor/gmp-6.3.0/mpn/powerpc32/vmx/mod_34lsub1.asm new file mode 100644 index 0000000..2bb11cd --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/powerpc32/vmx/mod_34lsub1.asm @@ -0,0 +1,388 @@ +dnl PowerPC-32 mpn_mod_34lsub1 -- mpn remainder mod 2^24-1. + +dnl Copyright 2002, 2003, 2005-2007, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + + +C cycles/limb +C 603e: - +C 604e: - +C 75x (G3): - +C 7400,7410 (G4): 1 simple load-use scheduling results in 0.75 +C 744x,745x (G4+): 0.75 +C ppc970: 0.75 +C power4: - +C power5: - + +C TODO +C * Either start using the low-end masking constants, or remove them. +C * Merge multiple feed-in cases into a parameterized code block. +C * Reduce register usage. It should be possible to almost halve it. + +define(`up', `r3') +define(`n', `r4') + +define(`a0', `v3') +define(`a1', `v4') +define(`a2', `v5') +define(`c0', `v6') +define(`c1', `v7') +define(`c2', `v8') +define(`z', `v9') +define(`x0', `v10') +define(`x1', `v11') +define(`x2', `v12') +define(`x3', `v13') +define(`pv', `v14') +define(`y0', `v0') +define(`y1', `v1') +define(`y2', `v2') +define(`y3', `v15') + +ASM_START() +PROLOGUE(mpn_mod_34lsub1) + cmpwi cr0, n, 20 C tuned cutoff point + bge L(large) + + li r9, 0 C result accumulator + mulli r10, n, 0xb C 0xb = ceil(32/3) + srwi. r10, r10, 5 C r10 = floor(n/3), n < 32 + beq L(small_tail) + mtctr r10 + lwz r6, 0(up) + lwz r7, 4(up) + lwzu r8, 8(up) + subf n, r10, n + subf n, r10, n + subf n, r10, n + bdz L(small_end) + + ALIGN(16) +L(los): rlwinm r0, r6, 0,8,31 + add r9, r9, r0 C add 24b from u0 + srwi r0, r6, 24 + lwz r6, 4(up) + rlwimi r0, r7, 8, 0x00ffff00 C --111100 + add r9, r9, r0 C add 8b from u0 and 16b from u1 + srwi r0, r7, 16 + lwz r7, 8(up) + rlwimi r0, r8, 16, 0x00ff0000 C --221111 + add r9, r9, r0 C add 16b from u1 and 8b from u2 + srwi r0, r8, 8 C --222222 + lwzu r8, 12(up) + add r9, r9, r0 C add 24b from u2 + bdnz L(los) +L(small_end): + rlwinm r0, r6, 0,8,31 + add r9, r9, r0 C add 24b from u0 + srwi r0, r6, 24 + rlwimi r0, r7, 8, 0x00ffff00 C --111100 + add r9, r9, r0 C add 8b from u0 and 16b from u1 + srwi r0, r7, 16 + rlwimi r0, r8, 16, 0x00ff0000 C --221111 + add r9, r9, r0 C add 16b from u1 and 8b from u2 + srwi r0, r8, 8 C --222222 + add r9, r9, r0 C add 24b from u2 + + addi up, up, 4 + rlwinm r0, r9, 0,8,31 + srwi r9, r9, 24 + add r9, r9, r0 + +L(small_tail): + cmpi cr0, n, 1 + blt L(ret) + + lwz r6, 0(up) + rlwinm r0, r6, 0,8,31 + srwi r6, r6, 24 + add r9, r9, r0 + add r9, r9, r6 + + beq L(ret) + + lwz r6, 4(up) + rlwinm r0, r6, 8,8,23 + srwi r6, r6, 16 + add r9, r9, r0 + add r9, r9, r6 + +L(ret): mr r3, r9 + blr + + +L(large): + stwu r1, -32(r1) + mfspr r10, 256 + oris r0, r10, 0xffff C Set VRSAVE bit 0-15 + mtspr 256, r0 + + andi. r7, up, 15 + vxor a0, v0, v0 + lis r9, 0xaaaa + vxor a1, v0, v0 + ori r9, r9, 0xaaab + vxor a2, v0, v0 + li r5, 16 + vxor c0, v0, v0 + li r6, 32 + vxor c1, v0, v0 + LEAL( r11, cnsts) C CAUTION clobbers r0 for elf, darwin + vxor c2, v0, v0 + vxor z, v0, v0 + + beq L(aligned16) + + cmpwi cr7, r7, 8 + bge cr7, L(na4) + + lvx a2, 0, up + addi up, up, 16 + vsldoi a2, a2, z, 4 + vsldoi a2, z, a2, 12 + + addi n, n, 9 + mulhwu r0, n, r9 + srwi r0, r0, 3 C r0 = floor(n/12) + mtctr r0 + + mulli r8, r0, 12 + subf n, r8, n + b L(2) + +L(na4): bne cr7, L(na8) + + lvx a1, 0, up + addi up, up, -16 + vsldoi a1, a1, z, 8 + vsldoi a1, z, a1, 8 + + addi n, n, 6 + mulhwu r0, n, r9 + srwi r0, r0, 3 C r0 = floor(n/12) + mtctr r0 + + mulli r8, r0, 12 + subf n, r8, n + b L(1) + +L(na8): + lvx a0, 0, up + vsldoi a0, a0, z, 12 + vsldoi a0, z, a0, 4 + + addi n, n, 3 + mulhwu r0, n, r9 + srwi r0, r0, 3 C r0 = floor(n/12) + mtctr r0 + + mulli r8, r0, 12 + subf n, r8, n + b L(0) + +L(aligned16): + mulhwu r0, n, r9 + srwi r0, r0, 3 C r0 = floor(n/12) + mtctr r0 + + mulli r8, r0, 12 + subf n, r8, n + + lvx a0, 0, up +L(0): lvx a1, r5, up +L(1): lvx a2, r6, up + addi up, up, 48 +L(2): bdz L(end) + li r12, 256 + li r9, 288 + ALIGN(32) +L(top): + lvx v0, 0, up + vaddcuw v10, a0, v0 + vadduwm a0, a0, v0 + vadduwm c0, c0, v10 + + lvx v1, r5, up + vaddcuw v10, a1, v1 + vadduwm a1, a1, v1 + vadduwm c1, c1, v10 + + lvx v2, r6, up + dcbt up, r12 + dcbt up, r9 + addi up, up, 48 + vaddcuw v10, a2, v2 + vadduwm a2, a2, v2 + vadduwm c2, c2, v10 + bdnz L(top) + +L(end): +C n = 0...11 + cmpwi cr0, n, 0 + beq L(sum) + cmpwi cr0, n, 4 + ble L(tail.1..4) + cmpwi cr0, n, 8 + ble L(tail.5..8) + +L(tail.9..11): + lvx v0, 0, up + vaddcuw v10, a0, v0 + vadduwm a0, a0, v0 + vadduwm c0, c0, v10 + + lvx v1, r5, up + vaddcuw v10, a1, v1 + vadduwm a1, a1, v1 + vadduwm c1, c1, v10 + + lvx v2, r6, up + + addi r8, r11, 96 + rlwinm r3, n ,4,26,27 + lvx v11, r3, r8 + vand v2, v2, v11 + + vaddcuw v10, a2, v2 + vadduwm a2, a2, v2 + vadduwm c2, c2, v10 + b L(sum) + +L(tail.5..8): + lvx v0, 0, up + vaddcuw v10, a0, v0 + vadduwm a0, a0, v0 + vadduwm c0, c0, v10 + + lvx v1, r5, up + + addi r8, r11, 96 + rlwinm r3, n ,4,26,27 + lvx v11, r3, r8 + vand v1, v1, v11 + + vaddcuw v10, a1, v1 + vadduwm a1, a1, v1 + vadduwm c1, c1, v10 + b L(sum) + +L(tail.1..4): + lvx v0, 0, up + + addi r8, r11, 96 + rlwinm r3, n ,4,26,27 + lvx v11, r3, r8 + vand v0, v0, v11 + + vaddcuw v10, a0, v0 + vadduwm a0, a0, v0 + vadduwm c0, c0, v10 + +L(sum): lvx pv, 0, r11 + vperm x0, a0, z, pv C extract 4 24-bit field from a0 + vperm y0, c2, z, pv + lvx pv, r5, r11 + vperm x1, a1, z, pv C extract 4 24-bit field from a1 + vperm y1, c0, z, pv C extract 4 24-bit field from a1 + lvx pv, r6, r11 + vperm x2, a2, z, pv C extract 4 24-bit field from a1 + vperm y2, c1, z, pv C extract 4 24-bit field from a1 + li r10, 48 + lvx pv, r10, r11 + vperm x3, a0, z, pv C extract remaining/partial a0 fields + vperm y3, c2, z, pv C extract remaining/partial a0 fields + li r10, 64 + lvx pv, r10, r11 + vperm x3, a1, x3, pv C insert remaining/partial a1 fields + vperm y3, c0, y3, pv C insert remaining/partial a1 fields + li r10, 80 + lvx pv, r10, r11 + vperm x3, a2, x3, pv C insert remaining/partial a2 fields + vperm y3, c1, y3, pv C insert remaining/partial a2 fields + +C We now have 4 128-bit accumulators to sum + vadduwm x0, x0, x1 + vadduwm x2, x2, x3 + vadduwm x0, x0, x2 + + vadduwm y0, y0, y1 + vadduwm y2, y2, y3 + vadduwm y0, y0, y2 + + vadduwm x0, x0, y0 + +C Reduce 32-bit fields + vsumsws x0, x0, z + + li r7, 16 + stvx x0, r7, r1 + lwz r3, 28(r1) + + mtspr 256, r10 + addi r1, r1, 32 + blr +EPILOGUE() + +C load | v0 | v1 | v2 | +C acc | a0 | a1 | a2 | +C carry | c0 | c1 | c2 | +C | 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 128 +C |---|---|---|---|---|---|---|---|---|---|---|---| 32 +C | | | | | | | | | | | | | | | | | 24 +C | | | | | | | | | 48 + +C $---------------$---------------$---------------$---------------$ +C | . . . . . . . . . . . . . . . | +C |_______________________________________________________________| +C | | | | | | | +C <-hi16-> <--- 24 --> <--- 24 --> <--- 24 --> <--- 24 --> <-lo16-> + + +DEF_OBJECT(cnsts,16) +C Permutation vectors in the order they are used above +C # 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f + .byte 0x10,0x01,0x02,0x03, 0x10,0x06,0x07,0x00, 0x10,0x0b,0x04,0x05, 0x10,0x08,0x09,0x0a C a0 + .byte 0x10,0x07,0x00,0x01, 0x10,0x04,0x05,0x06, 0x10,0x09,0x0a,0x0b, 0x10,0x0e,0x0f,0x08 C a1 + .byte 0x10,0x00,0x01,0x02, 0x10,0x05,0x06,0x07, 0x10,0x0a,0x0b,0x04, 0x10,0x0f,0x08,0x09 C a2 + .byte 0x10,0x0d,0x0e,0x0f, 0x10,0x10,0x10,0x0c, 0x10,0x10,0x10,0x10, 0x10,0x10,0x10,0x10 C part a0 + .byte 0x10,0x11,0x12,0x13, 0x10,0x02,0x03,0x17, 0x10,0x10,0x0c,0x0d, 0x10,0x10,0x10,0x10 C part a1 + .byte 0x10,0x11,0x12,0x13, 0x10,0x15,0x16,0x17, 0x10,0x03,0x1a,0x1b, 0x10,0x0c,0x0d,0x0e C part a2 +C Masks for high end of number + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 +C Masks for low end of number +C .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff +C .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff +C .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff +C .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff +END_OBJECT(cnsts) diff --git a/vendor/gmp-6.3.0/mpn/powerpc32/vmx/popcount.asm b/vendor/gmp-6.3.0/mpn/powerpc32/vmx/popcount.asm new file mode 100644 index 0000000..943c92d --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/powerpc32/vmx/popcount.asm @@ -0,0 +1,34 @@ +dnl PowerPC-32/VMX mpn_popcount. + +dnl Copyright 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_popcount) +include_mpn(`powerpc64/vmx/popcount.asm') -- cgit v1.2.3