diff options
Diffstat (limited to 'vendor/gmp-6.3.0/mpn/powerpc64/mode64/mod_1_1.asm')
-rw-r--r-- | vendor/gmp-6.3.0/mpn/powerpc64/mode64/mod_1_1.asm | 164 |
1 files changed, 164 insertions, 0 deletions
diff --git a/vendor/gmp-6.3.0/mpn/powerpc64/mode64/mod_1_1.asm b/vendor/gmp-6.3.0/mpn/powerpc64/mode64/mod_1_1.asm new file mode 100644 index 0000000..8733730 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/powerpc64/mode64/mod_1_1.asm @@ -0,0 +1,164 @@ +dnl PowerPC-64 mpn_mod_1_1p + +dnl Copyright 2010, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 ? +C POWER4/PPC970 17 +C POWER5 16 +C POWER6 30 +C POWER7 10.2 + +C TODO +C * Optimise, in particular the cps function. This was compiler-generated and +C then hand optimised. + +C INPUT PARAMETERS +define(`ap', `r3') +define(`n', `r4') +define(`d', `r5') +define(`cps', `r6') + +ASM_START() + +EXTERN_FUNC(mpn_invert_limb) + +PROLOGUE(mpn_mod_1_1p) + sldi r10, r4, 3 + addi r4, r4, -1 + add r3, r3, r10 + ld r0, 16(r6) C B1modb + ld r12, 24(r6) C B2modb + ld r9, -8(r3) + ld r10, -16(r3) + mtctr r4 + mulhdu r8, r9, r0 + mulld r7, r9, r0 + addc r11, r7, r10 + addze r9, r8 + bdz L(end) + + ALIGN(16) +L(top): ld r4, -24(r3) + addi r3, r3, -8 + nop + mulld r10, r11, r0 + mulld r8, r9, r12 + mulhdu r11, r11, r0 + mulhdu r9, r9, r12 + addc r7, r10, r4 + addze r10, r11 + addc r11, r8, r7 + adde r9, r9, r10 + bdnz L(top) + +L(end): +ifdef(`HAVE_LIMB_LITTLE_ENDIAN', +` lwz r0, 8(r6)', +` lwz r0, 12(r6)') + ld r3, 0(r6) + cmpdi cr7, r0, 0 + beq- cr7, L(4) + subfic r10, r0, 64 + sld r9, r9, r0 + srd r10, r11, r10 + or r9, r10, r9 +L(4): subfc r10, r5, r9 + subfe r10, r10, r10 + nand r10, r10, r10 + sld r11, r11, r0 + and r10, r10, r5 + subf r9, r10, r9 + mulhdu r10, r9, r3 + mulld r3, r9, r3 + addi r9, r9, 1 + addc r8, r3, r11 + adde r3, r10, r9 + mulld r3, r3, r5 + subf r3, r3, r11 + cmpld cr7, r8, r3 + bge cr7, L(5) C FIXME: Make branch-less + add r3, r3, r5 +L(5): cmpld cr7, r3, r5 + bge- cr7, L(10) + srd r3, r3, r0 + blr + +L(10): subf r3, r5, r3 + srd r3, r3, r0 + blr +EPILOGUE() + +PROLOGUE(mpn_mod_1_1p_cps,toc) + mflr r0 + std r29, -24(r1) + std r30, -16(r1) + std r31, -8(r1) + cntlzd r31, r4 + std r0, 16(r1) + extsw r31, r31 + mr r29, r3 + stdu r1, -144(r1) + sld r30, r4, r31 + mr r3, r30 + CALL( mpn_invert_limb) + cmpdi cr7, r31, 0 + neg r0, r30 + beq- cr7, L(13) + subfic r11, r31, 64 + li r0, 1 + neg r9, r30 + srd r11, r3, r11 + sld r0, r0, r31 + or r0, r11, r0 + mulld r0, r0, r9 +L(13): mulhdu r9, r0, r3 + mulld r11, r0, r3 + add r9, r0, r9 + nor r9, r9, r9 + mulld r9, r9, r30 + cmpld cr7, r11, r9 + bge cr7, L(14) + add r9, r9, r30 +L(14): addi r1, r1, 144 + srd r0, r0, r31 + std r31, 8(r29) + std r3, 0(r29) + std r0, 16(r29) + ld r0, 16(r1) + srd r9, r9, r31 + ld r30, -16(r1) + ld r31, -8(r1) + std r9, 24(r29) + ld r29, -24(r1) + mtlr r0 + blr +EPILOGUE() |