diff options
Diffstat (limited to 'vendor/gmp-6.3.0/mpn/alpha/ev6/nails')
-rw-r--r-- | vendor/gmp-6.3.0/mpn/alpha/ev6/nails/README | 65 | ||||
-rw-r--r-- | vendor/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_1.asm | 396 | ||||
-rw-r--r-- | vendor/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_2.asm | 146 | ||||
-rw-r--r-- | vendor/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_3.asm | 169 | ||||
-rw-r--r-- | vendor/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_4.asm | 210 | ||||
-rw-r--r-- | vendor/gmp-6.3.0/mpn/alpha/ev6/nails/aors_n.asm | 233 | ||||
-rw-r--r-- | vendor/gmp-6.3.0/mpn/alpha/ev6/nails/gmp-mparam.h | 72 | ||||
-rw-r--r-- | vendor/gmp-6.3.0/mpn/alpha/ev6/nails/mul_1.asm | 364 | ||||
-rw-r--r-- | vendor/gmp-6.3.0/mpn/alpha/ev6/nails/submul_1.asm | 396 |
9 files changed, 2051 insertions, 0 deletions
diff --git a/vendor/gmp-6.3.0/mpn/alpha/ev6/nails/README b/vendor/gmp-6.3.0/mpn/alpha/ev6/nails/README new file mode 100644 index 0000000..b214ac5 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/alpha/ev6/nails/README @@ -0,0 +1,65 @@ +Copyright 2002, 2005 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. + + + + + +This directory contains assembly code for nails-enabled 21264. The code is not +very well optimized. + +For addmul_N, as N grows larger, we could make multiple loads together, then do +about 3.3 i/c. 10 cycles after the last load, we can increase to 4 i/c. This +would surely allow addmul_4 to run at 2 c/l, but the same should be possible +also for addmul_3 and perhaps even addmul_2. + + + current fair best +Routine c/l unroll c/l unroll c/l i/c +mul_1 3.25 2.75 2.75 3.273 +addmul_1 4.0 4 3.5 4 14 3.25 3.385 +addmul_2 4.0 1 2.5 2 10 2.25 3.333 +addmul_3 3.0 1 2.33 2 14 2 3.333 +addmul_4 2.5 1 2.125 2 17 2 3.135 + +addmul_5 2 1 10 +addmul_6 2 1 12 +addmul_7 2 1 14 + +(The "best" column doesn't account for bookkeeping instructions and +thereby assumes infinite unrolling.) + +Basecase usages: + +1 addmul_1 +2 addmul_2 +3 addmul_3 +4 addmul_4 +5 addmul_3 + addmul_2 2.3998 +6 addmul_4 + addmul_2 +7 addmul_4 + addmul_3 diff --git a/vendor/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_1.asm b/vendor/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_1.asm new file mode 100644 index 0000000..711d4e6 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_1.asm @@ -0,0 +1,396 @@ +dnl Alpha ev6 nails mpn_addmul_1. + +dnl Copyright 2002, 2005, 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C EV4: 42 +C EV5: 18 +C EV6: 4 + +C TODO +C * Reroll loop for 3.75 c/l with current 4-way unrolling. +C * The loop is overscheduled wrt loads and wrt multiplies, in particular +C umulh. +C * Use FP loop count and multiple exit points, that would simplify feed-in lp0 +C and would work since the loop structure is really regular. + +C INPUT PARAMETERS +define(`rp',`r16') +define(`up',`r17') +define(`n', `r18') +define(`vl0',`r19') + +define(`numb_mask',`r6') + +define(`m0a',`r0') +define(`m0b',`r1') +define(`m1a',`r2') +define(`m1b',`r3') +define(`m2a',`r20') +define(`m2b',`r21') +define(`m3a',`r22') +define(`m3b',`r23') + +define(`acc0',`r25') +define(`acc1',`r27') + +define(`ul0',`r4') +define(`ul1',`r5') +define(`ul2',`r4') +define(`ul3',`r5') + +define(`rl0',`r24') +define(`rl1',`r24') +define(`rl2',`r24') +define(`rl3',`r24') + +define(`t0',`r7') +define(`t1',`r8') + +define(`NAIL_BITS',`GMP_NAIL_BITS') +define(`NUMB_BITS',`GMP_NUMB_BITS') + +dnl This declaration is munged by configure +NAILS_SUPPORT(2-63) + +ASM_START() +PROLOGUE(mpn_addmul_1) + sll vl0, NAIL_BITS, vl0 + lda numb_mask, -1(r31) + srl numb_mask, NAIL_BITS, numb_mask + + and n, 3, r25 + cmpeq r25, 1, r21 + bne r21, L(1m4) + cmpeq r25, 2, r21 + bne r21, L(2m4) + beq r25, L(0m4) + +L(3m4): ldq ul3, 0(up) + lda n, -4(n) + ldq ul0, 8(up) + mulq vl0, ul3, m3a + umulh vl0, ul3, m3b + ldq ul1, 16(up) + lda up, 24(up) + lda rp, -8(rp) + mulq vl0, ul0, m0a + umulh vl0, ul0, m0b + bge n, L(ge3) + + mulq vl0, ul1, m1a + umulh vl0, ul1, m1b + ldq rl3, 8(rp) + srl m3a,NAIL_BITS, t0 + addq t0, r31, acc1 + addq rl3, acc1, acc1 + ldq rl0, 16(rp) + srl m0a,NAIL_BITS, t0 + addq t0, m3b, acc0 + srl acc1,NUMB_BITS, t1 + br r31, L(ta3) + +L(ge3): ldq ul2, 0(up) + mulq vl0, ul1, m1a + umulh vl0, ul1, m1b + ldq rl3, 8(rp) + srl m3a,NAIL_BITS, t0 + ldq ul3, 8(up) + lda n, -4(n) + mulq vl0, ul2, m2a + addq t0, r31, acc1 + umulh vl0, ul2, m2b + addq rl3, acc1, acc1 + ldq rl0, 16(rp) + srl m0a,NAIL_BITS, t0 + ldq ul0, 16(up) + mulq vl0, ul3, m3a + addq t0, m3b, acc0 + srl acc1,NUMB_BITS, t1 + br r31, L(el3) + +L(0m4): lda n, -8(n) + ldq ul2, 0(up) + ldq ul3, 8(up) + mulq vl0, ul2, m2a + umulh vl0, ul2, m2b + ldq ul0, 16(up) + mulq vl0, ul3, m3a + umulh vl0, ul3, m3b + ldq ul1, 24(up) + lda up, 32(up) + mulq vl0, ul0, m0a + umulh vl0, ul0, m0b + bge n, L(ge4) + + ldq rl2, 0(rp) + srl m2a,NAIL_BITS, t0 + mulq vl0, ul1, m1a + addq t0, r31, acc0 + umulh vl0, ul1, m1b + addq rl2, acc0, acc0 + ldq rl3, 8(rp) + srl m3a,NAIL_BITS, t0 + addq t0, m2b, acc1 + srl acc0,NUMB_BITS, t1 + br r31, L(ta4) + +L(ge4): ldq rl2, 0(rp) + srl m2a,NAIL_BITS, t0 + ldq ul2, 0(up) + mulq vl0, ul1, m1a + addq t0, r31, acc0 + umulh vl0, ul1, m1b + addq rl2, acc0, acc0 + ldq rl3, 8(rp) + srl m3a,NAIL_BITS, t0 + ldq ul3, 8(up) + lda n, -4(n) + mulq vl0, ul2, m2a + addq t0, m2b, acc1 + srl acc0,NUMB_BITS, t1 + br r31, L(el0) + +L(2m4): lda n, -4(n) + ldq ul0, 0(up) + ldq ul1, 8(up) + lda up, 16(up) + lda rp, -16(rp) + mulq vl0, ul0, m0a + umulh vl0, ul0, m0b + bge n, L(ge2) + + mulq vl0, ul1, m1a + umulh vl0, ul1, m1b + ldq rl0, 16(rp) + srl m0a,NAIL_BITS, t0 + addq t0, r31, acc0 + addq rl0, acc0, acc0 + ldq rl1, 24(rp) + srl m1a,NAIL_BITS, t0 + addq t0, m0b, acc1 + srl acc0,NUMB_BITS, t1 + br r31, L(ta2) + +L(ge2): ldq ul2, 0(up) + mulq vl0, ul1, m1a + umulh vl0, ul1, m1b + ldq ul3, 8(up) + lda n, -4(n) + mulq vl0, ul2, m2a + umulh vl0, ul2, m2b + ldq rl0, 16(rp) + srl m0a,NAIL_BITS, t0 + ldq ul0, 16(up) + mulq vl0, ul3, m3a + addq t0, r31, acc0 + umulh vl0, ul3, m3b + addq rl0, acc0, acc0 + ldq rl1, 24(rp) + srl m1a,NAIL_BITS, t0 + ldq ul1, 24(up) + lda up, 32(up) + lda rp, 32(rp) + mulq vl0, ul0, m0a + addq t0, m0b, acc1 + srl acc0,NUMB_BITS, t1 + bge n, L(el2) + + br r31, L(ta6) + +L(1m4): lda n, -4(n) + ldq ul1, 0(up) + lda up, 8(up) + lda rp, -24(rp) + bge n, L(ge1) + + mulq vl0, ul1, m1a + umulh vl0, ul1, m1b + ldq rl1, 24(rp) + srl m1a,NAIL_BITS, t0 + addq rl1, t0, acc1 + and acc1,numb_mask, r28 + srl acc1,NUMB_BITS, t1 + stq r28, 24(rp) + addq t1, m1b, r0 + ret r31, (r26), 1 + +L(ge1): ldq ul2, 0(up) + mulq vl0, ul1, m1a + umulh vl0, ul1, m1b + ldq ul3, 8(up) + lda n, -4(n) + mulq vl0, ul2, m2a + umulh vl0, ul2, m2b + ldq ul0, 16(up) + mulq vl0, ul3, m3a + umulh vl0, ul3, m3b + ldq rl1, 24(rp) + srl m1a,NAIL_BITS, t0 + ldq ul1, 24(up) + lda up, 32(up) + lda rp, 32(rp) + mulq vl0, ul0, m0a + addq t0, r31, acc1 + umulh vl0, ul0, m0b + addq rl1, acc1, acc1 + ldq rl2, 0(rp) + srl m2a,NAIL_BITS, t0 + mulq vl0, ul1, m1a + addq t0, m1b, acc0 + srl acc1,NUMB_BITS, t1 + blt n, L(ta5) + +L(ge5): ldq ul2, 0(up) + br r31, L(el1) + + ALIGN(16) +L(top): mulq vl0, ul0, m0a C U1 + addq t0, m0b, acc1 C L0 + srl acc0,NUMB_BITS, t1 C U0 + stq r28, -24(rp) C L1 +C +L(el2): umulh vl0, ul0, m0b C U1 + and acc0,numb_mask, r28 C L0 + addq rl1, acc1, acc1 C U0 + ldq rl2, 0(rp) C L1 +C + unop C U1 + addq t1, acc1, acc1 C L0 + srl m2a,NAIL_BITS, t0 C U0 + ldq ul2, 0(up) C L1 +C + mulq vl0, ul1, m1a C U1 + addq t0, m1b, acc0 C L0 + srl acc1,NUMB_BITS, t1 C U0 + stq r28, -16(rp) C L1 +C +L(el1): umulh vl0, ul1, m1b C U1 + and acc1,numb_mask, r28 C L0 + addq rl2, acc0, acc0 C U0 + ldq rl3, 8(rp) C L1 +C + lda n, -4(n) C L1 + addq t1, acc0, acc0 C L0 + srl m3a,NAIL_BITS, t0 C U0 + ldq ul3, 8(up) C L1 +C + mulq vl0, ul2, m2a C U1 + addq t0, m2b, acc1 C L0 + srl acc0,NUMB_BITS, t1 C U0 + stq r28, -8(rp) C L1 +C +L(el0): umulh vl0, ul2, m2b C U1 + and acc0,numb_mask, r28 C L0 + addq rl3, acc1, acc1 C U0 + ldq rl0, 16(rp) C L1 +C + unop C U1 + addq t1, acc1, acc1 C L0 + srl m0a,NAIL_BITS, t0 C U0 + ldq ul0, 16(up) C L1 +C + mulq vl0, ul3, m3a C U1 + addq t0, m3b, acc0 C L0 + srl acc1,NUMB_BITS, t1 C U0 + stq r28, 0(rp) C L1 +C +L(el3): umulh vl0, ul3, m3b C U1 + and acc1,numb_mask, r28 C L0 + addq rl0, acc0, acc0 C U0 + ldq rl1, 24(rp) C L1 +C + unop C U1 + addq t1, acc0, acc0 C L0 + srl m1a,NAIL_BITS, t0 C U0 + ldq ul1, 24(up) C L1 +C + lda up, 32(up) C L0 + unop C U1 + lda rp, 32(rp) C L1 + bge n, L(top) C U0 + +L(end): mulq vl0, ul0, m0a + addq t0, m0b, acc1 + srl acc0,NUMB_BITS, t1 + stq r28, -24(rp) +L(ta6): umulh vl0, ul0, m0b + and acc0,numb_mask, r28 + addq rl1, acc1, acc1 + ldq rl2, 0(rp) + addq t1, acc1, acc1 + srl m2a,NAIL_BITS, t0 + mulq vl0, ul1, m1a + addq t0, m1b, acc0 + srl acc1,NUMB_BITS, t1 + stq r28, -16(rp) +L(ta5): umulh vl0, ul1, m1b + and acc1,numb_mask, r28 + addq rl2, acc0, acc0 + ldq rl3, 8(rp) + addq t1, acc0, acc0 + srl m3a,NAIL_BITS, t0 + addq t0, m2b, acc1 + srl acc0,NUMB_BITS, t1 + stq r28, -8(rp) + unop + ALIGN(16) +L(ta4): and acc0,numb_mask, r28 + addq rl3, acc1, acc1 + ldq rl0, 16(rp) + addq t1, acc1, acc1 + srl m0a,NAIL_BITS, t0 + addq t0, m3b, acc0 + srl acc1,NUMB_BITS, t1 + stq r28, 0(rp) + unop + ALIGN(16) +L(ta3): and acc1,numb_mask, r28 + addq rl0, acc0, acc0 + ldq rl1, 24(rp) + addq t1, acc0, acc0 + srl m1a,NAIL_BITS, t0 + addq t0, m0b, acc1 + srl acc0,NUMB_BITS, t1 + stq r28, 8(rp) + unop + ALIGN(16) +L(ta2): and acc0,numb_mask, r28 + addq rl1, acc1, acc1 + addq t1, acc1, acc1 + srl acc1,NUMB_BITS, t1 + stq r28, 16(rp) + and acc1,numb_mask, r28 + addq t1, m1b, r0 + stq r28, 24(rp) + ret r31, (r26), 1 +EPILOGUE() +ASM_END() diff --git a/vendor/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_2.asm b/vendor/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_2.asm new file mode 100644 index 0000000..6ff6b3a --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_2.asm @@ -0,0 +1,146 @@ +dnl Alpha ev6 nails mpn_addmul_2. + +dnl Copyright 2002, 2005, 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C Runs at 4.0 cycles/limb. + +C We could either go for 2-way unrolling over 11 cycles, or 2.75 c/l, +C or 4-way unrolling over 20 cycles, for 2.5 c/l. + + +C INPUT PARAMETERS +define(`rp',`r16') +define(`up',`r17') +define(`n',`r18') +define(`vp',`r19') + +C Useful register aliases +define(`numb_mask',`r24') +define(`ulimb',`r25') +define(`rlimb',`r27') + +define(`m0a',`r0') +define(`m0b',`r1') +define(`m1a',`r2') +define(`m1b',`r3') + +define(`acc0',`r4') +define(`acc1',`r5') + +define(`v0',`r6') +define(`v1',`r7') + +C Used for temps: r8 r19 r28 + +define(`NAIL_BITS',`GMP_NAIL_BITS') +define(`NUMB_BITS',`GMP_NUMB_BITS') + +C This declaration is munged by configure +NAILS_SUPPORT(3-63) + +ASM_START() +PROLOGUE(mpn_addmul_2) + lda numb_mask,-1(r31) + srl numb_mask,NAIL_BITS,numb_mask + + ldq v0, 0(vp) + ldq v1, 8(vp) + + bis r31, r31, acc0 C zero acc0 + sll v0,NAIL_BITS, v0 + bis r31, r31, acc1 C zero acc1 + sll v1,NAIL_BITS, v1 + bis r31, r31, r19 + + ldq ulimb, 0(up) + lda up, 8(up) + mulq v0, ulimb, m0a C U1 + umulh v0, ulimb, m0b C U1 + mulq v1, ulimb, m1a C U1 + umulh v1, ulimb, m1b C U1 + lda n, -1(n) + beq n, L(end) C U0 + + ALIGN(16) +L(top): bis r31, r31, r31 C U1 nop + addq r19, acc0, acc0 C U0 propagate nail + ldq rlimb, 0(rp) C L0 + ldq ulimb, 0(up) C L1 + + lda rp, 8(rp) C L1 + srl m0a,NAIL_BITS, r8 C U0 + lda up, 8(up) C L0 + mulq v0, ulimb, m0a C U1 + + addq r8, acc0, r19 C U0 + addq m0b, acc1, acc0 C L1 + umulh v0, ulimb, m0b C U1 + bis r31, r31, r31 C L0 nop + + addq rlimb, r19, r19 C L1 FINAL PROD-SUM + srl m1a,NAIL_BITS, r8 C U0 + lda n, -1(n) C L0 + mulq v1, ulimb, m1a C U1 + + addq r8, acc0, acc0 C U0 + bis r31, m1b, acc1 C L1 + umulh v1, ulimb, m1b C U1 + and r19,numb_mask, r28 C L0 extract numb part + + unop + srl r19,NUMB_BITS, r19 C U1 extract nail part + stq r28, -8(rp) C L1 + bne n, L(top) C U0 + +L(end): ldq rlimb, 0(rp) + addq r19, acc0, acc0 C propagate nail + lda rp, 8(rp) + srl m0a,NAIL_BITS, r8 C U0 + addq r8, acc0, r19 + addq m0b, acc1, acc0 + addq rlimb, r19, r19 + srl m1a,NAIL_BITS, r8 C U0 + addq r8, acc0, acc0 + bis r31, m1b, acc1 + and r19,numb_mask, r28 C extract limb + + srl r19,NUMB_BITS, r19 C extract nail + stq r28, -8(rp) + + addq r19, acc0, acc0 C propagate nail + and acc0,numb_mask, r28 + stq r28, 0(rp) + srl acc0,NUMB_BITS, r19 + addq r19, acc1, r0 + + ret r31, (r26), 1 +EPILOGUE() +ASM_END() diff --git a/vendor/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_3.asm b/vendor/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_3.asm new file mode 100644 index 0000000..a1ffb68 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_3.asm @@ -0,0 +1,169 @@ +dnl Alpha ev6 nails mpn_addmul_3. + +dnl Copyright 2002, 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C Runs at 3.0 cycles/limb. + +C With 2-way unrolling, we could probably reach 2.25 c/l (3.33 i/c). + + +C INPUT PARAMETERS +define(`rp',`r16') +define(`up',`r17') +define(`n',`r18') +define(`vp',`r19') + +C Useful register aliases +define(`numb_mask',`r24') +define(`ulimb',`r25') +define(`rlimb',`r27') + +define(`m0a',`r0') +define(`m0b',`r1') +define(`m1a',`r2') +define(`m1b',`r3') +define(`m2a',`r20') +define(`m2b',`r21') + +define(`acc0',`r4') +define(`acc1',`r5') +define(`acc2',`r22') + +define(`v0',`r6') +define(`v1',`r7') +define(`v2',`r23') + +C Used for temps: r8 r19 r28 + +define(`NAIL_BITS',`GMP_NAIL_BITS') +define(`NUMB_BITS',`GMP_NUMB_BITS') + +C This declaration is munged by configure +NAILS_SUPPORT(3-63) + +ASM_START() +PROLOGUE(mpn_addmul_3) + lda numb_mask,-1(r31) + srl numb_mask,NAIL_BITS,numb_mask + + ldq v0, 0(vp) + ldq v1, 8(vp) + ldq v2, 16(vp) + + bis r31, r31, acc0 C zero acc0 + sll v0,NAIL_BITS, v0 + bis r31, r31, acc1 C zero acc1 + sll v1,NAIL_BITS, v1 + bis r31, r31, acc2 C zero acc2 + sll v2,NAIL_BITS, v2 + bis r31, r31, r19 + + ldq ulimb, 0(up) + lda up, 8(up) + mulq v0, ulimb, m0a C U1 + umulh v0, ulimb, m0b C U1 + mulq v1, ulimb, m1a C U1 + umulh v1, ulimb, m1b C U1 + lda n, -1(n) + mulq v2, ulimb, m2a C U1 + umulh v2, ulimb, m2b C U1 + beq n, L(end) C U0 + + ALIGN(16) +L(top): ldq rlimb, 0(rp) C L1 + ldq ulimb, 0(up) C L0 + bis r31, r31, r31 C U0 nop + addq r19, acc0, acc0 C U1 propagate nail + + lda rp, 8(rp) C L1 + srl m0a,NAIL_BITS, r8 C U0 + lda up, 8(up) C L0 + mulq v0, ulimb, m0a C U1 + + addq r8, acc0, r19 C U0 + addq m0b, acc1, acc0 C L1 + umulh v0, ulimb, m0b C U1 + bis r31, r31, r31 C L0 nop + + addq rlimb, r19, r19 C L1 + srl m1a,NAIL_BITS, r8 C U0 + bis r31, r31, r31 C L0 nop + mulq v1, ulimb, m1a C U1 + + addq r8, acc0, acc0 C U0 + addq m1b, acc2, acc1 C L1 + umulh v1, ulimb, m1b C U1 + and r19,numb_mask, r28 C L0 extract numb part + + bis r31, r31, r31 C L1 nop + srl m2a,NAIL_BITS, r8 C U0 + lda n, -1(n) C L0 + mulq v2, ulimb, m2a C U1 + + addq r8, acc1, acc1 C L0 + bis r31, m2b, acc2 C L1 + umulh v2, ulimb, m2b C U1 + srl r19,NUMB_BITS, r19 C U0 extract nail part + + stq r28, -8(rp) C L + bne n, L(top) C U0 + +L(end): ldq rlimb, 0(rp) + addq r19, acc0, acc0 C propagate nail + lda rp, 8(rp) + srl m0a,NAIL_BITS, r8 C U0 + addq r8, acc0, r19 + addq m0b, acc1, acc0 + addq rlimb, r19, r19 + srl m1a,NAIL_BITS, r8 C U0 + addq r8, acc0, acc0 + addq m1b, acc2, acc1 + and r19,numb_mask, r28 C extract limb + srl m2a,NAIL_BITS, r8 C U0 + addq r8, acc1, acc1 + bis r31, m2b, acc2 + srl r19,NUMB_BITS, r19 C extract nail + stq r28, -8(rp) + + addq r19, acc0, acc0 C propagate nail + and acc0,numb_mask, r28 + stq r28, 0(rp) + srl acc0,NUMB_BITS, r19 + addq r19, acc1, acc1 + + and acc1,numb_mask, r28 + stq r28, 8(rp) + srl acc1,NUMB_BITS, r19 + addq r19, acc2, m0a + + ret r31, (r26), 1 +EPILOGUE() +ASM_END() diff --git a/vendor/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_4.asm b/vendor/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_4.asm new file mode 100644 index 0000000..77e02a4 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_4.asm @@ -0,0 +1,210 @@ +dnl Alpha ev6 nails mpn_addmul_4. + +dnl Copyright 2002, 2005, 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C Runs at 2.5 cycles/limb. + +C We should go for 2-way unrolling over 17 cycles, for 2.125 c/l corresponding +C to 3.24 insn/cycle. + + +C INPUT PARAMETERS +define(`rp',`r16') +define(`up',`r17') +define(`n',`r18') +define(`vp',`r19') + +C Useful register aliases +define(`numb_mask',`r24') +define(`ulimb',`r25') +define(`rlimb',`r27') + +define(`m0a',`r0') +define(`m0b',`r1') +define(`m1a',`r2') +define(`m1b',`r3') +define(`m2a',`r20') +define(`m2b',`r21') +define(`m3a',`r12') +define(`m3b',`r13') + +define(`acc0',`r4') +define(`acc1',`r5') +define(`acc2',`r22') +define(`acc3',`r14') + +define(`v0',`r6') +define(`v1',`r7') +define(`v2',`r23') +define(`v3',`r15') + +C Used for temps: r8 r19 r28 + +define(`NAIL_BITS',`GMP_NAIL_BITS') +define(`NUMB_BITS',`GMP_NUMB_BITS') + +C This declaration is munged by configure +NAILS_SUPPORT(4-63) + +ASM_START() +PROLOGUE(mpn_addmul_4) + lda r30, -240(r30) + stq r12, 32(r30) + stq r13, 40(r30) + stq r14, 48(r30) + stq r15, 56(r30) + + lda numb_mask,-1(r31) + srl numb_mask,NAIL_BITS,numb_mask + + ldq v0, 0(vp) + ldq v1, 8(vp) + ldq v2, 16(vp) + ldq v3, 24(vp) + + bis r31, r31, acc0 C zero acc0 + sll v0,NAIL_BITS, v0 + bis r31, r31, acc1 C zero acc1 + sll v1,NAIL_BITS, v1 + bis r31, r31, acc2 C zero acc2 + sll v2,NAIL_BITS, v2 + bis r31, r31, acc3 C zero acc3 + sll v3,NAIL_BITS, v3 + bis r31, r31, r19 + + ldq ulimb, 0(up) + lda up, 8(up) + mulq v0, ulimb, m0a C U1 + umulh v0, ulimb, m0b C U1 + mulq v1, ulimb, m1a C U1 + umulh v1, ulimb, m1b C U1 + lda n, -1(n) + mulq v2, ulimb, m2a C U1 + umulh v2, ulimb, m2b C U1 + mulq v3, ulimb, m3a C U1 + umulh v3, ulimb, m3b C U1 + beq n, L(end) C U0 + + ALIGN(16) +L(top): bis r31, r31, r31 C U1 nop + ldq rlimb, 0(rp) C L0 + ldq ulimb, 0(up) C L1 + addq r19, acc0, acc0 C U0 propagate nail + + bis r31, r31, r31 C L0 nop + bis r31, r31, r31 C U1 nop + bis r31, r31, r31 C L1 nop + bis r31, r31, r31 C U0 nop + + lda rp, 8(rp) C L0 + srl m0a,NAIL_BITS, r8 C U0 + lda up, 8(up) C L1 + mulq v0, ulimb, m0a C U1 + + addq r8, acc0, r19 C U0 + addq m0b, acc1, acc0 C L0 + umulh v0, ulimb, m0b C U1 + bis r31, r31, r31 C L1 nop + + addq rlimb, r19, r19 C L0 + srl m1a,NAIL_BITS, r8 C U0 + bis r31, r31, r31 C L1 nop + mulq v1, ulimb, m1a C U1 + + addq r8, acc0, acc0 C U0 + addq m1b, acc2, acc1 C L0 + umulh v1, ulimb, m1b C U1 + and r19,numb_mask, r28 C L1 extract numb part + + bis r31, r31, r31 C L0 nop + srl m2a,NAIL_BITS, r8 C U0 + lda n, -1(n) C L1 + mulq v2, ulimb, m2a C U1 + + addq r8, acc1, acc1 C L1 + addq m2b, acc3, acc2 C L0 + umulh v2, ulimb, m2b C U1 + srl r19,NUMB_BITS, r19 C U0 extract nail part + + bis r31, r31, r31 C L0 nop + srl m3a,NAIL_BITS, r8 C U0 + stq r28, -8(rp) C L1 + mulq v3, ulimb, m3a C U1 + + addq r8, acc2, acc2 C L0 + bis r31, m3b, acc3 C L1 + umulh v3, ulimb, m3b C U1 + bne n, L(top) C U0 + +L(end): ldq rlimb, 0(rp) + addq r19, acc0, acc0 C propagate nail + lda rp, 8(rp) C FIXME: DELETE + srl m0a,NAIL_BITS, r8 C U0 + addq r8, acc0, r19 + addq m0b, acc1, acc0 + addq rlimb, r19, r19 + srl m1a,NAIL_BITS, r8 C U0 + addq r8, acc0, acc0 + addq m1b, acc2, acc1 + and r19,numb_mask, r28 C extract limb + srl m2a,NAIL_BITS, r8 C U0 + addq r8, acc1, acc1 + addq m2b, acc3, acc2 + srl r19,NUMB_BITS, r19 C extract nail + srl m3a,NAIL_BITS, r8 C U0 + stq r28, -8(rp) + addq r8, acc2, acc2 + bis r31, m3b, acc3 + + addq r19, acc0, acc0 C propagate nail + and acc0,numb_mask, r28 + stq r28, 0(rp) + srl acc0,NUMB_BITS, r19 + addq r19, acc1, acc1 + + and acc1,numb_mask, r28 + stq r28, 8(rp) + srl acc1,NUMB_BITS, r19 + addq r19, acc2, acc2 + + and acc2,numb_mask, r28 + stq r28, 16(rp) + srl acc2,NUMB_BITS, r19 + addq r19, acc3, r0 + + ldq r12, 32(r30) + ldq r13, 40(r30) + ldq r14, 48(r30) + ldq r15, 56(r30) + lda r30, 240(r30) + ret r31, (r26), 1 +EPILOGUE() +ASM_END() diff --git a/vendor/gmp-6.3.0/mpn/alpha/ev6/nails/aors_n.asm b/vendor/gmp-6.3.0/mpn/alpha/ev6/nails/aors_n.asm new file mode 100644 index 0000000..f658677 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/alpha/ev6/nails/aors_n.asm @@ -0,0 +1,233 @@ +dnl Alpha ev6 nails mpn_add_n and mpn_sub_n. + +dnl Copyright 2002, 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +dnl Runs at 2.5 cycles/limb. It would be possible to reach 2.0 cycles/limb +dnl with 8-way unrolling. + +include(`../config.m4') + +dnl INPUT PARAMETERS +define(`rp',`r16') +define(`up',`r17') +define(`vp',`r18') +define(`n',`r19') + +define(`rl0',`r0') +define(`rl1',`r1') +define(`rl2',`r2') +define(`rl3',`r3') + +define(`ul0',`r4') +define(`ul1',`r5') +define(`ul2',`r6') +define(`ul3',`r7') + +define(`vl0',`r22') +define(`vl1',`r23') +define(`vl2',`r24') +define(`vl3',`r25') + +define(`numb_mask',`r21') + +define(`NAIL_BITS',`GMP_NAIL_BITS') +define(`CYSH',`GMP_NUMB_BITS') + +dnl This declaration is munged by configure +NAILS_SUPPORT(1-63) + +ifdef(`OPERATION_add_n', ` + define(`OP', addq) + define(`CYSH',`GMP_NUMB_BITS') + define(`func', mpn_add_n)') +ifdef(`OPERATION_sub_n', ` + define(`OP', subq) + define(`CYSH',63) + define(`func', mpn_sub_n)') + +MULFUNC_PROLOGUE(mpn_add_n mpn_sub_n) + +ASM_START() +PROLOGUE(func) + lda numb_mask, -1(r31) + srl numb_mask, NAIL_BITS, numb_mask + bis r31, r31, r20 + + and n, 3, r25 + lda n, -4(n) + beq r25, L(ge4) + +L(lp0): ldq ul0, 0(up) + lda up, 8(up) + ldq vl0, 0(vp) + lda vp, 8(vp) + lda rp, 8(rp) + lda r25, -1(r25) + OP ul0, vl0, rl0 + OP rl0, r20, rl0 + and rl0, numb_mask, r28 + stq r28, -8(rp) + srl rl0, CYSH, r20 + bne r25, L(lp0) + + blt n, L(ret) + +L(ge4): ldq ul0, 0(up) + ldq vl0, 0(vp) + ldq ul1, 8(up) + ldq vl1, 8(vp) + ldq ul2, 16(up) + ldq vl2, 16(vp) + ldq ul3, 24(up) + ldq vl3, 24(vp) + lda up, 32(up) + lda vp, 32(vp) + lda n, -4(n) + bge n, L(ge8) + + OP ul0, vl0, rl0 C main-add 0 + OP rl0, r20, rl0 C cy-add 0 + OP ul1, vl1, rl1 C main-add 1 + srl rl0, CYSH, r20 C gen cy 0 + OP rl1, r20, rl1 C cy-add 1 + and rl0,numb_mask, r27 + br r31, L(cj0) + +L(ge8): OP ul0, vl0, rl0 C main-add 0 + ldq ul0, 0(up) + ldq vl0, 0(vp) + OP rl0, r20, rl0 C cy-add 0 + OP ul1, vl1, rl1 C main-add 1 + srl rl0, CYSH, r20 C gen cy 0 + ldq ul1, 8(up) + ldq vl1, 8(vp) + OP rl1, r20, rl1 C cy-add 1 + and rl0,numb_mask, r27 + OP ul2, vl2, rl2 C main-add 2 + srl rl1, CYSH, r20 C gen cy 1 + ldq ul2, 16(up) + ldq vl2, 16(vp) + OP rl2, r20, rl2 C cy-add 2 + and rl1,numb_mask, r28 + stq r27, 0(rp) + OP ul3, vl3, rl3 C main-add 3 + srl rl2, CYSH, r20 C gen cy 2 + ldq ul3, 24(up) + ldq vl3, 24(vp) + OP rl3, r20, rl3 C cy-add 3 + and rl2,numb_mask, r27 + stq r28, 8(rp) + lda rp, 32(rp) + lda up, 32(up) + lda vp, 32(vp) + lda n, -4(n) + blt n, L(end) + + ALIGN(32) +L(top): OP ul0, vl0, rl0 C main-add 0 + srl rl3, CYSH, r20 C gen cy 3 + ldq ul0, 0(up) + ldq vl0, 0(vp) + + OP rl0, r20, rl0 C cy-add 0 + and rl3,numb_mask, r28 + stq r27, -16(rp) + bis r31, r31, r31 + + OP ul1, vl1, rl1 C main-add 1 + srl rl0, CYSH, r20 C gen cy 0 + ldq ul1, 8(up) + ldq vl1, 8(vp) + + OP rl1, r20, rl1 C cy-add 1 + and rl0,numb_mask, r27 + stq r28, -8(rp) + bis r31, r31, r31 + + OP ul2, vl2, rl2 C main-add 2 + srl rl1, CYSH, r20 C gen cy 1 + ldq ul2, 16(up) + ldq vl2, 16(vp) + + OP rl2, r20, rl2 C cy-add 2 + and rl1,numb_mask, r28 + stq r27, 0(rp) + bis r31, r31, r31 + + OP ul3, vl3, rl3 C main-add 3 + srl rl2, CYSH, r20 C gen cy 2 + ldq ul3, 24(up) + ldq vl3, 24(vp) + + OP rl3, r20, rl3 C cy-add 3 + and rl2,numb_mask, r27 + stq r28, 8(rp) + bis r31, r31, r31 + + bis r31, r31, r31 + lda n, -4(n) + lda up, 32(up) + lda vp, 32(vp) + + bis r31, r31, r31 + bis r31, r31, r31 + lda rp, 32(rp) + bge n, L(top) + +L(end): OP ul0, vl0, rl0 C main-add 0 + srl rl3, CYSH, r20 C gen cy 3 + OP rl0, r20, rl0 C cy-add 0 + and rl3,numb_mask, r28 + stq r27, -16(rp) + OP ul1, vl1, rl1 C main-add 1 + srl rl0, CYSH, r20 C gen cy 0 + OP rl1, r20, rl1 C cy-add 1 + and rl0,numb_mask, r27 + stq r28, -8(rp) +L(cj0): OP ul2, vl2, rl2 C main-add 2 + srl rl1, CYSH, r20 C gen cy 1 + OP rl2, r20, rl2 C cy-add 2 + and rl1,numb_mask, r28 + stq r27, 0(rp) + OP ul3, vl3, rl3 C main-add 3 + srl rl2, CYSH, r20 C gen cy 2 + OP rl3, r20, rl3 C cy-add 3 + and rl2,numb_mask, r27 + stq r28, 8(rp) + + srl rl3, CYSH, r20 C gen cy 3 + and rl3,numb_mask, r28 + stq r27, 16(rp) + stq r28, 24(rp) + +L(ret): and r20, 1, r0 + ret r31, (r26), 1 +EPILOGUE() +ASM_END() diff --git a/vendor/gmp-6.3.0/mpn/alpha/ev6/nails/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/alpha/ev6/nails/gmp-mparam.h new file mode 100644 index 0000000..7949fe8 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/alpha/ev6/nails/gmp-mparam.h @@ -0,0 +1,72 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2004 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* Generated by tuneup.c, 2004-02-07, gcc 3.3 */ + +#define MUL_TOOM22_THRESHOLD 40 +#define MUL_TOOM33_THRESHOLD 236 + +#define SQR_BASECASE_THRESHOLD 7 /* karatsuba */ +#define SQR_TOOM2_THRESHOLD 0 /* never sqr_basecase */ +#define SQR_TOOM3_THRESHOLD 120 + +#define DIV_SB_PREINV_THRESHOLD MP_SIZE_T_MAX /* no preinv with nails */ +#define DIV_DC_THRESHOLD 48 +#define POWM_THRESHOLD 113 + +#define HGCD_THRESHOLD 78 +#define GCD_ACCEL_THRESHOLD 3 +#define GCD_DC_THRESHOLD 392 +#define JACOBI_BASE_METHOD 1 + +#define DIVREM_1_NORM_THRESHOLD MP_SIZE_T_MAX /* no preinv with nails */ +#define DIVREM_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* no preinv with nails */ +#define MOD_1_NORM_THRESHOLD MP_SIZE_T_MAX /* no preinv with nails */ +#define MOD_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* no preinv with nails */ +#define USE_PREINV_DIVREM_1 0 /* no preinv with nails */ +#define USE_PREINV_MOD_1 0 /* no preinv with nails */ +#define DIVREM_2_THRESHOLD MP_SIZE_T_MAX /* no preinv with nails */ +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define MODEXACT_1_ODD_THRESHOLD 0 /* always */ + +#define GET_STR_DC_THRESHOLD 15 +#define GET_STR_PRECOMPUTE_THRESHOLD 24 +#define SET_STR_THRESHOLD 6336 + +#define MUL_FFT_TABLE { 688, 1440, 3648, 6400, 25600, 0 } +#define MUL_FFT_MODF_THRESHOLD 488 +#define MUL_FFT_THRESHOLD 3712 + +#define SQR_FFT_TABLE { 432, 864, 3136, 6400, 25600, 0 } +#define SQR_FFT_MODF_THRESHOLD 480 +#define SQR_FFT_THRESHOLD 2976 diff --git a/vendor/gmp-6.3.0/mpn/alpha/ev6/nails/mul_1.asm b/vendor/gmp-6.3.0/mpn/alpha/ev6/nails/mul_1.asm new file mode 100644 index 0000000..da2ee3d --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/alpha/ev6/nails/mul_1.asm @@ -0,0 +1,364 @@ +dnl Alpha ev6 nails mpn_mul_1. + +dnl Copyright 2002, 2005, 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C EV4: 42 +C EV5: 18 +C EV6: 3.25 + +C TODO +C * Reroll loop for 3.0 c/l with current 4-way unrolling. +C * The loop is overscheduled wrt loads and wrt multiplies, in particular +C umulh. +C * Use FP loop count and multiple exit points, that would simplify feed-in lp0 +C and would work since the loop structure is really regular. + +C INPUT PARAMETERS +define(`rp',`r16') +define(`up',`r17') +define(`n', `r18') +define(`vl0',`r19') + +define(`numb_mask',`r6') + +define(`m0a',`r0') +define(`m0b',`r1') +define(`m1a',`r2') +define(`m1b',`r3') +define(`m2a',`r20') +define(`m2b',`r21') +define(`m3a',`r22') +define(`m3b',`r23') + +define(`acc0',`r25') +define(`acc1',`r27') + +define(`ul0',`r4') +define(`ul1',`r5') +define(`ul2',`r4') +define(`ul3',`r5') + +define(`rl0',`r24') +define(`rl1',`r24') +define(`rl2',`r24') +define(`rl3',`r24') + +define(`t0',`r7') +define(`t1',`r8') + +define(`NAIL_BITS',`GMP_NAIL_BITS') +define(`NUMB_BITS',`GMP_NUMB_BITS') + +dnl This declaration is munged by configure +NAILS_SUPPORT(1-63) + +ASM_START() +PROLOGUE(mpn_mul_1) + sll vl0, NAIL_BITS, vl0 + lda numb_mask, -1(r31) + srl numb_mask, NAIL_BITS, numb_mask + + and n, 3, r25 + cmpeq r25, 1, r21 + bne r21, L(1m4) + cmpeq r25, 2, r21 + bne r21, L(2m4) + beq r25, L(0m4) + +L(3m4): ldq ul3, 0(up) + lda n, -4(n) + ldq ul0, 8(up) + mulq vl0, ul3, m3a + umulh vl0, ul3, m3b + ldq ul1, 16(up) + lda up, 24(up) + lda rp, -8(rp) + mulq vl0, ul0, m0a + umulh vl0, ul0, m0b + bge n, L(ge3) + + mulq vl0, ul1, m1a + umulh vl0, ul1, m1b + srl m3a,NAIL_BITS, t0 + addq t0, r31, acc1 + srl m0a,NAIL_BITS, t0 + addq t0, m3b, acc0 + srl acc1,NUMB_BITS, t1 + br r31, L(ta3) + +L(ge3): ldq ul2, 0(up) + mulq vl0, ul1, m1a + umulh vl0, ul1, m1b + srl m3a,NAIL_BITS, t0 + ldq ul3, 8(up) + lda n, -4(n) + mulq vl0, ul2, m2a + addq t0, r31, acc1 + umulh vl0, ul2, m2b + srl m0a,NAIL_BITS, t0 + ldq ul0, 16(up) + mulq vl0, ul3, m3a + addq t0, m3b, acc0 + srl acc1,NUMB_BITS, t1 + br r31, L(el3) + +L(0m4): lda n, -8(n) + ldq ul2, 0(up) + ldq ul3, 8(up) + mulq vl0, ul2, m2a + umulh vl0, ul2, m2b + ldq ul0, 16(up) + mulq vl0, ul3, m3a + umulh vl0, ul3, m3b + ldq ul1, 24(up) + lda up, 32(up) + mulq vl0, ul0, m0a + umulh vl0, ul0, m0b + bge n, L(ge4) + + srl m2a,NAIL_BITS, t0 + mulq vl0, ul1, m1a + addq t0, r31, acc0 + umulh vl0, ul1, m1b + srl m3a,NAIL_BITS, t0 + addq t0, m2b, acc1 + srl acc0,NUMB_BITS, t1 + br r31, L(ta4) + +L(ge4): srl m2a,NAIL_BITS, t0 + ldq ul2, 0(up) + mulq vl0, ul1, m1a + addq t0, r31, acc0 + umulh vl0, ul1, m1b + srl m3a,NAIL_BITS, t0 + ldq ul3, 8(up) + lda n, -4(n) + mulq vl0, ul2, m2a + addq t0, m2b, acc1 + srl acc0,NUMB_BITS, t1 + br r31, L(el0) + +L(2m4): lda n, -4(n) + ldq ul0, 0(up) + ldq ul1, 8(up) + lda up, 16(up) + lda rp, -16(rp) + mulq vl0, ul0, m0a + umulh vl0, ul0, m0b + bge n, L(ge2) + + mulq vl0, ul1, m1a + umulh vl0, ul1, m1b + srl m0a,NAIL_BITS, t0 + addq t0, r31, acc0 + srl m1a,NAIL_BITS, t0 + addq t0, m0b, acc1 + srl acc0,NUMB_BITS, t1 + br r31, L(ta2) + +L(ge2): ldq ul2, 0(up) + mulq vl0, ul1, m1a + umulh vl0, ul1, m1b + ldq ul3, 8(up) + lda n, -4(n) + mulq vl0, ul2, m2a + umulh vl0, ul2, m2b + srl m0a,NAIL_BITS, t0 + ldq ul0, 16(up) + mulq vl0, ul3, m3a + addq t0, r31, acc0 + umulh vl0, ul3, m3b + srl m1a,NAIL_BITS, t0 + ldq ul1, 24(up) + lda up, 32(up) + lda rp, 32(rp) + mulq vl0, ul0, m0a + addq t0, m0b, acc1 + srl acc0,NUMB_BITS, t1 + bge n, L(el2) + + br r31, L(ta6) + +L(1m4): lda n, -4(n) + ldq ul1, 0(up) + lda up, 8(up) + lda rp, -24(rp) + bge n, L(ge1) + + mulq vl0, ul1, m1a + umulh vl0, ul1, m1b + srl m1a,NAIL_BITS, t0 + addq t0, r31, acc1 + and acc1,numb_mask, r28 + srl acc1,NUMB_BITS, t1 + stq r28, 24(rp) + addq t1, m1b, r0 + ret r31, (r26), 1 + +L(ge1): ldq ul2, 0(up) + mulq vl0, ul1, m1a + umulh vl0, ul1, m1b + ldq ul3, 8(up) + lda n, -4(n) + mulq vl0, ul2, m2a + umulh vl0, ul2, m2b + ldq ul0, 16(up) + mulq vl0, ul3, m3a + umulh vl0, ul3, m3b + srl m1a,NAIL_BITS, t0 + ldq ul1, 24(up) + lda up, 32(up) + lda rp, 32(rp) + mulq vl0, ul0, m0a + addq t0, r31, acc1 + umulh vl0, ul0, m0b + srl m2a,NAIL_BITS, t0 + mulq vl0, ul1, m1a + addq t0, m1b, acc0 + srl acc1,NUMB_BITS, t1 + blt n, L(ta5) + +L(ge5): ldq ul2, 0(up) + br r31, L(el1) + + ALIGN(16) +L(top): mulq vl0, ul0, m0a C U1 + addq t0, m0b, acc1 C L0 + srl acc0,NUMB_BITS, t1 C U0 + stq r28, -24(rp) C L1 +C +L(el2): umulh vl0, ul0, m0b C U1 + and acc0,numb_mask, r28 C L0 + unop C U0 + unop C L1 +C + unop C U1 + addq t1, acc1, acc1 C L0 + srl m2a,NAIL_BITS, t0 C U0 + ldq ul2, 0(up) C L1 +C + mulq vl0, ul1, m1a C U1 + addq t0, m1b, acc0 C L0 + srl acc1,NUMB_BITS, t1 C U0 + stq r28, -16(rp) C L1 +C +L(el1): umulh vl0, ul1, m1b C U1 + and acc1,numb_mask, r28 C L0 + unop C U0 + lda n, -4(n) C L1 +C + unop C U1 + addq t1, acc0, acc0 C L0 + srl m3a,NAIL_BITS, t0 C U0 + ldq ul3, 8(up) C L1 +C + mulq vl0, ul2, m2a C U1 + addq t0, m2b, acc1 C L0 + srl acc0,NUMB_BITS, t1 C U0 + stq r28, -8(rp) C L1 +C +L(el0): umulh vl0, ul2, m2b C U1 + and acc0,numb_mask, r28 C L0 + unop C U0 + unop C L1 +C + unop C U1 + addq t1, acc1, acc1 C L0 + srl m0a,NAIL_BITS, t0 C U0 + ldq ul0, 16(up) C L1 +C + mulq vl0, ul3, m3a C U1 + addq t0, m3b, acc0 C L0 + srl acc1,NUMB_BITS, t1 C U0 + stq r28, 0(rp) C L1 +C +L(el3): umulh vl0, ul3, m3b C U1 + and acc1,numb_mask, r28 C L0 + unop C U0 + unop C L1 +C + unop C U1 + addq t1, acc0, acc0 C L0 + srl m1a,NAIL_BITS, t0 C U0 + ldq ul1, 24(up) C L1 +C + lda up, 32(up) C L0 + unop C U1 + lda rp, 32(rp) C L1 + bge n, L(top) C U0 + +L(end): mulq vl0, ul0, m0a + addq t0, m0b, acc1 + srl acc0,NUMB_BITS, t1 + stq r28, -24(rp) +L(ta6): umulh vl0, ul0, m0b + and acc0,numb_mask, r28 + addq t1, acc1, acc1 + srl m2a,NAIL_BITS, t0 + mulq vl0, ul1, m1a + addq t0, m1b, acc0 + srl acc1,NUMB_BITS, t1 + stq r28, -16(rp) +L(ta5): umulh vl0, ul1, m1b + and acc1,numb_mask, r28 + addq t1, acc0, acc0 + srl m3a,NAIL_BITS, t0 + addq t0, m2b, acc1 + srl acc0,NUMB_BITS, t1 + stq r28, -8(rp) + ALIGN(16) +L(ta4): and acc0,numb_mask, r28 + addq t1, acc1, acc1 + srl m0a,NAIL_BITS, t0 + addq t0, m3b, acc0 + srl acc1,NUMB_BITS, t1 + stq r28, 0(rp) + unop + ALIGN(16) +L(ta3): and acc1,numb_mask, r28 + addq t1, acc0, acc0 + srl m1a,NAIL_BITS, t0 + addq t0, m0b, acc1 + srl acc0,NUMB_BITS, t1 + stq r28, 8(rp) + unop + ALIGN(16) +L(ta2): and acc0,numb_mask, r28 + addq t1, acc1, acc1 + srl acc1,NUMB_BITS, t1 + stq r28, 16(rp) + and acc1,numb_mask, r28 + addq t1, m1b, r0 + stq r28, 24(rp) + ret r31, (r26), 1 +EPILOGUE() +ASM_END() diff --git a/vendor/gmp-6.3.0/mpn/alpha/ev6/nails/submul_1.asm b/vendor/gmp-6.3.0/mpn/alpha/ev6/nails/submul_1.asm new file mode 100644 index 0000000..f473a59 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/alpha/ev6/nails/submul_1.asm @@ -0,0 +1,396 @@ +dnl Alpha ev6 nails mpn_submul_1. + +dnl Copyright 2002, 2005, 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C EV4: 42 +C EV5: 18 +C EV6: 4 + +C TODO +C * Reroll loop for 3.75 c/l with current 4-way unrolling. +C * The loop is overscheduled wrt loads and wrt multiplies, in particular +C umulh. +C * Use FP loop count and multiple exit points, that would simplify feed-in lp0 +C and would work since the loop structure is really regular. + +C INPUT PARAMETERS +define(`rp',`r16') +define(`up',`r17') +define(`n', `r18') +define(`vl0',`r19') + +define(`numb_mask',`r6') + +define(`m0a',`r0') +define(`m0b',`r1') +define(`m1a',`r2') +define(`m1b',`r3') +define(`m2a',`r20') +define(`m2b',`r21') +define(`m3a',`r22') +define(`m3b',`r23') + +define(`acc0',`r25') +define(`acc1',`r27') + +define(`ul0',`r4') +define(`ul1',`r5') +define(`ul2',`r4') +define(`ul3',`r5') + +define(`rl0',`r24') +define(`rl1',`r24') +define(`rl2',`r24') +define(`rl3',`r24') + +define(`t0',`r7') +define(`t1',`r8') + +define(`NAIL_BITS',`GMP_NAIL_BITS') +define(`NUMB_BITS',`GMP_NUMB_BITS') + +dnl This declaration is munged by configure +NAILS_SUPPORT(2-63) + +ASM_START() +PROLOGUE(mpn_submul_1) + sll vl0, NAIL_BITS, vl0 + lda numb_mask, -1(r31) + srl numb_mask, NAIL_BITS, numb_mask + + and n, 3, r25 + cmpeq r25, 1, r21 + bne r21, L(1m4) + cmpeq r25, 2, r21 + bne r21, L(2m4) + beq r25, L(0m4) + +L(3m4): ldq ul3, 0(up) + lda n, -4(n) + ldq ul0, 8(up) + mulq vl0, ul3, m3a + umulh vl0, ul3, m3b + ldq ul1, 16(up) + lda up, 24(up) + lda rp, -8(rp) + mulq vl0, ul0, m0a + umulh vl0, ul0, m0b + bge n, L(ge3) + + mulq vl0, ul1, m1a + umulh vl0, ul1, m1b + ldq rl3, 8(rp) + srl m3a,NAIL_BITS, t0 + addq t0, r31, acc1 + subq rl3, acc1, acc1 + ldq rl0, 16(rp) + srl m0a,NAIL_BITS, t0 + addq t0, m3b, acc0 + sra acc1,NUMB_BITS, t1 + br r31, L(ta3) + +L(ge3): ldq ul2, 0(up) + mulq vl0, ul1, m1a + umulh vl0, ul1, m1b + ldq rl3, 8(rp) + srl m3a,NAIL_BITS, t0 + ldq ul3, 8(up) + lda n, -4(n) + mulq vl0, ul2, m2a + addq t0, r31, acc1 + umulh vl0, ul2, m2b + subq rl3, acc1, acc1 + ldq rl0, 16(rp) + srl m0a,NAIL_BITS, t0 + ldq ul0, 16(up) + mulq vl0, ul3, m3a + addq t0, m3b, acc0 + sra acc1,NUMB_BITS, t1 + br r31, L(el3) + +L(0m4): lda n, -8(n) + ldq ul2, 0(up) + ldq ul3, 8(up) + mulq vl0, ul2, m2a + umulh vl0, ul2, m2b + ldq ul0, 16(up) + mulq vl0, ul3, m3a + umulh vl0, ul3, m3b + ldq ul1, 24(up) + lda up, 32(up) + mulq vl0, ul0, m0a + umulh vl0, ul0, m0b + bge n, L(ge4) + + ldq rl2, 0(rp) + srl m2a,NAIL_BITS, t0 + mulq vl0, ul1, m1a + addq t0, r31, acc0 + umulh vl0, ul1, m1b + subq rl2, acc0, acc0 + ldq rl3, 8(rp) + srl m3a,NAIL_BITS, t0 + addq t0, m2b, acc1 + sra acc0,NUMB_BITS, t1 + br r31, L(ta4) + +L(ge4): ldq rl2, 0(rp) + srl m2a,NAIL_BITS, t0 + ldq ul2, 0(up) + mulq vl0, ul1, m1a + addq t0, r31, acc0 + umulh vl0, ul1, m1b + subq rl2, acc0, acc0 + ldq rl3, 8(rp) + srl m3a,NAIL_BITS, t0 + ldq ul3, 8(up) + lda n, -4(n) + mulq vl0, ul2, m2a + addq t0, m2b, acc1 + sra acc0,NUMB_BITS, t1 + br r31, L(el0) + +L(2m4): lda n, -4(n) + ldq ul0, 0(up) + ldq ul1, 8(up) + lda up, 16(up) + lda rp, -16(rp) + mulq vl0, ul0, m0a + umulh vl0, ul0, m0b + bge n, L(ge2) + + mulq vl0, ul1, m1a + umulh vl0, ul1, m1b + ldq rl0, 16(rp) + srl m0a,NAIL_BITS, t0 + addq t0, r31, acc0 + subq rl0, acc0, acc0 + ldq rl1, 24(rp) + srl m1a,NAIL_BITS, t0 + addq t0, m0b, acc1 + sra acc0,NUMB_BITS, t1 + br r31, L(ta2) + +L(ge2): ldq ul2, 0(up) + mulq vl0, ul1, m1a + umulh vl0, ul1, m1b + ldq ul3, 8(up) + lda n, -4(n) + mulq vl0, ul2, m2a + umulh vl0, ul2, m2b + ldq rl0, 16(rp) + srl m0a,NAIL_BITS, t0 + ldq ul0, 16(up) + mulq vl0, ul3, m3a + addq t0, r31, acc0 + umulh vl0, ul3, m3b + subq rl0, acc0, acc0 + ldq rl1, 24(rp) + srl m1a,NAIL_BITS, t0 + ldq ul1, 24(up) + lda up, 32(up) + lda rp, 32(rp) + mulq vl0, ul0, m0a + addq t0, m0b, acc1 + sra acc0,NUMB_BITS, t1 + bge n, L(el2) + + br r31, L(ta6) + +L(1m4): lda n, -4(n) + ldq ul1, 0(up) + lda up, 8(up) + lda rp, -24(rp) + bge n, L(ge1) + + mulq vl0, ul1, m1a + umulh vl0, ul1, m1b + ldq rl1, 24(rp) + srl m1a,NAIL_BITS, t0 + subq rl1, t0, acc1 + and acc1,numb_mask, r28 + sra acc1,NUMB_BITS, t1 + stq r28, 24(rp) + subq m1b, t1, r0 + ret r31, (r26), 1 + +L(ge1): ldq ul2, 0(up) + mulq vl0, ul1, m1a + umulh vl0, ul1, m1b + ldq ul3, 8(up) + lda n, -4(n) + mulq vl0, ul2, m2a + umulh vl0, ul2, m2b + ldq ul0, 16(up) + mulq vl0, ul3, m3a + umulh vl0, ul3, m3b + ldq rl1, 24(rp) + srl m1a,NAIL_BITS, t0 + ldq ul1, 24(up) + lda up, 32(up) + lda rp, 32(rp) + mulq vl0, ul0, m0a + addq t0, r31, acc1 + umulh vl0, ul0, m0b + subq rl1, acc1, acc1 + ldq rl2, 0(rp) + srl m2a,NAIL_BITS, t0 + mulq vl0, ul1, m1a + addq t0, m1b, acc0 + sra acc1,NUMB_BITS, t1 + blt n, L(ta5) + +L(ge5): ldq ul2, 0(up) + br r31, L(el1) + + ALIGN(16) +L(top): mulq vl0, ul0, m0a C U1 + addq t0, m0b, acc1 C L0 + sra acc0,NUMB_BITS, t1 C U0 + stq r28, -24(rp) C L1 +C +L(el2): umulh vl0, ul0, m0b C U1 + and acc0,numb_mask, r28 C L0 + subq rl1, acc1, acc1 C U0 + ldq rl2, 0(rp) C L1 +C + unop C U1 + addq t1, acc1, acc1 C L0 + srl m2a,NAIL_BITS, t0 C U0 + ldq ul2, 0(up) C L1 +C + mulq vl0, ul1, m1a C U1 + addq t0, m1b, acc0 C L0 + sra acc1,NUMB_BITS, t1 C U0 + stq r28, -16(rp) C L1 +C +L(el1): umulh vl0, ul1, m1b C U1 + and acc1,numb_mask, r28 C L0 + subq rl2, acc0, acc0 C U0 + ldq rl3, 8(rp) C L1 +C + lda n, -4(n) C L1 + addq t1, acc0, acc0 C L0 + srl m3a,NAIL_BITS, t0 C U0 + ldq ul3, 8(up) C L1 +C + mulq vl0, ul2, m2a C U1 + addq t0, m2b, acc1 C L0 + sra acc0,NUMB_BITS, t1 C U0 + stq r28, -8(rp) C L1 +C +L(el0): umulh vl0, ul2, m2b C U1 + and acc0,numb_mask, r28 C L0 + subq rl3, acc1, acc1 C U0 + ldq rl0, 16(rp) C L1 +C + unop C U1 + addq t1, acc1, acc1 C L0 + srl m0a,NAIL_BITS, t0 C U0 + ldq ul0, 16(up) C L1 +C + mulq vl0, ul3, m3a C U1 + addq t0, m3b, acc0 C L0 + sra acc1,NUMB_BITS, t1 C U0 + stq r28, 0(rp) C L1 +C +L(el3): umulh vl0, ul3, m3b C U1 + and acc1,numb_mask, r28 C L0 + subq rl0, acc0, acc0 C U0 + ldq rl1, 24(rp) C L1 +C + unop C U1 + addq t1, acc0, acc0 C L0 + srl m1a,NAIL_BITS, t0 C U0 + ldq ul1, 24(up) C L1 +C + lda up, 32(up) C L0 + unop C U1 + lda rp, 32(rp) C L1 + bge n, L(top) C U0 + +L(end): mulq vl0, ul0, m0a + addq t0, m0b, acc1 + sra acc0,NUMB_BITS, t1 + stq r28, -24(rp) +L(ta6): umulh vl0, ul0, m0b + and acc0,numb_mask, r28 + subq rl1, acc1, acc1 + ldq rl2, 0(rp) + addq t1, acc1, acc1 + srl m2a,NAIL_BITS, t0 + mulq vl0, ul1, m1a + addq t0, m1b, acc0 + sra acc1,NUMB_BITS, t1 + stq r28, -16(rp) +L(ta5): umulh vl0, ul1, m1b + and acc1,numb_mask, r28 + subq rl2, acc0, acc0 + ldq rl3, 8(rp) + addq t1, acc0, acc0 + srl m3a,NAIL_BITS, t0 + addq t0, m2b, acc1 + sra acc0,NUMB_BITS, t1 + stq r28, -8(rp) + unop + ALIGN(16) +L(ta4): and acc0,numb_mask, r28 + subq rl3, acc1, acc1 + ldq rl0, 16(rp) + addq t1, acc1, acc1 + srl m0a,NAIL_BITS, t0 + addq t0, m3b, acc0 + sra acc1,NUMB_BITS, t1 + stq r28, 0(rp) + unop + ALIGN(16) +L(ta3): and acc1,numb_mask, r28 + subq rl0, acc0, acc0 + ldq rl1, 24(rp) + addq t1, acc0, acc0 + srl m1a,NAIL_BITS, t0 + addq t0, m0b, acc1 + sra acc0,NUMB_BITS, t1 + stq r28, 8(rp) + unop + ALIGN(16) +L(ta2): and acc0,numb_mask, r28 + subq rl1, acc1, acc1 + addq t1, acc1, acc1 + sra acc1,NUMB_BITS, t1 + stq r28, 16(rp) + and acc1,numb_mask, r28 + subq m1b, t1, r0 + stq r28, 24(rp) + ret r31, (r26), 1 +EPILOGUE() +ASM_END() |