From a89a14ef5da44684a16b204e7a70460cc8c4922a Mon Sep 17 00:00:00 2001 From: Thomas Voss Date: Fri, 21 Jun 2024 23:36:36 +0200 Subject: Basic constant folding implementation --- .../gmp-6.3.0/mpn/sparc64/ultrasparc1234/add_n.asm | 241 ++++++++ .../mpn/sparc64/ultrasparc1234/addmul_1.asm | 606 +++++++++++++++++++++ .../mpn/sparc64/ultrasparc1234/addmul_2.asm | 551 +++++++++++++++++++ .../mpn/sparc64/ultrasparc1234/lshiftc.asm | 165 ++++++ .../gmp-6.3.0/mpn/sparc64/ultrasparc1234/mul_1.asm | 580 ++++++++++++++++++++ .../mpn/sparc64/ultrasparc1234/sqr_diagonal.asm | 342 ++++++++++++ .../gmp-6.3.0/mpn/sparc64/ultrasparc1234/sub_n.asm | 241 ++++++++ .../mpn/sparc64/ultrasparc1234/submul_1.asm | 68 +++ 8 files changed, 2794 insertions(+) create mode 100644 vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234/add_n.asm create mode 100644 vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234/addmul_1.asm create mode 100644 vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234/addmul_2.asm create mode 100644 vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234/lshiftc.asm create mode 100644 vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234/mul_1.asm create mode 100644 vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234/sqr_diagonal.asm create mode 100644 vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234/sub_n.asm create mode 100644 vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234/submul_1.asm (limited to 'vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234') diff --git a/vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234/add_n.asm b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234/add_n.asm new file mode 100644 index 0000000..92374d2 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234/add_n.asm @@ -0,0 +1,241 @@ +dnl SPARC v9 mpn_add_n -- Add two limb vectors of the same length > 0 and +dnl store sum in a third limb vector. + +dnl Copyright 2001-2003, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC 1&2: 4 +C UltraSPARC 3: 4.5 + +C Compute carry-out from the most significant bits of u,v, and r, where +C r=u+v+carry_in, using logic operations. + +C This code runs at 4 cycles/limb on UltraSPARC 1 and 2. It has a 4 insn +C recurrency, and the UltraSPARC 1 and 2 the IE units are 100% saturated. +C Therefore, it seems futile to try to optimize this any further... + +C INPUT PARAMETERS +define(`rp', `%i0') +define(`up', `%i1') +define(`vp', `%i2') +define(`n', `%i3') + +define(`u0', `%l0') +define(`u1', `%l2') +define(`u2', `%l4') +define(`u3', `%l6') +define(`v0', `%l1') +define(`v1', `%l3') +define(`v2', `%l5') +define(`v3', `%l7') + +define(`cy',`%i4') + +define(`fanop',`fitod %f0,%f2') dnl A quasi nop running in the FA pipe +define(`fmnop',`fmuld %f0,%f0,%f4') dnl A quasi nop running in the FM pipe + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_add_nc) + save %sp,-160,%sp + + fitod %f0,%f0 C make sure f0 contains small, quiet number + subcc n,4,%g0 + bl,pn %xcc,.Loop0 + nop + b,a L(com) +EPILOGUE() + +PROLOGUE(mpn_add_n) + save %sp,-160,%sp + + fitod %f0,%f0 C make sure f0 contains small, quiet number + subcc n,4,%g0 + bl,pn %xcc,.Loop0 + mov 0,cy +L(com): + ldx [up+0],u0 + ldx [vp+0],v0 + add up,32,up + ldx [up-24],u1 + ldx [vp+8],v1 + add vp,32,vp + ldx [up-16],u2 + ldx [vp-16],v2 + ldx [up-8],u3 + ldx [vp-8],v3 + subcc n,8,n + add u0,v0,%g1 C main add + add %g1,cy,%g5 C carry add + or u0,v0,%g2 + bl,pn %xcc,.Lend4567 + fanop + b,a .Loop + + .align 16 +C START MAIN LOOP +.Loop: andn %g2,%g5,%g2 + and u0,v0,%g3 + ldx [up+0],u0 + fanop +C -- + or %g3,%g2,%g2 + ldx [vp+0],v0 + add up,32,up + fanop +C -- + srlx %g2,63,cy + add u1,v1,%g1 + stx %g5,[rp+0] + fanop +C -- + add %g1,cy,%g5 + or u1,v1,%g2 + fmnop + fanop +C -- + andn %g2,%g5,%g2 + and u1,v1,%g3 + ldx [up-24],u1 + fanop +C -- + or %g3,%g2,%g2 + ldx [vp+8],v1 + add vp,32,vp + fanop +C -- + srlx %g2,63,cy + add u2,v2,%g1 + stx %g5,[rp+8] + fanop +C -- + add %g1,cy,%g5 + or u2,v2,%g2 + fmnop + fanop +C -- + andn %g2,%g5,%g2 + and u2,v2,%g3 + ldx [up-16],u2 + fanop +C -- + or %g3,%g2,%g2 + ldx [vp-16],v2 + add rp,32,rp + fanop +C -- + srlx %g2,63,cy + add u3,v3,%g1 + stx %g5,[rp-16] + fanop +C -- + add %g1,cy,%g5 + or u3,v3,%g2 + fmnop + fanop +C -- + andn %g2,%g5,%g2 + and u3,v3,%g3 + ldx [up-8],u3 + fanop +C -- + or %g3,%g2,%g2 + subcc n,4,n + ldx [vp-8],v3 + fanop +C -- + srlx %g2,63,cy + add u0,v0,%g1 + stx %g5,[rp-8] + fanop +C -- + add %g1,cy,%g5 + or u0,v0,%g2 + bge,pt %xcc,.Loop + fanop +C END MAIN LOOP +.Lend4567: + andn %g2,%g5,%g2 + and u0,v0,%g3 + or %g3,%g2,%g2 + srlx %g2,63,cy + add u1,v1,%g1 + stx %g5,[rp+0] + add %g1,cy,%g5 + or u1,v1,%g2 + andn %g2,%g5,%g2 + and u1,v1,%g3 + or %g3,%g2,%g2 + srlx %g2,63,cy + add u2,v2,%g1 + stx %g5,[rp+8] + add %g1,cy,%g5 + or u2,v2,%g2 + andn %g2,%g5,%g2 + and u2,v2,%g3 + or %g3,%g2,%g2 + add rp,32,rp + srlx %g2,63,cy + add u3,v3,%g1 + stx %g5,[rp-16] + add %g1,cy,%g5 + or u3,v3,%g2 + andn %g2,%g5,%g2 + and u3,v3,%g3 + or %g3,%g2,%g2 + srlx %g2,63,cy + stx %g5,[rp-8] + + addcc n,4,n + bz,pn %xcc,.Lret + fanop + +.Loop0: ldx [up],u0 + add up,8,up + ldx [vp],v0 + add vp,8,vp + add rp,8,rp + subcc n,1,n + add u0,v0,%g1 + or u0,v0,%g2 + add %g1,cy,%g5 + and u0,v0,%g3 + andn %g2,%g5,%g2 + stx %g5,[rp-8] + or %g3,%g2,%g2 + bnz,pt %xcc,.Loop0 + srlx %g2,63,cy + +.Lret: mov cy,%i0 + ret + restore +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234/addmul_1.asm b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234/addmul_1.asm new file mode 100644 index 0000000..48a9414 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234/addmul_1.asm @@ -0,0 +1,606 @@ +dnl SPARC v9 64-bit mpn_addmul_1 -- Multiply a limb vector with a limb and add +dnl the result to a second limb vector. + +dnl Copyright 1998, 2000-2004 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC 1&2: 14 +C UltraSPARC 3: 17.5 + +C Algorithm: We use eight floating-point multiplies per limb product, with the +C invariant v operand split into four 16-bit pieces, and the up operand split +C into 32-bit pieces. We sum pairs of 48-bit partial products using +C floating-point add, then convert the four 49-bit product-sums and transfer +C them to the integer unit. + +C Possible optimizations: +C 0. Rewrite to use algorithm of mpn_addmul_2. +C 1. Align the stack area where we transfer the four 49-bit product-sums +C to a 32-byte boundary. That would minimize the cache collision. +C (UltraSPARC-1/2 use a direct-mapped cache.) (Perhaps even better would +C be to align the area to map to the area immediately before up?) +C 2. Sum the 4 49-bit quantities using 32-bit operations, as in the +C develop mpn_addmul_2. This would save many integer instructions. +C 3. Unrolling. Questionable if it is worth the code expansion, given that +C it could only save 1 cycle/limb. +C 4. Specialize for particular v values. If its upper 32 bits are zero, we +C could save many operations, in the FPU (fmuld), but more so in the IEU +C since we'll be summing 48-bit quantities, which might be simpler. +C 5. Ideally, we should schedule the f2/f3 and f4/f5 RAW further apart, and +C the i00,i16,i32,i48 RAW less apart. The latter apart-scheduling should +C not be greater than needed for L2 cache latency, and also not so great +C that i16 needs to be copied. +C 6. Avoid performing mem+fa+fm in the same cycle, at least not when we want +C to get high IEU bandwidth. (12 of the 14 cycles will be free for 2 IEU +C ops.) + +C Instruction classification (as per UltraSPARC-1/2 functional units): +C 8 FM +C 10 FA +C 12 MEM +C 10 ISHIFT + 14 IADDLOG +C 1 BRANCH +C 55 insns totally (plus one mov insn that should be optimized out) + +C The loop executes 56 instructions in 14 cycles on UltraSPARC-1/2, i.e we +C sustain the peak execution rate of 4 instructions/cycle. + +C INPUT PARAMETERS +C rp i0 +C up i1 +C n i2 +C v i3 + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) + +define(`p00', `%f8') define(`p16',`%f10') define(`p32',`%f12') define(`p48',`%f14') +define(`r32',`%f16') define(`r48',`%f18') define(`r64',`%f20') define(`r80',`%f22') +define(`v00',`%f24') define(`v16',`%f26') define(`v32',`%f28') define(`v48',`%f30') +define(`u00',`%f32') define(`u32', `%f34') +define(`a00',`%f36') define(`a16',`%f38') define(`a32',`%f40') define(`a48',`%f42') +define(`cy',`%g1') +define(`rlimb',`%g3') +define(`i00',`%l0') define(`i16',`%l1') define(`i32',`%l2') define(`i48',`%l3') +define(`xffffffff',`%l7') +define(`xffff',`%o0') + +PROLOGUE(mpn_addmul_1) + +C Initialization. (1) Split v operand into four 16-bit chunks and store them +C as IEEE double in fp registers. (2) Clear upper 32 bits of fp register pairs +C f2 and f4. (3) Store masks in registers aliased to `xffff' and `xffffffff'. + + save %sp, -256, %sp + mov -1, %g4 + srlx %g4, 48, xffff C store mask in register `xffff' + and %i3, xffff, %g2 + stx %g2, [%sp+2223+0] + srlx %i3, 16, %g3 + and %g3, xffff, %g3 + stx %g3, [%sp+2223+8] + srlx %i3, 32, %g2 + and %g2, xffff, %g2 + stx %g2, [%sp+2223+16] + srlx %i3, 48, %g3 + stx %g3, [%sp+2223+24] + srlx %g4, 32, xffffffff C store mask in register `xffffffff' + + sllx %i2, 3, %i2 + mov 0, cy C clear cy + add %i0, %i2, %i0 + add %i1, %i2, %i1 + neg %i2 + add %i1, 4, %i5 + add %i0, -32, %i4 + add %i0, -16, %i0 + + ldd [%sp+2223+0], v00 + ldd [%sp+2223+8], v16 + ldd [%sp+2223+16], v32 + ldd [%sp+2223+24], v48 + ld [%sp+2223+0],%f2 C zero f2 + ld [%sp+2223+0],%f4 C zero f4 + ld [%i5+%i2], %f3 C read low 32 bits of up[i] + ld [%i1+%i2], %f5 C read high 32 bits of up[i] + fxtod v00, v00 + fxtod v16, v16 + fxtod v32, v32 + fxtod v48, v48 + +C Start real work. (We sneakingly read f3 and f5 above...) +C The software pipeline is very deep, requiring 4 feed-in stages. + + fxtod %f2, u00 + fxtod %f4, u32 + fmuld u00, v00, a00 + fmuld u00, v16, a16 + fmuld u00, v32, p32 + fmuld u32, v00, r32 + fmuld u00, v48, p48 + addcc %i2, 8, %i2 + bnz,pt %xcc, .L_two_or_more + fmuld u32, v16, r48 + +.L_one: + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 + fdtox a00, a00 + faddd p48, r48, a48 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 + fdtox a32, a32 + fdtox a48, a48 + std a00, [%sp+2223+0] + std a16, [%sp+2223+8] + std a32, [%sp+2223+16] + std a48, [%sp+2223+24] + add %i2, 8, %i2 + + fdtox r64, a00 + ldx [%i0+%i2], rlimb C read rp[i] + fdtox r80, a16 + ldx [%sp+2223+0], i00 + ldx [%sp+2223+8], i16 + ldx [%sp+2223+16], i32 + ldx [%sp+2223+24], i48 + std a00, [%sp+2223+0] + std a16, [%sp+2223+8] + add %i2, 8, %i2 + + srlx rlimb, 32, %g4 C HI(rlimb) + and rlimb, xffffffff, %g5 C LO(rlimb) + add i00, %g5, %g5 C i00+ now in g5 + ldx [%sp+2223+0], i00 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + srlx i48, 16, %l5 C (i48 >> 16) + add i32, %g4, %g4 C i32+ now in g4 + sllx i48, 32, %l6 C (i48 << 32) + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + std a00, [%sp+2223+0] + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + std a16, [%sp+2223+8] + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + add %l6, %o2, %o2 C mi64- in %o2 + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + add cy, %g5, %o4 C x = prev(i00) + cy + b .L_out_1 + add %i2, 8, %i2 + +.L_two_or_more: + ld [%i5+%i2], %f3 C read low 32 bits of up[i] + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 + ld [%i1+%i2], %f5 C read high 32 bits of up[i] + fdtox a00, a00 + faddd p48, r48, a48 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 + fdtox a32, a32 + fxtod %f2, u00 + fxtod %f4, u32 + fdtox a48, a48 + std a00, [%sp+2223+0] + fmuld u00, v00, p00 + std a16, [%sp+2223+8] + fmuld u00, v16, p16 + std a32, [%sp+2223+16] + fmuld u00, v32, p32 + std a48, [%sp+2223+24] + faddd p00, r64, a00 + fmuld u32, v00, r32 + faddd p16, r80, a16 + fmuld u00, v48, p48 + addcc %i2, 8, %i2 + bnz,pt %xcc, .L_three_or_more + fmuld u32, v16, r48 + +.L_two: + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 + fdtox a00, a00 + ldx [%i0+%i2], rlimb C read rp[i] + faddd p48, r48, a48 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 + ldx [%sp+2223+0], i00 + fdtox a32, a32 + ldx [%sp+2223+8], i16 + ldx [%sp+2223+16], i32 + ldx [%sp+2223+24], i48 + fdtox a48, a48 + std a00, [%sp+2223+0] + std a16, [%sp+2223+8] + std a32, [%sp+2223+16] + std a48, [%sp+2223+24] + add %i2, 8, %i2 + + fdtox r64, a00 + srlx rlimb, 32, %g4 C HI(rlimb) + and rlimb, xffffffff, %g5 C LO(rlimb) + ldx [%i0+%i2], rlimb C read rp[i] + add i00, %g5, %g5 C i00+ now in g5 + fdtox r80, a16 + ldx [%sp+2223+0], i00 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + srlx i48, 16, %l5 C (i48 >> 16) + add i32, %g4, %g4 C i32+ now in g4 + ldx [%sp+2223+16], i32 + sllx i48, 32, %l6 C (i48 << 32) + ldx [%sp+2223+24], i48 + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + std a00, [%sp+2223+0] + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + std a16, [%sp+2223+8] + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + add %l6, %o2, %o2 C mi64- in %o2 + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + add cy, %g5, %o4 C x = prev(i00) + cy + b .L_out_2 + add %i2, 8, %i2 + +.L_three_or_more: + ld [%i5+%i2], %f3 C read low 32 bits of up[i] + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 + ld [%i1+%i2], %f5 C read high 32 bits of up[i] + fdtox a00, a00 + ldx [%i0+%i2], rlimb C read rp[i] + faddd p48, r48, a48 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 + ldx [%sp+2223+0], i00 + fdtox a32, a32 + ldx [%sp+2223+8], i16 + fxtod %f2, u00 + ldx [%sp+2223+16], i32 + fxtod %f4, u32 + ldx [%sp+2223+24], i48 + fdtox a48, a48 + std a00, [%sp+2223+0] + fmuld u00, v00, p00 + std a16, [%sp+2223+8] + fmuld u00, v16, p16 + std a32, [%sp+2223+16] + fmuld u00, v32, p32 + std a48, [%sp+2223+24] + faddd p00, r64, a00 + fmuld u32, v00, r32 + faddd p16, r80, a16 + fmuld u00, v48, p48 + addcc %i2, 8, %i2 + bnz,pt %xcc, .L_four_or_more + fmuld u32, v16, r48 + +.L_three: + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 + fdtox a00, a00 + srlx rlimb, 32, %g4 C HI(rlimb) + and rlimb, xffffffff, %g5 C LO(rlimb) + ldx [%i0+%i2], rlimb C read rp[i] + faddd p48, r48, a48 + add i00, %g5, %g5 C i00+ now in g5 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 + ldx [%sp+2223+0], i00 + fdtox a32, a32 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + srlx i48, 16, %l5 C (i48 >> 16) + add i32, %g4, %g4 C i32+ now in g4 + ldx [%sp+2223+16], i32 + sllx i48, 32, %l6 C (i48 << 32) + ldx [%sp+2223+24], i48 + fdtox a48, a48 + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + std a00, [%sp+2223+0] + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + std a16, [%sp+2223+8] + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + std a32, [%sp+2223+16] + add %l6, %o2, %o2 C mi64- in %o2 + std a48, [%sp+2223+24] + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + add cy, %g5, %o4 C x = prev(i00) + cy + b .L_out_3 + add %i2, 8, %i2 + +.L_four_or_more: + ld [%i5+%i2], %f3 C read low 32 bits of up[i] + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 + ld [%i1+%i2], %f5 C read high 32 bits of up[i] + fdtox a00, a00 + srlx rlimb, 32, %g4 C HI(rlimb) + and rlimb, xffffffff, %g5 C LO(rlimb) + ldx [%i0+%i2], rlimb C read rp[i] + faddd p48, r48, a48 + add i00, %g5, %g5 C i00+ now in g5 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 + ldx [%sp+2223+0], i00 + fdtox a32, a32 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + fxtod %f2, u00 + srlx i48, 16, %l5 C (i48 >> 16) + add i32, %g4, %g4 C i32+ now in g4 + ldx [%sp+2223+16], i32 + fxtod %f4, u32 + sllx i48, 32, %l6 C (i48 << 32) + ldx [%sp+2223+24], i48 + fdtox a48, a48 + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + std a00, [%sp+2223+0] + fmuld u00, v00, p00 + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + std a16, [%sp+2223+8] + fmuld u00, v16, p16 + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + std a32, [%sp+2223+16] + fmuld u00, v32, p32 + add %l6, %o2, %o2 C mi64- in %o2 + std a48, [%sp+2223+24] + faddd p00, r64, a00 + fmuld u32, v00, r32 + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + faddd p16, r80, a16 + fmuld u00, v48, p48 + add cy, %g5, %o4 C x = prev(i00) + cy + addcc %i2, 8, %i2 + bnz,pt %xcc, .Loop + fmuld u32, v16, r48 + +.L_four: + b,a .L_out_4 + +C BEGIN MAIN LOOP + .align 16 +.Loop: +C 00 + srlx %o4, 16, %o5 C (x >> 16) + ld [%i5+%i2], %f3 C read low 32 bits of up[i] + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 +C 01 + add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT + and %o4, xffff, %o5 C (x & 0xffff) + ld [%i1+%i2], %f5 C read high 32 bits of up[i] + fdtox a00, a00 +C 02 + srlx rlimb, 32, %g4 C HI(rlimb) + and rlimb, xffffffff, %g5 C LO(rlimb) + ldx [%i0+%i2], rlimb C read rp[i] + faddd p48, r48, a48 +C 03 + srlx %o2, 48, %o7 C (mi64 >> 48) + add i00, %g5, %g5 C i00+ now in g5 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 +C 04 + sllx %o2, 16, %i3 C (mi64 << 16) + add %o7, %o1, cy C new cy + ldx [%sp+2223+0], i00 + fdtox a32, a32 +C 05 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + fxtod %f2, u00 +C 06 + srlx i48, 16, %l5 C (i48 >> 16) + add i32, %g4, %g4 C i32+ now in g4 + ldx [%sp+2223+16], i32 + fxtod %f4, u32 +C 07 + sllx i48, 32, %l6 C (i48 << 32) + or %i3, %o5, %o5 + ldx [%sp+2223+24], i48 + fdtox a48, a48 +C 08 + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + std a00, [%sp+2223+0] + fmuld u00, v00, p00 +C 09 + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + std a16, [%sp+2223+8] + fmuld u00, v16, p16 +C 10 + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + std a32, [%sp+2223+16] + fmuld u00, v32, p32 +C 11 + add %l6, %o2, %o2 C mi64- in %o2 + std a48, [%sp+2223+24] + faddd p00, r64, a00 + fmuld u32, v00, r32 +C 12 + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + stx %o5, [%i4+%i2] + faddd p16, r80, a16 + fmuld u00, v48, p48 +C 13 + add cy, %g5, %o4 C x = prev(i00) + cy + addcc %i2, 8, %i2 + bnz,pt %xcc, .Loop + fmuld u32, v16, r48 +C END MAIN LOOP + +.L_out_4: + srlx %o4, 16, %o5 C (x >> 16) + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 + add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT + and %o4, xffff, %o5 C (x & 0xffff) + fdtox a00, a00 + srlx rlimb, 32, %g4 C HI(rlimb) + and rlimb, xffffffff, %g5 C LO(rlimb) + ldx [%i0+%i2], rlimb C read rp[i] + faddd p48, r48, a48 + srlx %o2, 48, %o7 C (mi64 >> 48) + add i00, %g5, %g5 C i00+ now in g5 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 + sllx %o2, 16, %i3 C (mi64 << 16) + add %o7, %o1, cy C new cy + ldx [%sp+2223+0], i00 + fdtox a32, a32 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + srlx i48, 16, %l5 C (i48 >> 16) + add i32, %g4, %g4 C i32+ now in g4 + ldx [%sp+2223+16], i32 + sllx i48, 32, %l6 C (i48 << 32) + or %i3, %o5, %o5 + ldx [%sp+2223+24], i48 + fdtox a48, a48 + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + std a00, [%sp+2223+0] + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + std a16, [%sp+2223+8] + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + std a32, [%sp+2223+16] + add %l6, %o2, %o2 C mi64- in %o2 + std a48, [%sp+2223+24] + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + stx %o5, [%i4+%i2] + add cy, %g5, %o4 C x = prev(i00) + cy + add %i2, 8, %i2 +.L_out_3: + srlx %o4, 16, %o5 C (x >> 16) + add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT + and %o4, xffff, %o5 C (x & 0xffff) + fdtox r64, a00 + srlx rlimb, 32, %g4 C HI(rlimb) + and rlimb, xffffffff, %g5 C LO(rlimb) + ldx [%i0+%i2], rlimb C read rp[i] + srlx %o2, 48, %o7 C (mi64 >> 48) + add i00, %g5, %g5 C i00+ now in g5 + fdtox r80, a16 + sllx %o2, 16, %i3 C (mi64 << 16) + add %o7, %o1, cy C new cy + ldx [%sp+2223+0], i00 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + srlx i48, 16, %l5 C (i48 >> 16) + add i32, %g4, %g4 C i32+ now in g4 + ldx [%sp+2223+16], i32 + sllx i48, 32, %l6 C (i48 << 32) + or %i3, %o5, %o5 + ldx [%sp+2223+24], i48 + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + std a00, [%sp+2223+0] + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + std a16, [%sp+2223+8] + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + add %l6, %o2, %o2 C mi64- in %o2 + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + stx %o5, [%i4+%i2] + add cy, %g5, %o4 C x = prev(i00) + cy + add %i2, 8, %i2 +.L_out_2: + srlx %o4, 16, %o5 C (x >> 16) + add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT + and %o4, xffff, %o5 C (x & 0xffff) + srlx rlimb, 32, %g4 C HI(rlimb) + and rlimb, xffffffff, %g5 C LO(rlimb) + srlx %o2, 48, %o7 C (mi64 >> 48) + add i00, %g5, %g5 C i00+ now in g5 + sllx %o2, 16, %i3 C (mi64 << 16) + add %o7, %o1, cy C new cy + ldx [%sp+2223+0], i00 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + srlx i48, 16, %l5 C (i48 >> 16) + add i32, %g4, %g4 C i32+ now in g4 + sllx i48, 32, %l6 C (i48 << 32) + or %i3, %o5, %o5 + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + add %l6, %o2, %o2 C mi64- in %o2 + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + stx %o5, [%i4+%i2] + add cy, %g5, %o4 C x = prev(i00) + cy + add %i2, 8, %i2 +.L_out_1: + srlx %o4, 16, %o5 C (x >> 16) + add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT + and %o4, xffff, %o5 C (x & 0xffff) + srlx %o2, 48, %o7 C (mi64 >> 48) + sllx %o2, 16, %i3 C (mi64 << 16) + add %o7, %o1, cy C new cy + or %i3, %o5, %o5 + stx %o5, [%i4+%i2] + + sllx i00, 0, %g2 + add %g2, cy, cy + sllx i16, 16, %g3 + add %g3, cy, cy + + return %i7+8 + mov cy, %o0 +EPILOGUE(mpn_addmul_1) diff --git a/vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234/addmul_2.asm b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234/addmul_2.asm new file mode 100644 index 0000000..37674d7 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234/addmul_2.asm @@ -0,0 +1,551 @@ +dnl SPARC v9 64-bit mpn_addmul_2 -- Multiply an n limb number with 2-limb +dnl number and add the result to a n limb vector. + +dnl Copyright 2002, 2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC 1&2: 9 +C UltraSPARC 3: 10 + +C Algorithm: We use 16 floating-point multiplies per limb product, with the +C 2-limb v operand split into eight 16-bit pieces, and the n-limb u operand +C split into 32-bit pieces. We sum four 48-bit partial products using +C floating-point add, then convert the resulting four 50-bit quantities and +C transfer them to the integer unit. + +C Possible optimizations: +C 1. Align the stack area where we transfer the four 50-bit product-sums +C to a 32-byte boundary. That would minimize the cache collision. +C (UltraSPARC-1/2 use a direct-mapped cache.) (Perhaps even better would +C be to align the area to map to the area immediately before up?) +C 2. Perform two of the fp->int conversions with integer instructions. We +C can get almost ten free IEU slots, if we clean up bookkeeping and the +C silly carry-limb code. +C 3. For an mpn_addmul_1 based on this, we need to fix the silly carry-limb +C code. + +C OSP (Overlapping software pipeline) version of mpn_mul_basecase: +C Operand swap will require 8 LDDA and 8 FXTOD, which will mean 8 cycles. +C FI = 20 +C L = 9 x un * vn +C WDFI = 10 x vn / 2 +C WD = 4 + +C Instruction classification (as per UltraSPARC functional units). +C Assuming silly carry code is fixed. Includes bookkeeping. +C +C mpn_addmul_X mpn_mul_X +C 1 2 1 2 +C ========== ========== +C FM 8 16 8 16 +C FA 10 18 10 18 +C MEM 12 12 10 10 +C ISHIFT 6 6 6 6 +C IADDLOG 11 11 10 10 +C BRANCH 1 1 1 1 +C +C TOTAL IEU 17 17 16 16 +C TOTAL 48 64 45 61 +C +C IEU cycles 8.5 8.5 8 8 +C MEM cycles 12 12 10 10 +C ISSUE cycles 12 16 11.25 15.25 +C FPU cycles 10 18 10 18 +C cycles/loop 12 18 12 18 +C cycles/limb 12 9 12 9 + + +C INPUT PARAMETERS +C rp[n + 1] i0 +C up[n] i1 +C n i2 +C vp[2] i3 + + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) + +C Combine registers: +C u00_hi= u32_hi +C u00_lo= u32_lo +C a000 = out000 +C a016 = out016 +C Free: f52 f54 + + +define(`p000', `%f8') define(`p016',`%f10') +define(`p032',`%f12') define(`p048',`%f14') +define(`p064',`%f16') define(`p080',`%f18') +define(`p096a',`%f20') define(`p112a',`%f22') +define(`p096b',`%f56') define(`p112b',`%f58') + +define(`out000',`%f0') define(`out016',`%f6') + +define(`v000',`%f24') define(`v016',`%f26') +define(`v032',`%f28') define(`v048',`%f30') +define(`v064',`%f44') define(`v080',`%f46') +define(`v096',`%f48') define(`v112',`%f50') + +define(`u00',`%f32') define(`u32', `%f34') + +define(`a000',`%f36') define(`a016',`%f38') +define(`a032',`%f40') define(`a048',`%f42') +define(`a064',`%f60') define(`a080',`%f62') + +define(`u00_hi',`%f2') define(`u32_hi',`%f4') +define(`u00_lo',`%f3') define(`u32_lo',`%f5') + +define(`cy',`%g1') +define(`rlimb',`%g3') +define(`i00',`%l0') define(`i16',`%l1') +define(`r00',`%l2') define(`r32',`%l3') +define(`xffffffff',`%l7') +define(`xffff',`%o0') + + +PROLOGUE(mpn_addmul_2) + +C Initialization. (1) Split v operand into eight 16-bit chunks and store them +C as IEEE double in fp registers. (2) Clear upper 32 bits of fp register pairs +C f2 and f4. (3) Store masks in registers aliased to `xffff' and `xffffffff'. +C This code could be better scheduled. + + save %sp, -256, %sp + +ifdef(`HAVE_VIS', +` mov -1, %g4 + wr %g0, 0xD2, %asi + srlx %g4, 32, xffffffff C store mask in register `xffffffff' + ldda [%i3+6] %asi, v000 + ldda [%i3+4] %asi, v016 + ldda [%i3+2] %asi, v032 + ldda [%i3+0] %asi, v048 + fxtod v000, v000 + ldda [%i3+14] %asi, v064 + fxtod v016, v016 + ldda [%i3+12] %asi, v080 + fxtod v032, v032 + ldda [%i3+10] %asi, v096 + fxtod v048, v048 + ldda [%i3+8] %asi, v112 + fxtod v064, v064 + fxtod v080, v080 + fxtod v096, v096 + fxtod v112, v112 + fzero u00_hi + fzero u32_hi +', +` mov -1, %g4 + ldx [%i3+0], %l0 C vp[0] + srlx %g4, 48, xffff C store mask in register `xffff' + ldx [%i3+8], %l1 C vp[1] + + and %l0, xffff, %g2 + stx %g2, [%sp+2223+0] + srlx %l0, 16, %g3 + and %g3, xffff, %g3 + stx %g3, [%sp+2223+8] + srlx %l0, 32, %g2 + and %g2, xffff, %g2 + stx %g2, [%sp+2223+16] + srlx %l0, 48, %g3 + stx %g3, [%sp+2223+24] + and %l1, xffff, %g2 + stx %g2, [%sp+2223+32] + srlx %l1, 16, %g3 + and %g3, xffff, %g3 + stx %g3, [%sp+2223+40] + srlx %l1, 32, %g2 + and %g2, xffff, %g2 + stx %g2, [%sp+2223+48] + srlx %l1, 48, %g3 + stx %g3, [%sp+2223+56] + + srlx %g4, 32, xffffffff C store mask in register `xffffffff' + + ldd [%sp+2223+0], v000 + ldd [%sp+2223+8], v016 + ldd [%sp+2223+16], v032 + ldd [%sp+2223+24], v048 + fxtod v000, v000 + ldd [%sp+2223+32], v064 + fxtod v016, v016 + ldd [%sp+2223+40], v080 + fxtod v032, v032 + ldd [%sp+2223+48], v096 + fxtod v048, v048 + ldd [%sp+2223+56], v112 + fxtod v064, v064 + ld [%sp+2223+0], u00_hi C zero u00_hi + fxtod v080, v080 + ld [%sp+2223+0], u32_hi C zero u32_hi + fxtod v096, v096 + fxtod v112, v112 +') +C Initialization done. + mov 0, %g2 + mov 0, rlimb + mov 0, %g4 + add %i0, -8, %i0 C BOOKKEEPING + +C Start software pipeline. + + ld [%i1+4], u00_lo C read low 32 bits of up[i] + fxtod u00_hi, u00 +C mid + ld [%i1+0], u32_lo C read high 32 bits of up[i] + fmuld u00, v000, a000 + fmuld u00, v016, a016 + fmuld u00, v032, a032 + fmuld u00, v048, a048 + add %i2, -1, %i2 C BOOKKEEPING + fmuld u00, v064, p064 + add %i1, 8, %i1 C BOOKKEEPING + fxtod u32_hi, u32 + fmuld u00, v080, p080 + fmuld u00, v096, p096a + brnz,pt %i2, .L_2_or_more + fmuld u00, v112, p112a + +.L1: fdtox a000, out000 + fmuld u32, v000, p000 + fdtox a016, out016 + fmuld u32, v016, p016 + fmovd p064, a064 + fmuld u32, v032, p032 + fmovd p080, a080 + fmuld u32, v048, p048 + std out000, [%sp+2223+16] + faddd p000, a032, a000 + fmuld u32, v064, p064 + std out016, [%sp+2223+24] + fxtod u00_hi, u00 + faddd p016, a048, a016 + fmuld u32, v080, p080 + faddd p032, a064, a032 + fmuld u32, v096, p096b + faddd p048, a080, a048 + fmuld u32, v112, p112b +C mid + fdtox a000, out000 + fdtox a016, out016 + faddd p064, p096a, a064 + faddd p080, p112a, a080 + std out000, [%sp+2223+0] + b .L_wd2 + std out016, [%sp+2223+8] + +.L_2_or_more: + ld [%i1+4], u00_lo C read low 32 bits of up[i] + fdtox a000, out000 + fmuld u32, v000, p000 + fdtox a016, out016 + fmuld u32, v016, p016 + fmovd p064, a064 + fmuld u32, v032, p032 + fmovd p080, a080 + fmuld u32, v048, p048 + std out000, [%sp+2223+16] + faddd p000, a032, a000 + fmuld u32, v064, p064 + std out016, [%sp+2223+24] + fxtod u00_hi, u00 + faddd p016, a048, a016 + fmuld u32, v080, p080 + faddd p032, a064, a032 + fmuld u32, v096, p096b + faddd p048, a080, a048 + fmuld u32, v112, p112b +C mid + ld [%i1+0], u32_lo C read high 32 bits of up[i] + fdtox a000, out000 + fmuld u00, v000, p000 + fdtox a016, out016 + fmuld u00, v016, p016 + faddd p064, p096a, a064 + fmuld u00, v032, p032 + faddd p080, p112a, a080 + fmuld u00, v048, p048 + add %i2, -1, %i2 C BOOKKEEPING + std out000, [%sp+2223+0] + faddd p000, a032, a000 + fmuld u00, v064, p064 + add %i1, 8, %i1 C BOOKKEEPING + std out016, [%sp+2223+8] + fxtod u32_hi, u32 + faddd p016, a048, a016 + fmuld u00, v080, p080 + faddd p032, a064, a032 + fmuld u00, v096, p096a + faddd p048, a080, a048 + brnz,pt %i2, .L_3_or_more + fmuld u00, v112, p112a + + b .Lend + nop + +C 64 32 0 +C . . . +C . |__rXXX_| 32 +C . |___cy___| 34 +C . |_______i00__| 50 +C |_______i16__| . 50 + + +C BEGIN MAIN LOOP + .align 16 +.L_3_or_more: +.Loop: ld [%i1+4], u00_lo C read low 32 bits of up[i] + and %g2, xffffffff, %g2 + fdtox a000, out000 + fmuld u32, v000, p000 +C + lduw [%i0+4+8], r00 C read low 32 bits of rp[i] + add %g2, rlimb, %l5 + fdtox a016, out016 + fmuld u32, v016, p016 +C + srlx %l5, 32, cy + ldx [%sp+2223+16], i00 + faddd p064, p096b, a064 + fmuld u32, v032, p032 +C + add %g4, cy, cy C new cy + ldx [%sp+2223+24], i16 + faddd p080, p112b, a080 + fmuld u32, v048, p048 +C + nop + std out000, [%sp+2223+16] + faddd p000, a032, a000 + fmuld u32, v064, p064 +C + add i00, r00, rlimb + add %i0, 8, %i0 C BOOKKEEPING + std out016, [%sp+2223+24] + fxtod u00_hi, u00 +C + sllx i16, 16, %g2 + add cy, rlimb, rlimb + faddd p016, a048, a016 + fmuld u32, v080, p080 +C + srlx i16, 16, %g4 + add %g2, rlimb, %l5 + faddd p032, a064, a032 + fmuld u32, v096, p096b +C + stw %l5, [%i0+4] + nop + faddd p048, a080, a048 + fmuld u32, v112, p112b +C midloop + ld [%i1+0], u32_lo C read high 32 bits of up[i] + and %g2, xffffffff, %g2 + fdtox a000, out000 + fmuld u00, v000, p000 +C + lduw [%i0+0], r32 C read high 32 bits of rp[i] + add %g2, rlimb, %l5 + fdtox a016, out016 + fmuld u00, v016, p016 +C + srlx %l5, 32, cy + ldx [%sp+2223+0], i00 + faddd p064, p096a, a064 + fmuld u00, v032, p032 +C + add %g4, cy, cy C new cy + ldx [%sp+2223+8], i16 + faddd p080, p112a, a080 + fmuld u00, v048, p048 +C + add %i2, -1, %i2 C BOOKKEEPING + std out000, [%sp+2223+0] + faddd p000, a032, a000 + fmuld u00, v064, p064 +C + add i00, r32, rlimb + add %i1, 8, %i1 C BOOKKEEPING + std out016, [%sp+2223+8] + fxtod u32_hi, u32 +C + sllx i16, 16, %g2 + add cy, rlimb, rlimb + faddd p016, a048, a016 + fmuld u00, v080, p080 +C + srlx i16, 16, %g4 + add %g2, rlimb, %l5 + faddd p032, a064, a032 + fmuld u00, v096, p096a +C + stw %l5, [%i0+0] + faddd p048, a080, a048 + brnz,pt %i2, .Loop + fmuld u00, v112, p112a +C END MAIN LOOP + +C WIND-DOWN PHASE 1 +.Lend: and %g2, xffffffff, %g2 + fdtox a000, out000 + fmuld u32, v000, p000 + lduw [%i0+4+8], r00 C read low 32 bits of rp[i] + add %g2, rlimb, %l5 + fdtox a016, out016 + fmuld u32, v016, p016 + srlx %l5, 32, cy + ldx [%sp+2223+16], i00 + faddd p064, p096b, a064 + fmuld u32, v032, p032 + add %g4, cy, cy C new cy + ldx [%sp+2223+24], i16 + faddd p080, p112b, a080 + fmuld u32, v048, p048 + std out000, [%sp+2223+16] + faddd p000, a032, a000 + fmuld u32, v064, p064 + add i00, r00, rlimb + add %i0, 8, %i0 C BOOKKEEPING + std out016, [%sp+2223+24] + sllx i16, 16, %g2 + add cy, rlimb, rlimb + faddd p016, a048, a016 + fmuld u32, v080, p080 + srlx i16, 16, %g4 + add %g2, rlimb, %l5 + faddd p032, a064, a032 + fmuld u32, v096, p096b + stw %l5, [%i0+4] + faddd p048, a080, a048 + fmuld u32, v112, p112b +C mid + and %g2, xffffffff, %g2 + fdtox a000, out000 + lduw [%i0+0], r32 C read high 32 bits of rp[i] + add %g2, rlimb, %l5 + fdtox a016, out016 + srlx %l5, 32, cy + ldx [%sp+2223+0], i00 + faddd p064, p096a, a064 + add %g4, cy, cy C new cy + ldx [%sp+2223+8], i16 + faddd p080, p112a, a080 + std out000, [%sp+2223+0] + add i00, r32, rlimb + std out016, [%sp+2223+8] + sllx i16, 16, %g2 + add cy, rlimb, rlimb + srlx i16, 16, %g4 + add %g2, rlimb, %l5 + stw %l5, [%i0+0] + +C WIND-DOWN PHASE 2 +.L_wd2: and %g2, xffffffff, %g2 + fdtox a032, out000 + lduw [%i0+4+8], r00 C read low 32 bits of rp[i] + add %g2, rlimb, %l5 + fdtox a048, out016 + srlx %l5, 32, cy + ldx [%sp+2223+16], i00 + add %g4, cy, cy C new cy + ldx [%sp+2223+24], i16 + std out000, [%sp+2223+16] + add i00, r00, rlimb + add %i0, 8, %i0 C BOOKKEEPING + std out016, [%sp+2223+24] + sllx i16, 16, %g2 + add cy, rlimb, rlimb + srlx i16, 16, %g4 + add %g2, rlimb, %l5 + stw %l5, [%i0+4] +C mid + and %g2, xffffffff, %g2 + fdtox a064, out000 + lduw [%i0+0], r32 C read high 32 bits of rp[i] + add %g2, rlimb, %l5 + fdtox a080, out016 + srlx %l5, 32, cy + ldx [%sp+2223+0], i00 + add %g4, cy, cy C new cy + ldx [%sp+2223+8], i16 + std out000, [%sp+2223+0] + add i00, r32, rlimb + std out016, [%sp+2223+8] + sllx i16, 16, %g2 + add cy, rlimb, rlimb + srlx i16, 16, %g4 + add %g2, rlimb, %l5 + stw %l5, [%i0+0] + +C WIND-DOWN PHASE 3 +.L_wd3: and %g2, xffffffff, %g2 + fdtox p096b, out000 + add %g2, rlimb, %l5 + fdtox p112b, out016 + srlx %l5, 32, cy + ldx [%sp+2223+16], rlimb + add %g4, cy, cy C new cy + ldx [%sp+2223+24], i16 + std out000, [%sp+2223+16] + add %i0, 8, %i0 C BOOKKEEPING + std out016, [%sp+2223+24] + sllx i16, 16, %g2 + add cy, rlimb, rlimb + srlx i16, 16, %g4 + add %g2, rlimb, %l5 + stw %l5, [%i0+4] +C mid + and %g2, xffffffff, %g2 + add %g2, rlimb, %l5 + srlx %l5, 32, cy + ldx [%sp+2223+0], rlimb + add %g4, cy, cy C new cy + ldx [%sp+2223+8], i16 + sllx i16, 16, %g2 + add cy, rlimb, rlimb + srlx i16, 16, %g4 + add %g2, rlimb, %l5 + stw %l5, [%i0+0] + + and %g2, xffffffff, %g2 + add %g2, rlimb, %l5 + srlx %l5, 32, cy + ldx [%sp+2223+16], i00 + add %g4, cy, cy C new cy + ldx [%sp+2223+24], i16 + + sllx i16, 16, %g2 + add i00, cy, cy + return %i7+8 + add %g2, cy, %o0 +EPILOGUE(mpn_addmul_2) diff --git a/vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234/lshiftc.asm b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234/lshiftc.asm new file mode 100644 index 0000000..47286d5 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234/lshiftc.asm @@ -0,0 +1,165 @@ +dnl SPARC v9 mpn_lshiftc + +dnl Copyright 1996, 2000-2003, 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C cycles/limb +C UltraSPARC 1&2: 3 +C UltraSPARC 3: 2.67 + +C INPUT PARAMETERS +define(`rp', `%i0') +define(`up', `%i1') +define(`n', `%i2') +define(`cnt',`%i3') + +define(`u0', `%l0') +define(`u1', `%l2') +define(`u2', `%l4') +define(`u3', `%l6') + +define(`tnc',`%i4') + +define(`fanop',`fitod %f0,%f2') dnl A quasi nop running in the FA pipe + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_lshiftc) + save %sp,-160,%sp + + sllx n,3,%g1 + sub %g0,cnt,tnc C negate shift count + add up,%g1,up C make %o1 point at end of src + add rp,%g1,rp C make %o0 point at end of res + ldx [up-8],u3 C load first limb + subcc n,5,n + srlx u3,tnc,%i5 C compute function result + bl,pn %xcc,.Lend1234 + sllx u3,cnt,%g3 + + subcc n,4,n + ldx [up-16],u0 + ldx [up-24],u1 + add up,-32,up + ldx [up-0],u2 + ldx [up-8],u3 + srlx u0,tnc,%g2 + bl,pn %xcc,.Lend5678 + not %g3, %g3 + + b,a .Loop + ALIGN(16) +.Loop: + sllx u0,cnt,%g1 + andn %g3,%g2,%g3 + ldx [up-16],u0 + fanop +C -- + srlx u1,tnc,%g2 + subcc n,4,n + stx %g3,[rp-8] + not %g1, %g1 +C -- + sllx u1,cnt,%g3 + andn %g1,%g2,%g1 + ldx [up-24],u1 + fanop +C -- + srlx u2,tnc,%g2 + stx %g1,[rp-16] + add up,-32,up + not %g3, %g3 +C -- + sllx u2,cnt,%g1 + andn %g3,%g2,%g3 + ldx [up-0],u2 + fanop +C -- + srlx u3,tnc,%g2 + stx %g3,[rp-24] + add rp,-32,rp + not %g1, %g1 +C -- + sllx u3,cnt,%g3 + andn %g1,%g2,%g1 + ldx [up-8],u3 + fanop +C -- + srlx u0,tnc,%g2 + stx %g1,[rp-0] + bge,pt %xcc,.Loop + not %g3, %g3 +C -- +.Lend5678: + sllx u0,cnt,%g1 + andn %g3,%g2,%g3 + srlx u1,tnc,%g2 + stx %g3,[rp-8] + not %g1, %g1 + sllx u1,cnt,%g3 + andn %g1,%g2,%g1 + srlx u2,tnc,%g2 + stx %g1,[rp-16] + not %g3, %g3 + sllx u2,cnt,%g1 + andn %g3,%g2,%g3 + srlx u3,tnc,%g2 + stx %g3,[rp-24] + add rp,-32,rp + not %g1, %g1 + sllx u3,cnt,%g3 C carry... + andn %g1,%g2,%g1 + stx %g1,[rp-0] + +.Lend1234: + addcc n,4,n + bz,pn %xcc,.Lret + fanop +.Loop0: + add rp,-8,rp + subcc n,1,n + ldx [up-16],u3 + add up,-8,up + srlx u3,tnc,%g2 + not %g3, %g3 + andn %g3,%g2,%g3 + stx %g3,[rp] + sllx u3,cnt,%g3 + bnz,pt %xcc,.Loop0 + fanop +.Lret: + not %g3, %g3 + stx %g3,[rp-8] + mov %i5,%i0 + ret + restore +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234/mul_1.asm b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234/mul_1.asm new file mode 100644 index 0000000..871d562 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234/mul_1.asm @@ -0,0 +1,580 @@ +dnl SPARC v9 64-bit mpn_mul_1 -- Multiply a limb vector with a limb and store +dnl the result in a second limb vector. + +dnl Copyright 1998, 2000-2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC 1&2: 14 +C UltraSPARC 3: 18.5 + +C Algorithm: We use eight floating-point multiplies per limb product, with the +C invariant v operand split into four 16-bit pieces, and the s1 operand split +C into 32-bit pieces. We sum pairs of 48-bit partial products using +C floating-point add, then convert the four 49-bit product-sums and transfer +C them to the integer unit. + +C Possible optimizations: +C 1. Align the stack area where we transfer the four 49-bit product-sums +C to a 32-byte boundary. That would minimize the cache collision. +C (UltraSPARC-1/2 use a direct-mapped cache.) (Perhaps even better would +C be to align the area to map to the area immediately before s1?) +C 2. Sum the 4 49-bit quantities using 32-bit operations, as in the +C develop mpn_addmul_2. This would save many integer instructions. +C 3. Unrolling. Questionable if it is worth the code expansion, given that +C it could only save 1 cycle/limb. +C 4. Specialize for particular v values. If its upper 32 bits are zero, we +C could save many operations, in the FPU (fmuld), but more so in the IEU +C since we'll be summing 48-bit quantities, which might be simpler. +C 5. Ideally, we should schedule the f2/f3 and f4/f5 RAW further apart, and +C the i00,i16,i32,i48 RAW less apart. The latter apart-scheduling should +C not be greater than needed for L2 cache latency, and also not so great +C that i16 needs to be copied. +C 6. Avoid performing mem+fa+fm in the same cycle, at least not when we want +C to get high IEU bandwidth. (12 of the 14 cycles will be free for 2 IEU +C ops.) + +C Instruction classification (as per UltraSPARC-1/2 functional units): +C 8 FM +C 10 FA +C 11 MEM +C 9 ISHIFT + 10? IADDLOG +C 1 BRANCH +C 49 insns totally (plus three mov insns that should be optimized out) + +C The loop executes 53 instructions in 14 cycles on UltraSPARC-1/2, i.e we +C sustain 3.79 instructions/cycle. + +C INPUT PARAMETERS +C rp i0 +C up i1 +C n i2 +C v i3 + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) + +define(`p00', `%f8') define(`p16',`%f10') define(`p32',`%f12') define(`p48',`%f14') +define(`r32',`%f16') define(`r48',`%f18') define(`r64',`%f20') define(`r80',`%f22') +define(`v00',`%f24') define(`v16',`%f26') define(`v32',`%f28') define(`v48',`%f30') +define(`u00',`%f32') define(`u32', `%f34') +define(`a00',`%f36') define(`a16',`%f38') define(`a32',`%f40') define(`a48',`%f42') +define(`cy',`%g1') +define(`rlimb',`%g3') +define(`i00',`%l0') define(`i16',`%l1') define(`i32',`%l2') define(`i48',`%l3') +define(`xffffffff',`%l7') +define(`xffff',`%o0') + +PROLOGUE(mpn_mul_1) + +C Initialization. (1) Split v operand into four 16-bit chunks and store them +C as IEEE double in fp registers. (2) Clear upper 32 bits of fp register pairs +C f2 and f4. (3) Store masks in registers aliased to `xffff' and `xffffffff'. + + save %sp, -256, %sp + mov -1, %g4 + srlx %g4, 48, xffff C store mask in register `xffff' + and %i3, xffff, %g2 + stx %g2, [%sp+2223+0] + srlx %i3, 16, %g3 + and %g3, xffff, %g3 + stx %g3, [%sp+2223+8] + srlx %i3, 32, %g2 + and %g2, xffff, %g2 + stx %g2, [%sp+2223+16] + srlx %i3, 48, %g3 + stx %g3, [%sp+2223+24] + srlx %g4, 32, xffffffff C store mask in register `xffffffff' + + sllx %i2, 3, %i2 + mov 0, cy C clear cy + add %i0, %i2, %i0 + add %i1, %i2, %i1 + neg %i2 + add %i1, 4, %i5 + add %i0, -32, %i4 + add %i0, -16, %i0 + + ldd [%sp+2223+0], v00 + ldd [%sp+2223+8], v16 + ldd [%sp+2223+16], v32 + ldd [%sp+2223+24], v48 + ld [%sp+2223+0],%f2 C zero f2 + ld [%sp+2223+0],%f4 C zero f4 + ld [%i5+%i2], %f3 C read low 32 bits of up[i] + ld [%i1+%i2], %f5 C read high 32 bits of up[i] + fxtod v00, v00 + fxtod v16, v16 + fxtod v32, v32 + fxtod v48, v48 + +C Start real work. (We sneakingly read f3 and f5 above...) +C The software pipeline is very deep, requiring 4 feed-in stages. + + fxtod %f2, u00 + fxtod %f4, u32 + fmuld u00, v00, a00 + fmuld u00, v16, a16 + fmuld u00, v32, p32 + fmuld u32, v00, r32 + fmuld u00, v48, p48 + addcc %i2, 8, %i2 + bnz,pt %xcc, .L_two_or_more + fmuld u32, v16, r48 + +.L_one: + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 + fdtox a00, a00 + faddd p48, r48, a48 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 + fdtox a32, a32 + fdtox a48, a48 + std a00, [%sp+2223+0] + std a16, [%sp+2223+8] + std a32, [%sp+2223+16] + std a48, [%sp+2223+24] + add %i2, 8, %i2 + + fdtox r64, a00 + fdtox r80, a16 + ldx [%sp+2223+0], i00 + ldx [%sp+2223+8], i16 + ldx [%sp+2223+16], i32 + ldx [%sp+2223+24], i48 + std a00, [%sp+2223+0] + std a16, [%sp+2223+8] + add %i2, 8, %i2 + + mov i00, %g5 C i00+ now in g5 + ldx [%sp+2223+0], i00 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + srlx i48, 16, %l5 C (i48 >> 16) + mov i32, %g4 C i32+ now in g4 + sllx i48, 32, %l6 C (i48 << 32) + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + std a00, [%sp+2223+0] + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + std a16, [%sp+2223+8] + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + add %l6, %o2, %o2 C mi64- in %o2 + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + add cy, %g5, %o4 C x = prev(i00) + cy + b .L_out_1 + add %i2, 8, %i2 + +.L_two_or_more: + ld [%i5+%i2], %f3 C read low 32 bits of up[i] + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 + ld [%i1+%i2], %f5 C read high 32 bits of up[i] + fdtox a00, a00 + faddd p48, r48, a48 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 + fdtox a32, a32 + fxtod %f2, u00 + fxtod %f4, u32 + fdtox a48, a48 + std a00, [%sp+2223+0] + fmuld u00, v00, p00 + std a16, [%sp+2223+8] + fmuld u00, v16, p16 + std a32, [%sp+2223+16] + fmuld u00, v32, p32 + std a48, [%sp+2223+24] + faddd p00, r64, a00 + fmuld u32, v00, r32 + faddd p16, r80, a16 + fmuld u00, v48, p48 + addcc %i2, 8, %i2 + bnz,pt %xcc, .L_three_or_more + fmuld u32, v16, r48 + +.L_two: + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 + fdtox a00, a00 + faddd p48, r48, a48 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 + ldx [%sp+2223+0], i00 + fdtox a32, a32 + ldx [%sp+2223+8], i16 + ldx [%sp+2223+16], i32 + ldx [%sp+2223+24], i48 + fdtox a48, a48 + std a00, [%sp+2223+0] + std a16, [%sp+2223+8] + std a32, [%sp+2223+16] + std a48, [%sp+2223+24] + add %i2, 8, %i2 + + fdtox r64, a00 + mov i00, %g5 C i00+ now in g5 + fdtox r80, a16 + ldx [%sp+2223+0], i00 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + srlx i48, 16, %l5 C (i48 >> 16) + mov i32, %g4 C i32+ now in g4 + ldx [%sp+2223+16], i32 + sllx i48, 32, %l6 C (i48 << 32) + ldx [%sp+2223+24], i48 + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + std a00, [%sp+2223+0] + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + std a16, [%sp+2223+8] + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + add %l6, %o2, %o2 C mi64- in %o2 + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + add cy, %g5, %o4 C x = prev(i00) + cy + b .L_out_2 + add %i2, 8, %i2 + +.L_three_or_more: + ld [%i5+%i2], %f3 C read low 32 bits of up[i] + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 + ld [%i1+%i2], %f5 C read high 32 bits of up[i] + fdtox a00, a00 + faddd p48, r48, a48 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 + ldx [%sp+2223+0], i00 + fdtox a32, a32 + ldx [%sp+2223+8], i16 + fxtod %f2, u00 + ldx [%sp+2223+16], i32 + fxtod %f4, u32 + ldx [%sp+2223+24], i48 + fdtox a48, a48 + std a00, [%sp+2223+0] + fmuld u00, v00, p00 + std a16, [%sp+2223+8] + fmuld u00, v16, p16 + std a32, [%sp+2223+16] + fmuld u00, v32, p32 + std a48, [%sp+2223+24] + faddd p00, r64, a00 + fmuld u32, v00, r32 + faddd p16, r80, a16 + fmuld u00, v48, p48 + addcc %i2, 8, %i2 + bnz,pt %xcc, .L_four_or_more + fmuld u32, v16, r48 + +.L_three: + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 + fdtox a00, a00 + faddd p48, r48, a48 + mov i00, %g5 C i00+ now in g5 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 + ldx [%sp+2223+0], i00 + fdtox a32, a32 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + srlx i48, 16, %l5 C (i48 >> 16) + mov i32, %g4 C i32+ now in g4 + ldx [%sp+2223+16], i32 + sllx i48, 32, %l6 C (i48 << 32) + ldx [%sp+2223+24], i48 + fdtox a48, a48 + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + std a00, [%sp+2223+0] + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + std a16, [%sp+2223+8] + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + std a32, [%sp+2223+16] + add %l6, %o2, %o2 C mi64- in %o2 + std a48, [%sp+2223+24] + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + add cy, %g5, %o4 C x = prev(i00) + cy + b .L_out_3 + add %i2, 8, %i2 + +.L_four_or_more: + ld [%i5+%i2], %f3 C read low 32 bits of up[i] + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 + ld [%i1+%i2], %f5 C read high 32 bits of up[i] + fdtox a00, a00 + faddd p48, r48, a48 + mov i00, %g5 C i00+ now in g5 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 + ldx [%sp+2223+0], i00 + fdtox a32, a32 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + fxtod %f2, u00 + srlx i48, 16, %l5 C (i48 >> 16) + mov i32, %g4 C i32+ now in g4 + ldx [%sp+2223+16], i32 + fxtod %f4, u32 + sllx i48, 32, %l6 C (i48 << 32) + ldx [%sp+2223+24], i48 + fdtox a48, a48 + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + std a00, [%sp+2223+0] + fmuld u00, v00, p00 + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + std a16, [%sp+2223+8] + fmuld u00, v16, p16 + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + std a32, [%sp+2223+16] + fmuld u00, v32, p32 + add %l6, %o2, %o2 C mi64- in %o2 + std a48, [%sp+2223+24] + faddd p00, r64, a00 + fmuld u32, v00, r32 + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + faddd p16, r80, a16 + fmuld u00, v48, p48 + add cy, %g5, %o4 C x = prev(i00) + cy + addcc %i2, 8, %i2 + bnz,pt %xcc, .Loop + fmuld u32, v16, r48 + +.L_four: + b,a .L_out_4 + +C BEGIN MAIN LOOP + .align 16 +.Loop: +C 00 + srlx %o4, 16, %o5 C (x >> 16) + ld [%i5+%i2], %f3 C read low 32 bits of up[i] + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 +C 01 + add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT + and %o4, xffff, %o5 C (x & 0xffff) + ld [%i1+%i2], %f5 C read high 32 bits of up[i] + fdtox a00, a00 +C 02 + faddd p48, r48, a48 +C 03 + srlx %o2, 48, %o7 C (mi64 >> 48) + mov i00, %g5 C i00+ now in g5 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 +C 04 + sllx %o2, 16, %i3 C (mi64 << 16) + add %o7, %o1, cy C new cy + ldx [%sp+2223+0], i00 + fdtox a32, a32 +C 05 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + fxtod %f2, u00 +C 06 + srlx i48, 16, %l5 C (i48 >> 16) + mov i32, %g4 C i32+ now in g4 + ldx [%sp+2223+16], i32 + fxtod %f4, u32 +C 07 + sllx i48, 32, %l6 C (i48 << 32) + or %i3, %o5, %o5 + ldx [%sp+2223+24], i48 + fdtox a48, a48 +C 08 + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + std a00, [%sp+2223+0] + fmuld u00, v00, p00 +C 09 + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + std a16, [%sp+2223+8] + fmuld u00, v16, p16 +C 10 + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + std a32, [%sp+2223+16] + fmuld u00, v32, p32 +C 11 + add %l6, %o2, %o2 C mi64- in %o2 + std a48, [%sp+2223+24] + faddd p00, r64, a00 + fmuld u32, v00, r32 +C 12 + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + stx %o5, [%i4+%i2] + faddd p16, r80, a16 + fmuld u00, v48, p48 +C 13 + add cy, %g5, %o4 C x = prev(i00) + cy + addcc %i2, 8, %i2 + bnz,pt %xcc, .Loop + fmuld u32, v16, r48 +C END MAIN LOOP + +.L_out_4: + srlx %o4, 16, %o5 C (x >> 16) + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 + add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT + and %o4, xffff, %o5 C (x & 0xffff) + fdtox a00, a00 + faddd p48, r48, a48 + srlx %o2, 48, %o7 C (mi64 >> 48) + mov i00, %g5 C i00+ now in g5 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 + sllx %o2, 16, %i3 C (mi64 << 16) + add %o7, %o1, cy C new cy + ldx [%sp+2223+0], i00 + fdtox a32, a32 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + srlx i48, 16, %l5 C (i48 >> 16) + mov i32, %g4 C i32+ now in g4 + ldx [%sp+2223+16], i32 + sllx i48, 32, %l6 C (i48 << 32) + or %i3, %o5, %o5 + ldx [%sp+2223+24], i48 + fdtox a48, a48 + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + std a00, [%sp+2223+0] + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + std a16, [%sp+2223+8] + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + std a32, [%sp+2223+16] + add %l6, %o2, %o2 C mi64- in %o2 + std a48, [%sp+2223+24] + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + stx %o5, [%i4+%i2] + add cy, %g5, %o4 C x = prev(i00) + cy + add %i2, 8, %i2 +.L_out_3: + srlx %o4, 16, %o5 C (x >> 16) + add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT + and %o4, xffff, %o5 C (x & 0xffff) + fdtox r64, a00 + srlx %o2, 48, %o7 C (mi64 >> 48) + mov i00, %g5 C i00+ now in g5 + fdtox r80, a16 + sllx %o2, 16, %i3 C (mi64 << 16) + add %o7, %o1, cy C new cy + ldx [%sp+2223+0], i00 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + srlx i48, 16, %l5 C (i48 >> 16) + mov i32, %g4 C i32+ now in g4 + ldx [%sp+2223+16], i32 + sllx i48, 32, %l6 C (i48 << 32) + or %i3, %o5, %o5 + ldx [%sp+2223+24], i48 + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + std a00, [%sp+2223+0] + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + std a16, [%sp+2223+8] + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + add %l6, %o2, %o2 C mi64- in %o2 + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + stx %o5, [%i4+%i2] + add cy, %g5, %o4 C x = prev(i00) + cy + add %i2, 8, %i2 +.L_out_2: + srlx %o4, 16, %o5 C (x >> 16) + add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT + and %o4, xffff, %o5 C (x & 0xffff) + srlx %o2, 48, %o7 C (mi64 >> 48) + mov i00, %g5 C i00+ now in g5 + sllx %o2, 16, %i3 C (mi64 << 16) + add %o7, %o1, cy C new cy + ldx [%sp+2223+0], i00 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + srlx i48, 16, %l5 C (i48 >> 16) + mov i32, %g4 C i32+ now in g4 + sllx i48, 32, %l6 C (i48 << 32) + or %i3, %o5, %o5 + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + add %l6, %o2, %o2 C mi64- in %o2 + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + stx %o5, [%i4+%i2] + add cy, %g5, %o4 C x = prev(i00) + cy + add %i2, 8, %i2 +.L_out_1: + srlx %o4, 16, %o5 C (x >> 16) + add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT + and %o4, xffff, %o5 C (x & 0xffff) + srlx %o2, 48, %o7 C (mi64 >> 48) + sllx %o2, 16, %i3 C (mi64 << 16) + add %o7, %o1, cy C new cy + or %i3, %o5, %o5 + stx %o5, [%i4+%i2] + + sllx i00, 0, %g2 + add %g2, cy, cy + sllx i16, 16, %g3 + add %g3, cy, cy + + return %i7+8 + mov cy, %o0 +EPILOGUE(mpn_mul_1) diff --git a/vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234/sqr_diagonal.asm b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234/sqr_diagonal.asm new file mode 100644 index 0000000..43c69d3 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234/sqr_diagonal.asm @@ -0,0 +1,342 @@ +dnl SPARC v9 64-bit mpn_sqr_diagonal. + +dnl Copyright 2001, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC 1&2: 22 +C UltraSPARC 3: 36 + +C This was generated by the Sun C compiler. It runs at 22 cycles/limb on the +C UltraSPARC-1/2, three cycles slower than theoretically possible for optimal +C code using the same algorithm. For 1-3 limbs, a special loop was generated, +C which causes performance problems in particular for 2 and 3 limbs. +C Ultimately, this should be replaced by hand-written code in the same software +C pipeline style as e.g., addmul_1.asm. + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_sqr_diagonal) + save %sp, -240, %sp + + sethi %hi(0x1ffc00), %o0 + sethi %hi(0x3ffc00), %o1 + add %o0, 1023, %o7 + cmp %i2, 4 + add %o1, 1023, %o4 + or %g0, %i1, %g1 + or %g0, %i0, %o0 + bl,pn %xcc, .Lsmall + or %g0, 0, %g2 + + ldx [%i1], %o1 + add %i1, 24, %g1 + or %g0, 3, %g2 + srlx %o1, 42, %g3 + stx %g3, [%sp+2279] + and %o1, %o7, %o2 + stx %o2, [%sp+2263] + srlx %o1, 21, %o1 + ldd [%sp+2279], %f0 + and %o1, %o7, %o1 + stx %o1, [%sp+2271] + ldx [%i1+8], %o2 + fxtod %f0, %f12 + srlx %o2, 21, %o1 + and %o2, %o7, %g3 + ldd [%sp+2263], %f2 + fmuld %f12, %f12, %f10 + srlx %o2, 42, %o2 + ldd [%sp+2271], %f0 + and %o1, %o7, %o1 + fxtod %f2, %f8 + stx %o2, [%sp+2279] + stx %o1, [%sp+2271] + fxtod %f0, %f0 + stx %g3, [%sp+2263] + fdtox %f10, %f14 + fmuld %f12, %f8, %f6 + ldx [%i1+16], %o2 + std %f14, [%sp+2255] + fmuld %f0, %f0, %f2 + fmuld %f8, %f8, %f10 + srlx %o2, 42, %o1 + faddd %f6, %f6, %f6 + fmuld %f12, %f0, %f12 + fmuld %f0, %f8, %f8 + ldd [%sp+2279], %f0 + ldd [%sp+2263], %f4 + fdtox %f10, %f10 + std %f10, [%sp+2239] + faddd %f2, %f6, %f6 + ldd [%sp+2271], %f2 + fdtox %f12, %f12 + std %f12, [%sp+2247] + fdtox %f8, %f8 + std %f8, [%sp+2231] + fdtox %f6, %f6 + std %f6, [%sp+2223] + +.Loop: srlx %o2, 21, %g3 + stx %o1, [%sp+2279] + add %g2, 1, %g2 + and %g3, %o7, %o1 + ldx [%sp+2255], %g4 + cmp %g2, %i2 + stx %o1, [%sp+2271] + add %g1, 8, %g1 + add %o0, 16, %o0 + ldx [%sp+2239], %o1 + fxtod %f0, %f10 + fxtod %f4, %f14 + ldx [%sp+2231], %i0 + ldx [%sp+2223], %g5 + ldx [%sp+2247], %g3 + and %o2, %o7, %o2 + fxtod %f2, %f8 + fmuld %f10, %f10, %f0 + stx %o2, [%sp+2263] + fmuld %f10, %f14, %f6 + ldx [%g1-8], %o2 + fmuld %f10, %f8, %f12 + fdtox %f0, %f2 + ldd [%sp+2279], %f0 + fmuld %f8, %f8, %f4 + faddd %f6, %f6, %f6 + fmuld %f14, %f14, %f10 + std %f2, [%sp+2255] + sllx %g4, 20, %g4 + ldd [%sp+2271], %f2 + fmuld %f8, %f14, %f8 + sllx %i0, 22, %i1 + fdtox %f12, %f12 + std %f12, [%sp+2247] + sllx %g5, 42, %i0 + add %o1, %i1, %o1 + faddd %f4, %f6, %f6 + ldd [%sp+2263], %f4 + add %o1, %i0, %o1 + add %g3, %g4, %g3 + fdtox %f10, %f10 + std %f10, [%sp+2239] + srlx %o1, 42, %g4 + and %g5, %o4, %i0 + fdtox %f8, %f8 + std %f8, [%sp+2231] + srlx %g5, 22, %g5 + sub %g4, %i0, %g4 + fdtox %f6, %f6 + std %f6, [%sp+2223] + srlx %g4, 63, %g4 + add %g3, %g5, %g3 + add %g3, %g4, %g3 + stx %o1, [%o0-16] + srlx %o2, 42, %o1 + bl,pt %xcc, .Loop + stx %g3, [%o0-8] + + stx %o1, [%sp+2279] + srlx %o2, 21, %o1 + fxtod %f0, %f16 + ldx [%sp+2223], %g3 + fxtod %f4, %f6 + and %o2, %o7, %o3 + stx %o3, [%sp+2263] + fxtod %f2, %f4 + and %o1, %o7, %o1 + ldx [%sp+2231], %o2 + sllx %g3, 42, %g4 + fmuld %f16, %f16, %f14 + stx %o1, [%sp+2271] + fmuld %f16, %f6, %f8 + add %o0, 48, %o0 + ldx [%sp+2239], %o1 + sllx %o2, 22, %o2 + fmuld %f4, %f4, %f10 + ldx [%sp+2255], %o3 + fdtox %f14, %f14 + fmuld %f4, %f6, %f2 + std %f14, [%sp+2255] + faddd %f8, %f8, %f12 + add %o1, %o2, %o2 + fmuld %f16, %f4, %f4 + ldd [%sp+2279], %f0 + sllx %o3, 20, %g5 + add %o2, %g4, %o2 + fmuld %f6, %f6, %f6 + srlx %o2, 42, %o3 + and %g3, %o4, %g4 + srlx %g3, 22, %g3 + faddd %f10, %f12, %f16 + ldd [%sp+2271], %f12 + ldd [%sp+2263], %f8 + fxtod %f0, %f0 + sub %o3, %g4, %o3 + ldx [%sp+2247], %o1 + srlx %o3, 63, %o3 + fdtox %f2, %f10 + fxtod %f8, %f8 + std %f10, [%sp+2231] + fdtox %f6, %f6 + std %f6, [%sp+2239] + add %o1, %g5, %o1 + fmuld %f0, %f0, %f2 + fdtox %f16, %f16 + std %f16, [%sp+2223] + add %o1, %g3, %o1 + fdtox %f4, %f4 + std %f4, [%sp+2247] + fmuld %f0, %f8, %f10 + fxtod %f12, %f12 + add %o1, %o3, %o1 + stx %o2, [%o0-48] + fmuld %f8, %f8, %f6 + stx %o1, [%o0-40] + fdtox %f2, %f2 + ldx [%sp+2231], %o2 + faddd %f10, %f10, %f10 + ldx [%sp+2223], %g3 + fmuld %f12, %f12, %f4 + fdtox %f6, %f6 + ldx [%sp+2239], %o1 + sllx %o2, 22, %o2 + fmuld %f12, %f8, %f8 + sllx %g3, 42, %g5 + ldx [%sp+2255], %o3 + fmuld %f0, %f12, %f0 + add %o1, %o2, %o2 + faddd %f4, %f10, %f4 + ldx [%sp+2247], %o1 + add %o2, %g5, %o2 + and %g3, %o4, %g4 + fdtox %f8, %f8 + sllx %o3, 20, %g5 + std %f8, [%sp+2231] + fdtox %f0, %f0 + srlx %o2, 42, %o3 + add %o1, %g5, %o1 + fdtox %f4, %f4 + srlx %g3, 22, %g3 + sub %o3, %g4, %o3 + std %f6, [%sp+2239] + std %f4, [%sp+2223] + srlx %o3, 63, %o3 + add %o1, %g3, %o1 + std %f2, [%sp+2255] + add %o1, %o3, %o1 + std %f0, [%sp+2247] + stx %o2, [%o0-32] + stx %o1, [%o0-24] + ldx [%sp+2231], %o2 + ldx [%sp+2223], %o3 + ldx [%sp+2239], %o1 + sllx %o2, 22, %o2 + sllx %o3, 42, %g5 + ldx [%sp+2255], %g4 + and %o3, %o4, %g3 + add %o1, %o2, %o2 + ldx [%sp+2247], %o1 + add %o2, %g5, %o2 + stx %o2, [%o0-16] + sllx %g4, 20, %g4 + srlx %o2, 42, %o2 + add %o1, %g4, %o1 + srlx %o3, 22, %o3 + sub %o2, %g3, %o2 + srlx %o2, 63, %o2 + add %o1, %o3, %o1 + add %o1, %o2, %o1 + stx %o1, [%o0-8] + ret + restore %g0, %g0, %g0 +.Lsmall: + ldx [%g1], %o2 +.Loop0: + and %o2, %o7, %o1 + stx %o1, [%sp+2263] + add %g2, 1, %g2 + srlx %o2, 21, %o1 + add %g1, 8, %g1 + srlx %o2, 42, %o2 + stx %o2, [%sp+2279] + and %o1, %o7, %o1 + ldd [%sp+2263], %f0 + cmp %g2, %i2 + stx %o1, [%sp+2271] + fxtod %f0, %f6 + ldd [%sp+2279], %f0 + ldd [%sp+2271], %f4 + fxtod %f0, %f2 + fmuld %f6, %f6, %f0 + fxtod %f4, %f10 + fmuld %f2, %f6, %f4 + fdtox %f0, %f0 + std %f0, [%sp+2239] + fmuld %f10, %f6, %f8 + fmuld %f10, %f10, %f0 + faddd %f4, %f4, %f6 + fmuld %f2, %f2, %f4 + fdtox %f8, %f8 + std %f8, [%sp+2231] + fmuld %f2, %f10, %f2 + faddd %f0, %f6, %f0 + fdtox %f4, %f4 + std %f4, [%sp+2255] + fdtox %f2, %f2 + std %f2, [%sp+2247] + fdtox %f0, %f0 + std %f0, [%sp+2223] + ldx [%sp+2239], %o1 + ldx [%sp+2255], %g4 + ldx [%sp+2231], %o2 + sllx %g4, 20, %g4 + ldx [%sp+2223], %o3 + sllx %o2, 22, %o2 + sllx %o3, 42, %g5 + add %o1, %o2, %o2 + ldx [%sp+2247], %o1 + add %o2, %g5, %o2 + stx %o2, [%o0] + and %o3, %o4, %g3 + srlx %o2, 42, %o2 + add %o1, %g4, %o1 + srlx %o3, 22, %o3 + sub %o2, %g3, %o2 + srlx %o2, 63, %o2 + add %o1, %o3, %o1 + add %o1, %o2, %o1 + stx %o1, [%o0+8] + add %o0, 16, %o0 + bl,a,pt %xcc, .Loop0 + ldx [%g1], %o2 + ret + restore %g0, %g0, %g0 +EPILOGUE(mpn_sqr_diagonal) diff --git a/vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234/sub_n.asm b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234/sub_n.asm new file mode 100644 index 0000000..9fb7f70 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234/sub_n.asm @@ -0,0 +1,241 @@ +dnl SPARC v9 mpn_sub_n -- Subtract two limb vectors of the same length > 0 and +dnl store difference in a third limb vector. + +dnl Copyright 2001-2003, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC 1&2: 4 +C UltraSPARC 3: 4.5 + +C Compute carry-out from the most significant bits of u,v, and r, where +C r=u-v-carry_in, using logic operations. + +C This code runs at 4 cycles/limb on UltraSPARC 1 and 2. It has a 4 insn +C recurrency, and the UltraSPARC 1 and 2 the IE units are 100% saturated. +C Therefore, it seems futile to try to optimize this any further... + +C INPUT PARAMETERS +define(`rp',`%i0') +define(`up',`%i1') +define(`vp',`%i2') +define(`n',`%i3') + +define(`u0',`%l0') +define(`u1',`%l2') +define(`u2',`%l4') +define(`u3',`%l6') +define(`v0',`%l1') +define(`v1',`%l3') +define(`v2',`%l5') +define(`v3',`%l7') + +define(`cy',`%i4') + +define(`fanop',`fitod %f0,%f2') dnl A quasi nop running in the FA pipe +define(`fmnop',`fmuld %f0,%f0,%f4') dnl A quasi nop running in the FM pipe + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_sub_nc) + save %sp,-160,%sp + + fitod %f0,%f0 C make sure f0 contains small, quiet number + subcc n,4,%g0 + bl,pn %xcc,.Loop0 + nop + b,a L(com) +EPILOGUE() + +PROLOGUE(mpn_sub_n) + save %sp,-160,%sp + + fitod %f0,%f0 C make sure f0 contains small, quiet number + subcc n,4,%g0 + bl,pn %xcc,.Loop0 + mov 0,cy +L(com): + ldx [up+0],u0 + ldx [vp+0],v0 + add up,32,up + ldx [up-24],u1 + ldx [vp+8],v1 + add vp,32,vp + ldx [up-16],u2 + ldx [vp-16],v2 + ldx [up-8],u3 + ldx [vp-8],v3 + subcc n,8,n + sub u0,v0,%g1 C main sub + sub %g1,cy,%g5 C carry sub + orn u0,v0,%g2 + bl,pn %xcc,.Lend4567 + fanop + b,a .Loop + + .align 16 +C START MAIN LOOP +.Loop: orn %g5,%g2,%g2 + andn u0,v0,%g3 + ldx [up+0],u0 + fanop +C -- + andn %g2,%g3,%g2 + ldx [vp+0],v0 + add up,32,up + fanop +C -- + srlx %g2,63,cy + sub u1,v1,%g1 + stx %g5,[rp+0] + fanop +C -- + sub %g1,cy,%g5 + orn u1,v1,%g2 + fmnop + fanop +C -- + orn %g5,%g2,%g2 + andn u1,v1,%g3 + ldx [up-24],u1 + fanop +C -- + andn %g2,%g3,%g2 + ldx [vp+8],v1 + add vp,32,vp + fanop +C -- + srlx %g2,63,cy + sub u2,v2,%g1 + stx %g5,[rp+8] + fanop +C -- + sub %g1,cy,%g5 + orn u2,v2,%g2 + fmnop + fanop +C -- + orn %g5,%g2,%g2 + andn u2,v2,%g3 + ldx [up-16],u2 + fanop +C -- + andn %g2,%g3,%g2 + ldx [vp-16],v2 + add rp,32,rp + fanop +C -- + srlx %g2,63,cy + sub u3,v3,%g1 + stx %g5,[rp-16] + fanop +C -- + sub %g1,cy,%g5 + orn u3,v3,%g2 + fmnop + fanop +C -- + orn %g5,%g2,%g2 + andn u3,v3,%g3 + ldx [up-8],u3 + fanop +C -- + andn %g2,%g3,%g2 + subcc n,4,n + ldx [vp-8],v3 + fanop +C -- + srlx %g2,63,cy + sub u0,v0,%g1 + stx %g5,[rp-8] + fanop +C -- + sub %g1,cy,%g5 + orn u0,v0,%g2 + bge,pt %xcc,.Loop + fanop +C END MAIN LOOP +.Lend4567: + orn %g5,%g2,%g2 + andn u0,v0,%g3 + andn %g2,%g3,%g2 + srlx %g2,63,cy + sub u1,v1,%g1 + stx %g5,[rp+0] + sub %g1,cy,%g5 + orn u1,v1,%g2 + orn %g5,%g2,%g2 + andn u1,v1,%g3 + andn %g2,%g3,%g2 + srlx %g2,63,cy + sub u2,v2,%g1 + stx %g5,[rp+8] + sub %g1,cy,%g5 + orn u2,v2,%g2 + orn %g5,%g2,%g2 + andn u2,v2,%g3 + andn %g2,%g3,%g2 + add rp,32,rp + srlx %g2,63,cy + sub u3,v3,%g1 + stx %g5,[rp-16] + sub %g1,cy,%g5 + orn u3,v3,%g2 + orn %g5,%g2,%g2 + andn u3,v3,%g3 + andn %g2,%g3,%g2 + srlx %g2,63,cy + stx %g5,[rp-8] + + addcc n,4,n + bz,pn %xcc,.Lret + fanop + +.Loop0: ldx [up],u0 + add up,8,up + ldx [vp],v0 + add vp,8,vp + add rp,8,rp + subcc n,1,n + sub u0,v0,%g1 + orn u0,v0,%g2 + sub %g1,cy,%g5 + andn u0,v0,%g3 + orn %g5,%g2,%g2 + stx %g5,[rp-8] + andn %g2,%g3,%g2 + bnz,pt %xcc,.Loop0 + srlx %g2,63,cy + +.Lret: mov cy,%i0 + ret + restore +EPILOGUE(mpn_sub_n) diff --git a/vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234/submul_1.asm b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234/submul_1.asm new file mode 100644 index 0000000..0bdb566 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234/submul_1.asm @@ -0,0 +1,68 @@ +dnl SPARC v9 64-bit mpn_submul_1 -- Multiply a limb vector with a limb and +dnl subtract the result from a second limb vector. + +dnl Copyright 2001-2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC 1&2: 18 +C UltraSPARC 3: 23 + +C INPUT PARAMETERS +C rp i0 +C up i1 +C n i2 +C v i3 + +ASM_START() + REGISTER(%g2,#scratch) + +PROLOGUE(mpn_submul_1) + save %sp,-176,%sp + + sllx %i2, 3, %g2 + or %g0, %i1, %o1 + add %g2, 15, %o0 + or %g0, %i2, %o2 + and %o0, -16, %o0 + sub %sp, %o0, %sp + add %sp, 2223, %o0 + or %g0, %o0, %l0 + call mpn_mul_1 + or %g0, %i3, %o3 + or %g0, %o0, %l1 C preserve carry value from mpn_mul_1 + or %g0, %i0, %o0 + or %g0, %i0, %o1 + or %g0, %l0, %o2 + call mpn_sub_n + or %g0, %i2, %o3 + ret + restore %l1, %o0, %o0 C sum carry values +EPILOGUE(mpn_submul_1) -- cgit v1.2.3