From a89a14ef5da44684a16b204e7a70460cc8c4922a Mon Sep 17 00:00:00 2001 From: Thomas Voss Date: Fri, 21 Jun 2024 23:36:36 +0200 Subject: Basic constant folding implementation --- vendor/gmp-6.3.0/mpn/sparc32/v9/README | 4 + vendor/gmp-6.3.0/mpn/sparc32/v9/add_n.asm | 129 +++++++ vendor/gmp-6.3.0/mpn/sparc32/v9/addmul_1.asm | 306 +++++++++++++++ vendor/gmp-6.3.0/mpn/sparc32/v9/gmp-mparam.h | 204 ++++++++++ vendor/gmp-6.3.0/mpn/sparc32/v9/mul_1.asm | 287 ++++++++++++++ vendor/gmp-6.3.0/mpn/sparc32/v9/sqr_diagonal.asm | 462 +++++++++++++++++++++++ vendor/gmp-6.3.0/mpn/sparc32/v9/sub_n.asm | 129 +++++++ vendor/gmp-6.3.0/mpn/sparc32/v9/submul_1.asm | 316 ++++++++++++++++ vendor/gmp-6.3.0/mpn/sparc32/v9/udiv.asm | 52 +++ 9 files changed, 1889 insertions(+) create mode 100644 vendor/gmp-6.3.0/mpn/sparc32/v9/README create mode 100644 vendor/gmp-6.3.0/mpn/sparc32/v9/add_n.asm create mode 100644 vendor/gmp-6.3.0/mpn/sparc32/v9/addmul_1.asm create mode 100644 vendor/gmp-6.3.0/mpn/sparc32/v9/gmp-mparam.h create mode 100644 vendor/gmp-6.3.0/mpn/sparc32/v9/mul_1.asm create mode 100644 vendor/gmp-6.3.0/mpn/sparc32/v9/sqr_diagonal.asm create mode 100644 vendor/gmp-6.3.0/mpn/sparc32/v9/sub_n.asm create mode 100644 vendor/gmp-6.3.0/mpn/sparc32/v9/submul_1.asm create mode 100644 vendor/gmp-6.3.0/mpn/sparc32/v9/udiv.asm (limited to 'vendor/gmp-6.3.0/mpn/sparc32/v9') diff --git a/vendor/gmp-6.3.0/mpn/sparc32/v9/README b/vendor/gmp-6.3.0/mpn/sparc32/v9/README new file mode 100644 index 0000000..9b39713 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc32/v9/README @@ -0,0 +1,4 @@ +Code for SPARC processors implementing version 9 of the SPARC architecture. +This code is for systems that doesn't preserve the full 64-bit contents of +integer register at context switch. For other systems (such as Solaris 7 or +later) use the code in ../../sparc64. diff --git a/vendor/gmp-6.3.0/mpn/sparc32/v9/add_n.asm b/vendor/gmp-6.3.0/mpn/sparc32/v9/add_n.asm new file mode 100644 index 0000000..7bd5974 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc32/v9/add_n.asm @@ -0,0 +1,129 @@ +dnl SPARC mpn_add_n -- Add two limb vectors of the same length > 0 and store +dnl sum in a third limb vector. + +dnl Copyright 2001 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C INPUT PARAMETERS +define(rp,%o0) +define(s1p,%o1) +define(s2p,%o2) +define(n,%o3) +define(cy,%g1) + +C This code uses 64-bit operations on `o' and `g' registers. It doesn't +C require that `o' registers' upper 32 bits are preserved by the operating +C system, but if they are not, they must be zeroed. That is indeed what +C happens at least on Slowaris 2.5 and 2.6. + +C On UltraSPARC 1 and 2, this code runs at 3 cycles/limb from the Dcache and at +C about 10 cycles/limb from the Ecache. + +ASM_START() +PROLOGUE(mpn_add_n) + lduw [s1p+0],%o4 + lduw [s2p+0],%o5 + addcc n,-2,n + bl,pn %icc,L(end1) + lduw [s1p+4],%g2 + lduw [s2p+4],%g3 + be,pn %icc,L(end2) + mov 0,cy + + .align 16 +L(loop): + add %o4,%o5,%g4 + add rp,8,rp + lduw [s1p+8],%o4 + fitod %f0,%f2 +C --- + add cy,%g4,%g4 + addcc n,-1,n + lduw [s2p+8],%o5 + fitod %f0,%f2 +C --- + srlx %g4,32,cy + add s2p,8,s2p + stw %g4,[rp-8] + be,pn %icc,L(exito)+4 +C --- + add %g2,%g3,%g4 + addcc n,-1,n + lduw [s1p+12],%g2 + fitod %f0,%f2 +C --- + add cy,%g4,%g4 + add s1p,8,s1p + lduw [s2p+4],%g3 + fitod %f0,%f2 +C --- + srlx %g4,32,cy + bne,pt %icc,L(loop) + stw %g4,[rp-4] +C --- +L(exite): + add %o4,%o5,%g4 + add cy,%g4,%g4 + srlx %g4,32,cy + stw %g4,[rp+0] + add %g2,%g3,%g4 + add cy,%g4,%g4 + stw %g4,[rp+4] + retl + srlx %g4,32,%o0 + +L(exito): + add %g2,%g3,%g4 + add cy,%g4,%g4 + srlx %g4,32,cy + stw %g4,[rp-4] + add %o4,%o5,%g4 + add cy,%g4,%g4 + stw %g4,[rp+0] + retl + srlx %g4,32,%o0 + +L(end1): + add %o4,%o5,%g4 + stw %g4,[rp+0] + retl + srlx %g4,32,%o0 + +L(end2): + add %o4,%o5,%g4 + srlx %g4,32,cy + stw %g4,[rp+0] + add %g2,%g3,%g4 + add cy,%g4,%g4 + stw %g4,[rp+4] + retl + srlx %g4,32,%o0 +EPILOGUE(mpn_add_n) diff --git a/vendor/gmp-6.3.0/mpn/sparc32/v9/addmul_1.asm b/vendor/gmp-6.3.0/mpn/sparc32/v9/addmul_1.asm new file mode 100644 index 0000000..2adf7a8 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc32/v9/addmul_1.asm @@ -0,0 +1,306 @@ +dnl SPARC v9 32-bit mpn_addmul_1 -- Multiply a limb vector with a limb and add +dnl the result to a second limb vector. + +dnl Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C Algorithm: We use two floating-point multiplies per limb product, with the +C invariant v operand split into two 16-bit pieces, and the u operand split +C into 32-bit pieces. We convert the two 48-bit products and transfer them to +C the integer unit. + +C cycles/limb +C UltraSPARC 1&2: 6.5 +C UltraSPARC 3: ? + +C Possible optimizations: +C 1. Combine 32-bit memory operations into 64-bit operations. Since we're +C memory bandwidth limited, this could save 1.5 cycles/limb. +C 2. Unroll the inner loop. Since we already use alternate temporary areas, +C it is very straightforward to unroll, using an exit branch midways. +C Unrolling would allow deeper scheduling which could improve speed for L2 +C cache case. +C 3. For mpn_mul_1: Use more alternating temp areas. The std'es and ldx'es +C aren't sufficiently apart-scheduled with just two temp areas. +C 4. Specialize for particular v values. If its upper 16 bits are zero, we +C could save many operations. + +C INPUT PARAMETERS +C rp i0 +C up i1 +C n i2 +C v i3 + +define(`FSIZE',224) + +ASM_START() +PROLOGUE(mpn_addmul_1) + add %sp, -FSIZE, %sp + sethi %hi(0xffff), %g1 + srl %o3, 16, %g2 + or %g1, %lo(0xffff), %g1 + and %o3, %g1, %g1 + stx %g1, [%sp+104] + stx %g2, [%sp+112] + ldd [%sp+104], %f6 + ldd [%sp+112], %f8 + fxtod %f6, %f6 + fxtod %f8, %f8 + ld [%sp+104], %f10 C zero f10 + + mov 0, %g3 C cy = 0 + +define(`fanop', `fitod %f18, %f0') C A quasi nop running in the FA pipe + + add %sp, 160, %o5 C point in scratch area + and %o5, -32, %o5 C align at 0 (mod 32) in scratch area + + subcc %o2, 1, %o2 + ld [%o1], %f11 C read up[i] + add %o1, 4, %o1 C up++ + bne,pt %icc, .L_two_or_more + fxtod %f10, %f2 + + fmuld %f2, %f8, %f16 + fmuld %f2, %f6, %f4 + fdtox %f16, %f14 + fdtox %f4, %f12 + std %f14, [%o5+16] + std %f12, [%o5+24] + ldx [%o5+16], %g2 C p16 + ldx [%o5+24], %g1 C p0 + lduw [%o0], %g5 C read rp[i] + b .L1 + add %o0, -16, %o0 + + .align 16 +.L_two_or_more: + subcc %o2, 1, %o2 + ld [%o1], %f11 C read up[i] + fmuld %f2, %f8, %f16 + fmuld %f2, %f6, %f4 + add %o1, 4, %o1 C up++ + bne,pt %icc, .L_three_or_more + fxtod %f10, %f2 + + fdtox %f16, %f14 + fdtox %f4, %f12 + std %f14, [%o5+16] + fmuld %f2, %f8, %f16 + std %f12, [%o5+24] + fmuld %f2, %f6, %f4 + fdtox %f16, %f14 + fdtox %f4, %f12 + std %f14, [%o5+0] + std %f12, [%o5+8] + lduw [%o0], %g5 C read rp[i] + ldx [%o5+16], %g2 C p16 + ldx [%o5+24], %g1 C p0 + b .L2 + add %o0, -12, %o0 + + .align 16 +.L_three_or_more: + subcc %o2, 1, %o2 + ld [%o1], %f11 C read up[i] + fdtox %f16, %f14 + fdtox %f4, %f12 + std %f14, [%o5+16] + fmuld %f2, %f8, %f16 + std %f12, [%o5+24] + fmuld %f2, %f6, %f4 + add %o1, 4, %o1 C up++ + bne,pt %icc, .L_four_or_more + fxtod %f10, %f2 + + fdtox %f16, %f14 + fdtox %f4, %f12 + std %f14, [%o5+0] + fmuld %f2, %f8, %f16 + std %f12, [%o5+8] + fmuld %f2, %f6, %f4 + fdtox %f16, %f14 + ldx [%o5+16], %g2 C p16 + fdtox %f4, %f12 + ldx [%o5+24], %g1 C p0 + std %f14, [%o5+16] + std %f12, [%o5+24] + lduw [%o0], %g5 C read rp[i] + b .L3 + add %o0, -8, %o0 + + .align 16 +.L_four_or_more: + subcc %o2, 1, %o2 + ld [%o1], %f11 C read up[i] + fdtox %f16, %f14 + fdtox %f4, %f12 + std %f14, [%o5+0] + fmuld %f2, %f8, %f16 + std %f12, [%o5+8] + fmuld %f2, %f6, %f4 + add %o1, 4, %o1 C up++ + bne,pt %icc, .L_five_or_more + fxtod %f10, %f2 + + fdtox %f16, %f14 + ldx [%o5+16], %g2 C p16 + fdtox %f4, %f12 + ldx [%o5+24], %g1 C p0 + std %f14, [%o5+16] + fmuld %f2, %f8, %f16 + std %f12, [%o5+24] + fmuld %f2, %f6, %f4 + add %o1, 4, %o1 C up++ + lduw [%o0], %g5 C read rp[i] + b .L4 + add %o0, -4, %o0 + + .align 16 +.L_five_or_more: + subcc %o2, 1, %o2 + ld [%o1], %f11 C read up[i] + fdtox %f16, %f14 + ldx [%o5+16], %g2 C p16 + fdtox %f4, %f12 + ldx [%o5+24], %g1 C p0 + std %f14, [%o5+16] + fmuld %f2, %f8, %f16 + std %f12, [%o5+24] + fmuld %f2, %f6, %f4 + add %o1, 4, %o1 C up++ + lduw [%o0], %g5 C read rp[i] + bne,pt %icc, .Loop + fxtod %f10, %f2 + b,a .L5 + +C BEGIN MAIN LOOP + .align 16 +C -- 0 +.Loop: nop + subcc %o2, 1, %o2 + ld [%o1], %f11 C read up[i] + fdtox %f16, %f14 +C -- 1 + sllx %g2, 16, %g4 C (p16 << 16) + add %o0, 4, %o0 C rp++ + ldx [%o5+0], %g2 C p16 + fdtox %f4, %f12 +C -- 2 + nop + add %g1, %g4, %g4 C p = p0 + (p16 << 16) + ldx [%o5+8], %g1 C p0 + fanop +C -- 3 + nop + add %g3, %g4, %g4 C p += cy + std %f14, [%o5+0] + fmuld %f2, %f8, %f16 +C -- 4 + nop + add %g5, %g4, %g4 C p += rp[i] + std %f12, [%o5+8] + fmuld %f2, %f6, %f4 +C -- 5 + xor %o5, 16, %o5 C alternate scratch variables + add %o1, 4, %o1 C up++ + stw %g4, [%o0-4] + fanop +C -- 6 + srlx %g4, 32, %g3 C new cy + lduw [%o0], %g5 C read rp[i] + bne,pt %icc, .Loop + fxtod %f10, %f2 +C END MAIN LOOP + +.L5: fdtox %f16, %f14 + sllx %g2, 16, %g4 C (p16 << 16) + ldx [%o5+0], %g2 C p16 + fdtox %f4, %f12 + add %g1, %g4, %g4 C p = p0 + (p16 << 16) + ldx [%o5+8], %g1 C p0 + add %g4, %g3, %g4 C p += cy + std %f14, [%o5+0] + fmuld %f2, %f8, %f16 + add %g5, %g4, %g4 C p += rp[i] + std %f12, [%o5+8] + fmuld %f2, %f6, %f4 + xor %o5, 16, %o5 + stw %g4, [%o0+0] + srlx %g4, 32, %g3 C new cy + lduw [%o0+4], %g5 C read rp[i] + +.L4: fdtox %f16, %f14 + sllx %g2, 16, %g4 C (p16 << 16) + ldx [%o5+0], %g2 C p16 + fdtox %f4, %f12 + add %g1, %g4, %g4 C p = p0 + (p16 << 16) + ldx [%o5+8], %g1 C p0 + add %g3, %g4, %g4 C p += cy + std %f14, [%o5+0] + add %g5, %g4, %g4 C p += rp[i] + std %f12, [%o5+8] + xor %o5, 16, %o5 + stw %g4, [%o0+4] + srlx %g4, 32, %g3 C new cy + lduw [%o0+8], %g5 C read rp[i] + +.L3: sllx %g2, 16, %g4 C (p16 << 16) + ldx [%o5+0], %g2 C p16 + add %g1, %g4, %g4 C p = p0 + (p16 << 16) + ldx [%o5+8], %g1 C p0 + add %g3, %g4, %g4 C p += cy + add %g5, %g4, %g4 C p += rp[i] + xor %o5, 16, %o5 + stw %g4, [%o0+8] + srlx %g4, 32, %g3 C new cy + lduw [%o0+12], %g5 C read rp[i] + +.L2: sllx %g2, 16, %g4 C (p16 << 16) + ldx [%o5+0], %g2 C p16 + add %g1, %g4, %g4 C p = p0 + (p16 << 16) + ldx [%o5+8], %g1 C p0 + add %g3, %g4, %g4 C p += cy + add %g5, %g4, %g4 C p += rp[i] + stw %g4, [%o0+12] + srlx %g4, 32, %g3 C new cy + lduw [%o0+16], %g5 C read rp[i] + +.L1: sllx %g2, 16, %g4 C (p16 << 16) + add %g1, %g4, %g4 C p = p0 + (p16 << 16) + add %g3, %g4, %g4 C p += cy + add %g5, %g4, %g4 C p += rp[i] + stw %g4, [%o0+16] + srlx %g4, 32, %g3 C new cy + + mov %g3, %o0 + retl + sub %sp, -FSIZE, %sp +EPILOGUE(mpn_addmul_1) diff --git a/vendor/gmp-6.3.0/mpn/sparc32/v9/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/sparc32/v9/gmp-mparam.h new file mode 100644 index 0000000..f909e2c --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc32/v9/gmp-mparam.h @@ -0,0 +1,204 @@ +/* SPARC v9 32-bit gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2002, 2004, 2009-2011, 2014 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 1593 MHz ultrasparc3 running Solaris 10 (swift.nada.kth.se) */ +/* FFT tuning limit = 25000000 */ +/* Generated by tuneup.c, 2014-03-16, gcc 3.4 */ + +#define DIVREM_1_NORM_THRESHOLD 3 +#define DIVREM_1_UNNORM_THRESHOLD 4 +#define MOD_1_1P_METHOD 2 +#define MOD_1_NORM_THRESHOLD 3 +#define MOD_1_UNNORM_THRESHOLD 4 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 13 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 12 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */ +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 22 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 32 +#define USE_PREINV_DIVREM_1 1 +#define DIV_QR_1N_PI1_METHOD 1 +#define DIV_QR_1_NORM_THRESHOLD 4 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */ + +#define MUL_TOOM22_THRESHOLD 28 +#define MUL_TOOM33_THRESHOLD 43 +#define MUL_TOOM44_THRESHOLD 126 +#define MUL_TOOM6H_THRESHOLD 161 +#define MUL_TOOM8H_THRESHOLD 208 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 73 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 80 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 85 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 55 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 72 + +#define SQR_BASECASE_THRESHOLD 4 +#define SQR_TOOM2_THRESHOLD 64 +#define SQR_TOOM3_THRESHOLD 85 +#define SQR_TOOM4_THRESHOLD 152 +#define SQR_TOOM6_THRESHOLD 185 +#define SQR_TOOM8_THRESHOLD 324 + +#define MULMID_TOOM42_THRESHOLD 64 + +#define MULMOD_BNM1_THRESHOLD 12 +#define SQRMOD_BNM1_THRESHOLD 16 + +#define MUL_FFT_MODF_THRESHOLD 288 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 288, 5}, { 9, 4}, { 19, 5}, { 11, 6}, \ + { 6, 5}, { 14, 6}, { 8, 5}, { 17, 6}, \ + { 9, 5}, { 20, 6}, { 13, 7}, { 7, 6}, \ + { 16, 7}, { 9, 6}, { 19, 7}, { 11, 6}, \ + { 23, 7}, { 13, 8}, { 7, 7}, { 15, 6}, \ + { 31, 7}, { 19, 8}, { 11, 7}, { 23, 9}, \ + { 7, 8}, { 15, 7}, { 31, 8}, { 19, 7}, \ + { 39, 8}, { 27, 9}, { 15, 8}, { 31, 7}, \ + { 63, 8}, { 39, 9}, { 23, 8}, { 47,10}, \ + { 15, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \ + { 79, 9}, { 47,10}, { 31, 9}, { 71, 8}, \ + { 143, 9}, { 79,10}, { 47, 9}, { 95,11}, \ + { 31,10}, { 63, 9}, { 135, 8}, { 271, 9}, \ + { 143, 8}, { 287,10}, { 79, 9}, { 175,10}, \ + { 95, 9}, { 191, 8}, { 383,10}, { 111,11}, \ + { 63,10}, { 143, 9}, { 287, 8}, { 575,10}, \ + { 175,11}, { 95,10}, { 191, 9}, { 415, 8}, \ + { 831,12}, { 63,11}, { 127,10}, { 287, 9}, \ + { 575,11}, { 159,10}, { 351, 9}, { 703,11}, \ + { 191,10}, { 415, 9}, { 831,11}, { 223,10}, \ + { 447, 9}, { 895, 8}, { 1791,12}, { 127,11}, \ + { 287,10}, { 607, 9}, { 1215, 8}, { 2431,11}, \ + { 319, 9}, { 1279,11}, { 351,12}, { 191,11}, \ + { 415,10}, { 831,11}, { 447,10}, { 895, 9}, \ + { 1791,11}, { 479,13}, { 127,12}, { 255,11}, \ + { 575,10}, { 1151,11}, { 607,12}, { 319,11}, \ + { 703,12}, { 383,11}, { 831,12}, { 447,11}, \ + { 895,10}, { 1791,11}, { 959,13}, { 255,12}, \ + { 575,11}, { 1215,10}, { 2431,12}, { 703,13}, \ + { 383,12}, { 959,14}, { 255,13}, { 511,12}, \ + { 1087,11}, { 2175,12}, { 1215,11}, { 2431,13}, \ + { 639,12}, { 1407,11}, { 2943,13}, { 895,12}, \ + { 1919,14}, { 511,13}, { 1151,12}, { 2431,13}, \ + { 1407,14}, { 767,13}, { 1791,15}, { 511,14}, \ + { 1023,13}, { 2431,14}, { 1279,13}, { 2943,12}, \ + { 5887,14}, { 16384,15}, { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 143 +#define MUL_FFT_THRESHOLD 2240 + +#define SQR_FFT_MODF_THRESHOLD 244 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 244, 5}, { 8, 4}, { 17, 5}, { 17, 6}, \ + { 9, 5}, { 19, 6}, { 17, 7}, { 9, 6}, \ + { 20, 7}, { 11, 6}, { 23, 7}, { 13, 8}, \ + { 7, 7}, { 19, 8}, { 11, 7}, { 25, 9}, \ + { 7, 8}, { 15, 7}, { 33, 8}, { 19, 7}, \ + { 39, 8}, { 23, 9}, { 15, 8}, { 39, 9}, \ + { 23,10}, { 15, 9}, { 31, 8}, { 63, 9}, \ + { 47,10}, { 31, 9}, { 63, 8}, { 127, 9}, \ + { 71, 8}, { 143, 7}, { 287, 9}, { 79,10}, \ + { 47,11}, { 31,10}, { 63, 9}, { 127, 8}, \ + { 255, 9}, { 143, 8}, { 287,10}, { 79, 9}, \ + { 159, 8}, { 319, 9}, { 175, 8}, { 351, 7}, \ + { 703,10}, { 95, 9}, { 191, 8}, { 383, 9}, \ + { 207, 8}, { 415, 9}, { 223,11}, { 63,10}, \ + { 127, 9}, { 271,10}, { 143, 9}, { 287, 8}, \ + { 575,10}, { 159, 9}, { 319,10}, { 175, 9}, \ + { 351, 8}, { 703,11}, { 95,10}, { 191, 9}, \ + { 383,10}, { 207, 9}, { 415, 8}, { 831,10}, \ + { 223,12}, { 63,11}, { 127,10}, { 271, 9}, \ + { 543,10}, { 287, 9}, { 575,11}, { 159,10}, \ + { 319, 9}, { 639,10}, { 351, 9}, { 703, 8}, \ + { 1407,11}, { 191,10}, { 415, 9}, { 831,11}, \ + { 223,10}, { 447, 9}, { 895,10}, { 479,12}, \ + { 127,11}, { 255,10}, { 543,11}, { 287,10}, \ + { 575,11}, { 319,10}, { 639,11}, { 351,10}, \ + { 703,12}, { 191,11}, { 415,10}, { 831,11}, \ + { 447,10}, { 895, 9}, { 1791,13}, { 127,12}, \ + { 255,11}, { 575,12}, { 319,11}, { 703,10}, \ + { 1407,12}, { 383,11}, { 831,12}, { 447,11}, \ + { 959,10}, { 1919, 9}, { 3839,13}, { 255,12}, \ + { 575,11}, { 1151,12}, { 703,11}, { 1407,13}, \ + { 383,12}, { 959,14}, { 255,13}, { 511,12}, \ + { 1215,11}, { 2431,13}, { 639,12}, { 1407,13}, \ + { 767,12}, { 1599,13}, { 895,12}, { 1919,14}, \ + { 511,13}, { 1151,12}, { 2431,13}, { 1407,12}, \ + { 2815,14}, { 767,13}, { 1535,12}, { 3071,13}, \ + { 1919,15}, { 511,14}, { 1023,13}, { 2431,14}, \ + { 1279,13}, { 2943,12}, { 5887,14}, { 16384,15}, \ + { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 153 +#define SQR_FFT_THRESHOLD 2112 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 144 +#define MULLO_MUL_N_THRESHOLD 4292 + +#define DC_DIV_QR_THRESHOLD 74 +#define DC_DIVAPPR_Q_THRESHOLD 406 +#define DC_BDIV_QR_THRESHOLD 63 +#define DC_BDIV_Q_THRESHOLD 363 + +#define INV_MULMOD_BNM1_THRESHOLD 108 +#define INV_NEWTON_THRESHOLD 351 +#define INV_APPR_THRESHOLD 303 + +#define BINV_NEWTON_THRESHOLD 354 +#define REDC_1_TO_REDC_N_THRESHOLD 61 + +#define MU_DIV_QR_THRESHOLD 998 +#define MU_DIVAPPR_Q_THRESHOLD 1099 +#define MUPI_DIV_QR_THRESHOLD 118 +#define MU_BDIV_QR_THRESHOLD 807 +#define MU_BDIV_Q_THRESHOLD 979 + +#define POWM_SEC_TABLE 3,22,127,624,779,2351 + +#define MATRIX22_STRASSEN_THRESHOLD 7 +#define HGCD_THRESHOLD 90 +#define HGCD_APPR_THRESHOLD 123 +#define HGCD_REDUCE_THRESHOLD 1494 +#define GCD_DC_THRESHOLD 283 +#define GCDEXT_DC_THRESHOLD 192 +#define JACOBI_BASE_METHOD 4 + +#define GET_STR_DC_THRESHOLD 12 +#define GET_STR_PRECOMPUTE_THRESHOLD 27 +#define SET_STR_DC_THRESHOLD 290 +#define SET_STR_PRECOMPUTE_THRESHOLD 634 + +#define FAC_DSC_THRESHOLD 156 +#define FAC_ODD_THRESHOLD 25 diff --git a/vendor/gmp-6.3.0/mpn/sparc32/v9/mul_1.asm b/vendor/gmp-6.3.0/mpn/sparc32/v9/mul_1.asm new file mode 100644 index 0000000..40aeffa --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc32/v9/mul_1.asm @@ -0,0 +1,287 @@ +dnl SPARC v9 32-bit mpn_mul_1 -- Multiply a limb vector with a limb and store +dnl the result in a second limb vector. + +dnl Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C Algorithm: We use two floating-point multiplies per limb product, with the +C invariant v operand split into two 16-bit pieces, and the u operand split +C into 32-bit pieces. We convert the two 48-bit products and transfer them to +C the integer unit. + +C cycles/limb +C UltraSPARC 1&2: 6.5 +C UltraSPARC 3: ? + +C Possible optimizations: +C 1. Combine 32-bit memory operations into 64-bit operations. Since we're +C memory bandwidth limited, this could save 1.5 cycles/limb. +C 2. Unroll the inner loop. Since we already use alternate temporary areas, +C it is very straightforward to unroll, using an exit branch midways. +C Unrolling would allow deeper scheduling which could improve speed for L2 +C cache case. +C 3. For mpn_mul_1: Use more alternating temp areas. The std'es and ldx'es +C aren't sufficiently apart-scheduled with just two temp areas. +C 4. Specialize for particular v values. If its upper 16 bits are zero, we +C could save many operations. + +C INPUT PARAMETERS +C rp i0 +C up i1 +C n i2 +C v i3 + +define(`FSIZE',224) + +ASM_START() +PROLOGUE(mpn_mul_1) + add %sp, -FSIZE, %sp + sethi %hi(0xffff), %g1 + srl %o3, 16, %g2 + or %g1, %lo(0xffff), %g1 + and %o3, %g1, %g1 + stx %g1, [%sp+104] + stx %g2, [%sp+112] + ldd [%sp+104], %f6 + ldd [%sp+112], %f8 + fxtod %f6, %f6 + fxtod %f8, %f8 + ld [%sp+104], %f10 C zero f10 + + mov 0, %g3 C cy = 0 + +define(`fanop', `fitod %f18, %f0') C A quasi nop running in the FA pipe + + add %sp, 160, %o5 C point in scratch area + and %o5, -32, %o5 C align at 0 (mod 32) in scratch area + + subcc %o2, 1, %o2 + ld [%o1], %f11 C read up[i] + add %o1, 4, %o1 C up++ + bne,pt %icc, .L_two_or_more + fxtod %f10, %f2 + + fmuld %f2, %f8, %f16 + fmuld %f2, %f6, %f4 + fdtox %f16, %f14 + fdtox %f4, %f12 + std %f14, [%o5+16] + std %f12, [%o5+24] + ldx [%o5+16], %g2 C p16 + ldx [%o5+24], %g1 C p0 + b .L1 + add %o0, -16, %o0 + + .align 16 +.L_two_or_more: + subcc %o2, 1, %o2 + ld [%o1], %f11 C read up[i] + fmuld %f2, %f8, %f16 + fmuld %f2, %f6, %f4 + add %o1, 4, %o1 C up++ + bne,pt %icc, .L_three_or_more + fxtod %f10, %f2 + + fdtox %f16, %f14 + fdtox %f4, %f12 + std %f14, [%o5+16] + fmuld %f2, %f8, %f16 + std %f12, [%o5+24] + fmuld %f2, %f6, %f4 + fdtox %f16, %f14 + fdtox %f4, %f12 + std %f14, [%o5+0] + std %f12, [%o5+8] + ldx [%o5+16], %g2 C p16 + ldx [%o5+24], %g1 C p0 + b .L2 + add %o0, -12, %o0 + + .align 16 +.L_three_or_more: + subcc %o2, 1, %o2 + ld [%o1], %f11 C read up[i] + fdtox %f16, %f14 + fdtox %f4, %f12 + std %f14, [%o5+16] + fmuld %f2, %f8, %f16 + std %f12, [%o5+24] + fmuld %f2, %f6, %f4 + add %o1, 4, %o1 C up++ + bne,pt %icc, .L_four_or_more + fxtod %f10, %f2 + + fdtox %f16, %f14 + fdtox %f4, %f12 + std %f14, [%o5+0] + fmuld %f2, %f8, %f16 + std %f12, [%o5+8] + fmuld %f2, %f6, %f4 + fdtox %f16, %f14 + ldx [%o5+16], %g2 C p16 + fdtox %f4, %f12 + ldx [%o5+24], %g1 C p0 + std %f14, [%o5+16] + std %f12, [%o5+24] + b .L3 + add %o0, -8, %o0 + + .align 16 +.L_four_or_more: + subcc %o2, 1, %o2 + ld [%o1], %f11 C read up[i] + fdtox %f16, %f14 + fdtox %f4, %f12 + std %f14, [%o5+0] + fmuld %f2, %f8, %f16 + std %f12, [%o5+8] + fmuld %f2, %f6, %f4 + add %o1, 4, %o1 C up++ + bne,pt %icc, .L_five_or_more + fxtod %f10, %f2 + + fdtox %f16, %f14 + ldx [%o5+16], %g2 C p16 + fdtox %f4, %f12 + ldx [%o5+24], %g1 C p0 + std %f14, [%o5+16] + fmuld %f2, %f8, %f16 + std %f12, [%o5+24] + fmuld %f2, %f6, %f4 + add %o1, 4, %o1 C up++ + b .L4 + add %o0, -4, %o0 + + .align 16 +.L_five_or_more: + subcc %o2, 1, %o2 + ld [%o1], %f11 C read up[i] + fdtox %f16, %f14 + ldx [%o5+16], %g2 C p16 + fdtox %f4, %f12 + ldx [%o5+24], %g1 C p0 + std %f14, [%o5+16] + fmuld %f2, %f8, %f16 + std %f12, [%o5+24] + fmuld %f2, %f6, %f4 + add %o1, 4, %o1 C up++ + bne,pt %icc, .Loop + fxtod %f10, %f2 + b,a .L5 + +C BEGIN MAIN LOOP + .align 16 +C -- 0 +.Loop: nop + subcc %o2, 1, %o2 + ld [%o1], %f11 C read up[i] + fdtox %f16, %f14 +C -- 1 + sllx %g2, 16, %g4 C (p16 << 16) + add %o0, 4, %o0 C rp++ + ldx [%o5+0], %g2 C p16 + fdtox %f4, %f12 +C -- 2 + nop + add %g1, %g4, %g4 C p = p0 + (p16 << 16) + ldx [%o5+8], %g1 C p0 + fanop +C -- 3 + nop + add %g3, %g4, %g4 C p += cy + std %f14, [%o5+0] + fmuld %f2, %f8, %f16 +C -- 4 + srlx %g4, 32, %g3 C new cy + add %o1, 4, %o1 C up++ + std %f12, [%o5+8] + fmuld %f2, %f6, %f4 +C -- 5 + xor %o5, 16, %o5 C alternate scratch variables + stw %g4, [%o0-4] + bne,pt %icc, .Loop + fxtod %f10, %f2 +C END MAIN LOOP + +.L5: fdtox %f16, %f14 + sllx %g2, 16, %g4 C (p16 << 16) + ldx [%o5+0], %g2 C p16 + fdtox %f4, %f12 + add %g1, %g4, %g4 C p = p0 + (p16 << 16) + ldx [%o5+8], %g1 C p0 + add %g4, %g3, %g4 C p += cy + std %f14, [%o5+0] + fmuld %f2, %f8, %f16 + std %f12, [%o5+8] + fmuld %f2, %f6, %f4 + xor %o5, 16, %o5 + stw %g4, [%o0+0] + srlx %g4, 32, %g3 C new cy + +.L4: fdtox %f16, %f14 + sllx %g2, 16, %g4 C (p16 << 16) + ldx [%o5+0], %g2 C p16 + fdtox %f4, %f12 + add %g1, %g4, %g4 C p = p0 + (p16 << 16) + ldx [%o5+8], %g1 C p0 + add %g3, %g4, %g4 C p += cy + std %f14, [%o5+0] + std %f12, [%o5+8] + xor %o5, 16, %o5 + stw %g4, [%o0+4] + srlx %g4, 32, %g3 C new cy + +.L3: sllx %g2, 16, %g4 C (p16 << 16) + ldx [%o5+0], %g2 C p16 + add %g1, %g4, %g4 C p = p0 + (p16 << 16) + ldx [%o5+8], %g1 C p0 + add %g3, %g4, %g4 C p += cy + xor %o5, 16, %o5 + stw %g4, [%o0+8] + srlx %g4, 32, %g3 C new cy + +.L2: sllx %g2, 16, %g4 C (p16 << 16) + ldx [%o5+0], %g2 C p16 + add %g1, %g4, %g4 C p = p0 + (p16 << 16) + ldx [%o5+8], %g1 C p0 + add %g3, %g4, %g4 C p += cy + stw %g4, [%o0+12] + srlx %g4, 32, %g3 C new cy + +.L1: sllx %g2, 16, %g4 C (p16 << 16) + add %g1, %g4, %g4 C p = p0 + (p16 << 16) + add %g3, %g4, %g4 C p += cy + stw %g4, [%o0+16] + srlx %g4, 32, %g3 C new cy + + mov %g3, %o0 + retl + sub %sp, -FSIZE, %sp +EPILOGUE(mpn_mul_1) diff --git a/vendor/gmp-6.3.0/mpn/sparc32/v9/sqr_diagonal.asm b/vendor/gmp-6.3.0/mpn/sparc32/v9/sqr_diagonal.asm new file mode 100644 index 0000000..e024279 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc32/v9/sqr_diagonal.asm @@ -0,0 +1,462 @@ +dnl SPARC v9 32-bit mpn_sqr_diagonal. + +dnl Copyright 2001, 2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C INPUT PARAMETERS +C rp i0 +C up i1 +C n i2 + +C This code uses a very deep software pipeline, due to the need for moving data +C forth and back between the integer registers and floating-point registers. +C +C A VIS variant of this code would make the pipeline less deep, since the +C masking now done in the integer unit could take place in the floating-point +C unit using the FAND instruction. It would be possible to save several cycles +C too. +C +C On UltraSPARC 1 and 2, this code runs at 11 cycles/limb from the Dcache and +C not much slower from the Ecache. It would perhaps be possible to shave off +C one cycle, but not easily. We cannot do better than 10 cycles/limb with the +C used instructions, since we have 10 memory operations per limb. But a VIS +C variant could run three cycles faster than the corresponding non-VIS code. + +C This is non-pipelined code showing the algorithm: +C +C .Loop: +C lduw [up+0],%g4 C 00000000hhhhllll +C sllx %g4,16,%g3 C 0000hhhhllll0000 +C or %g3,%g4,%g2 C 0000hhhhXXXXllll +C andn %g2,%g5,%g2 C 0000hhhh0000llll +C stx %g2,[%fp+80] +C ldd [%fp+80],%f0 +C fitod %f0,%f4 C hi16 +C fitod %f1,%f6 C lo16 +C ld [up+0],%f9 +C fxtod %f8,%f2 +C fmuld %f2,%f4,%f4 +C fmuld %f2,%f6,%f6 +C fdtox %f4,%f4 +C fdtox %f6,%f6 +C std %f4,[%fp-24] +C std %f6,[%fp-16] +C ldx [%fp-24],%g2 +C ldx [%fp-16],%g1 +C sllx %g2,16,%g2 +C add %g2,%g1,%g1 +C stw %g1,[rp+0] +C srlx %g1,32,%l0 +C stw %l0,[rp+4] +C add up,4,up +C subcc n,1,n +C bne,pt %icc,.Loop +C add rp,8,rp + +define(`fanop',`fitod %f12,%f10') dnl A quasi nop running in the FA pipe + +ASM_START() + + TEXT + ALIGN(4) +.Lnoll: + .word 0 + +PROLOGUE(mpn_sqr_diagonal) + save %sp,-256,%sp + +ifdef(`PIC', +`.Lpc: rd %pc,%o7 + ld [%o7+.Lnoll-.Lpc],%f8', +` sethi %hi(.Lnoll),%g1 + ld [%g1+%lo(.Lnoll)],%f8') + + sethi %hi(0xffff0000),%g5 + add %i1,-8,%i1 + + lduw [%i1+8],%g4 + add %i1,4,%i1 C s1_ptr++ + sllx %g4,16,%g3 C 0000hhhhllll0000 + or %g3,%g4,%g2 C 0000hhhhXXXXllll + subcc %i2,1,%i2 + bne,pt %icc,.L_grt_1 + andn %g2,%g5,%g2 C 0000hhhh0000llll + + add %i1,4,%i1 C s1_ptr++ + stx %g2,[%fp+80] + ld [%i1],%f9 + ldd [%fp+80],%f0 + fxtod %f8,%f2 + fitod %f0,%f4 + fitod %f1,%f6 + fmuld %f2,%f4,%f4 + fmuld %f2,%f6,%f6 + fdtox %f4,%f4 + fdtox %f6,%f6 + std %f4,[%fp-24] + std %f6,[%fp-16] + + add %fp, 80, %l3 + add %fp, -24, %l4 + add %fp, 72, %l5 + b .L1 + add %fp, -40, %l6 + +.L_grt_1: + stx %g2,[%fp+80] + lduw [%i1+8],%g4 + add %i1,4,%i1 C s1_ptr++ + sllx %g4,16,%g3 C 0000hhhhllll0000 + or %g3,%g4,%g2 C 0000hhhhXXXXllll + subcc %i2,1,%i2 + bne,pt %icc,.L_grt_2 + andn %g2,%g5,%g2 C 0000hhhh0000llll + + stx %g2,[%fp+72] + ld [%i1],%f9 + add %i1,4,%i1 C s1_ptr++ + ldd [%fp+80],%f0 + fxtod %f8,%f2 + fitod %f0,%f4 + fitod %f1,%f6 + fmuld %f2,%f4,%f4 + ld [%i1],%f9 + fmuld %f2,%f6,%f6 + ldd [%fp+72],%f0 + fdtox %f4,%f4 + fdtox %f6,%f6 + std %f4,[%fp-24] + fxtod %f8,%f2 + std %f6,[%fp-16] + fitod %f0,%f4 + fitod %f1,%f6 + fmuld %f2,%f4,%f4 + fmuld %f2,%f6,%f6 + fdtox %f4,%f4 + + add %fp, 72, %l3 + add %fp, -40, %l4 + add %fp, 80, %l5 + b .L2 + add %fp, -24, %l6 + +.L_grt_2: + stx %g2,[%fp+72] + lduw [%i1+8],%g4 + ld [%i1],%f9 + add %i1,4,%i1 C s1_ptr++ + ldd [%fp+80],%f0 + sllx %g4,16,%g3 C 0000hhhhllll0000 + or %g3,%g4,%g2 C 0000hhhhXXXXllll + subcc %i2,1,%i2 + fxtod %f8,%f2 + bne,pt %icc,.L_grt_3 + andn %g2,%g5,%g2 C 0000hhhh0000llll + + stx %g2,[%fp+80] + fitod %f0,%f4 + fitod %f1,%f6 + fmuld %f2,%f4,%f4 + ld [%i1],%f9 + fmuld %f2,%f6,%f6 + add %i1,4,%i1 C s1_ptr++ + ldd [%fp+72],%f0 + fdtox %f4,%f4 + fdtox %f6,%f6 + std %f4,[%fp-24] + fxtod %f8,%f2 + std %f6,[%fp-16] + fitod %f0,%f4 + fitod %f1,%f6 + fmuld %f2,%f4,%f4 + ld [%i1],%f9 + add %fp, 80, %l3 + fmuld %f2,%f6,%f6 + add %fp, -24, %l4 + ldd [%fp+80],%f0 + add %fp, 72, %l5 + fdtox %f4,%f4 + b .L3 + add %fp, -40, %l6 + +.L_grt_3: + stx %g2,[%fp+80] + fitod %f0,%f4 + lduw [%i1+8],%g4 + fitod %f1,%f6 + fmuld %f2,%f4,%f4 + ld [%i1],%f9 + fmuld %f2,%f6,%f6 + add %i1,4,%i1 C s1_ptr++ + ldd [%fp+72],%f0 + fdtox %f4,%f4 + sllx %g4,16,%g3 C 0000hhhhllll0000 + fdtox %f6,%f6 + or %g3,%g4,%g2 C 0000hhhhXXXXllll + subcc %i2,1,%i2 + std %f4,[%fp-24] + fxtod %f8,%f2 + std %f6,[%fp-16] + bne,pt %icc,.L_grt_4 + andn %g2,%g5,%g2 C 0000hhhh0000llll + + stx %g2,[%fp+72] + fitod %f0,%f4 + fitod %f1,%f6 + add %fp, 72, %l3 + fmuld %f2,%f4,%f4 + add %fp, -40, %l4 + ld [%i1],%f9 + fmuld %f2,%f6,%f6 + add %i1,4,%i1 C s1_ptr++ + ldd [%fp+80],%f0 + add %fp, 80, %l5 + fdtox %f4,%f4 + b .L4 + add %fp, -24, %l6 + +.L_grt_4: + stx %g2,[%fp+72] + fitod %f0,%f4 + lduw [%i1+8],%g4 + fitod %f1,%f6 + fmuld %f2,%f4,%f4 + ld [%i1],%f9 + fmuld %f2,%f6,%f6 + add %i1,4,%i1 C s1_ptr++ + ldd [%fp+80],%f0 + fdtox %f4,%f4 + sllx %g4,16,%g3 C 0000hhhhllll0000 + fdtox %f6,%f6 + or %g3,%g4,%g2 C 0000hhhhXXXXllll + subcc %i2,1,%i2 + std %f4,[%fp-40] + fxtod %f8,%f2 + std %f6,[%fp-32] + be,pn %icc,.L5 + andn %g2,%g5,%g2 C 0000hhhh0000llll + + b,a .Loop + + .align 16 +C --- LOOP BEGIN +.Loop: nop + nop + stx %g2,[%fp+80] + fitod %f0,%f4 +C --- + nop + nop + lduw [%i1+8],%g4 + fitod %f1,%f6 +C --- + nop + nop + ldx [%fp-24],%g2 C p16 + fanop +C --- + nop + nop + ldx [%fp-16],%g1 C p0 + fmuld %f2,%f4,%f4 +C --- + sllx %g2,16,%g2 C align p16 + add %i0,8,%i0 C res_ptr++ + ld [%i1],%f9 + fmuld %f2,%f6,%f6 +C --- + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i1,4,%i1 C s1_ptr++ + ldd [%fp+72],%f0 + fanop +C --- + srlx %g1,32,%l0 + nop + stw %g1,[%i0-8] + fdtox %f4,%f4 +C --- + sllx %g4,16,%g3 C 0000hhhhllll0000 + nop + stw %l0,[%i0-4] + fdtox %f6,%f6 +C --- + or %g3,%g4,%g2 C 0000hhhhXXXXllll + subcc %i2,1,%i2 + std %f4,[%fp-24] + fxtod %f8,%f2 +C --- + std %f6,[%fp-16] + andn %g2,%g5,%g2 C 0000hhhh0000llll + be,pn %icc,.Lend + fanop +C --- LOOP MIDDLE + nop + nop + stx %g2,[%fp+72] + fitod %f0,%f4 +C --- + nop + nop + lduw [%i1+8],%g4 + fitod %f1,%f6 +C --- + nop + nop + ldx [%fp-40],%g2 C p16 + fanop +C --- + nop + nop + ldx [%fp-32],%g1 C p0 + fmuld %f2,%f4,%f4 +C --- + sllx %g2,16,%g2 C align p16 + add %i0,8,%i0 C res_ptr++ + ld [%i1],%f9 + fmuld %f2,%f6,%f6 +C --- + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i1,4,%i1 C s1_ptr++ + ldd [%fp+80],%f0 + fanop +C --- + srlx %g1,32,%l0 + nop + stw %g1,[%i0-8] + fdtox %f4,%f4 +C --- + sllx %g4,16,%g3 C 0000hhhhllll0000 + nop + stw %l0,[%i0-4] + fdtox %f6,%f6 +C --- + or %g3,%g4,%g2 C 0000hhhhXXXXllll + subcc %i2,1,%i2 + std %f4,[%fp-40] + fxtod %f8,%f2 +C --- + std %f6,[%fp-32] + andn %g2,%g5,%g2 C 0000hhhh0000llll + bne,pt %icc,.Loop + fanop +C --- LOOP END + +.L5: add %fp, 80, %l3 + add %fp, -24, %l4 + add %fp, 72, %l5 + b .Ltail + add %fp, -40, %l6 + +.Lend: add %fp, 72, %l3 + add %fp, -40, %l4 + add %fp, 80, %l5 + add %fp, -24, %l6 +.Ltail: stx %g2,[%l3] + fitod %f0,%f4 + fitod %f1,%f6 + ldx [%l4],%g2 C p16 + ldx [%l4+8],%g1 C p0 + fmuld %f2,%f4,%f4 + sllx %g2,16,%g2 C align p16 + add %i0,8,%i0 C res_ptr++ + ld [%i1],%f9 + fmuld %f2,%f6,%f6 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i1,4,%i1 C s1_ptr++ + ldd [%l5],%f0 + srlx %g1,32,%l0 + stw %g1,[%i0-8] + fdtox %f4,%f4 + stw %l0,[%i0-4] +.L4: fdtox %f6,%f6 + std %f4,[%l4] + fxtod %f8,%f2 + std %f6,[%l4+8] + + fitod %f0,%f4 + fitod %f1,%f6 + ldx [%l6],%g2 C p16 + ldx [%l6+8],%g1 C p0 + fmuld %f2,%f4,%f4 + sllx %g2,16,%g2 C align p16 + add %i0,8,%i0 C res_ptr++ + ld [%i1],%f9 + fmuld %f2,%f6,%f6 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + ldd [%l3],%f0 + srlx %g1,32,%l0 + stw %g1,[%i0-8] + fdtox %f4,%f4 + stw %l0,[%i0-4] +.L3: fdtox %f6,%f6 + std %f4,[%l6] + fxtod %f8,%f2 + std %f6,[%l6+8] + + fitod %f0,%f4 + fitod %f1,%f6 + ldx [%l4],%g2 C p16 + ldx [%l4+8],%g1 C p0 + fmuld %f2,%f4,%f4 + sllx %g2,16,%g2 C align p16 + add %i0,8,%i0 C res_ptr++ + fmuld %f2,%f6,%f6 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + srlx %g1,32,%l0 + stw %g1,[%i0-8] + fdtox %f4,%f4 + stw %l0,[%i0-4] +.L2: fdtox %f6,%f6 + std %f4,[%l4] + std %f6,[%l4+8] + + ldx [%l6],%g2 C p16 + ldx [%l6+8],%g1 C p0 + sllx %g2,16,%g2 C align p16 + add %i0,8,%i0 C res_ptr++ + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + srlx %g1,32,%l0 + stw %g1,[%i0-8] + stw %l0,[%i0-4] + +.L1: ldx [%l4],%g2 C p16 + ldx [%l4+8],%g1 C p0 + sllx %g2,16,%g2 C align p16 + add %i0,8,%i0 C res_ptr++ + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + srlx %g1,32,%l0 + stw %g1,[%i0-8] + stw %l0,[%i0-4] + + ret + restore %g0,%g0,%o0 + +EPILOGUE(mpn_sqr_diagonal) diff --git a/vendor/gmp-6.3.0/mpn/sparc32/v9/sub_n.asm b/vendor/gmp-6.3.0/mpn/sparc32/v9/sub_n.asm new file mode 100644 index 0000000..636c73b --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc32/v9/sub_n.asm @@ -0,0 +1,129 @@ +dnl SPARC mpn_sub_n -- Subtract two limb vectors of the same length > 0 and +dnl store difference in a third limb vector. + +dnl Copyright 2001 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C INPUT PARAMETERS +define(rp,%o0) +define(s1p,%o1) +define(s2p,%o2) +define(n,%o3) +define(cy,%g1) + +C This code uses 64-bit operations on `o' and `g' registers. It doesn't +C require that `o' registers' upper 32 bits are preserved by the operating +C system, but if they are not, they must be zeroed. That is indeed what +C happens at least on Slowaris 2.5 and 2.6. + +C On UltraSPARC 1 and 2, this code runs at 3 cycles/limb from the Dcache and at +C about 10 cycles/limb from the Ecache. + +ASM_START() +PROLOGUE(mpn_sub_n) + lduw [s1p+0],%o4 + lduw [s2p+0],%o5 + addcc n,-2,n + bl,pn %icc,L(end1) + lduw [s1p+4],%g2 + lduw [s2p+4],%g3 + be,pn %icc,L(end2) + mov 0,cy + + .align 16 +L(loop): + sub %o4,%o5,%g4 + add rp,8,rp + lduw [s1p+8],%o4 + fitod %f0,%f2 +C --- + sub %g4,cy,%g4 + addcc n,-1,n + lduw [s2p+8],%o5 + fitod %f0,%f2 +C --- + srlx %g4,63,cy + add s2p,8,s2p + stw %g4,[rp-8] + be,pn %icc,L(exito)+4 +C --- + sub %g2,%g3,%g4 + addcc n,-1,n + lduw [s1p+12],%g2 + fitod %f0,%f2 +C --- + sub %g4,cy,%g4 + add s1p,8,s1p + lduw [s2p+4],%g3 + fitod %f0,%f2 +C --- + srlx %g4,63,cy + bne,pt %icc,L(loop) + stw %g4,[rp-4] +C --- +L(exite): + sub %o4,%o5,%g4 + sub %g4,cy,%g4 + srlx %g4,63,cy + stw %g4,[rp+0] + sub %g2,%g3,%g4 + sub %g4,cy,%g4 + stw %g4,[rp+4] + retl + srlx %g4,63,%o0 + +L(exito): + sub %g2,%g3,%g4 + sub %g4,cy,%g4 + srlx %g4,63,cy + stw %g4,[rp-4] + sub %o4,%o5,%g4 + sub %g4,cy,%g4 + stw %g4,[rp+0] + retl + srlx %g4,63,%o0 + +L(end1): + sub %o4,%o5,%g4 + stw %g4,[rp+0] + retl + srlx %g4,63,%o0 + +L(end2): + sub %o4,%o5,%g4 + srlx %g4,63,cy + stw %g4,[rp+0] + sub %g2,%g3,%g4 + sub %g4,cy,%g4 + stw %g4,[rp+4] + retl + srlx %g4,63,%o0 +EPILOGUE(mpn_sub_n) diff --git a/vendor/gmp-6.3.0/mpn/sparc32/v9/submul_1.asm b/vendor/gmp-6.3.0/mpn/sparc32/v9/submul_1.asm new file mode 100644 index 0000000..92d0ce7 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc32/v9/submul_1.asm @@ -0,0 +1,316 @@ +dnl SPARC v9 32-bit mpn_submul_1 -- Multiply a limb vector with a limb and +dnl subtract the result from a second limb vector. + +dnl Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C Algorithm: We use two floating-point multiplies per limb product, with the +C invariant v operand split into two 16-bit pieces, and the u operand split +C into 32-bit pieces. We convert the two 48-bit products and transfer them to +C the integer unit. + +C cycles/limb +C UltraSPARC 1&2: 6.5 +C UltraSPARC 3: ? + +C Possible optimizations: +C 1. Combine 32-bit memory operations into 64-bit operations. Since we're +C memory bandwidth limited, this could save 1.5 cycles/limb. +C 2. Unroll the inner loop. Since we already use alternate temporary areas, +C it is very straightforward to unroll, using an exit branch midways. +C Unrolling would allow deeper scheduling which could improve speed for L2 +C cache case. +C 3. For mpn_mul_1: Use more alternating temp areas. The std'es and ldx'es +C aren't sufficiently apart-scheduled with just two temp areas. +C 4. Specialize for particular v values. If its upper 16 bits are zero, we +C could save many operations. + +C INPUT PARAMETERS +C rp i0 +C up i1 +C n i2 +C v i3 + +define(`FSIZE',224) + +ASM_START() +PROLOGUE(mpn_submul_1) + add %sp, -FSIZE, %sp + sethi %hi(0xffff), %g1 + srl %o3, 16, %g2 + or %g1, %lo(0xffff), %g1 + and %o3, %g1, %g1 + stx %g1, [%sp+104] + stx %g2, [%sp+112] + ldd [%sp+104], %f6 + ldd [%sp+112], %f8 + fxtod %f6, %f6 + fxtod %f8, %f8 + ld [%sp+104], %f10 C zero f10 + + mov 0, %g3 C cy = 0 + +define(`fanop', `fitod %f18, %f0') C A quasi nop running in the FA pipe + + add %sp, 160, %o5 C point in scratch area + and %o5, -32, %o5 C align at 0 (mod 32) in scratch area + + subcc %o2, 1, %o2 + ld [%o1], %f11 C read up[i] + add %o1, 4, %o1 C up++ + bne,pt %icc, .L_two_or_more + fxtod %f10, %f2 + + fmuld %f2, %f8, %f16 + fmuld %f2, %f6, %f4 + fdtox %f16, %f14 + fdtox %f4, %f12 + std %f14, [%o5+16] + std %f12, [%o5+24] + ldx [%o5+16], %g2 C p16 + ldx [%o5+24], %g1 C p0 + lduw [%o0], %g5 C read rp[i] + b .L1 + add %o0, -16, %o0 + + .align 16 +.L_two_or_more: + subcc %o2, 1, %o2 + ld [%o1], %f11 C read up[i] + fmuld %f2, %f8, %f16 + fmuld %f2, %f6, %f4 + add %o1, 4, %o1 C up++ + bne,pt %icc, .L_three_or_more + fxtod %f10, %f2 + + fdtox %f16, %f14 + fdtox %f4, %f12 + std %f14, [%o5+16] + fmuld %f2, %f8, %f16 + std %f12, [%o5+24] + fmuld %f2, %f6, %f4 + fdtox %f16, %f14 + fdtox %f4, %f12 + std %f14, [%o5+0] + std %f12, [%o5+8] + lduw [%o0], %g5 C read rp[i] + ldx [%o5+16], %g2 C p16 + ldx [%o5+24], %g1 C p0 + b .L2 + add %o0, -12, %o0 + + .align 16 +.L_three_or_more: + subcc %o2, 1, %o2 + ld [%o1], %f11 C read up[i] + fdtox %f16, %f14 + fdtox %f4, %f12 + std %f14, [%o5+16] + fmuld %f2, %f8, %f16 + std %f12, [%o5+24] + fmuld %f2, %f6, %f4 + add %o1, 4, %o1 C up++ + bne,pt %icc, .L_four_or_more + fxtod %f10, %f2 + + fdtox %f16, %f14 + fdtox %f4, %f12 + std %f14, [%o5+0] + fmuld %f2, %f8, %f16 + std %f12, [%o5+8] + fmuld %f2, %f6, %f4 + fdtox %f16, %f14 + ldx [%o5+16], %g2 C p16 + fdtox %f4, %f12 + ldx [%o5+24], %g1 C p0 + std %f14, [%o5+16] + std %f12, [%o5+24] + lduw [%o0], %g5 C read rp[i] + b .L3 + add %o0, -8, %o0 + + .align 16 +.L_four_or_more: + subcc %o2, 1, %o2 + ld [%o1], %f11 C read up[i] + fdtox %f16, %f14 + fdtox %f4, %f12 + std %f14, [%o5+0] + fmuld %f2, %f8, %f16 + std %f12, [%o5+8] + fmuld %f2, %f6, %f4 + add %o1, 4, %o1 C up++ + bne,pt %icc, .L_five_or_more + fxtod %f10, %f2 + + fdtox %f16, %f14 + ldx [%o5+16], %g2 C p16 + fdtox %f4, %f12 + ldx [%o5+24], %g1 C p0 + std %f14, [%o5+16] + fmuld %f2, %f8, %f16 + std %f12, [%o5+24] + fmuld %f2, %f6, %f4 + add %o1, 4, %o1 C up++ + lduw [%o0], %g5 C read rp[i] + b .L4 + add %o0, -4, %o0 + + .align 16 +.L_five_or_more: + subcc %o2, 1, %o2 + ld [%o1], %f11 C read up[i] + fdtox %f16, %f14 + ldx [%o5+16], %g2 C p16 + fdtox %f4, %f12 + ldx [%o5+24], %g1 C p0 + std %f14, [%o5+16] + fmuld %f2, %f8, %f16 + std %f12, [%o5+24] + fmuld %f2, %f6, %f4 + add %o1, 4, %o1 C up++ + lduw [%o0], %g5 C read rp[i] + bne,pt %icc, .Loop + fxtod %f10, %f2 + b,a .L5 + +C BEGIN MAIN LOOP + .align 16 +C -- 0 +.Loop: sub %g0, %g3, %g3 + subcc %o2, 1, %o2 + ld [%o1], %f11 C read up[i] + fdtox %f16, %f14 +C -- 1 + sllx %g2, 16, %g4 C (p16 << 16) + add %o0, 4, %o0 C rp++ + ldx [%o5+0], %g2 C p16 + fdtox %f4, %f12 +C -- 2 + srl %g3, 0, %g3 C zero most significant 32 bits + add %g1, %g4, %g4 C p = p0 + (p16 << 16) + ldx [%o5+8], %g1 C p0 + fanop +C -- 3 + nop + add %g3, %g4, %g4 C p += cy + std %f14, [%o5+0] + fmuld %f2, %f8, %f16 +C -- 4 + nop + sub %g5, %g4, %g4 C p += rp[i] + std %f12, [%o5+8] + fmuld %f2, %f6, %f4 +C -- 5 + xor %o5, 16, %o5 C alternate scratch variables + add %o1, 4, %o1 C up++ + stw %g4, [%o0-4] + fanop +C -- 6 + srlx %g4, 32, %g3 C new cy + lduw [%o0], %g5 C read rp[i] + bne,pt %icc, .Loop + fxtod %f10, %f2 +C END MAIN LOOP + +.L5: sub %g0, %g3, %g3 + fdtox %f16, %f14 + sllx %g2, 16, %g4 C (p16 << 16) + ldx [%o5+0], %g2 C p16 + fdtox %f4, %f12 + srl %g3, 0, %g3 C zero most significant 32 bits + add %g1, %g4, %g4 C p = p0 + (p16 << 16) + ldx [%o5+8], %g1 C p0 + add %g4, %g3, %g4 C p += cy + std %f14, [%o5+0] + fmuld %f2, %f8, %f16 + sub %g5, %g4, %g4 C p += rp[i] + std %f12, [%o5+8] + fmuld %f2, %f6, %f4 + xor %o5, 16, %o5 + stw %g4, [%o0+0] + srlx %g4, 32, %g3 C new cy + lduw [%o0+4], %g5 C read rp[i] + + sub %g0, %g3, %g3 +.L4: fdtox %f16, %f14 + sllx %g2, 16, %g4 C (p16 << 16) + ldx [%o5+0], %g2 C p16 + fdtox %f4, %f12 + srl %g3, 0, %g3 C zero most significant 32 bits + add %g1, %g4, %g4 C p = p0 + (p16 << 16) + ldx [%o5+8], %g1 C p0 + add %g3, %g4, %g4 C p += cy + std %f14, [%o5+0] + sub %g5, %g4, %g4 C p += rp[i] + std %f12, [%o5+8] + xor %o5, 16, %o5 + stw %g4, [%o0+4] + srlx %g4, 32, %g3 C new cy + lduw [%o0+8], %g5 C read rp[i] + + sub %g0, %g3, %g3 +.L3: sllx %g2, 16, %g4 C (p16 << 16) + ldx [%o5+0], %g2 C p16 + srl %g3, 0, %g3 C zero most significant 32 bits + add %g1, %g4, %g4 C p = p0 + (p16 << 16) + ldx [%o5+8], %g1 C p0 + add %g3, %g4, %g4 C p += cy + sub %g5, %g4, %g4 C p += rp[i] + xor %o5, 16, %o5 + stw %g4, [%o0+8] + srlx %g4, 32, %g3 C new cy + lduw [%o0+12], %g5 C read rp[i] + + sub %g0, %g3, %g3 +.L2: sllx %g2, 16, %g4 C (p16 << 16) + ldx [%o5+0], %g2 C p16 + srl %g3, 0, %g3 C zero most significant 32 bits + add %g1, %g4, %g4 C p = p0 + (p16 << 16) + ldx [%o5+8], %g1 C p0 + add %g3, %g4, %g4 C p += cy + sub %g5, %g4, %g4 C p += rp[i] + stw %g4, [%o0+12] + srlx %g4, 32, %g3 C new cy + lduw [%o0+16], %g5 C read rp[i] + + sub %g0, %g3, %g3 +.L1: sllx %g2, 16, %g4 C (p16 << 16) + srl %g3, 0, %g3 C zero most significant 32 bits + add %g1, %g4, %g4 C p = p0 + (p16 << 16) + add %g3, %g4, %g4 C p += cy + sub %g5, %g4, %g4 C p += rp[i] + stw %g4, [%o0+16] + srlx %g4, 32, %g3 C new cy + + sub %g0, %g3, %o0 + retl + sub %sp, -FSIZE, %sp +EPILOGUE(mpn_submul_1) diff --git a/vendor/gmp-6.3.0/mpn/sparc32/v9/udiv.asm b/vendor/gmp-6.3.0/mpn/sparc32/v9/udiv.asm new file mode 100644 index 0000000..61dde97 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc32/v9/udiv.asm @@ -0,0 +1,52 @@ +dnl SPARC v9 32-bit mpn_udiv_qrnnd - division support for longlong.h. + +dnl Copyright 2002, 2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C INPUT PARAMETERS +C rem_ptr o0 +C n1 o1 +C n0 o2 +C d o3 + +ASM_START() +PROLOGUE(mpn_udiv_qrnnd) + sllx %o1, 32, %g1 C shift upper dividend limb + srl %o2, 0, %g2 C zero extend lower dividend limb + srl %o3, 0, %g3 C zero extend divisor + or %g2, %g1, %g1 C assemble 64-bit dividend + udivx %g1, %g3, %g1 + mulx %g1, %g3, %g4 + sub %g2, %g4, %g2 + st %g2, [%o0] C store remainder + retl + mov %g1, %o0 C return quotient +EPILOGUE(mpn_udiv_qrnnd) -- cgit v1.2.3