diff options
Diffstat (limited to 'vendor/gmp-6.3.0/mpn/sparc64')
61 files changed, 9471 insertions, 0 deletions
| diff --git a/vendor/gmp-6.3.0/mpn/sparc64/README b/vendor/gmp-6.3.0/mpn/sparc64/README new file mode 100644 index 0000000..e2c051a --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/README @@ -0,0 +1,125 @@ +Copyright 1997, 1999-2002 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + +  * the GNU Lesser General Public License as published by the Free +    Software Foundation; either version 3 of the License, or (at your +    option) any later version. + +or + +  * the GNU General Public License as published by the Free Software +    Foundation; either version 2 of the License, or (at your option) any +    later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library.  If not, +see https://www.gnu.org/licenses/. + + + + + +This directory contains mpn functions for 64-bit V9 SPARC + +RELEVANT OPTIMIZATION ISSUES + +Notation: +  IANY = shift/add/sub/logical/sethi +  IADDLOG = add/sub/logical/sethi +  MEM = ld*/st* +  FA = fadd*/fsub*/f*to*/fmov* +  FM = fmul* + +UltraSPARC can issue four instructions per cycle, with these restrictions: +* Two IANY instructions, but only one of these may be a shift.  If there is a +  shift and an IANY instruction, the shift must precede the IANY instruction. +* One FA. +* One FM. +* One branch. +* One MEM. +* IANY/IADDLOG/MEM must be insn 1, 2, or 3 in an issue bundle.  Taken branches +  should not be in slot 4, since that makes the delay insn come from separate +  bundle. +* If two IANY/IADDLOG instructions are to be executed in the same cycle and one +  of these is setting the condition codes, that instruction must be the second +  one. + +To summarize, ignoring branches, these are the bundles that can reach the peak +execution speed: + +insn1	iany	iany	mem	iany	iany	mem	iany	iany	mem +insn2	iaddlog	mem	iany	mem	iaddlog	iany	mem	iaddlog	iany +insn3	mem	iaddlog	iaddlog	fa	fa	fa	fm	fm	fm +insn4	fa/fm	fa/fm	fa/fm	fm	fm	fm	fa	fa	fa + +The 64-bit integer multiply instruction mulx takes from 5 cycles to 35 cycles, +depending on the position of the most significant bit of the first source +operand.  When used for 32x32->64 multiplication, it needs 20 cycles. +Furthermore, it stalls the processor while executing.  We stay away from that +instruction, and instead use floating-point operations. + +Floating-point add and multiply units are fully pipelined.  The latency for +UltraSPARC-1/2 is 3 cycles and for UltraSPARC-3 it is 4 cycles. + +Integer conditional move instructions cannot dual-issue with other integer +instructions.  No conditional move can issue 1-5 cycles after a load.  (This +might have been fixed for UltraSPARC-3.) + +The UltraSPARC-3 pipeline is very simular to the one of UltraSPARC-1/2 , but is +somewhat slower.  Branches execute slower, and there may be other new stalls. +But integer multiply doesn't stall the entire CPU and also has a much lower +latency.  But it's still not pipelined, and thus useless for our needs. + +STATUS + +* mpn_lshift, mpn_rshift: The current code runs at 2.0 cycles/limb on +  UltraSPARC-1/2 and 2.65 on UltraSPARC-3.  For UltraSPARC-1/2, the IEU0 +  functional unit is saturated with shifts. + +* mpn_add_n, mpn_sub_n: The current code runs at 4 cycles/limb on +  UltraSPARC-1/2 and 4.5 cycles/limb on UltraSPARC-3.  The 4 instruction +  recurrency is the speed limiter. + +* mpn_addmul_1: The current code runs at 14 cycles/limb asymptotically on +  UltraSPARC-1/2 and 17.5 cycles/limb on UltraSPARC-3.  On UltraSPARC-1/2, the +  code sustains 4 instructions/cycle.  It might be possible to invent a better +  way of summing the intermediate 49-bit operands, but it is unlikely that it +  will save enough instructions to save an entire cycle. + +  The load-use of the u operand is not enough scheduled for good L2 cache +  performance.  The UltraSPARC-1/2 L1 cache is direct mapped, and since we use +  temporary stack slots that will conflict with the u and r operands, we miss +  to L2 very often.  The load-use of the std/ldx pairs via the stack are +  perhaps over-scheduled. + +  It would be possible to save two instructions: (1) The mov could be avoided +  if the std/ldx were less scheduled.  (2) The ldx of the r operand could be +  split into two ld instructions, saving the shifts/masks. + +  It should be possible to reach 14 cycles/limb for UltraSPARC-3 if the fp +  operations where rescheduled for this processor's 4-cycle latency. + +* mpn_mul_1: The current code is a straightforward edit of the mpn_addmul_1 +  code.  It would be possible to shave one or two cycles from it, with some +  labour. + +* mpn_submul_1: Simpleminded code just calling mpn_mul_1 + mpn_sub_n.  This +  means that it runs at 18 cycles/limb on UltraSPARC-1/2 and 23 cycles/limb on +  UltraSPARC-3.  It would be possible to either match the mpn_addmul_1 +  performance, or in the worst case use one more instruction group. + +* US1/US2 cache conflict resolving.  The direct mapped L1 date cache of US1/US2 +  is a problem for mul_1, addmul_1 (and a prospective submul_1).  We should +  allocate a larger cache area, and put the stack temp area in a place that +  doesn't cause cache conflicts. diff --git a/vendor/gmp-6.3.0/mpn/sparc64/copyd.asm b/vendor/gmp-6.3.0/mpn/sparc64/copyd.asm new file mode 100644 index 0000000..ab105d3 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/copyd.asm @@ -0,0 +1,89 @@ +dnl  SPARC v9 mpn_copyd -- Copy a limb vector, decrementing. + +dnl  Copyright 1999-2003 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C		   cycles/limb +C UltraSPARC 1&2:	 2 +C UltraSPARC 3:		 2.5 +C UltraSPARC T1:	17 +C UltraSPARC T3:	 6 +C UltraSPARC T4/T5:	 2 + +C INPUT PARAMETERS +C rptr	%o0 +C sptr	%o1 +C n	%o2 + +ASM_START() +	REGISTER(%g2,#scratch) +	REGISTER(%g3,#scratch) +PROLOGUE(mpn_copyd) +	sllx	%o2,3,%g1 +	add	%g1,%o0,%o0 +	add	%g1,%o1,%o1 +	addcc	%o2,-8,%o2 +	bl,pt	%xcc,L(end01234567) +	nop +L(loop1): +	ldx	[%o1-8],%g1 +	ldx	[%o1-16],%g2 +	ldx	[%o1-24],%g3 +	ldx	[%o1-32],%g4 +	ldx	[%o1-40],%g5 +	ldx	[%o1-48],%o3 +	ldx	[%o1-56],%o4 +	ldx	[%o1-64],%o5 +	add	%o1,-64,%o1 +	stx	%g1,[%o0-8] +	stx	%g2,[%o0-16] +	stx	%g3,[%o0-24] +	stx	%g4,[%o0-32] +	stx	%g5,[%o0-40] +	stx	%o3,[%o0-48] +	stx	%o4,[%o0-56] +	stx	%o5,[%o0-64] +	addcc	%o2,-8,%o2 +	bge,pt	%xcc,L(loop1) +	add	%o0,-64,%o0 +L(end01234567): +	addcc	%o2,8,%o2 +	bz,pn	%xcc,L(end) +	nop +L(loop2): +	ldx	[%o1-8],%g1 +	add	%o1,-8,%o1 +	addcc	%o2,-1,%o2 +	stx	%g1,[%o0-8] +	bg,pt	%xcc,L(loop2) +	add	%o0,-8,%o0 +L(end):	retl +	nop +EPILOGUE(mpn_copyd) diff --git a/vendor/gmp-6.3.0/mpn/sparc64/copyi.asm b/vendor/gmp-6.3.0/mpn/sparc64/copyi.asm new file mode 100644 index 0000000..45663dc --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/copyi.asm @@ -0,0 +1,86 @@ +dnl  SPARC v9 mpn_copyi -- Copy a limb vector, incrementing. + +dnl  Copyright 1999-2003 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C		   cycles/limb +C UltraSPARC 1&2:	 2 +C UltraSPARC 3:		 2.5 +C UltraSPARC T1:	17 +C UltraSPARC T3:	 6 +C UltraSPARC T4/T5:	 2 + +C INPUT PARAMETERS +C rptr	%o0 +C sptr	%o1 +C n	%o2 + +ASM_START() +	REGISTER(%g2,#scratch) +	REGISTER(%g3,#scratch) +PROLOGUE(mpn_copyi) +	addcc	%o2,-8,%o2 +	bl,pt	%xcc,L(end01234567) +	nop +L(loop1): +	ldx	[%o1+0],%g1 +	ldx	[%o1+8],%g2 +	ldx	[%o1+16],%g3 +	ldx	[%o1+24],%g4 +	ldx	[%o1+32],%g5 +	ldx	[%o1+40],%o3 +	ldx	[%o1+48],%o4 +	ldx	[%o1+56],%o5 +	add	%o1,64,%o1 +	stx	%g1,[%o0+0] +	stx	%g2,[%o0+8] +	stx	%g3,[%o0+16] +	stx	%g4,[%o0+24] +	stx	%g5,[%o0+32] +	stx	%o3,[%o0+40] +	stx	%o4,[%o0+48] +	stx	%o5,[%o0+56] +	addcc	%o2,-8,%o2 +	bge,pt	%xcc,L(loop1) +	add	%o0,64,%o0 +L(end01234567): +	addcc	%o2,8,%o2 +	bz,pn	%xcc,L(end) +	nop +L(loop2): +	ldx	[%o1+0],%g1 +	add	%o1,8,%o1 +	addcc	%o2,-1,%o2 +	stx	%g1,[%o0+0] +	bg,pt	%xcc,L(loop2) +	add	%o0,8,%o0 +L(end):	retl +	nop +EPILOGUE(mpn_copyi) diff --git a/vendor/gmp-6.3.0/mpn/sparc64/dive_1.c b/vendor/gmp-6.3.0/mpn/sparc64/dive_1.c new file mode 100644 index 0000000..4264f29 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/dive_1.c @@ -0,0 +1,161 @@ +/* UltraSPARC 64 mpn_divexact_1 -- mpn by limb exact division. + +   THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY.  THEY'RE ALMOST +   CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN +   FUTURE GNU MP RELEASES. + +Copyright 2000, 2001, 2003, 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + +  * the GNU Lesser General Public License as published by the Free +    Software Foundation; either version 3 of the License, or (at your +    option) any later version. + +or + +  * the GNU General Public License as published by the Free Software +    Foundation; either version 2 of the License, or (at your option) any +    later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library.  If not, +see https://www.gnu.org/licenses/.  */ + +#include "gmp-impl.h" +#include "longlong.h" + +#include "mpn/sparc64/sparc64.h" + + +/*                 64-bit divisor   32-bit divisor +                    cycles/limb      cycles/limb +                     (approx)         (approx) +   Ultrasparc 2i:      110               70 +*/ + + +/* There are two key ideas here to reduce mulx's.  Firstly when the divisor +   is 32-bits the high of q*d can be calculated without the two 32x32->64 +   cross-products involving the high 32-bits of the divisor, that being zero +   of course.  Secondly umul_ppmm_lowequal and umul_ppmm_half_lowequal save +   one mulx (each) knowing the low of q*d is equal to the input limb l. + +   For size==1, a simple udivx is used.  This is faster than calculating an +   inverse. + +   For a 32-bit divisor and small sizes, an attempt was made at a simple +   udivx loop (two per 64-bit limb), but it turned out to be slower than +   mul-by-inverse.  At size==2 the inverse is about 260 cycles total +   compared to a udivx at 291.  Perhaps the latter would suit when size==2 +   but the high 32-bits of the second limb is zero (saving one udivx), but +   it doesn't seem worth a special case just for that.  */ + +void +mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor) +{ +  mp_limb_t  inverse, s, s_next, c, l, ls, q; +  unsigned   rshift, lshift; +  mp_limb_t  lshift_mask; +  mp_limb_t  divisor_h; + +  ASSERT (size >= 1); +  ASSERT (divisor != 0); +  ASSERT (MPN_SAME_OR_SEPARATE_P (dst, src, size)); +  ASSERT_MPN (src, size); +  ASSERT_LIMB (divisor); + +  s = *src++;                 /* src low limb */ +  size--; +  if (size == 0) +    { +      *dst = s / divisor; +      return; +    } + +  if ((divisor & 1) == 0) +    { +      count_trailing_zeros (rshift, divisor); +      divisor >>= rshift; +      lshift = 64 - rshift; + +      lshift_mask = MP_LIMB_T_MAX; +    } +  else +    { +      rshift = 0; + +      /* rshift==0 means no shift, so must mask out other part in this case */ +      lshift = 0; +      lshift_mask = 0; +    } + +  binvert_limb (inverse, divisor); + +  c = 0; +  divisor_h = HIGH32 (divisor); + +  if (divisor_h == 0) +    { +      /* 32-bit divisor */ +      do +        { +          s_next = *src++; +          ls = (s >> rshift) | ((s_next << lshift) & lshift_mask); +          s = s_next; + +          SUBC_LIMB (c, l, ls, c); + +          q = l * inverse; +          *dst++ = q; + +          umul_ppmm_half_lowequal (l, q, divisor, l); +          c += l; + +          size--; +        } +      while (size != 0); + +      ls = s >> rshift; +      l = ls - c; +      q = l * inverse; +      *dst = q; +    } +  else +    { +      /* 64-bit divisor */ +      mp_limb_t  divisor_l = LOW32 (divisor); +      do +        { +          s_next = *src++; +          ls = (s >> rshift) | ((s_next << lshift) & lshift_mask); +          s = s_next; + +          SUBC_LIMB (c, l, ls, c); + +          q = l * inverse; +          *dst++ = q; + +          umul_ppmm_lowequal (l, q, divisor, divisor_h, divisor_l, l); +          c += l; + +          size--; +        } +      while (size != 0); + +      ls = s >> rshift; +      l = ls - c; +      q = l * inverse; +      *dst = q; +    } +} diff --git a/vendor/gmp-6.3.0/mpn/sparc64/divrem_1.c b/vendor/gmp-6.3.0/mpn/sparc64/divrem_1.c new file mode 100644 index 0000000..ac94565 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/divrem_1.c @@ -0,0 +1,242 @@ +/* UltraSparc 64 mpn_divrem_1 -- mpn by limb division. + +Copyright 1991, 1993, 1994, 1996, 1998-2001, 2003 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + +  * the GNU Lesser General Public License as published by the Free +    Software Foundation; either version 3 of the License, or (at your +    option) any later version. + +or + +  * the GNU General Public License as published by the Free Software +    Foundation; either version 2 of the License, or (at your option) any +    later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library.  If not, +see https://www.gnu.org/licenses/.  */ + +#include "gmp-impl.h" +#include "longlong.h" + +#include "mpn/sparc64/sparc64.h" + + +/*                   64-bit divisor       32-bit divisor +                       cycles/limb          cycles/limb +                        (approx)             (approx) +                   integer  fraction    integer  fraction +   Ultrasparc 2i:    160      160          122      96 +*/ + + +/* 32-bit divisors are treated in special case code.  This requires 4 mulx +   per limb instead of 8 in the general case. + +   For big endian systems we need HALF_ENDIAN_ADJ included in the src[i] +   addressing, to get the two halves of each limb read in the correct order. +   This is kept in an adj variable.  Doing that measures about 4 c/l faster +   than just writing HALF_ENDIAN_ADJ(i) in the integer loop.  The latter +   shouldn't be 6 cycles worth of work, but perhaps it doesn't schedule well +   (on gcc 3.2.1 at least).  The fraction loop doesn't seem affected, but we +   still use a variable since that ought to work out best.  */ + +mp_limb_t +mpn_divrem_1 (mp_ptr qp_limbptr, mp_size_t xsize_limbs, +              mp_srcptr ap_limbptr, mp_size_t size_limbs, mp_limb_t d_limb) +{ +  mp_size_t  total_size_limbs; +  mp_size_t  i; + +  ASSERT (xsize_limbs >= 0); +  ASSERT (size_limbs >= 0); +  ASSERT (d_limb != 0); +  /* FIXME: What's the correct overlap rule when xsize!=0? */ +  ASSERT (MPN_SAME_OR_SEPARATE_P (qp_limbptr + xsize_limbs, +                                  ap_limbptr, size_limbs)); + +  total_size_limbs = size_limbs + xsize_limbs; +  if (UNLIKELY (total_size_limbs == 0)) +    return 0; + +  /* udivx is good for total_size==1, and no need to bother checking +     limb<divisor, since if that's likely the caller should check */ +  if (UNLIKELY (total_size_limbs == 1)) +    { +      mp_limb_t  a, q; +      a = (LIKELY (size_limbs != 0) ? ap_limbptr[0] : 0); +      q = a / d_limb; +      qp_limbptr[0] = q; +      return a - q*d_limb; +    } + +  if (d_limb <= CNST_LIMB(0xFFFFFFFF)) +    { +      mp_size_t  size, xsize, total_size, adj; +      unsigned   *qp, n1, n0, q, r, nshift, norm_rmask; +      mp_limb_t  dinv_limb; +      const unsigned *ap; +      int        norm, norm_rshift; + +      size = 2 * size_limbs; +      xsize = 2 * xsize_limbs; +      total_size = size + xsize; + +      ap = (unsigned *) ap_limbptr; +      qp = (unsigned *) qp_limbptr; + +      qp += xsize; +      r = 0;        /* initial remainder */ + +      if (LIKELY (size != 0)) +        { +          n1 = ap[size-1 + HALF_ENDIAN_ADJ(1)]; + +          /* If the length of the source is uniformly distributed, then +             there's a 50% chance of the high 32-bits being zero, which we +             can skip.  */ +          if (n1 == 0) +            { +              n1 = ap[size-2 + HALF_ENDIAN_ADJ(0)]; +              total_size--; +              size--; +              ASSERT (size > 0);  /* because always even */ +              qp[size + HALF_ENDIAN_ADJ(1)] = 0; +            } + +          /* Skip a division if high < divisor (high quotient 0).  Testing +             here before before normalizing will still skip as often as +             possible.  */ +          if (n1 < d_limb) +            { +              r = n1; +              size--; +              qp[size + HALF_ENDIAN_ADJ(size)] = 0; +              total_size--; +              if (total_size == 0) +                return r; +            } +        } + +      count_leading_zeros_32 (norm, d_limb); +      norm -= 32; +      d_limb <<= norm; +      r <<= norm; + +      norm_rshift = 32 - norm; +      norm_rmask = (norm == 0 ? 0 : 0xFFFFFFFF); + +      invert_half_limb (dinv_limb, d_limb); + +      if (LIKELY (size != 0)) +        { +          i = size - 1; +          adj = HALF_ENDIAN_ADJ (i); +          n1 = ap[i + adj]; +          adj = -adj; +          r |= ((n1 >> norm_rshift) & norm_rmask); +          for ( ; i > 0; i--) +            { +              n0 = ap[i-1 + adj]; +              adj = -adj; +              nshift = (n1 << norm) | ((n0 >> norm_rshift) & norm_rmask); +              udiv_qrnnd_half_preinv (q, r, r, nshift, d_limb, dinv_limb); +              qp[i + adj] = q; +              n1 = n0; +            } +          nshift = n1 << norm; +          udiv_qrnnd_half_preinv (q, r, r, nshift, d_limb, dinv_limb); +          qp[0 + HALF_ENDIAN_ADJ(0)] = q; +        } +      qp -= xsize; +      adj = HALF_ENDIAN_ADJ (0); +      for (i = xsize-1; i >= 0; i--) +        { +          udiv_qrnnd_half_preinv (q, r, r, 0, d_limb, dinv_limb); +          adj = -adj; +          qp[i + adj] = q; +        } + +      return r >> norm; +    } +  else +    { +      mp_srcptr  ap; +      mp_ptr     qp; +      mp_size_t  size, xsize, total_size; +      mp_limb_t  d, n1, n0, q, r, dinv, nshift, norm_rmask; +      int        norm, norm_rshift; + +      ap = ap_limbptr; +      qp = qp_limbptr; +      size = size_limbs; +      xsize = xsize_limbs; +      total_size = total_size_limbs; +      d = d_limb; + +      qp += total_size;   /* above high limb */ +      r = 0;              /* initial remainder */ + +      if (LIKELY (size != 0)) +        { +          /* Skip a division if high < divisor (high quotient 0).  Testing +             here before before normalizing will still skip as often as +             possible.  */ +          n1 = ap[size-1]; +          if (n1 < d) +            { +              r = n1; +              *--qp = 0; +              total_size--; +              if (total_size == 0) +                return r; +              size--; +            } +        } + +      count_leading_zeros (norm, d); +      d <<= norm; +      r <<= norm; + +      norm_rshift = GMP_LIMB_BITS - norm; +      norm_rmask = (norm == 0 ? 0 : ~CNST_LIMB(0)); + +      invert_limb (dinv, d); + +      if (LIKELY (size != 0)) +        { +          n1 = ap[size-1]; +          r |= ((n1 >> norm_rshift) & norm_rmask); +          for (i = size-2; i >= 0; i--) +            { +              n0 = ap[i]; +              nshift = (n1 << norm) | ((n0 >> norm_rshift) & norm_rmask); +              udiv_qrnnd_preinv (q, r, r, nshift, d, dinv); +              *--qp = q; +              n1 = n0; +            } +          nshift = n1 << norm; +          udiv_qrnnd_preinv (q, r, r, nshift, d, dinv); +          *--qp = q; +        } +      for (i = 0; i < xsize; i++) +        { +          udiv_qrnnd_preinv (q, r, r, CNST_LIMB(0), d, dinv); +          *--qp = q; +        } +      return r >> norm; +    } +} diff --git a/vendor/gmp-6.3.0/mpn/sparc64/gcd_11.asm b/vendor/gmp-6.3.0/mpn/sparc64/gcd_11.asm new file mode 100644 index 0000000..2dd200d --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/gcd_11.asm @@ -0,0 +1,87 @@ +dnl  SPARC64 mpn_gcd_11. + +dnl  Based on the K7 gcd_1.asm, by Kevin Ryde.  Rehacked for SPARC by Torbjörn +dnl  Granlund. + +dnl  Copyright 2000-2002, 2005, 2009, 2011-2013, 2021 Free Software Foundation, +dnl  Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C		  cycles/bit (approx) +C UltraSPARC 1&2:	 5.1 +C UltraSPARC 3:		 5.0 +C UltraSPARC T1:	11.4 +C UltraSPARC T3:	10 +C UltraSPARC T4:	 6 +C Numbers measured with: speed -CD -s32-64 -t32 mpn_gcd_1 + +C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0. + +deflit(MAXSHIFT, 7) +deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1)) + +	RODATA +	TYPE(ctz_table,object) +ctz_table: +	.byte	MAXSHIFT +forloop(i,1,MASK, +`	.byte	m4_count_trailing_zeros(i) +') +	SIZE(ctz_table,.-ctz_table) + +define(`u0',    `%o0') +define(`v0',    `%o1') + +ASM_START() +PROLOGUE(mpn_gcd_11) +	LEA64(ctz_table, o5, g4) +	b	L(odd) +	 mov	u0, %o4 + +	ALIGN(16) +L(top):	movcc	%xcc, %o4, v0		C v = min(u,v) +	movcc	%xcc, %o2, %o0		C u = |v - u] +L(mid):	ldub	[%o5+%g1], %g5		C +	brz,pn %g1, L(shift_alot)	C +	 srlx	%o0, %g5, %o4		C new u, odd +L(odd):	subcc	v0, %o4, %o2		C v - u, set flags for branch and movcc +	sub	%o4, v0, %o0		C u - v +	bnz,pt	%xcc, L(top)		C +	 and	%o2, MASK, %g1		C extract low MAXSHIFT bits from (v-u) + +	retl +	 mov	v0, %o0 + +L(shift_alot): +	mov	%o4, %o0 +	b	L(mid) +	 and	%o4, MASK, %g1		C +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/sparc64/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/sparc64/gmp-mparam.h new file mode 100644 index 0000000..5ac2c46 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/gmp-mparam.h @@ -0,0 +1,139 @@ +/* Sparc64 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2002, 2004, 2006, 2008-2010 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + +  * the GNU Lesser General Public License as published by the Free +    Software Foundation; either version 3 of the License, or (at your +    option) any later version. + +or + +  * the GNU General Public License as published by the Free Software +    Foundation; either version 2 of the License, or (at your option) any +    later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library.  If not, +see https://www.gnu.org/licenses/.  */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 500 MHz ultrasparc2 running GNU/Linux */ + +#define DIVREM_1_NORM_THRESHOLD              3 +#define DIVREM_1_UNNORM_THRESHOLD            4 +#define MOD_1_NORM_THRESHOLD                 3 +#define MOD_1_UNNORM_THRESHOLD               3 +#define MOD_1N_TO_MOD_1_1_THRESHOLD      MP_SIZE_T_MAX  /* never */ +#define MOD_1U_TO_MOD_1_1_THRESHOLD         22 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD         0  /* never mpn_mod_1_1p */ +#define MOD_1_2_TO_MOD_1_4_THRESHOLD        27 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD  MP_SIZE_T_MAX  /* never */ +#define USE_PREINV_DIVREM_1                  1 +#define DIVEXACT_1_THRESHOLD                 0  /* always */ +#define BMOD_1_TO_MOD_1_THRESHOLD        MP_SIZE_T_MAX  /* never */ + +#define MUL_TOOM22_THRESHOLD                30 +#define MUL_TOOM33_THRESHOLD               187 +#define MUL_TOOM44_THRESHOLD               278 +#define MUL_TOOM6H_THRESHOLD               278 +#define MUL_TOOM8H_THRESHOLD               357 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD     201 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD     199 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD     154 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD     107 + +#define SQR_BASECASE_THRESHOLD              13 +#define SQR_TOOM2_THRESHOLD                 69 +#define SQR_TOOM3_THRESHOLD                116 +#define SQR_TOOM4_THRESHOLD                336 +#define SQR_TOOM6_THRESHOLD                336 +#define SQR_TOOM8_THRESHOLD                454 + +#define MULMOD_BNM1_THRESHOLD               17 +#define SQRMOD_BNM1_THRESHOLD               23 + +#define MUL_FFT_MODF_THRESHOLD             248  /* k = 5 */ +#define MUL_FFT_TABLE3                                      \ +  { {    248, 5}, {      9, 4}, {     19, 6}, {      5, 5}, \ +    {     15, 6}, {      8, 5}, {     17, 6}, {     21, 7}, \ +    {     19, 8}, {     11, 7}, {     25, 8}, {     15, 7}, \ +    {     31, 8}, {     27, 9}, {     15, 8}, {     33, 9}, \ +    {     19, 8}, {     39, 9}, {     27,10}, {     15, 9}, \ +    {     39,10}, {     23, 9}, {     47,11}, {     15,10}, \ +    {     31, 9}, {     67,10}, {     39, 9}, {     79,10}, \ +    {     47,11}, {     31,10}, {     63, 9}, {    127, 8}, \ +    {    255,10}, {     71, 9}, {    143, 8}, {    287,10}, \ +    {     79,11}, {     47,12}, {   4096,13}, {   8192,14}, \ +    {  16384,15}, {  32768,16}, {  65536,17}, { 131072,18}, \ +    { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ +    {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 50 +#define MUL_FFT_THRESHOLD                 1984 + +#define SQR_FFT_MODF_THRESHOLD             236  /* k = 5 */ +#define SQR_FFT_TABLE3                                      \ +  { {    236, 5}, {      8, 4}, {     17, 5}, {     19, 6}, \ +    {     10, 5}, {     21, 6}, {     19, 7}, {     10, 6}, \ +    {     21, 7}, {     21, 8}, {     21, 9}, {     11, 8}, \ +    {     23, 9}, {     19, 8}, {     43, 9}, {     23,10}, \ +    {     15, 9}, {     43,10}, {     23,11}, {     15,10}, \ +    {     31, 9}, {     63,10}, {     47, 8}, {    191,11}, \ +    {     31,10}, {     63, 8}, {    255, 7}, {    511, 9}, \ +    {    135, 8}, {    271,10}, {     71, 9}, {    143, 8}, \ +    {    287, 7}, {    575,11}, {     47, 9}, {    191, 8}, \ +    {    383,12}, {   4096,13}, {   8192,14}, {  16384,15}, \ +    {  32768,16}, {  65536,17}, { 131072,18}, { 262144,19}, \ +    { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ +    {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 49 +#define SQR_FFT_THRESHOLD                 1120 + +#define MULLO_BASECASE_THRESHOLD            16 +#define MULLO_DC_THRESHOLD                  41 +#define MULLO_MUL_N_THRESHOLD             3791 + +#define DC_DIV_QR_THRESHOLD                 27 +#define DC_DIVAPPR_Q_THRESHOLD             100 +#define DC_BDIV_QR_THRESHOLD                47 +#define DC_BDIV_Q_THRESHOLD                174 + +#define INV_MULMOD_BNM1_THRESHOLD           58 +#define INV_NEWTON_THRESHOLD                13 +#define INV_APPR_THRESHOLD                   9 + +#define BINV_NEWTON_THRESHOLD              187 +#define REDC_1_TO_REDC_2_THRESHOLD          10 +#define REDC_2_TO_REDC_N_THRESHOLD         115 + +#define MU_DIV_QR_THRESHOLD                680 +#define MU_DIVAPPR_Q_THRESHOLD             618 +#define MUPI_DIV_QR_THRESHOLD                0  /* always */ +#define MU_BDIV_QR_THRESHOLD               748 +#define MU_BDIV_Q_THRESHOLD                889 + +#define MATRIX22_STRASSEN_THRESHOLD         13 +#define HGCD_THRESHOLD                      53 +#define GCD_DC_THRESHOLD                   283 +#define GCDEXT_DC_THRESHOLD                186 +#define JACOBI_BASE_METHOD                   2 + +#define GET_STR_DC_THRESHOLD                13 +#define GET_STR_PRECOMPUTE_THRESHOLD        16 +#define SET_STR_DC_THRESHOLD               390 +#define SET_STR_PRECOMPUTE_THRESHOLD      1665 diff --git a/vendor/gmp-6.3.0/mpn/sparc64/lshift.asm b/vendor/gmp-6.3.0/mpn/sparc64/lshift.asm new file mode 100644 index 0000000..90bbb45 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/lshift.asm @@ -0,0 +1,140 @@ +dnl  SPARC v9 mpn_lshift + +dnl  Contributed to the GNU project by David Miller. + +dnl  Copyright 2013 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C		    cycles/limb +C UltraSPARC 1&2:	 2 +C UltraSPARC 3:		 2.5 +C UltraSPARC T1:	17.5 +C UltraSPARC T3:	 8 +C UltraSPARC T4:	 3 + +C INPUT PARAMETERS +define(`rp',     `%i0') +define(`up',     `%i1') +define(`n',      `%i2') +define(`cnt',    `%i3') + +define(`tcnt',   `%i4') +define(`retval', `%i5') +define(`u0',     `%l0') +define(`u1',     `%l1') +define(`r0',     `%l6') +define(`r1',     `%l7') +define(`u0_off', `%o0') +define(`u1_off', `%o1') +define(`r0_off', `%o2') +define(`r1_off', `%o3') + +ASM_START() +	REGISTER(%g2,#scratch) +	REGISTER(%g3,#scratch) +PROLOGUE(mpn_lshift) +	save	%sp, -176, %sp + +	sllx	n, 3, n +	sub	%g0, cnt, tcnt + +	sub	up, 8, u1_off +	add	rp, (5 * 8), r1_off + +	ldx	[n + u1_off], u1	C WAS: up - 8 +	add	u1_off, (3 * 8), u1_off + +	sub	r1_off, 8, r0_off +	sub	u1_off, 8, u0_off + +	subcc	n, (3 * 8), n +	srlx	u1, tcnt, retval + +	bl,pn	%xcc, L(end12) +	 sllx	u1, cnt, %l3 + +	ldx	[n + u0_off], u0	C WAS: up - 16 +	subcc	n, (2 * 8), n + +	ldx	[n + u1_off], u1	C WAS: up - 24 + +	bl,pn	%xcc, L(end34) +	 srlx	u0, tcnt, %l4 + +	b,a	L(top) +	ALIGN(16) +L(top): +	sllx	u0, cnt, %l2 +	or	%l4, %l3, r0 + +	ldx	[n + u0_off], u0	C WAS: up - 16 +	srlx	u1, tcnt, %l5 + +	stx	r0, [n + r0_off]	C WAS: rp - 8 +	subcc	n, (2 * 8), n + +	sllx	u1, cnt, %l3 +	or	%l2, %l5, r1 + +	ldx	[n + u1_off], u1	C WAS: up - 24 +	srlx	u0, tcnt, %l4 + +	bge,pt	%xcc, L(top) +	 stx	r1, [n + r1_off]	C WAS: rp - 16 + +L(end34): +	sllx	u0, cnt, %l2 +	or	%l4, %l3, r0 + +	srlx	u1, tcnt, %l5 +	stx	r0, [n + r0_off]	C WAS: rp - 8 + +	or	%l2, %l5, r1 +	sub	n, (2 * 8), %o5 + +	sllx	u1, cnt, %l3 +	stx	r1, [%o5 + r1_off]	C WAS: rp - 16 + +L(end12): +	andcc	n, 8, %g0 +	bz,pn	%xcc, L(done) +	 nop + +	ldx	[n + u0_off], u1 +	srlx	u1, tcnt, %l4 +	or	%l4, %l3, r0 +	stx	r0, [r0_off - 24] +	sllx	u1, cnt, %l3 +L(done): +	stx	%l3, [r0_off - 32] + +	ret +	restore retval, 0, %o0 +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/sparc64/lshiftc.asm b/vendor/gmp-6.3.0/mpn/sparc64/lshiftc.asm new file mode 100644 index 0000000..4a0f0a3 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/lshiftc.asm @@ -0,0 +1,147 @@ +dnl  SPARC v9 mpn_lshiftc + +dnl  Contributed to the GNU project by David Miller. + +dnl  Copyright 2013 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C		    cycles/limb +C UltraSPARC 1&2:	 3 +C UltraSPARC 3:		 3 +C UltraSPARC T1:	17 +C UltraSPARC T3:	10 +C UltraSPARC T4:	 3.5 + +C INPUT PARAMETERS +define(`rp',     `%i0') +define(`up',     `%i1') +define(`n',      `%i2') +define(`cnt',    `%i3') + +define(`tcnt',   `%i4') +define(`retval', `%i5') +define(`u0',     `%l0') +define(`u1',     `%l1') +define(`r0',     `%l6') +define(`r1',     `%l7') +define(`u0_off', `%o0') +define(`u1_off', `%o1') +define(`r0_off', `%o2') +define(`r1_off', `%o3') + +ASM_START() +	REGISTER(%g2,#scratch) +	REGISTER(%g3,#scratch) +PROLOGUE(mpn_lshiftc) +	save	%sp, -176, %sp + +	sllx	n, 3, n +	sub	%g0, cnt, tcnt + +	sub	up, 8, u1_off +	add	rp, (5 * 8), r1_off + +	ldx	[n + u1_off], u1	C WAS: up - 8 +	add	u1_off, (3 * 8), u1_off + +	sub	r1_off, 8, r0_off +	sub	u1_off, 8, u0_off + +	subcc	n, (3 * 8), n +	srlx	u1, tcnt, retval + +	bl,pn	%xcc, L(end12) +	 sllx	u1, cnt, %l3 + +	ldx	[n + u0_off], u0	C WAS: up - 16 +	subcc	n, (2 * 8), n + +	ldx	[n + u1_off], u1	C WAS: up - 24 + +	bl,pn	%xcc, L(end34) +	 srlx	u0, tcnt, %l4 + +	b,a	L(top) +	ALIGN(16) +L(top): +	not	%l3, %l3 +	sllx	u0, cnt, %l2 + +	andn	%l3, %l4, r0 +	ldx	[n + u0_off], u0	C WAS: up - 16 + +	srlx	u1, tcnt, %l5 +	stx	r0, [n + r0_off]	C WAS: rp - 8 + +	subcc	n, (2 * 8), n +	not	%l2, %l2 + +	sllx	u1, cnt, %l3 +	andn	%l2, %l5, r1 + +	ldx	[n + u1_off], u1	C WAS: up - 24 +	srlx	u0, tcnt, %l4 + +	bge,pt	%xcc, L(top) +	 stx	r1, [n + r1_off]	C WAS: rp - 16 + +L(end34): +	not	%l3, %l3 +	sllx	u0, cnt, %l2 + +	andn	%l3, %l4, r0 +	srlx	u1, tcnt, %l5 + +	stx	r0, [n + r0_off]	C WAS: rp - 8 +	not	%l2, %l2 + +	andn	%l2, %l5, r1 +	sub	n, (2 * 8), %o5 + +	sllx	u1, cnt, %l3 +	stx	r1, [%o5 + r1_off]	C WAS: rp - 16 + +L(end12): +	andcc	n, 8, %g0 +	bz	%xcc, L(done)+4 +	 not	%l3, %l3 + +	ldx	[n + u0_off], u1 +	srlx	u1, tcnt, %l4 +	andn	%l3, %l4, r0 +	stx	r0, [r0_off - 24] +	sllx	u1, cnt, %l3 +L(done): +	not	%l3, %l3 +	stx	%l3, [r0_off - 32] + +	ret +	restore retval, 0, %o0 +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/sparc64/mod_1.c b/vendor/gmp-6.3.0/mpn/sparc64/mod_1.c new file mode 100644 index 0000000..ab53f9d --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/mod_1.c @@ -0,0 +1,238 @@ +/* UltraSPARC 64 mpn_mod_1 -- mpn by limb remainder. + +Copyright 1991, 1993, 1994, 1999-2001, 2003, 2010 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + +  * the GNU Lesser General Public License as published by the Free +    Software Foundation; either version 3 of the License, or (at your +    option) any later version. + +or + +  * the GNU General Public License as published by the Free Software +    Foundation; either version 2 of the License, or (at your option) any +    later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library.  If not, +see https://www.gnu.org/licenses/.  */ + +#include "gmp-impl.h" +#include "longlong.h" + +#include "mpn/sparc64/sparc64.h" + + +/*                 64-bit divisor   32-bit divisor +                    cycles/limb      cycles/limb +                     (approx)         (approx) +   Ultrasparc 2i:      160               120 +*/ + + +/* 32-bit divisors are treated in special case code.  This requires 4 mulx +   per limb instead of 8 in the general case. + +   For big endian systems we need HALF_ENDIAN_ADJ included in the src[i] +   addressing, to get the two halves of each limb read in the correct order. +   This is kept in an adj variable.  Doing that measures about 6 c/l faster +   than just writing HALF_ENDIAN_ADJ(i) in the loop.  The latter shouldn't +   be 6 cycles worth of work, but perhaps it doesn't schedule well (on gcc +   3.2.1 at least). + +   A simple udivx/umulx loop for the 32-bit case was attempted for small +   sizes, but at size==2 it was only about the same speed and at size==3 was +   slower.  */ + +static mp_limb_t +mpn_mod_1_anynorm (mp_srcptr src_limbptr, mp_size_t size_limbs, mp_limb_t d_limb) +{ +  int        norm, norm_rshift; +  mp_limb_t  src_high_limb; +  mp_size_t  i; + +  ASSERT (size_limbs >= 0); +  ASSERT (d_limb != 0); + +  if (UNLIKELY (size_limbs == 0)) +    return 0; + +  src_high_limb = src_limbptr[size_limbs-1]; + +  /* udivx is good for size==1, and no need to bother checking limb<divisor, +     since if that's likely the caller should check */ +  if (UNLIKELY (size_limbs == 1)) +    return src_high_limb % d_limb; + +  if (d_limb <= CNST_LIMB(0xFFFFFFFF)) +    { +      unsigned   *src, n1, n0, r, dummy_q, nshift, norm_rmask; +      mp_size_t  size, adj; +      mp_limb_t  dinv_limb; + +      size = 2 * size_limbs;    /* halfwords */ +      src = (unsigned *) src_limbptr; + +      /* prospective initial remainder, if < d */ +      r = src_high_limb >> 32; + +      /* If the length of the source is uniformly distributed, then there's +         a 50% chance of the high 32-bits being zero, which we can skip.  */ +      if (r == 0) +        { +          r = (unsigned) src_high_limb; +          size--; +          ASSERT (size > 0);  /* because always even */ +        } + +      /* Skip a division if high < divisor.  Having the test here before +         normalizing will still skip as often as possible.  */ +      if (r < d_limb) +        { +          size--; +          ASSERT (size > 0);  /* because size==1 handled above */ +        } +      else +        r = 0; + +      count_leading_zeros_32 (norm, d_limb); +      norm -= 32; +      d_limb <<= norm; + +      norm_rshift = 32 - norm; +      norm_rmask = (norm == 0 ? 0 : 0xFFFFFFFF); +      i = size-1; +      adj = HALF_ENDIAN_ADJ (i); +      n1 = src [i + adj]; +      r = (r << norm) | ((n1 >> norm_rshift) & norm_rmask); + +      invert_half_limb (dinv_limb, d_limb); +      adj = -adj; + +      for (i--; i >= 0; i--) +        { +          n0 = src [i + adj]; +          adj = -adj; +          nshift = (n1 << norm) | ((n0 >> norm_rshift) & norm_rmask); +          udiv_qrnnd_half_preinv (dummy_q, r, r, nshift, d_limb, dinv_limb); +          n1 = n0; +        } + +      /* same as loop, but without n0 */ +      nshift = n1 << norm; +      udiv_qrnnd_half_preinv (dummy_q, r, r, nshift, d_limb, dinv_limb); + +      ASSERT ((r & ((1 << norm) - 1)) == 0); +      return r >> norm; +    } +  else +    { +      mp_srcptr  src; +      mp_size_t  size; +      mp_limb_t  n1, n0, r, dinv, dummy_q, nshift, norm_rmask; + +      src = src_limbptr; +      size = size_limbs; +      r = src_high_limb;  /* initial remainder */ + +      /* Skip a division if high < divisor.  Having the test here before +         normalizing will still skip as often as possible.  */ +      if (r < d_limb) +        { +          size--; +          ASSERT (size > 0);  /* because size==1 handled above */ +        } +      else +        r = 0; + +      count_leading_zeros (norm, d_limb); +      d_limb <<= norm; + +      norm_rshift = GMP_LIMB_BITS - norm; +      norm_rmask = (norm == 0 ? 0 : 0xFFFFFFFF); + +      src += size; +      n1 = *--src; +      r = (r << norm) | ((n1 >> norm_rshift) & norm_rmask); + +      invert_limb (dinv, d_limb); + +      for (i = size-2; i >= 0; i--) +        { +          n0 = *--src; +          nshift = (n1 << norm) | ((n0 >> norm_rshift) & norm_rmask); +          udiv_qrnnd_preinv (dummy_q, r, r, nshift, d_limb, dinv); +          n1 = n0; +        } + +      /* same as loop, but without n0 */ +      nshift = n1 << norm; +      udiv_qrnnd_preinv (dummy_q, r, r, nshift, d_limb, dinv); + +      ASSERT ((r & ((CNST_LIMB(1) << norm) - 1)) == 0); +      return r >> norm; +    } +} + +mp_limb_t +mpn_mod_1 (mp_srcptr ap, mp_size_t n, mp_limb_t b) +{ +  ASSERT (n >= 0); +  ASSERT (b != 0); + +  /* Should this be handled at all?  Rely on callers?  Note un==0 is currently +     required by mpz/fdiv_r_ui.c and possibly other places.  */ +  if (n == 0) +    return 0; + +  if (UNLIKELY ((b & GMP_NUMB_HIGHBIT) != 0)) +    { +      if (BELOW_THRESHOLD (n, MOD_1N_TO_MOD_1_1_THRESHOLD)) +	{ +	  return mpn_mod_1_anynorm (ap, n, b); +	} +      else +	{ +	  mp_limb_t pre[4]; +	  mpn_mod_1_1p_cps (pre, b); +	  return mpn_mod_1_1p (ap, n, b, pre); +	} +    } +  else +    { +      if (BELOW_THRESHOLD (n, MOD_1U_TO_MOD_1_1_THRESHOLD)) +	{ +	  return mpn_mod_1_anynorm (ap, n, b); +	} +      else if (BELOW_THRESHOLD (n, MOD_1_1_TO_MOD_1_2_THRESHOLD)) +	{ +	  mp_limb_t pre[4]; +	  mpn_mod_1_1p_cps (pre, b); +	  return mpn_mod_1_1p (ap, n, b << pre[1], pre); +	} +      else if (BELOW_THRESHOLD (n, MOD_1_2_TO_MOD_1_4_THRESHOLD) || UNLIKELY (b > GMP_NUMB_MASK / 4)) +	{ +	  mp_limb_t pre[5]; +	  mpn_mod_1s_2p_cps (pre, b); +	  return mpn_mod_1s_2p (ap, n, b << pre[1], pre); +	} +      else +	{ +	  mp_limb_t pre[7]; +	  mpn_mod_1s_4p_cps (pre, b); +	  return mpn_mod_1s_4p (ap, n, b << pre[1], pre); +	} +    } +} diff --git a/vendor/gmp-6.3.0/mpn/sparc64/mod_1_4.c b/vendor/gmp-6.3.0/mpn/sparc64/mod_1_4.c new file mode 100644 index 0000000..735a402 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/mod_1_4.c @@ -0,0 +1,235 @@ +/* mpn_mod_1s_4p (ap, n, b, cps) +   Divide (ap,,n) by b.  Return the single-limb remainder. +   Requires that d < B / 4. + +   Contributed to the GNU project by Torbjorn Granlund. +   Based on a suggestion by Peter L. Montgomery. + +   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY +   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST +   GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2008-2010 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + +  * the GNU Lesser General Public License as published by the Free +    Software Foundation; either version 3 of the License, or (at your +    option) any later version. + +or + +  * the GNU General Public License as published by the Free Software +    Foundation; either version 2 of the License, or (at your option) any +    later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library.  If not, +see https://www.gnu.org/licenses/.  */ + +#include "gmp-impl.h" +#include "longlong.h" + +#include "mpn/sparc64/sparc64.h" + +void +mpn_mod_1s_4p_cps (mp_limb_t cps[7], mp_limb_t b) +{ +  mp_limb_t bi; +  mp_limb_t B1modb, B2modb, B3modb, B4modb, B5modb; +  int cnt; + +  ASSERT (b <= (~(mp_limb_t) 0) / 4); + +  count_leading_zeros (cnt, b); + +  b <<= cnt; +  invert_limb (bi, b); + +  cps[0] = bi; +  cps[1] = cnt; + +  B1modb = -b * ((bi >> (GMP_LIMB_BITS-cnt)) | (CNST_LIMB(1) << cnt)); +  ASSERT (B1modb <= b);		/* NB: not fully reduced mod b */ +  cps[2] = B1modb >> cnt; + +  udiv_rnnd_preinv (B2modb, B1modb, CNST_LIMB(0), b, bi); +  cps[3] = B2modb >> cnt; + +  udiv_rnnd_preinv (B3modb, B2modb, CNST_LIMB(0), b, bi); +  cps[4] = B3modb >> cnt; + +  udiv_rnnd_preinv (B4modb, B3modb, CNST_LIMB(0), b, bi); +  cps[5] = B4modb >> cnt; + +  udiv_rnnd_preinv (B5modb, B4modb, CNST_LIMB(0), b, bi); +  cps[6] = B5modb >> cnt; + +#if WANT_ASSERT +  { +    int i; +    b = cps[2]; +    for (i = 3; i <= 6; i++) +      { +	b += cps[i]; +	ASSERT (b >= cps[i]); +      } +  } +#endif +} + +mp_limb_t +mpn_mod_1s_4p (mp_srcptr ap, mp_size_t n, mp_limb_t b, const mp_limb_t cps[7]) +{ +  mp_limb_t rh, rl, bi, ph, pl, ch, cl, r; +  mp_limb_t B1modb, B2modb, B3modb, B4modb, B5modb; +  mp_size_t i; +  int cnt; + +  ASSERT (n >= 1); + +  B1modb = cps[2]; +  B2modb = cps[3]; +  B3modb = cps[4]; +  B4modb = cps[5]; +  B5modb = cps[6]; + +  if ((b >> 32) == 0) +    { +      switch (n & 3) +	{ +	case 0: +	  umul_ppmm_s (ph, pl, ap[n - 3], B1modb); +	  add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[n - 4]); +	  umul_ppmm_s (ch, cl, ap[n - 2], B2modb); +	  add_ssaaaa (ph, pl, ph, pl, ch, cl); +	  umul_ppmm_s (rh, rl, ap[n - 1], B3modb); +	  add_ssaaaa (rh, rl, rh, rl, ph, pl); +	  n -= 4; +	  break; +	case 1: +	  rh = 0; +	  rl = ap[n - 1]; +	  n -= 1; +	  break; +	case 2: +	  rh = ap[n - 1]; +	  rl = ap[n - 2]; +	  n -= 2; +	  break; +	case 3: +	  umul_ppmm_s (ph, pl, ap[n - 2], B1modb); +	  add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[n - 3]); +	  umul_ppmm_s (rh, rl, ap[n - 1], B2modb); +	  add_ssaaaa (rh, rl, rh, rl, ph, pl); +	  n -= 3; +	  break; +	} + +      for (i = n - 4; i >= 0; i -= 4) +	{ +	  /* rr = ap[i]				< B +		+ ap[i+1] * (B mod b)		<= (B-1)(b-1) +		+ ap[i+2] * (B^2 mod b)		<= (B-1)(b-1) +		+ ap[i+3] * (B^3 mod b)		<= (B-1)(b-1) +		+ LO(rr)  * (B^4 mod b)		<= (B-1)(b-1) +		+ HI(rr)  * (B^5 mod b)		<= (B-1)(b-1) +	  */ +	  umul_ppmm_s (ph, pl, ap[i + 1], B1modb); +	  add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[i + 0]); + +	  umul_ppmm_s (ch, cl, ap[i + 2], B2modb); +	  add_ssaaaa (ph, pl, ph, pl, ch, cl); + +	  umul_ppmm_s (ch, cl, ap[i + 3], B3modb); +	  add_ssaaaa (ph, pl, ph, pl, ch, cl); + +	  umul_ppmm_s (ch, cl, rl, B4modb); +	  add_ssaaaa (ph, pl, ph, pl, ch, cl); + +	  umul_ppmm_s (rh, rl, rh, B5modb); +	  add_ssaaaa (rh, rl, rh, rl, ph, pl); +	} + +      umul_ppmm_s (rh, cl, rh, B1modb); +      add_ssaaaa (rh, rl, rh, rl, CNST_LIMB(0), cl); +    } +  else +    { +      switch (n & 3) +	{ +	case 0: +	  umul_ppmm (ph, pl, ap[n - 3], B1modb); +	  add_ssaaaa (ph, pl, ph, pl, 0, ap[n - 4]); +	  umul_ppmm (ch, cl, ap[n - 2], B2modb); +	  add_ssaaaa (ph, pl, ph, pl, ch, cl); +	  umul_ppmm (rh, rl, ap[n - 1], B3modb); +	  add_ssaaaa (rh, rl, rh, rl, ph, pl); +	  n -= 4; +	  break; +	case 1: +	  rh = 0; +	  rl = ap[n - 1]; +	  n -= 1; +	  break; +	case 2: +	  rh = ap[n - 1]; +	  rl = ap[n - 2]; +	  n -= 2; +	  break; +	case 3: +	  umul_ppmm (ph, pl, ap[n - 2], B1modb); +	  add_ssaaaa (ph, pl, ph, pl, 0, ap[n - 3]); +	  umul_ppmm (rh, rl, ap[n - 1], B2modb); +	  add_ssaaaa (rh, rl, rh, rl, ph, pl); +	  n -= 3; +	  break; +	} + +      for (i = n - 4; i >= 0; i -= 4) +	{ +	  /* rr = ap[i]				< B +		+ ap[i+1] * (B mod b)		<= (B-1)(b-1) +		+ ap[i+2] * (B^2 mod b)		<= (B-1)(b-1) +		+ ap[i+3] * (B^3 mod b)		<= (B-1)(b-1) +		+ LO(rr)  * (B^4 mod b)		<= (B-1)(b-1) +		+ HI(rr)  * (B^5 mod b)		<= (B-1)(b-1) +	  */ +	  umul_ppmm (ph, pl, ap[i + 1], B1modb); +	  add_ssaaaa (ph, pl, ph, pl, 0, ap[i + 0]); + +	  umul_ppmm (ch, cl, ap[i + 2], B2modb); +	  add_ssaaaa (ph, pl, ph, pl, ch, cl); + +	  umul_ppmm (ch, cl, ap[i + 3], B3modb); +	  add_ssaaaa (ph, pl, ph, pl, ch, cl); + +	  umul_ppmm (ch, cl, rl, B4modb); +	  add_ssaaaa (ph, pl, ph, pl, ch, cl); + +	  umul_ppmm (rh, rl, rh, B5modb); +	  add_ssaaaa (rh, rl, rh, rl, ph, pl); +	} + +      umul_ppmm (rh, cl, rh, B1modb); +      add_ssaaaa (rh, rl, rh, rl, 0, cl); +    } + +  bi = cps[0]; +  cnt = cps[1]; + +  r = (rh << cnt) | (rl >> (GMP_LIMB_BITS - cnt)); +  udiv_rnnd_preinv (r, r, rl << cnt, b, bi); + +  return r >> cnt; +} diff --git a/vendor/gmp-6.3.0/mpn/sparc64/mode1o.c b/vendor/gmp-6.3.0/mpn/sparc64/mode1o.c new file mode 100644 index 0000000..771c999 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/mode1o.c @@ -0,0 +1,196 @@ +/* UltraSPARC 64 mpn_modexact_1c_odd -- mpn by limb exact style remainder. + +   THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY.  THEY'RE ALMOST +   CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN +   FUTURE GNU MP RELEASES. + +Copyright 2000-2003 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + +  * the GNU Lesser General Public License as published by the Free +    Software Foundation; either version 3 of the License, or (at your +    option) any later version. + +or + +  * the GNU General Public License as published by the Free Software +    Foundation; either version 2 of the License, or (at your option) any +    later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library.  If not, +see https://www.gnu.org/licenses/.  */ + +#include "gmp-impl.h" +#include "longlong.h" + +#include "mpn/sparc64/sparc64.h" + + +/*                 64-bit divisor   32-bit divisor +                    cycles/limb      cycles/limb +                     (approx)         (approx) +   Ultrasparc 2i:       ?                ? +*/ + + +/* This implementation reduces the number of multiplies done, knowing that +   on ultrasparc 1 and 2 the mulx instruction stalls the whole chip. + +   The key idea is to use the fact that the low limb of q*d equals l, this +   being the whole purpose of the q calculated.  It means there's no need to +   calculate the lowest 32x32->64 part of the q*d, instead it can be +   inferred from l and the other three 32x32->64 parts.  See sparc64.h for +   details. + +   When d is 32-bits, the same applies, but in this case there's only one +   other 32x32->64 part (ie. HIGH(q)*d). + +   The net effect is that for 64-bit divisor each limb is 4 mulx, or for +   32-bit divisor each is 2 mulx. + +   Enhancements: + +   No doubt this could be done in assembler, if that helped the scheduling, +   or perhaps guaranteed good code irrespective of the compiler. + +   Alternatives: + +   It might be possibly to use floating point.  The loop is dominated by +   multiply latency, so not sure if floats would improve that.  One +   possibility would be to take two limbs at a time, with a 128 bit inverse, +   if there's enough registers, which could effectively use float throughput +   to reduce total latency across two limbs.  */ + +#define ASSERT_RETVAL(r)                \ +  ASSERT (orig_c < d ? r < d : r <= d) + +mp_limb_t +mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d, mp_limb_t orig_c) +{ +  mp_limb_t  c = orig_c; +  mp_limb_t  s, l, q, h, inverse; + +  ASSERT (size >= 1); +  ASSERT (d & 1); +  ASSERT_MPN (src, size); +  ASSERT_LIMB (d); +  ASSERT_LIMB (c); + +  /* udivx is faster than 10 or 12 mulx's for one limb via an inverse */ +  if (size == 1) +    { +      s = src[0]; +      if (s > c) +	{ +	  l = s-c; +	  h = l % d; +	  if (h != 0) +	    h = d - h; +	} +      else +	{ +	  l = c-s; +	  h = l % d; +	} +      return h; +    } + +  binvert_limb (inverse, d); + +  if (d <= 0xFFFFFFFF) +    { +      s = *src++; +      size--; +      do +        { +          SUBC_LIMB (c, l, s, c); +          s = *src++; +          q = l * inverse; +          umul_ppmm_half_lowequal (h, q, d, l); +          c += h; +          size--; +        } +      while (size != 0); + +      if (s <= d) +        { +          /* With high s <= d the final step can be a subtract and addback. +             If c==0 then the addback will restore to l>=0.  If c==d then +             will get l==d if s==0, but that's ok per the function +             definition.  */ + +          l = c - s; +          l += (l > c ? d : 0); + +          ASSERT_RETVAL (l); +          return l; +        } +      else +        { +          /* Can't skip a divide, just do the loop code once more. */ +          SUBC_LIMB (c, l, s, c); +          q = l * inverse; +          umul_ppmm_half_lowequal (h, q, d, l); +          c += h; + +          ASSERT_RETVAL (c); +          return c; +        } +    } +  else +    { +      mp_limb_t  dl = LOW32 (d); +      mp_limb_t  dh = HIGH32 (d); +      long i; + +      s = *src++; +      size--; +      do +        { +          SUBC_LIMB (c, l, s, c); +          s = *src++; +          q = l * inverse; +          umul_ppmm_lowequal (h, q, d, dh, dl, l); +          c += h; +          size--; +        } +      while (size != 0); + +      if (s <= d) +        { +          /* With high s <= d the final step can be a subtract and addback. +             If c==0 then the addback will restore to l>=0.  If c==d then +             will get l==d if s==0, but that's ok per the function +             definition.  */ + +          l = c - s; +          l += (l > c ? d : 0); + +          ASSERT_RETVAL (l); +          return l; +        } +      else +        { +          /* Can't skip a divide, just do the loop code once more. */ +          SUBC_LIMB (c, l, s, c); +          q = l * inverse; +          umul_ppmm_lowequal (h, q, d, dh, dl, l); +          c += h; + +          ASSERT_RETVAL (c); +          return c; +        } +    } +} diff --git a/vendor/gmp-6.3.0/mpn/sparc64/rshift.asm b/vendor/gmp-6.3.0/mpn/sparc64/rshift.asm new file mode 100644 index 0000000..3f8e11f --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/rshift.asm @@ -0,0 +1,142 @@ +dnl  SPARC v9 mpn_rshift + +dnl  Contributed to the GNU project by David Miller. + +dnl  Copyright 2013 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C		    cycles/limb +C UltraSPARC 1&2:	 2 +C UltraSPARC 3:		 2.5 +C UltraSPARC T1:	17.5 +C UltraSPARC T3:	 8 +C UltraSPARC T4:	 3 + +C INPUT PARAMETERS +define(`rp',     `%i0') +define(`up',     `%i1') +define(`n',      `%i2') +define(`cnt',    `%i3') + +define(`tcnt',   `%i4') +define(`retval', `%i5') +define(`u0',     `%l0') +define(`u1',     `%l1') +define(`r0',     `%l6') +define(`r1',     `%l7') +define(`u0_off', `%o0') +define(`u1_off', `%o1') +define(`r0_off', `%o2') +define(`r1_off', `%o3') + +ASM_START() +	REGISTER(%g2,#scratch) +	REGISTER(%g3,#scratch) +PROLOGUE(mpn_rshift) +	save	%sp, -176, %sp + +	sllx	n, 3, n +	sub	%g0, cnt, tcnt + +	add	up, n, up +	add	rp, n, rp + +	neg	n, n +	sub	up, (2 * 8), u0_off +	sub	rp, (5 * 8), r0_off + +	ldx	[n + up], u1		C WAS: up + 0 +	sub	u0_off, (1 * 8), u1_off +	sub	r0_off, (1 * 8), r1_off + +	subcc	n, -(3 * 8), n +	sllx	u1, tcnt, retval + +	bg,pn	%xcc, L(end12) +	 srlx	u1, cnt, %l3 + +	ldx	[n + u0_off], u0	C WAS: up + 0 +	subcc	n, -(2 * 8), n + +	ldx	[n + u1_off], u1	C WAS: up + 8 + +	bg,pn	%xcc, L(end34) +	 sllx	u0, tcnt, %l4 + +	b,a	L(top) +	ALIGN(16) +L(top): +	srlx	u0, cnt, %l2 +	or	%l3, %l4, r0 + +	ldx	[n + u0_off], u0	C WAS: up + 0 +	sllx	u1, tcnt, %l5 + +	stx	r0, [n + r0_off]	C WAS: rp + 0 +	subcc	n, -(2 * 8), n + +	srlx	u1, cnt, %l3 +	or	%l2, %l5, r1 + +	ldx	[n + u1_off], u1	C WAS: up + 8 +	sllx	u0, tcnt, %l4 + +	ble,pt	%xcc, L(top) +	 stx	r1, [n + r1_off]	C WAS: rp + 8 + +L(end34): +	srlx	u0, cnt, %l2 +	or	%l3, %l4, r0 + +	sllx	u1, tcnt, %l5 +	stx	r0, [n + r0_off]	C WAS: rp + 0 + +	or	%l2, %l5, r1 +	sub	n, -(2 * 8), %o5 + +	srlx	u1, cnt, %l3 +	stx	r1, [%o5 + r1_off]	C WAS: rp + 8 + +L(end12): +	andcc	n, 8, %g0 +	bz,pn	%xcc, L(done) +	 nop + +	ldx	[n + u0_off], u1 +	sllx	u1, tcnt, %l4 +	or	%l3, %l4, r0 +	stx	r0, [r0_off + 24] +	srlx	u1, cnt, %l3 +L(done): +	stx	%l3, [r0_off + 32] + +	ret +	restore retval, 0, %o0 +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/sparc64/sec_tabselect.asm b/vendor/gmp-6.3.0/mpn/sparc64/sec_tabselect.asm new file mode 100644 index 0000000..22e0dc5 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/sec_tabselect.asm @@ -0,0 +1,162 @@ +dnl  SPARC v9 mpn_sec_tabselect. + +dnl  Contributed to the GNU project by Torbjörn Granlund and David Miller. + +dnl  Copyright 2013 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C		   cycles/limb +C UltraSPARC 1&2:	 2 hopefully +C UltraSPARC 3:		 3 +C UltraSPARC T1:	17 +C UltraSPARC T3:	 ? +C UltraSPARC T4/T5:	 2.25 hopefully + +C INPUT PARAMETERS +define(`rp',     `%i0') +define(`tp',     `%i1') +define(`n',      `%i2') +define(`nents',  `%i3') +define(`which',  `%i4') + +define(`i',      `%g1') +define(`j',      `%g3') +define(`stride', `%g4') +define(`tporig', `%g5') +define(`mask',   `%o0') + +define(`data0',  `%l0') +define(`data1',  `%l1') +define(`data2',  `%l2') +define(`data3',  `%l3') +define(`t0',     `%l4') +define(`t1',     `%l5') +define(`t2',     `%l6') +define(`t3',     `%l7') + +ASM_START() +	REGISTER(%g2,#scratch) +	REGISTER(%g3,#scratch) +PROLOGUE(mpn_sec_tabselect) +	save	%sp, -176, %sp + +	sllx	n, 3, stride +	sub	n, 4, j +	brlz	j, L(outer_end) +	 mov	tp, tporig + +L(outer_loop): +	clr	data0 +	clr	data1 +	clr	data2 +	clr	data3 +	mov	tporig, tp +	mov	nents, i +	mov	which, %o1 + +L(top):	subcc	%o1, 1, %o1		C set carry iff o1 = 0 +	ldx	[tp + 0], t0 +	subc	%g0, %g0, mask +	ldx	[tp + 8], t1 +	sub	i, 1, i +	ldx	[tp + 16], t2 +	ldx	[tp + 24], t3 +	add	tp, stride, tp +	and	t0, mask, t0 +	and	t1, mask, t1 +	or	t0, data0, data0 +	and	t2, mask, t2 +	or	t1, data1, data1 +	and	t3, mask, t3 +	or	t2, data2, data2 +	brnz	i, L(top) +	 or	t3, data3, data3 + +	stx	data0, [rp + 0] +	subcc	j, 4, j +	stx	data1, [rp + 8] +	stx	data2, [rp + 16] +	stx	data3, [rp + 24] +	add	tporig, (4 * 8), tporig + +	brgez	j, L(outer_loop) +	 add	rp, (4 * 8), rp +L(outer_end): + + +	andcc	n, 2, %g0 +	be	L(b0x) +	 nop +L(b1x):	clr	data0 +	clr	data1 +	mov	tporig, tp +	mov	nents, i +	mov	which, %o1 + +L(tp2):	subcc	%o1, 1, %o1 +	ldx	[tp + 0], t0 +	subc	%g0, %g0, mask +	ldx	[tp + 8], t1 +	sub	i, 1, i +	add	tp, stride, tp +	and	t0, mask, t0 +	and	t1, mask, t1 +	or	t0, data0, data0 +	brnz	i, L(tp2) +	 or	t1, data1, data1 + +	stx	data0, [rp + 0] +	stx	data1, [rp + 8] +	add	tporig, (2 * 8), tporig +	add	rp, (2 * 8), rp + + +L(b0x):	andcc	n, 1, %g0 +	be	L(b00) +	 nop +L(b01):	clr	data0 +	mov	tporig, tp +	mov	nents, i +	mov	which, %o1 + +L(tp1):	subcc	%o1, 1, %o1 +	ldx	[tp + 0], t0 +	subc	%g0, %g0, mask +	sub	i, 1, i +	add	tp, stride, tp +	and	t0, mask, t0 +	brnz	i, L(tp1) +	 or	t0, data0, data0 + +	stx	data0, [rp + 0] + +L(b00):	 ret +	  restore +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/sparc64/sparc64.h b/vendor/gmp-6.3.0/mpn/sparc64/sparc64.h new file mode 100644 index 0000000..8698a82 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/sparc64.h @@ -0,0 +1,217 @@ +/* UltraSPARC 64 support macros. + +   THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY.  THEY'RE ALMOST +   CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN +   FUTURE GNU MP RELEASES. + +Copyright 2003 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + +  * the GNU Lesser General Public License as published by the Free +    Software Foundation; either version 3 of the License, or (at your +    option) any later version. + +or + +  * the GNU General Public License as published by the Free Software +    Foundation; either version 2 of the License, or (at your option) any +    later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library.  If not, +see https://www.gnu.org/licenses/.  */ + + +#define LOW32(x)   ((x) & 0xFFFFFFFF) +#define HIGH32(x)  ((x) >> 32) + + +/* Halfword number i in src is accessed as src[i+HALF_ENDIAN_ADJ(i)]. +   Plain src[i] would be incorrect in big endian, HALF_ENDIAN_ADJ has the +   effect of swapping the two halves in this case.  */ +#if HAVE_LIMB_BIG_ENDIAN +#define HALF_ENDIAN_ADJ(i)  (1 - (((i) & 1) << 1))   /* +1 even, -1 odd */ +#endif +#if HAVE_LIMB_LITTLE_ENDIAN +#define HALF_ENDIAN_ADJ(i)  0                        /* no adjust */ +#endif +#ifndef HALF_ENDIAN_ADJ +Error, error, unknown limb endianness; +#endif + + +/* umul_ppmm_lowequal sets h to the high limb of q*d, assuming the low limb +   of that product is equal to l.  dh and dl are the 32-bit halves of d. + +   |-----high----||----low-----| +   +------+------+ +   |             |                 ph = qh * dh +   +------+------+ +          +------+------+ +          |             |          pm1 = ql * dh +          +------+------+ +          +------+------+ +          |             |          pm2 = qh * dl +          +------+------+ +                 +------+------+ +                 |             |   pl = ql * dl (not calculated) +                 +------+------+ + +   Knowing that the low 64 bits is equal to l means that LOW(pm1) + LOW(pm2) +   + HIGH(pl) == HIGH(l).  The only thing we need from those product parts +   is whether they produce a carry into the high. + +   pm_l = LOW(pm1)+LOW(pm2) is done to contribute its carry, then the only +   time there's a further carry from LOW(pm_l)+HIGH(pl) is if LOW(pm_l) > +   HIGH(l).  pl is never actually calculated.  */ + +#define umul_ppmm_lowequal(h, q, d, dh, dl, l)  \ +  do {                                          \ +    mp_limb_t  ql, qh, ph, pm1, pm2, pm_l;      \ +    ASSERT (dh == HIGH32(d));                   \ +    ASSERT (dl == LOW32(d));                    \ +    ASSERT (q*d == l);                          \ +                                                \ +    ql = LOW32 (q);                             \ +    qh = HIGH32 (q);                            \ +                                                \ +    pm1 = ql * dh;                              \ +    pm2 = qh * dl;                              \ +    ph  = qh * dh;                              \ +                                                \ +    pm_l = LOW32 (pm1) + LOW32 (pm2);           \ +                                                \ +    (h) = ph + HIGH32 (pm1) + HIGH32 (pm2)      \ +      + HIGH32 (pm_l) + ((pm_l << 32) > l);     \ +                                                \ +    ASSERT_HIGH_PRODUCT (h, q, d);              \ +  } while (0) + + +/* Set h to the high of q*d, assuming the low limb of that product is equal +   to l, and that d fits in 32-bits. + +   |-----high----||----low-----| +          +------+------+ +          |             |          pm = qh * dl +          +------+------+ +                 +------+------+ +                 |             |   pl = ql * dl (not calculated) +                 +------+------+ + +   Knowing that LOW(pm) + HIGH(pl) == HIGH(l) (mod 2^32) means that the only +   time there's a carry from that sum is when LOW(pm) > HIGH(l).  There's no +   need to calculate pl to determine this.  */ + +#define umul_ppmm_half_lowequal(h, q, d, l)     \ +  do {                                          \ +    mp_limb_t pm;                               \ +    ASSERT (q*d == l);                          \ +    ASSERT (HIGH32(d) == 0);                    \ +                                                \ +    pm = HIGH32(q) * d;                         \ +    (h) = HIGH32(pm) + ((pm << 32) > l);        \ +    ASSERT_HIGH_PRODUCT (h, q, d);              \ +  } while (0) + + +/* check that h is the high limb of x*y */ +#if WANT_ASSERT +#define ASSERT_HIGH_PRODUCT(h, x, y)    \ +  do {                                  \ +    mp_limb_t  want_h, dummy;           \ +    umul_ppmm (want_h, dummy, x, y);    \ +    ASSERT (h == want_h);               \ +  } while (0) +#else +#define ASSERT_HIGH_PRODUCT(h, q, d)    \ +  do { } while (0) +#endif + + +/* Multiply u anv v, where v < 2^32.  */ +#define umul_ppmm_s(w1, w0, u, v)					\ +  do {									\ +    UWtype __x0, __x2;							\ +    UWtype __ul, __vl, __uh;						\ +    UWtype __u = (u), __v = (v);					\ +									\ +    __ul = __ll_lowpart (__u);						\ +    __uh = __ll_highpart (__u);						\ +    __vl = __ll_lowpart (__v);						\ +									\ +    __x0 = (UWtype) __ul * __vl;					\ +    __x2 = (UWtype) __uh * __vl;					\ +									\ +    (w1) = (__x2 + (__x0 >> W_TYPE_SIZE/2)) >> W_TYPE_SIZE/2;		\ +    (w0) = (__x2 << W_TYPE_SIZE/2) + __x0;				\ +  } while (0) + +/* Count the leading zeros on a limb, but assuming it fits in 32 bits. +   The count returned will be in the range 32 to 63. +   This is the 32-bit generic C count_leading_zeros from longlong.h. */ +#define count_leading_zeros_32(count, x)                                      \ +  do {                                                                        \ +    mp_limb_t  __xr = (x);                                                    \ +    unsigned   __a;                                                           \ +    ASSERT ((x) != 0);                                                        \ +    ASSERT ((x) <= CNST_LIMB(0xFFFFFFFF));                                    \ +    __a = __xr < ((UWtype) 1 << 16) ? (__xr < ((UWtype) 1 << 8) ? 1 : 8 + 1)  \ +      : (__xr < ((UWtype) 1 << 24)  ? 16 + 1 : 24 + 1);                       \ +                                                                              \ +    (count) = W_TYPE_SIZE + 1 - __a - __clz_tab[__xr >> __a];                 \ +  } while (0) + + +/* Set inv to a 32-bit inverse floor((b*(b-d)-1) / d), knowing that d fits +   32 bits and is normalized (high bit set).  */ +#define invert_half_limb(inv, d)                \ +  do {                                          \ +    mp_limb_t  _n;                              \ +    ASSERT ((d) <= 0xFFFFFFFF);                 \ +    ASSERT ((d) & 0x80000000);                  \ +    _n = (((mp_limb_t) -(d)) << 32) - 1;        \ +    (inv) = (mp_limb_t) (unsigned) (_n / (d));  \ +  } while (0) + + +/* Divide nh:nl by d, setting q to the quotient and r to the remainder. +   q, r, nh and nl are 32-bits each, d_limb is 32-bits but in an mp_limb_t, +   dinv_limb is similarly a 32-bit inverse but in an mp_limb_t.  */ + +#define udiv_qrnnd_half_preinv(q, r, nh, nl, d_limb, dinv_limb)         \ +  do {                                                                  \ +    unsigned   _n2, _n10, _n1, _nadj, _q11n, _xh, _r, _q;               \ +    mp_limb_t  _n, _x;                                                  \ +    ASSERT (d_limb <= 0xFFFFFFFF);                                      \ +    ASSERT (dinv_limb <= 0xFFFFFFFF);                                   \ +    ASSERT (d_limb & 0x80000000);                                       \ +    ASSERT (nh < d_limb);                                               \ +    _n10 = (nl);                                                        \ +    _n2 = (nh);                                                         \ +    _n1 = (int) _n10 >> 31;                                             \ +    _nadj = _n10 + (_n1 & d_limb);                                      \ +    _x = dinv_limb * (_n2 - _n1) + _nadj;                               \ +    _q11n = ~(_n2 + HIGH32 (_x));             /* -q1-1 */               \ +    _n = ((mp_limb_t) _n2 << 32) + _n10;                                \ +    _x = _n + d_limb * _q11n;                 /* n-q1*d-d */            \ +    _xh = HIGH32 (_x) - d_limb;               /* high(n-q1*d-d) */      \ +    ASSERT (_xh == 0 || _xh == ~0);                                     \ +    _r = _x + (d_limb & _xh);                 /* addback */             \ +    _q = _xh - _q11n;                         /* q1+1-addback */        \ +    ASSERT (_r < d_limb);                                               \ +    ASSERT (d_limb * _q + _r == _n);                                    \ +    (r) = _r;                                                           \ +    (q) = _q;                                                           \ +  } while (0) diff --git a/vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234/add_n.asm b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234/add_n.asm new file mode 100644 index 0000000..92374d2 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234/add_n.asm @@ -0,0 +1,241 @@ +dnl  SPARC v9 mpn_add_n -- Add two limb vectors of the same length > 0 and +dnl  store sum in a third limb vector. + +dnl  Copyright 2001-2003, 2011 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C		   cycles/limb +C UltraSPARC 1&2:     4 +C UltraSPARC 3:	      4.5 + +C Compute carry-out from the most significant bits of u,v, and r, where +C r=u+v+carry_in, using logic operations. + +C This code runs at 4 cycles/limb on UltraSPARC 1 and 2.  It has a 4 insn +C recurrency, and the UltraSPARC 1 and 2 the IE units are 100% saturated. +C Therefore, it seems futile to try to optimize this any further... + +C INPUT PARAMETERS +define(`rp', `%i0') +define(`up', `%i1') +define(`vp', `%i2') +define(`n',  `%i3') + +define(`u0', `%l0') +define(`u1', `%l2') +define(`u2', `%l4') +define(`u3', `%l6') +define(`v0', `%l1') +define(`v1', `%l3') +define(`v2', `%l5') +define(`v3', `%l7') + +define(`cy',`%i4') + +define(`fanop',`fitod %f0,%f2')		dnl  A quasi nop running in the FA pipe +define(`fmnop',`fmuld %f0,%f0,%f4')	dnl  A quasi nop running in the FM pipe + +ASM_START() +	REGISTER(%g2,#scratch) +	REGISTER(%g3,#scratch) +PROLOGUE(mpn_add_nc) +	save	%sp,-160,%sp + +	fitod	%f0,%f0		C make sure f0 contains small, quiet number +	subcc	n,4,%g0 +	bl,pn	%xcc,.Loop0 +	nop +	b,a	L(com) +EPILOGUE() + +PROLOGUE(mpn_add_n) +	save	%sp,-160,%sp + +	fitod	%f0,%f0		C make sure f0 contains small, quiet number +	subcc	n,4,%g0 +	bl,pn	%xcc,.Loop0 +	mov	0,cy +L(com): +	ldx	[up+0],u0 +	ldx	[vp+0],v0 +	add	up,32,up +	ldx	[up-24],u1 +	ldx	[vp+8],v1 +	add	vp,32,vp +	ldx	[up-16],u2 +	ldx	[vp-16],v2 +	ldx	[up-8],u3 +	ldx	[vp-8],v3 +	subcc	n,8,n +	add	u0,v0,%g1	C main add +	add	%g1,cy,%g5	C carry add +	or	u0,v0,%g2 +	bl,pn	%xcc,.Lend4567 +	fanop +	b,a	.Loop + +	.align	16 +C START MAIN LOOP +.Loop:	andn	%g2,%g5,%g2 +	and	u0,v0,%g3 +	ldx	[up+0],u0 +	fanop +C -- +	or	%g3,%g2,%g2 +	ldx	[vp+0],v0 +	add	up,32,up +	fanop +C -- +	srlx	%g2,63,cy +	add	u1,v1,%g1 +	stx	%g5,[rp+0] +	fanop +C -- +	add	%g1,cy,%g5 +	or	u1,v1,%g2 +	fmnop +	fanop +C -- +	andn	%g2,%g5,%g2 +	and	u1,v1,%g3 +	ldx	[up-24],u1 +	fanop +C -- +	or	%g3,%g2,%g2 +	ldx	[vp+8],v1 +	add	vp,32,vp +	fanop +C -- +	srlx	%g2,63,cy +	add	u2,v2,%g1 +	stx	%g5,[rp+8] +	fanop +C -- +	add	%g1,cy,%g5 +	or	u2,v2,%g2 +	fmnop +	fanop +C -- +	andn	%g2,%g5,%g2 +	and	u2,v2,%g3 +	ldx	[up-16],u2 +	fanop +C -- +	or	%g3,%g2,%g2 +	ldx	[vp-16],v2 +	add	rp,32,rp +	fanop +C -- +	srlx	%g2,63,cy +	add	u3,v3,%g1 +	stx	%g5,[rp-16] +	fanop +C -- +	add	%g1,cy,%g5 +	or	u3,v3,%g2 +	fmnop +	fanop +C -- +	andn	%g2,%g5,%g2 +	and	u3,v3,%g3 +	ldx	[up-8],u3 +	fanop +C -- +	or	%g3,%g2,%g2 +	subcc	n,4,n +	ldx	[vp-8],v3 +	fanop +C -- +	srlx	%g2,63,cy +	add	u0,v0,%g1 +	stx	%g5,[rp-8] +	fanop +C -- +	add	%g1,cy,%g5 +	or	u0,v0,%g2 +	bge,pt	%xcc,.Loop +	fanop +C END MAIN LOOP +.Lend4567: +	andn	%g2,%g5,%g2 +	and	u0,v0,%g3 +	or	%g3,%g2,%g2 +	srlx	%g2,63,cy +	add	u1,v1,%g1 +	stx	%g5,[rp+0] +	add	%g1,cy,%g5 +	or	u1,v1,%g2 +	andn	%g2,%g5,%g2 +	and	u1,v1,%g3 +	or	%g3,%g2,%g2 +	srlx	%g2,63,cy +	add	u2,v2,%g1 +	stx	%g5,[rp+8] +	add	%g1,cy,%g5 +	or	u2,v2,%g2 +	andn	%g2,%g5,%g2 +	and	u2,v2,%g3 +	or	%g3,%g2,%g2 +	add	rp,32,rp +	srlx	%g2,63,cy +	add	u3,v3,%g1 +	stx	%g5,[rp-16] +	add	%g1,cy,%g5 +	or	u3,v3,%g2 +	andn	%g2,%g5,%g2 +	and	u3,v3,%g3 +	or	%g3,%g2,%g2 +	srlx	%g2,63,cy +	stx	%g5,[rp-8] + +	addcc	n,4,n +	bz,pn	%xcc,.Lret +	fanop + +.Loop0:	ldx	[up],u0 +	add	up,8,up +	ldx	[vp],v0 +	add	vp,8,vp +	add	rp,8,rp +	subcc	n,1,n +	add	u0,v0,%g1 +	or	u0,v0,%g2 +	add	%g1,cy,%g5 +	and	u0,v0,%g3 +	andn	%g2,%g5,%g2 +	stx	%g5,[rp-8] +	or	%g3,%g2,%g2 +	bnz,pt	%xcc,.Loop0 +	srlx	%g2,63,cy + +.Lret:	mov	cy,%i0 +	ret +	restore +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234/addmul_1.asm b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234/addmul_1.asm new file mode 100644 index 0000000..48a9414 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234/addmul_1.asm @@ -0,0 +1,606 @@ +dnl  SPARC v9 64-bit mpn_addmul_1 -- Multiply a limb vector with a limb and add +dnl  the result to a second limb vector. + +dnl  Copyright 1998, 2000-2004 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C		   cycles/limb +C UltraSPARC 1&2:     14 +C UltraSPARC 3:	      17.5 + +C Algorithm: We use eight floating-point multiplies per limb product, with the +C invariant v operand split into four 16-bit pieces, and the up operand split +C into 32-bit pieces.  We sum pairs of 48-bit partial products using +C floating-point add, then convert the four 49-bit product-sums and transfer +C them to the integer unit. + +C Possible optimizations: +C   0. Rewrite to use algorithm of mpn_addmul_2. +C   1. Align the stack area where we transfer the four 49-bit product-sums +C      to a 32-byte boundary.  That would minimize the cache collision. +C      (UltraSPARC-1/2 use a direct-mapped cache.)  (Perhaps even better would +C      be to align the area to map to the area immediately before up?) +C   2. Sum the 4 49-bit quantities using 32-bit operations, as in the +C      develop mpn_addmul_2.  This would save many integer instructions. +C   3. Unrolling.  Questionable if it is worth the code expansion, given that +C      it could only save 1 cycle/limb. +C   4. Specialize for particular v values.  If its upper 32 bits are zero, we +C      could save many operations, in the FPU (fmuld), but more so in the IEU +C      since we'll be summing 48-bit quantities, which might be simpler. +C   5. Ideally, we should schedule the f2/f3 and f4/f5 RAW further apart, and +C      the i00,i16,i32,i48 RAW less apart.  The latter apart-scheduling should +C      not be greater than needed for L2 cache latency, and also not so great +C      that i16 needs to be copied. +C   6. Avoid performing mem+fa+fm in the same cycle, at least not when we want +C      to get high IEU bandwidth.  (12 of the 14 cycles will be free for 2 IEU +C      ops.) + +C Instruction classification (as per UltraSPARC-1/2 functional units): +C    8 FM +C   10 FA +C   12 MEM +C   10 ISHIFT + 14 IADDLOG +C    1 BRANCH +C   55 insns totally (plus one mov insn that should be optimized out) + +C The loop executes 56 instructions in 14 cycles on UltraSPARC-1/2, i.e we +C sustain the peak execution rate of 4 instructions/cycle. + +C INPUT PARAMETERS +C rp	i0 +C up	i1 +C n	i2 +C v	i3 + +ASM_START() +	REGISTER(%g2,#scratch) +	REGISTER(%g3,#scratch) + +define(`p00', `%f8') define(`p16',`%f10') define(`p32',`%f12') define(`p48',`%f14') +define(`r32',`%f16') define(`r48',`%f18') define(`r64',`%f20') define(`r80',`%f22') +define(`v00',`%f24') define(`v16',`%f26') define(`v32',`%f28') define(`v48',`%f30') +define(`u00',`%f32') define(`u32', `%f34') +define(`a00',`%f36') define(`a16',`%f38') define(`a32',`%f40') define(`a48',`%f42') +define(`cy',`%g1') +define(`rlimb',`%g3') +define(`i00',`%l0') define(`i16',`%l1') define(`i32',`%l2') define(`i48',`%l3') +define(`xffffffff',`%l7') +define(`xffff',`%o0') + +PROLOGUE(mpn_addmul_1) + +C Initialization.  (1) Split v operand into four 16-bit chunks and store them +C as IEEE double in fp registers.  (2) Clear upper 32 bits of fp register pairs +C f2 and f4.  (3) Store masks in registers aliased to `xffff' and `xffffffff'. + +	save	%sp, -256, %sp +	mov	-1, %g4 +	srlx	%g4, 48, xffff		C store mask in register `xffff' +	and	%i3, xffff, %g2 +	stx	%g2, [%sp+2223+0] +	srlx	%i3, 16, %g3 +	and	%g3, xffff, %g3 +	stx	%g3, [%sp+2223+8] +	srlx	%i3, 32, %g2 +	and	%g2, xffff, %g2 +	stx	%g2, [%sp+2223+16] +	srlx	%i3, 48, %g3 +	stx	%g3, [%sp+2223+24] +	srlx	%g4, 32, xffffffff	C store mask in register `xffffffff' + +	sllx	%i2, 3, %i2 +	mov	0, cy			C clear cy +	add	%i0, %i2, %i0 +	add	%i1, %i2, %i1 +	neg	%i2 +	add	%i1, 4, %i5 +	add	%i0, -32, %i4 +	add	%i0, -16, %i0 + +	ldd	[%sp+2223+0], v00 +	ldd	[%sp+2223+8], v16 +	ldd	[%sp+2223+16], v32 +	ldd	[%sp+2223+24], v48 +	ld	[%sp+2223+0],%f2	C zero f2 +	ld	[%sp+2223+0],%f4	C zero f4 +	ld	[%i5+%i2], %f3		C read low 32 bits of up[i] +	ld	[%i1+%i2], %f5		C read high 32 bits of up[i] +	fxtod	v00, v00 +	fxtod	v16, v16 +	fxtod	v32, v32 +	fxtod	v48, v48 + +C Start real work.  (We sneakingly read f3 and f5 above...) +C The software pipeline is very deep, requiring 4 feed-in stages. + +	fxtod	%f2, u00 +	fxtod	%f4, u32 +	fmuld	u00, v00, a00 +	fmuld	u00, v16, a16 +	fmuld	u00, v32, p32 +	fmuld	u32, v00, r32 +	fmuld	u00, v48, p48 +	addcc	%i2, 8, %i2 +	bnz,pt	%xcc, .L_two_or_more +	fmuld	u32, v16, r48 + +.L_one: +	fmuld	u32, v32, r64	C FIXME not urgent +	faddd	p32, r32, a32 +	fdtox	a00, a00 +	faddd	p48, r48, a48 +	fmuld	u32, v48, r80	C FIXME not urgent +	fdtox	a16, a16 +	fdtox	a32, a32 +	fdtox	a48, a48 +	std	a00, [%sp+2223+0] +	std	a16, [%sp+2223+8] +	std	a32, [%sp+2223+16] +	std	a48, [%sp+2223+24] +	add	%i2, 8, %i2 + +	fdtox	r64, a00 +	ldx	[%i0+%i2], rlimb	C read rp[i] +	fdtox	r80, a16 +	ldx	[%sp+2223+0], i00 +	ldx	[%sp+2223+8], i16 +	ldx	[%sp+2223+16], i32 +	ldx	[%sp+2223+24], i48 +	std	a00, [%sp+2223+0] +	std	a16, [%sp+2223+8] +	add	%i2, 8, %i2 + +	srlx	rlimb, 32, %g4		C HI(rlimb) +	and	rlimb, xffffffff, %g5	C LO(rlimb) +	add	i00, %g5, %g5		C i00+ now in g5 +	ldx	[%sp+2223+0], i00 +	srlx	i16, 48, %l4		C (i16 >> 48) +	mov	i16, %g2 +	ldx	[%sp+2223+8], i16 +	srlx	i48, 16, %l5		C (i48 >> 16) +	add	i32, %g4, %g4		C i32+ now in g4 +	sllx	i48, 32, %l6		C (i48 << 32) +	srlx	%g4, 32, %o3		C (i32 >> 32) +	add	%l5, %l4, %o1		C hi64- in %o1 +	std	a00, [%sp+2223+0] +	sllx	%g4, 16, %o2		C (i32 << 16) +	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT +	std	a16, [%sp+2223+8] +	sllx	%o1, 48, %o3		C (hi64 << 48) +	add	%g2, %o2, %o2		C mi64- in %o2 +	add	%l6, %o2, %o2		C mi64- in %o2 +	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT +	add	cy, %g5, %o4		C x = prev(i00) + cy +	b	.L_out_1 +	add	%i2, 8, %i2 + +.L_two_or_more: +	ld	[%i5+%i2], %f3		C read low 32 bits of up[i] +	fmuld	u32, v32, r64	C FIXME not urgent +	faddd	p32, r32, a32 +	ld	[%i1+%i2], %f5		C read high 32 bits of up[i] +	fdtox	a00, a00 +	faddd	p48, r48, a48 +	fmuld	u32, v48, r80	C FIXME not urgent +	fdtox	a16, a16 +	fdtox	a32, a32 +	fxtod	%f2, u00 +	fxtod	%f4, u32 +	fdtox	a48, a48 +	std	a00, [%sp+2223+0] +	fmuld	u00, v00, p00 +	std	a16, [%sp+2223+8] +	fmuld	u00, v16, p16 +	std	a32, [%sp+2223+16] +	fmuld	u00, v32, p32 +	std	a48, [%sp+2223+24] +	faddd	p00, r64, a00 +	fmuld	u32, v00, r32 +	faddd	p16, r80, a16 +	fmuld	u00, v48, p48 +	addcc	%i2, 8, %i2 +	bnz,pt	%xcc, .L_three_or_more +	fmuld	u32, v16, r48 + +.L_two: +	fmuld	u32, v32, r64	C FIXME not urgent +	faddd	p32, r32, a32 +	fdtox	a00, a00 +	ldx	[%i0+%i2], rlimb	C read rp[i] +	faddd	p48, r48, a48 +	fmuld	u32, v48, r80	C FIXME not urgent +	fdtox	a16, a16 +	ldx	[%sp+2223+0], i00 +	fdtox	a32, a32 +	ldx	[%sp+2223+8], i16 +	ldx	[%sp+2223+16], i32 +	ldx	[%sp+2223+24], i48 +	fdtox	a48, a48 +	std	a00, [%sp+2223+0] +	std	a16, [%sp+2223+8] +	std	a32, [%sp+2223+16] +	std	a48, [%sp+2223+24] +	add	%i2, 8, %i2 + +	fdtox	r64, a00 +	srlx	rlimb, 32, %g4		C HI(rlimb) +	and	rlimb, xffffffff, %g5	C LO(rlimb) +	ldx	[%i0+%i2], rlimb	C read rp[i] +	add	i00, %g5, %g5		C i00+ now in g5 +	fdtox	r80, a16 +	ldx	[%sp+2223+0], i00 +	srlx	i16, 48, %l4		C (i16 >> 48) +	mov	i16, %g2 +	ldx	[%sp+2223+8], i16 +	srlx	i48, 16, %l5		C (i48 >> 16) +	add	i32, %g4, %g4		C i32+ now in g4 +	ldx	[%sp+2223+16], i32 +	sllx	i48, 32, %l6		C (i48 << 32) +	ldx	[%sp+2223+24], i48 +	srlx	%g4, 32, %o3		C (i32 >> 32) +	add	%l5, %l4, %o1		C hi64- in %o1 +	std	a00, [%sp+2223+0] +	sllx	%g4, 16, %o2		C (i32 << 16) +	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT +	std	a16, [%sp+2223+8] +	sllx	%o1, 48, %o3		C (hi64 << 48) +	add	%g2, %o2, %o2		C mi64- in %o2 +	add	%l6, %o2, %o2		C mi64- in %o2 +	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT +	add	cy, %g5, %o4		C x = prev(i00) + cy +	b	.L_out_2 +	add	%i2, 8, %i2 + +.L_three_or_more: +	ld	[%i5+%i2], %f3		C read low 32 bits of up[i] +	fmuld	u32, v32, r64	C FIXME not urgent +	faddd	p32, r32, a32 +	ld	[%i1+%i2], %f5		C read high 32 bits of up[i] +	fdtox	a00, a00 +	ldx	[%i0+%i2], rlimb	C read rp[i] +	faddd	p48, r48, a48 +	fmuld	u32, v48, r80	C FIXME not urgent +	fdtox	a16, a16 +	ldx	[%sp+2223+0], i00 +	fdtox	a32, a32 +	ldx	[%sp+2223+8], i16 +	fxtod	%f2, u00 +	ldx	[%sp+2223+16], i32 +	fxtod	%f4, u32 +	ldx	[%sp+2223+24], i48 +	fdtox	a48, a48 +	std	a00, [%sp+2223+0] +	fmuld	u00, v00, p00 +	std	a16, [%sp+2223+8] +	fmuld	u00, v16, p16 +	std	a32, [%sp+2223+16] +	fmuld	u00, v32, p32 +	std	a48, [%sp+2223+24] +	faddd	p00, r64, a00 +	fmuld	u32, v00, r32 +	faddd	p16, r80, a16 +	fmuld	u00, v48, p48 +	addcc	%i2, 8, %i2 +	bnz,pt	%xcc, .L_four_or_more +	fmuld	u32, v16, r48 + +.L_three: +	fmuld	u32, v32, r64	C FIXME not urgent +	faddd	p32, r32, a32 +	fdtox	a00, a00 +	srlx	rlimb, 32, %g4		C HI(rlimb) +	and	rlimb, xffffffff, %g5	C LO(rlimb) +	ldx	[%i0+%i2], rlimb	C read rp[i] +	faddd	p48, r48, a48 +	add	i00, %g5, %g5		C i00+ now in g5 +	fmuld	u32, v48, r80	C FIXME not urgent +	fdtox	a16, a16 +	ldx	[%sp+2223+0], i00 +	fdtox	a32, a32 +	srlx	i16, 48, %l4		C (i16 >> 48) +	mov	i16, %g2 +	ldx	[%sp+2223+8], i16 +	srlx	i48, 16, %l5		C (i48 >> 16) +	add	i32, %g4, %g4		C i32+ now in g4 +	ldx	[%sp+2223+16], i32 +	sllx	i48, 32, %l6		C (i48 << 32) +	ldx	[%sp+2223+24], i48 +	fdtox	a48, a48 +	srlx	%g4, 32, %o3		C (i32 >> 32) +	add	%l5, %l4, %o1		C hi64- in %o1 +	std	a00, [%sp+2223+0] +	sllx	%g4, 16, %o2		C (i32 << 16) +	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT +	std	a16, [%sp+2223+8] +	sllx	%o1, 48, %o3		C (hi64 << 48) +	add	%g2, %o2, %o2		C mi64- in %o2 +	std	a32, [%sp+2223+16] +	add	%l6, %o2, %o2		C mi64- in %o2 +	std	a48, [%sp+2223+24] +	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT +	add	cy, %g5, %o4		C x = prev(i00) + cy +	b	.L_out_3 +	add	%i2, 8, %i2 + +.L_four_or_more: +	ld	[%i5+%i2], %f3		C read low 32 bits of up[i] +	fmuld	u32, v32, r64	C FIXME not urgent +	faddd	p32, r32, a32 +	ld	[%i1+%i2], %f5		C read high 32 bits of up[i] +	fdtox	a00, a00 +	srlx	rlimb, 32, %g4		C HI(rlimb) +	and	rlimb, xffffffff, %g5	C LO(rlimb) +	ldx	[%i0+%i2], rlimb	C read rp[i] +	faddd	p48, r48, a48 +	add	i00, %g5, %g5		C i00+ now in g5 +	fmuld	u32, v48, r80	C FIXME not urgent +	fdtox	a16, a16 +	ldx	[%sp+2223+0], i00 +	fdtox	a32, a32 +	srlx	i16, 48, %l4		C (i16 >> 48) +	mov	i16, %g2 +	ldx	[%sp+2223+8], i16 +	fxtod	%f2, u00 +	srlx	i48, 16, %l5		C (i48 >> 16) +	add	i32, %g4, %g4		C i32+ now in g4 +	ldx	[%sp+2223+16], i32 +	fxtod	%f4, u32 +	sllx	i48, 32, %l6		C (i48 << 32) +	ldx	[%sp+2223+24], i48 +	fdtox	a48, a48 +	srlx	%g4, 32, %o3		C (i32 >> 32) +	add	%l5, %l4, %o1		C hi64- in %o1 +	std	a00, [%sp+2223+0] +	fmuld	u00, v00, p00 +	sllx	%g4, 16, %o2		C (i32 << 16) +	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT +	std	a16, [%sp+2223+8] +	fmuld	u00, v16, p16 +	sllx	%o1, 48, %o3		C (hi64 << 48) +	add	%g2, %o2, %o2		C mi64- in %o2 +	std	a32, [%sp+2223+16] +	fmuld	u00, v32, p32 +	add	%l6, %o2, %o2		C mi64- in %o2 +	std	a48, [%sp+2223+24] +	faddd	p00, r64, a00 +	fmuld	u32, v00, r32 +	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT +	faddd	p16, r80, a16 +	fmuld	u00, v48, p48 +	add	cy, %g5, %o4		C x = prev(i00) + cy +	addcc	%i2, 8, %i2 +	bnz,pt	%xcc, .Loop +	fmuld	u32, v16, r48 + +.L_four: +	b,a	.L_out_4 + +C BEGIN MAIN LOOP +	.align	16 +.Loop: +C 00 +	srlx	%o4, 16, %o5		C (x >> 16) +	ld	[%i5+%i2], %f3		C read low 32 bits of up[i] +	fmuld	u32, v32, r64	C FIXME not urgent +	faddd	p32, r32, a32 +C 01 +	add	%o5, %o2, %o2		C mi64 in %o2   2nd ASSIGNMENT +	and	%o4, xffff, %o5		C (x & 0xffff) +	ld	[%i1+%i2], %f5		C read high 32 bits of up[i] +	fdtox	a00, a00 +C 02 +	srlx	rlimb, 32, %g4		C HI(rlimb) +	and	rlimb, xffffffff, %g5	C LO(rlimb) +	ldx	[%i0+%i2], rlimb	C read rp[i] +	faddd	p48, r48, a48 +C 03 +	srlx	%o2, 48, %o7		C (mi64 >> 48) +	add	i00, %g5, %g5		C i00+ now in g5 +	fmuld	u32, v48, r80	C FIXME not urgent +	fdtox	a16, a16 +C 04 +	sllx	%o2, 16, %i3		C (mi64 << 16) +	add	%o7, %o1, cy		C new cy +	ldx	[%sp+2223+0], i00 +	fdtox	a32, a32 +C 05 +	srlx	i16, 48, %l4		C (i16 >> 48) +	mov	i16, %g2 +	ldx	[%sp+2223+8], i16 +	fxtod	%f2, u00 +C 06 +	srlx	i48, 16, %l5		C (i48 >> 16) +	add	i32, %g4, %g4		C i32+ now in g4 +	ldx	[%sp+2223+16], i32 +	fxtod	%f4, u32 +C 07 +	sllx	i48, 32, %l6		C (i48 << 32) +	or	%i3, %o5, %o5 +	ldx	[%sp+2223+24], i48 +	fdtox	a48, a48 +C 08 +	srlx	%g4, 32, %o3		C (i32 >> 32) +	add	%l5, %l4, %o1		C hi64- in %o1 +	std	a00, [%sp+2223+0] +	fmuld	u00, v00, p00 +C 09 +	sllx	%g4, 16, %o2		C (i32 << 16) +	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT +	std	a16, [%sp+2223+8] +	fmuld	u00, v16, p16 +C 10 +	sllx	%o1, 48, %o3		C (hi64 << 48) +	add	%g2, %o2, %o2		C mi64- in %o2 +	std	a32, [%sp+2223+16] +	fmuld	u00, v32, p32 +C 11 +	add	%l6, %o2, %o2		C mi64- in %o2 +	std	a48, [%sp+2223+24] +	faddd	p00, r64, a00 +	fmuld	u32, v00, r32 +C 12 +	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT +	stx	%o5, [%i4+%i2] +	faddd	p16, r80, a16 +	fmuld	u00, v48, p48 +C 13 +	add	cy, %g5, %o4		C x = prev(i00) + cy +	addcc	%i2, 8, %i2 +	bnz,pt	%xcc, .Loop +	fmuld	u32, v16, r48 +C END MAIN LOOP + +.L_out_4: +	srlx	%o4, 16, %o5		C (x >> 16) +	fmuld	u32, v32, r64	C FIXME not urgent +	faddd	p32, r32, a32 +	add	%o5, %o2, %o2		C mi64 in %o2   2nd ASSIGNMENT +	and	%o4, xffff, %o5		C (x & 0xffff) +	fdtox	a00, a00 +	srlx	rlimb, 32, %g4		C HI(rlimb) +	and	rlimb, xffffffff, %g5	C LO(rlimb) +	ldx	[%i0+%i2], rlimb	C read rp[i] +	faddd	p48, r48, a48 +	srlx	%o2, 48, %o7		C (mi64 >> 48) +	add	i00, %g5, %g5		C i00+ now in g5 +	fmuld	u32, v48, r80	C FIXME not urgent +	fdtox	a16, a16 +	sllx	%o2, 16, %i3		C (mi64 << 16) +	add	%o7, %o1, cy		C new cy +	ldx	[%sp+2223+0], i00 +	fdtox	a32, a32 +	srlx	i16, 48, %l4		C (i16 >> 48) +	mov	i16, %g2 +	ldx	[%sp+2223+8], i16 +	srlx	i48, 16, %l5		C (i48 >> 16) +	add	i32, %g4, %g4		C i32+ now in g4 +	ldx	[%sp+2223+16], i32 +	sllx	i48, 32, %l6		C (i48 << 32) +	or	%i3, %o5, %o5 +	ldx	[%sp+2223+24], i48 +	fdtox	a48, a48 +	srlx	%g4, 32, %o3		C (i32 >> 32) +	add	%l5, %l4, %o1		C hi64- in %o1 +	std	a00, [%sp+2223+0] +	sllx	%g4, 16, %o2		C (i32 << 16) +	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT +	std	a16, [%sp+2223+8] +	sllx	%o1, 48, %o3		C (hi64 << 48) +	add	%g2, %o2, %o2		C mi64- in %o2 +	std	a32, [%sp+2223+16] +	add	%l6, %o2, %o2		C mi64- in %o2 +	std	a48, [%sp+2223+24] +	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT +	stx	%o5, [%i4+%i2] +	add	cy, %g5, %o4		C x = prev(i00) + cy +	add	%i2, 8, %i2 +.L_out_3: +	srlx	%o4, 16, %o5		C (x >> 16) +	add	%o5, %o2, %o2		C mi64 in %o2   2nd ASSIGNMENT +	and	%o4, xffff, %o5		C (x & 0xffff) +	fdtox	r64, a00 +	srlx	rlimb, 32, %g4		C HI(rlimb) +	and	rlimb, xffffffff, %g5	C LO(rlimb) +	ldx	[%i0+%i2], rlimb	C read rp[i] +	srlx	%o2, 48, %o7		C (mi64 >> 48) +	add	i00, %g5, %g5		C i00+ now in g5 +	fdtox	r80, a16 +	sllx	%o2, 16, %i3		C (mi64 << 16) +	add	%o7, %o1, cy		C new cy +	ldx	[%sp+2223+0], i00 +	srlx	i16, 48, %l4		C (i16 >> 48) +	mov	i16, %g2 +	ldx	[%sp+2223+8], i16 +	srlx	i48, 16, %l5		C (i48 >> 16) +	add	i32, %g4, %g4		C i32+ now in g4 +	ldx	[%sp+2223+16], i32 +	sllx	i48, 32, %l6		C (i48 << 32) +	or	%i3, %o5, %o5 +	ldx	[%sp+2223+24], i48 +	srlx	%g4, 32, %o3		C (i32 >> 32) +	add	%l5, %l4, %o1		C hi64- in %o1 +	std	a00, [%sp+2223+0] +	sllx	%g4, 16, %o2		C (i32 << 16) +	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT +	std	a16, [%sp+2223+8] +	sllx	%o1, 48, %o3		C (hi64 << 48) +	add	%g2, %o2, %o2		C mi64- in %o2 +	add	%l6, %o2, %o2		C mi64- in %o2 +	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT +	stx	%o5, [%i4+%i2] +	add	cy, %g5, %o4		C x = prev(i00) + cy +	add	%i2, 8, %i2 +.L_out_2: +	srlx	%o4, 16, %o5		C (x >> 16) +	add	%o5, %o2, %o2		C mi64 in %o2   2nd ASSIGNMENT +	and	%o4, xffff, %o5		C (x & 0xffff) +	srlx	rlimb, 32, %g4		C HI(rlimb) +	and	rlimb, xffffffff, %g5	C LO(rlimb) +	srlx	%o2, 48, %o7		C (mi64 >> 48) +	add	i00, %g5, %g5		C i00+ now in g5 +	sllx	%o2, 16, %i3		C (mi64 << 16) +	add	%o7, %o1, cy		C new cy +	ldx	[%sp+2223+0], i00 +	srlx	i16, 48, %l4		C (i16 >> 48) +	mov	i16, %g2 +	ldx	[%sp+2223+8], i16 +	srlx	i48, 16, %l5		C (i48 >> 16) +	add	i32, %g4, %g4		C i32+ now in g4 +	sllx	i48, 32, %l6		C (i48 << 32) +	or	%i3, %o5, %o5 +	srlx	%g4, 32, %o3		C (i32 >> 32) +	add	%l5, %l4, %o1		C hi64- in %o1 +	sllx	%g4, 16, %o2		C (i32 << 16) +	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT +	sllx	%o1, 48, %o3		C (hi64 << 48) +	add	%g2, %o2, %o2		C mi64- in %o2 +	add	%l6, %o2, %o2		C mi64- in %o2 +	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT +	stx	%o5, [%i4+%i2] +	add	cy, %g5, %o4		C x = prev(i00) + cy +	add	%i2, 8, %i2 +.L_out_1: +	srlx	%o4, 16, %o5		C (x >> 16) +	add	%o5, %o2, %o2		C mi64 in %o2   2nd ASSIGNMENT +	and	%o4, xffff, %o5		C (x & 0xffff) +	srlx	%o2, 48, %o7		C (mi64 >> 48) +	sllx	%o2, 16, %i3		C (mi64 << 16) +	add	%o7, %o1, cy		C new cy +	or	%i3, %o5, %o5 +	stx	%o5, [%i4+%i2] + +	sllx	i00, 0, %g2 +	add	%g2, cy, cy +	sllx	i16, 16, %g3 +	add	%g3, cy, cy + +	return	%i7+8 +	mov	cy, %o0 +EPILOGUE(mpn_addmul_1) diff --git a/vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234/addmul_2.asm b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234/addmul_2.asm new file mode 100644 index 0000000..37674d7 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234/addmul_2.asm @@ -0,0 +1,551 @@ +dnl  SPARC v9 64-bit mpn_addmul_2 -- Multiply an n limb number with 2-limb +dnl  number and add the result to a n limb vector. + +dnl  Copyright 2002, 2003 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C                  cycles/limb +C UltraSPARC 1&2:      9 +C UltraSPARC 3:       10 + +C Algorithm: We use 16 floating-point multiplies per limb product, with the +C 2-limb v operand split into eight 16-bit pieces, and the n-limb u operand +C split into 32-bit pieces.  We sum four 48-bit partial products using +C floating-point add, then convert the resulting four 50-bit quantities and +C transfer them to the integer unit. + +C Possible optimizations: +C   1. Align the stack area where we transfer the four 50-bit product-sums +C      to a 32-byte boundary.  That would minimize the cache collision. +C      (UltraSPARC-1/2 use a direct-mapped cache.)  (Perhaps even better would +C      be to align the area to map to the area immediately before up?) +C   2. Perform two of the fp->int conversions with integer instructions.  We +C      can get almost ten free IEU slots, if we clean up bookkeeping and the +C      silly carry-limb code. +C   3. For an mpn_addmul_1 based on this, we need to fix the silly carry-limb +C      code. + +C OSP (Overlapping software pipeline) version of mpn_mul_basecase: +C Operand swap will require 8 LDDA and 8 FXTOD, which will mean 8 cycles. +C FI	= 20 +C L	=  9 x un * vn +C WDFI	= 10 x vn / 2 +C WD	= 4 + +C Instruction classification (as per UltraSPARC functional units). +C Assuming silly carry code is fixed.  Includes bookkeeping. +C +C               mpn_addmul_X     mpn_mul_X +C                1       2       1       2 +C               ==========      ========== +C      FM        8      16       8      16 +C      FA       10      18      10      18 +C     MEM       12      12      10      10 +C  ISHIFT        6       6       6       6 +C IADDLOG       11      11      10      10 +C  BRANCH        1       1       1       1 +C +C TOTAL IEU     17      17      16      16 +C TOTAL         48      64      45      61 +C +C IEU cycles     8.5     8.5     8       8 +C MEM cycles    12      12      10      10 +C ISSUE cycles  12      16      11.25   15.25 +C FPU cycles    10      18      10      18 +C cycles/loop   12      18      12      18 +C cycles/limb   12       9      12       9 + + +C INPUT PARAMETERS +C rp[n + 1]	i0 +C up[n]		i1 +C n		i2 +C vp[2]		i3 + + +ASM_START() +	REGISTER(%g2,#scratch) +	REGISTER(%g3,#scratch) + +C Combine registers: +C u00_hi= u32_hi +C u00_lo= u32_lo +C a000  = out000 +C a016  = out016 +C Free: f52 f54 + + +define(`p000', `%f8')  define(`p016',`%f10') +define(`p032',`%f12')  define(`p048',`%f14') +define(`p064',`%f16')  define(`p080',`%f18') +define(`p096a',`%f20') define(`p112a',`%f22') +define(`p096b',`%f56') define(`p112b',`%f58') + +define(`out000',`%f0') define(`out016',`%f6') + +define(`v000',`%f24')  define(`v016',`%f26') +define(`v032',`%f28')  define(`v048',`%f30') +define(`v064',`%f44')  define(`v080',`%f46') +define(`v096',`%f48')  define(`v112',`%f50') + +define(`u00',`%f32')   define(`u32', `%f34') + +define(`a000',`%f36')  define(`a016',`%f38') +define(`a032',`%f40')  define(`a048',`%f42') +define(`a064',`%f60')  define(`a080',`%f62') + +define(`u00_hi',`%f2') define(`u32_hi',`%f4') +define(`u00_lo',`%f3') define(`u32_lo',`%f5') + +define(`cy',`%g1') +define(`rlimb',`%g3') +define(`i00',`%l0')    define(`i16',`%l1') +define(`r00',`%l2')    define(`r32',`%l3') +define(`xffffffff',`%l7') +define(`xffff',`%o0') + + +PROLOGUE(mpn_addmul_2) + +C Initialization.  (1) Split v operand into eight 16-bit chunks and store them +C as IEEE double in fp registers.  (2) Clear upper 32 bits of fp register pairs +C f2 and f4.  (3) Store masks in registers aliased to `xffff' and `xffffffff'. +C This code could be better scheduled. + +	save	%sp, -256, %sp + +ifdef(`HAVE_VIS', +`	mov	-1, %g4 +	wr	%g0, 0xD2, %asi +	srlx	%g4, 32, xffffffff	C store mask in register `xffffffff' +	ldda	[%i3+6] %asi, v000 +	ldda	[%i3+4] %asi, v016 +	ldda	[%i3+2] %asi, v032 +	ldda	[%i3+0] %asi, v048 +	fxtod	v000, v000 +	ldda	[%i3+14] %asi, v064 +	fxtod	v016, v016 +	ldda	[%i3+12] %asi, v080 +	fxtod	v032, v032 +	ldda	[%i3+10] %asi, v096 +	fxtod	v048, v048 +	ldda	[%i3+8] %asi, v112 +	fxtod	v064, v064 +	fxtod	v080, v080 +	fxtod	v096, v096 +	fxtod	v112, v112 +	fzero	u00_hi +	fzero	u32_hi +', +`	mov	-1, %g4 +	ldx	[%i3+0], %l0		C vp[0] +	srlx	%g4, 48, xffff		C store mask in register `xffff' +	ldx	[%i3+8], %l1		C vp[1] + +	and	%l0, xffff, %g2 +	stx	%g2, [%sp+2223+0] +	srlx	%l0, 16, %g3 +	and	%g3, xffff, %g3 +	stx	%g3, [%sp+2223+8] +	srlx	%l0, 32, %g2 +	and	%g2, xffff, %g2 +	stx	%g2, [%sp+2223+16] +	srlx	%l0, 48, %g3 +	stx	%g3, [%sp+2223+24] +	and	%l1, xffff, %g2 +	stx	%g2, [%sp+2223+32] +	srlx	%l1, 16, %g3 +	and	%g3, xffff, %g3 +	stx	%g3, [%sp+2223+40] +	srlx	%l1, 32, %g2 +	and	%g2, xffff, %g2 +	stx	%g2, [%sp+2223+48] +	srlx	%l1, 48, %g3 +	stx	%g3, [%sp+2223+56] + +	srlx	%g4, 32, xffffffff	C store mask in register `xffffffff' + +	ldd	[%sp+2223+0], v000 +	ldd	[%sp+2223+8], v016 +	ldd	[%sp+2223+16], v032 +	ldd	[%sp+2223+24], v048 +	fxtod	v000, v000 +	ldd	[%sp+2223+32], v064 +	fxtod	v016, v016 +	ldd	[%sp+2223+40], v080 +	fxtod	v032, v032 +	ldd	[%sp+2223+48], v096 +	fxtod	v048, v048 +	ldd	[%sp+2223+56], v112 +	fxtod	v064, v064 +	ld	[%sp+2223+0], u00_hi	C zero u00_hi +	fxtod	v080, v080 +	ld	[%sp+2223+0], u32_hi	C zero u32_hi +	fxtod	v096, v096 +	fxtod	v112, v112 +') +C Initialization done. +	mov	0, %g2 +	mov	0, rlimb +	mov	0, %g4 +	add	%i0, -8, %i0		C BOOKKEEPING + +C Start software pipeline. + +	ld	[%i1+4], u00_lo		C read low 32 bits of up[i] +	fxtod	u00_hi, u00 +C mid +	ld	[%i1+0], u32_lo		C read high 32 bits of up[i] +	fmuld	u00, v000, a000 +	fmuld	u00, v016, a016 +	fmuld	u00, v032, a032 +	fmuld	u00, v048, a048 +	add	%i2, -1, %i2		C BOOKKEEPING +	fmuld	u00, v064, p064 +	add	%i1, 8, %i1		C BOOKKEEPING +	fxtod	u32_hi, u32 +	fmuld	u00, v080, p080 +	fmuld	u00, v096, p096a +	brnz,pt	%i2, .L_2_or_more +	 fmuld	u00, v112, p112a + +.L1:	fdtox	a000, out000 +	fmuld	u32, v000, p000 +	fdtox	a016, out016 +	fmuld	u32, v016, p016 +	fmovd	p064, a064 +	fmuld	u32, v032, p032 +	fmovd	p080, a080 +	fmuld	u32, v048, p048 +	std	out000, [%sp+2223+16] +	faddd	p000, a032, a000 +	fmuld	u32, v064, p064 +	std	out016, [%sp+2223+24] +	fxtod	u00_hi, u00 +	faddd	p016, a048, a016 +	fmuld	u32, v080, p080 +	faddd	p032, a064, a032 +	fmuld	u32, v096, p096b +	faddd	p048, a080, a048 +	fmuld	u32, v112, p112b +C mid +	fdtox	a000, out000 +	fdtox	a016, out016 +	faddd	p064, p096a, a064 +	faddd	p080, p112a, a080 +	std	out000, [%sp+2223+0] +	b	.L_wd2 +	 std	out016, [%sp+2223+8] + +.L_2_or_more: +	ld	[%i1+4], u00_lo		C read low 32 bits of up[i] +	fdtox	a000, out000 +	fmuld	u32, v000, p000 +	fdtox	a016, out016 +	fmuld	u32, v016, p016 +	fmovd	p064, a064 +	fmuld	u32, v032, p032 +	fmovd	p080, a080 +	fmuld	u32, v048, p048 +	std	out000, [%sp+2223+16] +	faddd	p000, a032, a000 +	fmuld	u32, v064, p064 +	std	out016, [%sp+2223+24] +	fxtod	u00_hi, u00 +	faddd	p016, a048, a016 +	fmuld	u32, v080, p080 +	faddd	p032, a064, a032 +	fmuld	u32, v096, p096b +	faddd	p048, a080, a048 +	fmuld	u32, v112, p112b +C mid +	ld	[%i1+0], u32_lo		C read high 32 bits of up[i] +	fdtox	a000, out000 +	fmuld	u00, v000, p000 +	fdtox	a016, out016 +	fmuld	u00, v016, p016 +	faddd	p064, p096a, a064 +	fmuld	u00, v032, p032 +	faddd	p080, p112a, a080 +	fmuld	u00, v048, p048 +	add	%i2, -1, %i2		C BOOKKEEPING +	std	out000, [%sp+2223+0] +	faddd	p000, a032, a000 +	fmuld	u00, v064, p064 +	add	%i1, 8, %i1		C BOOKKEEPING +	std	out016, [%sp+2223+8] +	fxtod	u32_hi, u32 +	faddd	p016, a048, a016 +	fmuld	u00, v080, p080 +	faddd	p032, a064, a032 +	fmuld	u00, v096, p096a +	faddd	p048, a080, a048 +	brnz,pt	%i2, .L_3_or_more +	 fmuld	u00, v112, p112a + +	b	.Lend +	 nop + +C  64      32       0 +C   .       .       . +C   .       |__rXXX_|	32 +C   .      |___cy___|	34 +C   .  |_______i00__|	50 +C  |_______i16__|   .	50 + + +C BEGIN MAIN LOOP +	.align	16 +.L_3_or_more: +.Loop:	ld	[%i1+4], u00_lo		C read low 32 bits of up[i] +	and	%g2, xffffffff, %g2 +	fdtox	a000, out000 +	fmuld	u32, v000, p000 +C +	lduw	[%i0+4+8], r00		C read low 32 bits of rp[i] +	add	%g2, rlimb, %l5 +	fdtox	a016, out016 +	fmuld	u32, v016, p016 +C +	srlx	%l5, 32, cy +	ldx	[%sp+2223+16], i00 +	faddd	p064, p096b, a064 +	fmuld	u32, v032, p032 +C +	add	%g4, cy, cy		C new cy +	ldx	[%sp+2223+24], i16 +	faddd	p080, p112b, a080 +	fmuld	u32, v048, p048 +C +	nop +	std	out000, [%sp+2223+16] +	faddd	p000, a032, a000 +	fmuld	u32, v064, p064 +C +	add	i00, r00, rlimb +	add	%i0, 8, %i0		C BOOKKEEPING +	std	out016, [%sp+2223+24] +	fxtod	u00_hi, u00 +C +	sllx	i16, 16, %g2 +	add	cy, rlimb, rlimb +	faddd	p016, a048, a016 +	fmuld	u32, v080, p080 +C +	srlx	i16, 16, %g4 +	add	%g2, rlimb, %l5 +	faddd	p032, a064, a032 +	fmuld	u32, v096, p096b +C +	stw	%l5, [%i0+4] +	nop +	faddd	p048, a080, a048 +	fmuld	u32, v112, p112b +C midloop +	ld	[%i1+0], u32_lo		C read high 32 bits of up[i] +	and	%g2, xffffffff, %g2 +	fdtox	a000, out000 +	fmuld	u00, v000, p000 +C +	lduw	[%i0+0], r32		C read high 32 bits of rp[i] +	add	%g2, rlimb, %l5 +	fdtox	a016, out016 +	fmuld	u00, v016, p016 +C +	srlx	%l5, 32, cy +	ldx	[%sp+2223+0], i00 +	faddd	p064, p096a, a064 +	fmuld	u00, v032, p032 +C +	add	%g4, cy, cy		C new cy +	ldx	[%sp+2223+8], i16 +	faddd	p080, p112a, a080 +	fmuld	u00, v048, p048 +C +	add	%i2, -1, %i2		C BOOKKEEPING +	std	out000, [%sp+2223+0] +	faddd	p000, a032, a000 +	fmuld	u00, v064, p064 +C +	add	i00, r32, rlimb +	add	%i1, 8, %i1		C BOOKKEEPING +	std	out016, [%sp+2223+8] +	fxtod	u32_hi, u32 +C +	sllx	i16, 16, %g2 +	add	cy, rlimb, rlimb +	faddd	p016, a048, a016 +	fmuld	u00, v080, p080 +C +	srlx	i16, 16, %g4 +	add	%g2, rlimb, %l5 +	faddd	p032, a064, a032 +	fmuld	u00, v096, p096a +C +	stw	%l5, [%i0+0] +	faddd	p048, a080, a048 +	brnz,pt	%i2, .Loop +	 fmuld	u00, v112, p112a +C END MAIN LOOP + +C WIND-DOWN PHASE 1 +.Lend:	and	%g2, xffffffff, %g2 +	fdtox	a000, out000 +	fmuld	u32, v000, p000 +	lduw	[%i0+4+8], r00		C read low 32 bits of rp[i] +	add	%g2, rlimb, %l5 +	fdtox	a016, out016 +	fmuld	u32, v016, p016 +	srlx	%l5, 32, cy +	ldx	[%sp+2223+16], i00 +	faddd	p064, p096b, a064 +	fmuld	u32, v032, p032 +	add	%g4, cy, cy		C new cy +	ldx	[%sp+2223+24], i16 +	faddd	p080, p112b, a080 +	fmuld	u32, v048, p048 +	std	out000, [%sp+2223+16] +	faddd	p000, a032, a000 +	fmuld	u32, v064, p064 +	add	i00, r00, rlimb +	add	%i0, 8, %i0		C BOOKKEEPING +	std	out016, [%sp+2223+24] +	sllx	i16, 16, %g2 +	add	cy, rlimb, rlimb +	faddd	p016, a048, a016 +	fmuld	u32, v080, p080 +	srlx	i16, 16, %g4 +	add	%g2, rlimb, %l5 +	faddd	p032, a064, a032 +	fmuld	u32, v096, p096b +	stw	%l5, [%i0+4] +	faddd	p048, a080, a048 +	fmuld	u32, v112, p112b +C mid +	and	%g2, xffffffff, %g2 +	fdtox	a000, out000 +	lduw	[%i0+0], r32		C read high 32 bits of rp[i] +	add	%g2, rlimb, %l5 +	fdtox	a016, out016 +	srlx	%l5, 32, cy +	ldx	[%sp+2223+0], i00 +	faddd	p064, p096a, a064 +	add	%g4, cy, cy		C new cy +	ldx	[%sp+2223+8], i16 +	faddd	p080, p112a, a080 +	std	out000, [%sp+2223+0] +	add	i00, r32, rlimb +	std	out016, [%sp+2223+8] +	sllx	i16, 16, %g2 +	add	cy, rlimb, rlimb +	srlx	i16, 16, %g4 +	add	%g2, rlimb, %l5 +	stw	%l5, [%i0+0] + +C WIND-DOWN PHASE 2 +.L_wd2:	and	%g2, xffffffff, %g2 +	fdtox	a032, out000 +	lduw	[%i0+4+8], r00		C read low 32 bits of rp[i] +	add	%g2, rlimb, %l5 +	fdtox	a048, out016 +	srlx	%l5, 32, cy +	ldx	[%sp+2223+16], i00 +	add	%g4, cy, cy		C new cy +	ldx	[%sp+2223+24], i16 +	std	out000, [%sp+2223+16] +	add	i00, r00, rlimb +	add	%i0, 8, %i0		C BOOKKEEPING +	std	out016, [%sp+2223+24] +	sllx	i16, 16, %g2 +	add	cy, rlimb, rlimb +	srlx	i16, 16, %g4 +	add	%g2, rlimb, %l5 +	stw	%l5, [%i0+4] +C mid +	and	%g2, xffffffff, %g2 +	fdtox	a064, out000 +	lduw	[%i0+0], r32		C read high 32 bits of rp[i] +	add	%g2, rlimb, %l5 +	fdtox	a080, out016 +	srlx	%l5, 32, cy +	ldx	[%sp+2223+0], i00 +	add	%g4, cy, cy		C new cy +	ldx	[%sp+2223+8], i16 +	std	out000, [%sp+2223+0] +	add	i00, r32, rlimb +	std	out016, [%sp+2223+8] +	sllx	i16, 16, %g2 +	add	cy, rlimb, rlimb +	srlx	i16, 16, %g4 +	add	%g2, rlimb, %l5 +	stw	%l5, [%i0+0] + +C WIND-DOWN PHASE 3 +.L_wd3:	and	%g2, xffffffff, %g2 +	fdtox	p096b, out000 +	add	%g2, rlimb, %l5 +	fdtox	p112b, out016 +	srlx	%l5, 32, cy +	ldx	[%sp+2223+16], rlimb +	add	%g4, cy, cy		C new cy +	ldx	[%sp+2223+24], i16 +	std	out000, [%sp+2223+16] +	add	%i0, 8, %i0		C BOOKKEEPING +	std	out016, [%sp+2223+24] +	sllx	i16, 16, %g2 +	add	cy, rlimb, rlimb +	srlx	i16, 16, %g4 +	add	%g2, rlimb, %l5 +	stw	%l5, [%i0+4] +C mid +	and	%g2, xffffffff, %g2 +	add	%g2, rlimb, %l5 +	srlx	%l5, 32, cy +	ldx	[%sp+2223+0], rlimb +	add	%g4, cy, cy		C new cy +	ldx	[%sp+2223+8], i16 +	sllx	i16, 16, %g2 +	add	cy, rlimb, rlimb +	srlx	i16, 16, %g4 +	add	%g2, rlimb, %l5 +	stw	%l5, [%i0+0] + +	and	%g2, xffffffff, %g2 +	add	%g2, rlimb, %l5 +	srlx	%l5, 32, cy +	ldx	[%sp+2223+16], i00 +	add	%g4, cy, cy		C new cy +	ldx	[%sp+2223+24], i16 + +	sllx	i16, 16, %g2 +	add	i00, cy, cy +	return	%i7+8 +	add	%g2, cy, %o0 +EPILOGUE(mpn_addmul_2) diff --git a/vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234/lshiftc.asm b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234/lshiftc.asm new file mode 100644 index 0000000..47286d5 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234/lshiftc.asm @@ -0,0 +1,165 @@ +dnl  SPARC v9 mpn_lshiftc + +dnl  Copyright 1996, 2000-2003, 2010 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C		   cycles/limb +C UltraSPARC 1&2:     3 +C UltraSPARC 3:	      2.67 + +C INPUT PARAMETERS +define(`rp', `%i0') +define(`up', `%i1') +define(`n',  `%i2') +define(`cnt',`%i3') + +define(`u0', `%l0') +define(`u1', `%l2') +define(`u2', `%l4') +define(`u3', `%l6') + +define(`tnc',`%i4') + +define(`fanop',`fitod %f0,%f2')		dnl  A quasi nop running in the FA pipe + +ASM_START() +	REGISTER(%g2,#scratch) +	REGISTER(%g3,#scratch) +PROLOGUE(mpn_lshiftc) +	save	%sp,-160,%sp + +	sllx	n,3,%g1 +	sub	%g0,cnt,tnc		C negate shift count +	add	up,%g1,up		C make %o1 point at end of src +	add	rp,%g1,rp		C make %o0 point at end of res +	ldx	[up-8],u3		C load first limb +	subcc	n,5,n +	srlx	u3,tnc,%i5		C compute function result +	bl,pn	%xcc,.Lend1234 +	sllx	u3,cnt,%g3 + +	subcc	n,4,n +	ldx	[up-16],u0 +	ldx	[up-24],u1 +	add	up,-32,up +	ldx	[up-0],u2 +	ldx	[up-8],u3 +	srlx	u0,tnc,%g2 +	bl,pn	%xcc,.Lend5678 +	not	%g3, %g3 + +	b,a	.Loop +	ALIGN(16) +.Loop: +	sllx	u0,cnt,%g1 +	andn	%g3,%g2,%g3 +	ldx	[up-16],u0 +	fanop +C -- +	srlx	u1,tnc,%g2 +	subcc	n,4,n +	stx	%g3,[rp-8] +	not	%g1, %g1 +C -- +	sllx	u1,cnt,%g3 +	andn	%g1,%g2,%g1 +	ldx	[up-24],u1 +	fanop +C -- +	srlx	u2,tnc,%g2 +	stx	%g1,[rp-16] +	add	up,-32,up +	not	%g3, %g3 +C -- +	sllx	u2,cnt,%g1 +	andn	%g3,%g2,%g3 +	ldx	[up-0],u2 +	fanop +C -- +	srlx	u3,tnc,%g2 +	stx	%g3,[rp-24] +	add	rp,-32,rp +	not	%g1, %g1 +C -- +	sllx	u3,cnt,%g3 +	andn	%g1,%g2,%g1 +	ldx	[up-8],u3 +	fanop +C -- +	srlx	u0,tnc,%g2 +	stx	%g1,[rp-0] +	bge,pt	%xcc,.Loop +	not	%g3, %g3 +C -- +.Lend5678: +	sllx	u0,cnt,%g1 +	andn	%g3,%g2,%g3 +	srlx	u1,tnc,%g2 +	stx	%g3,[rp-8] +	not	%g1, %g1 +	sllx	u1,cnt,%g3 +	andn	%g1,%g2,%g1 +	srlx	u2,tnc,%g2 +	stx	%g1,[rp-16] +	not	%g3, %g3 +	sllx	u2,cnt,%g1 +	andn	%g3,%g2,%g3 +	srlx	u3,tnc,%g2 +	stx	%g3,[rp-24] +	add	rp,-32,rp +	not	%g1, %g1 +	sllx	u3,cnt,%g3		C carry... +	andn	%g1,%g2,%g1 +	stx	%g1,[rp-0] + +.Lend1234: +	addcc	n,4,n +	bz,pn	%xcc,.Lret +	fanop +.Loop0: +	add	rp,-8,rp +	subcc	n,1,n +	ldx	[up-16],u3 +	add	up,-8,up +	srlx	u3,tnc,%g2 +	not	%g3, %g3 +	andn	%g3,%g2,%g3 +	stx	%g3,[rp] +	sllx	u3,cnt,%g3 +	bnz,pt	%xcc,.Loop0 +	fanop +.Lret: +	not	%g3, %g3 +	stx	%g3,[rp-8] +	mov	%i5,%i0 +	ret +	restore +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234/mul_1.asm b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234/mul_1.asm new file mode 100644 index 0000000..871d562 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234/mul_1.asm @@ -0,0 +1,580 @@ +dnl  SPARC v9 64-bit mpn_mul_1 -- Multiply a limb vector with a limb and store +dnl  the result in a second limb vector. + +dnl  Copyright 1998, 2000-2003 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C		   cycles/limb +C UltraSPARC 1&2:     14 +C UltraSPARC 3:	      18.5 + +C Algorithm: We use eight floating-point multiplies per limb product, with the +C invariant v operand split into four 16-bit pieces, and the s1 operand split +C into 32-bit pieces.  We sum pairs of 48-bit partial products using +C floating-point add, then convert the four 49-bit product-sums and transfer +C them to the integer unit. + +C Possible optimizations: +C   1. Align the stack area where we transfer the four 49-bit product-sums +C      to a 32-byte boundary.  That would minimize the cache collision. +C      (UltraSPARC-1/2 use a direct-mapped cache.)  (Perhaps even better would +C      be to align the area to map to the area immediately before s1?) +C   2. Sum the 4 49-bit quantities using 32-bit operations, as in the +C      develop mpn_addmul_2.  This would save many integer instructions. +C   3. Unrolling.  Questionable if it is worth the code expansion, given that +C      it could only save 1 cycle/limb. +C   4. Specialize for particular v values.  If its upper 32 bits are zero, we +C      could save many operations, in the FPU (fmuld), but more so in the IEU +C      since we'll be summing 48-bit quantities, which might be simpler. +C   5. Ideally, we should schedule the f2/f3 and f4/f5 RAW further apart, and +C      the i00,i16,i32,i48 RAW less apart.  The latter apart-scheduling should +C      not be greater than needed for L2 cache latency, and also not so great +C      that i16 needs to be copied. +C   6. Avoid performing mem+fa+fm in the same cycle, at least not when we want +C      to get high IEU bandwidth.  (12 of the 14 cycles will be free for 2 IEU +C      ops.) + +C Instruction classification (as per UltraSPARC-1/2 functional units): +C    8 FM +C   10 FA +C   11 MEM +C   9 ISHIFT + 10? IADDLOG +C    1 BRANCH +C   49 insns totally (plus three mov insns that should be optimized out) + +C The loop executes 53 instructions in 14 cycles on UltraSPARC-1/2, i.e we +C sustain 3.79 instructions/cycle. + +C INPUT PARAMETERS +C rp	i0 +C up	i1 +C n	i2 +C v	i3 + +ASM_START() +	REGISTER(%g2,#scratch) +	REGISTER(%g3,#scratch) + +define(`p00', `%f8') define(`p16',`%f10') define(`p32',`%f12') define(`p48',`%f14') +define(`r32',`%f16') define(`r48',`%f18') define(`r64',`%f20') define(`r80',`%f22') +define(`v00',`%f24') define(`v16',`%f26') define(`v32',`%f28') define(`v48',`%f30') +define(`u00',`%f32') define(`u32', `%f34') +define(`a00',`%f36') define(`a16',`%f38') define(`a32',`%f40') define(`a48',`%f42') +define(`cy',`%g1') +define(`rlimb',`%g3') +define(`i00',`%l0') define(`i16',`%l1') define(`i32',`%l2') define(`i48',`%l3') +define(`xffffffff',`%l7') +define(`xffff',`%o0') + +PROLOGUE(mpn_mul_1) + +C Initialization.  (1) Split v operand into four 16-bit chunks and store them +C as IEEE double in fp registers.  (2) Clear upper 32 bits of fp register pairs +C f2 and f4.  (3) Store masks in registers aliased to `xffff' and `xffffffff'. + +	save	%sp, -256, %sp +	mov	-1, %g4 +	srlx	%g4, 48, xffff		C store mask in register `xffff' +	and	%i3, xffff, %g2 +	stx	%g2, [%sp+2223+0] +	srlx	%i3, 16, %g3 +	and	%g3, xffff, %g3 +	stx	%g3, [%sp+2223+8] +	srlx	%i3, 32, %g2 +	and	%g2, xffff, %g2 +	stx	%g2, [%sp+2223+16] +	srlx	%i3, 48, %g3 +	stx	%g3, [%sp+2223+24] +	srlx	%g4, 32, xffffffff	C store mask in register `xffffffff' + +	sllx	%i2, 3, %i2 +	mov	0, cy			C clear cy +	add	%i0, %i2, %i0 +	add	%i1, %i2, %i1 +	neg	%i2 +	add	%i1, 4, %i5 +	add	%i0, -32, %i4 +	add	%i0, -16, %i0 + +	ldd	[%sp+2223+0], v00 +	ldd	[%sp+2223+8], v16 +	ldd	[%sp+2223+16], v32 +	ldd	[%sp+2223+24], v48 +	ld	[%sp+2223+0],%f2	C zero f2 +	ld	[%sp+2223+0],%f4	C zero f4 +	ld	[%i5+%i2], %f3		C read low 32 bits of up[i] +	ld	[%i1+%i2], %f5		C read high 32 bits of up[i] +	fxtod	v00, v00 +	fxtod	v16, v16 +	fxtod	v32, v32 +	fxtod	v48, v48 + +C Start real work.  (We sneakingly read f3 and f5 above...) +C The software pipeline is very deep, requiring 4 feed-in stages. + +	fxtod	%f2, u00 +	fxtod	%f4, u32 +	fmuld	u00, v00, a00 +	fmuld	u00, v16, a16 +	fmuld	u00, v32, p32 +	fmuld	u32, v00, r32 +	fmuld	u00, v48, p48 +	addcc	%i2, 8, %i2 +	bnz,pt	%xcc, .L_two_or_more +	fmuld	u32, v16, r48 + +.L_one: +	fmuld	u32, v32, r64	C FIXME not urgent +	faddd	p32, r32, a32 +	fdtox	a00, a00 +	faddd	p48, r48, a48 +	fmuld	u32, v48, r80	C FIXME not urgent +	fdtox	a16, a16 +	fdtox	a32, a32 +	fdtox	a48, a48 +	std	a00, [%sp+2223+0] +	std	a16, [%sp+2223+8] +	std	a32, [%sp+2223+16] +	std	a48, [%sp+2223+24] +	add	%i2, 8, %i2 + +	fdtox	r64, a00 +	fdtox	r80, a16 +	ldx	[%sp+2223+0], i00 +	ldx	[%sp+2223+8], i16 +	ldx	[%sp+2223+16], i32 +	ldx	[%sp+2223+24], i48 +	std	a00, [%sp+2223+0] +	std	a16, [%sp+2223+8] +	add	%i2, 8, %i2 + +	mov	i00, %g5		C i00+ now in g5 +	ldx	[%sp+2223+0], i00 +	srlx	i16, 48, %l4		C (i16 >> 48) +	mov	i16, %g2 +	ldx	[%sp+2223+8], i16 +	srlx	i48, 16, %l5		C (i48 >> 16) +	mov	i32, %g4		C i32+ now in g4 +	sllx	i48, 32, %l6		C (i48 << 32) +	srlx	%g4, 32, %o3		C (i32 >> 32) +	add	%l5, %l4, %o1		C hi64- in %o1 +	std	a00, [%sp+2223+0] +	sllx	%g4, 16, %o2		C (i32 << 16) +	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT +	std	a16, [%sp+2223+8] +	sllx	%o1, 48, %o3		C (hi64 << 48) +	add	%g2, %o2, %o2		C mi64- in %o2 +	add	%l6, %o2, %o2		C mi64- in %o2 +	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT +	add	cy, %g5, %o4		C x = prev(i00) + cy +	b	.L_out_1 +	add	%i2, 8, %i2 + +.L_two_or_more: +	ld	[%i5+%i2], %f3		C read low 32 bits of up[i] +	fmuld	u32, v32, r64	C FIXME not urgent +	faddd	p32, r32, a32 +	ld	[%i1+%i2], %f5		C read high 32 bits of up[i] +	fdtox	a00, a00 +	faddd	p48, r48, a48 +	fmuld	u32, v48, r80	C FIXME not urgent +	fdtox	a16, a16 +	fdtox	a32, a32 +	fxtod	%f2, u00 +	fxtod	%f4, u32 +	fdtox	a48, a48 +	std	a00, [%sp+2223+0] +	fmuld	u00, v00, p00 +	std	a16, [%sp+2223+8] +	fmuld	u00, v16, p16 +	std	a32, [%sp+2223+16] +	fmuld	u00, v32, p32 +	std	a48, [%sp+2223+24] +	faddd	p00, r64, a00 +	fmuld	u32, v00, r32 +	faddd	p16, r80, a16 +	fmuld	u00, v48, p48 +	addcc	%i2, 8, %i2 +	bnz,pt	%xcc, .L_three_or_more +	fmuld	u32, v16, r48 + +.L_two: +	fmuld	u32, v32, r64	C FIXME not urgent +	faddd	p32, r32, a32 +	fdtox	a00, a00 +	faddd	p48, r48, a48 +	fmuld	u32, v48, r80	C FIXME not urgent +	fdtox	a16, a16 +	ldx	[%sp+2223+0], i00 +	fdtox	a32, a32 +	ldx	[%sp+2223+8], i16 +	ldx	[%sp+2223+16], i32 +	ldx	[%sp+2223+24], i48 +	fdtox	a48, a48 +	std	a00, [%sp+2223+0] +	std	a16, [%sp+2223+8] +	std	a32, [%sp+2223+16] +	std	a48, [%sp+2223+24] +	add	%i2, 8, %i2 + +	fdtox	r64, a00 +	mov	i00, %g5		C i00+ now in g5 +	fdtox	r80, a16 +	ldx	[%sp+2223+0], i00 +	srlx	i16, 48, %l4		C (i16 >> 48) +	mov	i16, %g2 +	ldx	[%sp+2223+8], i16 +	srlx	i48, 16, %l5		C (i48 >> 16) +	mov	i32, %g4		C i32+ now in g4 +	ldx	[%sp+2223+16], i32 +	sllx	i48, 32, %l6		C (i48 << 32) +	ldx	[%sp+2223+24], i48 +	srlx	%g4, 32, %o3		C (i32 >> 32) +	add	%l5, %l4, %o1		C hi64- in %o1 +	std	a00, [%sp+2223+0] +	sllx	%g4, 16, %o2		C (i32 << 16) +	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT +	std	a16, [%sp+2223+8] +	sllx	%o1, 48, %o3		C (hi64 << 48) +	add	%g2, %o2, %o2		C mi64- in %o2 +	add	%l6, %o2, %o2		C mi64- in %o2 +	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT +	add	cy, %g5, %o4		C x = prev(i00) + cy +	b	.L_out_2 +	add	%i2, 8, %i2 + +.L_three_or_more: +	ld	[%i5+%i2], %f3		C read low 32 bits of up[i] +	fmuld	u32, v32, r64	C FIXME not urgent +	faddd	p32, r32, a32 +	ld	[%i1+%i2], %f5		C read high 32 bits of up[i] +	fdtox	a00, a00 +	faddd	p48, r48, a48 +	fmuld	u32, v48, r80	C FIXME not urgent +	fdtox	a16, a16 +	ldx	[%sp+2223+0], i00 +	fdtox	a32, a32 +	ldx	[%sp+2223+8], i16 +	fxtod	%f2, u00 +	ldx	[%sp+2223+16], i32 +	fxtod	%f4, u32 +	ldx	[%sp+2223+24], i48 +	fdtox	a48, a48 +	std	a00, [%sp+2223+0] +	fmuld	u00, v00, p00 +	std	a16, [%sp+2223+8] +	fmuld	u00, v16, p16 +	std	a32, [%sp+2223+16] +	fmuld	u00, v32, p32 +	std	a48, [%sp+2223+24] +	faddd	p00, r64, a00 +	fmuld	u32, v00, r32 +	faddd	p16, r80, a16 +	fmuld	u00, v48, p48 +	addcc	%i2, 8, %i2 +	bnz,pt	%xcc, .L_four_or_more +	fmuld	u32, v16, r48 + +.L_three: +	fmuld	u32, v32, r64	C FIXME not urgent +	faddd	p32, r32, a32 +	fdtox	a00, a00 +	faddd	p48, r48, a48 +	mov	i00, %g5		C i00+ now in g5 +	fmuld	u32, v48, r80	C FIXME not urgent +	fdtox	a16, a16 +	ldx	[%sp+2223+0], i00 +	fdtox	a32, a32 +	srlx	i16, 48, %l4		C (i16 >> 48) +	mov	i16, %g2 +	ldx	[%sp+2223+8], i16 +	srlx	i48, 16, %l5		C (i48 >> 16) +	mov	i32, %g4		C i32+ now in g4 +	ldx	[%sp+2223+16], i32 +	sllx	i48, 32, %l6		C (i48 << 32) +	ldx	[%sp+2223+24], i48 +	fdtox	a48, a48 +	srlx	%g4, 32, %o3		C (i32 >> 32) +	add	%l5, %l4, %o1		C hi64- in %o1 +	std	a00, [%sp+2223+0] +	sllx	%g4, 16, %o2		C (i32 << 16) +	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT +	std	a16, [%sp+2223+8] +	sllx	%o1, 48, %o3		C (hi64 << 48) +	add	%g2, %o2, %o2		C mi64- in %o2 +	std	a32, [%sp+2223+16] +	add	%l6, %o2, %o2		C mi64- in %o2 +	std	a48, [%sp+2223+24] +	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT +	add	cy, %g5, %o4		C x = prev(i00) + cy +	b	.L_out_3 +	add	%i2, 8, %i2 + +.L_four_or_more: +	ld	[%i5+%i2], %f3		C read low 32 bits of up[i] +	fmuld	u32, v32, r64	C FIXME not urgent +	faddd	p32, r32, a32 +	ld	[%i1+%i2], %f5		C read high 32 bits of up[i] +	fdtox	a00, a00 +	faddd	p48, r48, a48 +	mov	i00, %g5		C i00+ now in g5 +	fmuld	u32, v48, r80	C FIXME not urgent +	fdtox	a16, a16 +	ldx	[%sp+2223+0], i00 +	fdtox	a32, a32 +	srlx	i16, 48, %l4		C (i16 >> 48) +	mov	i16, %g2 +	ldx	[%sp+2223+8], i16 +	fxtod	%f2, u00 +	srlx	i48, 16, %l5		C (i48 >> 16) +	mov	i32, %g4		C i32+ now in g4 +	ldx	[%sp+2223+16], i32 +	fxtod	%f4, u32 +	sllx	i48, 32, %l6		C (i48 << 32) +	ldx	[%sp+2223+24], i48 +	fdtox	a48, a48 +	srlx	%g4, 32, %o3		C (i32 >> 32) +	add	%l5, %l4, %o1		C hi64- in %o1 +	std	a00, [%sp+2223+0] +	fmuld	u00, v00, p00 +	sllx	%g4, 16, %o2		C (i32 << 16) +	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT +	std	a16, [%sp+2223+8] +	fmuld	u00, v16, p16 +	sllx	%o1, 48, %o3		C (hi64 << 48) +	add	%g2, %o2, %o2		C mi64- in %o2 +	std	a32, [%sp+2223+16] +	fmuld	u00, v32, p32 +	add	%l6, %o2, %o2		C mi64- in %o2 +	std	a48, [%sp+2223+24] +	faddd	p00, r64, a00 +	fmuld	u32, v00, r32 +	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT +	faddd	p16, r80, a16 +	fmuld	u00, v48, p48 +	add	cy, %g5, %o4		C x = prev(i00) + cy +	addcc	%i2, 8, %i2 +	bnz,pt	%xcc, .Loop +	fmuld	u32, v16, r48 + +.L_four: +	b,a	.L_out_4 + +C BEGIN MAIN LOOP +	.align	16 +.Loop: +C 00 +	srlx	%o4, 16, %o5		C (x >> 16) +	ld	[%i5+%i2], %f3		C read low 32 bits of up[i] +	fmuld	u32, v32, r64	C FIXME not urgent +	faddd	p32, r32, a32 +C 01 +	add	%o5, %o2, %o2		C mi64 in %o2   2nd ASSIGNMENT +	and	%o4, xffff, %o5		C (x & 0xffff) +	ld	[%i1+%i2], %f5		C read high 32 bits of up[i] +	fdtox	a00, a00 +C 02 +	faddd	p48, r48, a48 +C 03 +	srlx	%o2, 48, %o7		C (mi64 >> 48) +	mov	i00, %g5		C i00+ now in g5 +	fmuld	u32, v48, r80	C FIXME not urgent +	fdtox	a16, a16 +C 04 +	sllx	%o2, 16, %i3		C (mi64 << 16) +	add	%o7, %o1, cy		C new cy +	ldx	[%sp+2223+0], i00 +	fdtox	a32, a32 +C 05 +	srlx	i16, 48, %l4		C (i16 >> 48) +	mov	i16, %g2 +	ldx	[%sp+2223+8], i16 +	fxtod	%f2, u00 +C 06 +	srlx	i48, 16, %l5		C (i48 >> 16) +	mov	i32, %g4		C i32+ now in g4 +	ldx	[%sp+2223+16], i32 +	fxtod	%f4, u32 +C 07 +	sllx	i48, 32, %l6		C (i48 << 32) +	or	%i3, %o5, %o5 +	ldx	[%sp+2223+24], i48 +	fdtox	a48, a48 +C 08 +	srlx	%g4, 32, %o3		C (i32 >> 32) +	add	%l5, %l4, %o1		C hi64- in %o1 +	std	a00, [%sp+2223+0] +	fmuld	u00, v00, p00 +C 09 +	sllx	%g4, 16, %o2		C (i32 << 16) +	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT +	std	a16, [%sp+2223+8] +	fmuld	u00, v16, p16 +C 10 +	sllx	%o1, 48, %o3		C (hi64 << 48) +	add	%g2, %o2, %o2		C mi64- in %o2 +	std	a32, [%sp+2223+16] +	fmuld	u00, v32, p32 +C 11 +	add	%l6, %o2, %o2		C mi64- in %o2 +	std	a48, [%sp+2223+24] +	faddd	p00, r64, a00 +	fmuld	u32, v00, r32 +C 12 +	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT +	stx	%o5, [%i4+%i2] +	faddd	p16, r80, a16 +	fmuld	u00, v48, p48 +C 13 +	add	cy, %g5, %o4		C x = prev(i00) + cy +	addcc	%i2, 8, %i2 +	bnz,pt	%xcc, .Loop +	fmuld	u32, v16, r48 +C END MAIN LOOP + +.L_out_4: +	srlx	%o4, 16, %o5		C (x >> 16) +	fmuld	u32, v32, r64	C FIXME not urgent +	faddd	p32, r32, a32 +	add	%o5, %o2, %o2		C mi64 in %o2   2nd ASSIGNMENT +	and	%o4, xffff, %o5		C (x & 0xffff) +	fdtox	a00, a00 +	faddd	p48, r48, a48 +	srlx	%o2, 48, %o7		C (mi64 >> 48) +	mov	i00, %g5		C i00+ now in g5 +	fmuld	u32, v48, r80	C FIXME not urgent +	fdtox	a16, a16 +	sllx	%o2, 16, %i3		C (mi64 << 16) +	add	%o7, %o1, cy		C new cy +	ldx	[%sp+2223+0], i00 +	fdtox	a32, a32 +	srlx	i16, 48, %l4		C (i16 >> 48) +	mov	i16, %g2 +	ldx	[%sp+2223+8], i16 +	srlx	i48, 16, %l5		C (i48 >> 16) +	mov	i32, %g4		C i32+ now in g4 +	ldx	[%sp+2223+16], i32 +	sllx	i48, 32, %l6		C (i48 << 32) +	or	%i3, %o5, %o5 +	ldx	[%sp+2223+24], i48 +	fdtox	a48, a48 +	srlx	%g4, 32, %o3		C (i32 >> 32) +	add	%l5, %l4, %o1		C hi64- in %o1 +	std	a00, [%sp+2223+0] +	sllx	%g4, 16, %o2		C (i32 << 16) +	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT +	std	a16, [%sp+2223+8] +	sllx	%o1, 48, %o3		C (hi64 << 48) +	add	%g2, %o2, %o2		C mi64- in %o2 +	std	a32, [%sp+2223+16] +	add	%l6, %o2, %o2		C mi64- in %o2 +	std	a48, [%sp+2223+24] +	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT +	stx	%o5, [%i4+%i2] +	add	cy, %g5, %o4		C x = prev(i00) + cy +	add	%i2, 8, %i2 +.L_out_3: +	srlx	%o4, 16, %o5		C (x >> 16) +	add	%o5, %o2, %o2		C mi64 in %o2   2nd ASSIGNMENT +	and	%o4, xffff, %o5		C (x & 0xffff) +	fdtox	r64, a00 +	srlx	%o2, 48, %o7		C (mi64 >> 48) +	mov	i00, %g5		C i00+ now in g5 +	fdtox	r80, a16 +	sllx	%o2, 16, %i3		C (mi64 << 16) +	add	%o7, %o1, cy		C new cy +	ldx	[%sp+2223+0], i00 +	srlx	i16, 48, %l4		C (i16 >> 48) +	mov	i16, %g2 +	ldx	[%sp+2223+8], i16 +	srlx	i48, 16, %l5		C (i48 >> 16) +	mov	i32, %g4		C i32+ now in g4 +	ldx	[%sp+2223+16], i32 +	sllx	i48, 32, %l6		C (i48 << 32) +	or	%i3, %o5, %o5 +	ldx	[%sp+2223+24], i48 +	srlx	%g4, 32, %o3		C (i32 >> 32) +	add	%l5, %l4, %o1		C hi64- in %o1 +	std	a00, [%sp+2223+0] +	sllx	%g4, 16, %o2		C (i32 << 16) +	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT +	std	a16, [%sp+2223+8] +	sllx	%o1, 48, %o3		C (hi64 << 48) +	add	%g2, %o2, %o2		C mi64- in %o2 +	add	%l6, %o2, %o2		C mi64- in %o2 +	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT +	stx	%o5, [%i4+%i2] +	add	cy, %g5, %o4		C x = prev(i00) + cy +	add	%i2, 8, %i2 +.L_out_2: +	srlx	%o4, 16, %o5		C (x >> 16) +	add	%o5, %o2, %o2		C mi64 in %o2   2nd ASSIGNMENT +	and	%o4, xffff, %o5		C (x & 0xffff) +	srlx	%o2, 48, %o7		C (mi64 >> 48) +	mov	i00, %g5		C i00+ now in g5 +	sllx	%o2, 16, %i3		C (mi64 << 16) +	add	%o7, %o1, cy		C new cy +	ldx	[%sp+2223+0], i00 +	srlx	i16, 48, %l4		C (i16 >> 48) +	mov	i16, %g2 +	ldx	[%sp+2223+8], i16 +	srlx	i48, 16, %l5		C (i48 >> 16) +	mov	i32, %g4		C i32+ now in g4 +	sllx	i48, 32, %l6		C (i48 << 32) +	or	%i3, %o5, %o5 +	srlx	%g4, 32, %o3		C (i32 >> 32) +	add	%l5, %l4, %o1		C hi64- in %o1 +	sllx	%g4, 16, %o2		C (i32 << 16) +	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT +	sllx	%o1, 48, %o3		C (hi64 << 48) +	add	%g2, %o2, %o2		C mi64- in %o2 +	add	%l6, %o2, %o2		C mi64- in %o2 +	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT +	stx	%o5, [%i4+%i2] +	add	cy, %g5, %o4		C x = prev(i00) + cy +	add	%i2, 8, %i2 +.L_out_1: +	srlx	%o4, 16, %o5		C (x >> 16) +	add	%o5, %o2, %o2		C mi64 in %o2   2nd ASSIGNMENT +	and	%o4, xffff, %o5		C (x & 0xffff) +	srlx	%o2, 48, %o7		C (mi64 >> 48) +	sllx	%o2, 16, %i3		C (mi64 << 16) +	add	%o7, %o1, cy		C new cy +	or	%i3, %o5, %o5 +	stx	%o5, [%i4+%i2] + +	sllx	i00, 0, %g2 +	add	%g2, cy, cy +	sllx	i16, 16, %g3 +	add	%g3, cy, cy + +	return	%i7+8 +	mov	cy, %o0 +EPILOGUE(mpn_mul_1) diff --git a/vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234/sqr_diagonal.asm b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234/sqr_diagonal.asm new file mode 100644 index 0000000..43c69d3 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234/sqr_diagonal.asm @@ -0,0 +1,342 @@ +dnl  SPARC v9 64-bit mpn_sqr_diagonal. + +dnl  Copyright 2001, 2002 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C		   cycles/limb +C UltraSPARC 1&2:     22 +C UltraSPARC 3:	      36 + +C This was generated by the Sun C compiler.  It runs at 22 cycles/limb on the +C UltraSPARC-1/2, three cycles slower than theoretically possible for optimal +C code using the same algorithm.  For 1-3 limbs, a special loop was generated, +C which causes performance problems in particular for 2 and 3 limbs. +C Ultimately, this should be replaced by hand-written code in the same software +C pipeline style as e.g., addmul_1.asm. + +ASM_START() +	REGISTER(%g2,#scratch) +	REGISTER(%g3,#scratch) +PROLOGUE(mpn_sqr_diagonal) +	save	%sp, -240, %sp + +	sethi	%hi(0x1ffc00), %o0 +	sethi	%hi(0x3ffc00), %o1 +	add	%o0, 1023, %o7 +	cmp	%i2, 4 +	add	%o1, 1023, %o4 +	or	%g0, %i1, %g1 +	or	%g0, %i0, %o0 +	bl,pn	%xcc, .Lsmall +	or	%g0, 0, %g2 + +	ldx	[%i1], %o1 +	add	%i1, 24, %g1 +	or	%g0, 3, %g2 +	srlx	%o1, 42, %g3 +	stx	%g3, [%sp+2279] +	and	%o1, %o7, %o2 +	stx	%o2, [%sp+2263] +	srlx	%o1, 21, %o1 +	ldd	[%sp+2279], %f0 +	and	%o1, %o7, %o1 +	stx	%o1, [%sp+2271] +	ldx	[%i1+8], %o2 +	fxtod	%f0, %f12 +	srlx	%o2, 21, %o1 +	and	%o2, %o7, %g3 +	ldd	[%sp+2263], %f2 +	fmuld	%f12, %f12, %f10 +	srlx	%o2, 42, %o2 +	ldd	[%sp+2271], %f0 +	and	%o1, %o7, %o1 +	fxtod	%f2, %f8 +	stx	%o2, [%sp+2279] +	stx	%o1, [%sp+2271] +	fxtod	%f0, %f0 +	stx	%g3, [%sp+2263] +	fdtox	%f10, %f14 +	fmuld	%f12, %f8, %f6 +	ldx	[%i1+16], %o2 +	std	%f14, [%sp+2255] +	fmuld	%f0, %f0, %f2 +	fmuld	%f8, %f8, %f10 +	srlx	%o2, 42, %o1 +	faddd	%f6, %f6, %f6 +	fmuld	%f12, %f0, %f12 +	fmuld	%f0, %f8, %f8 +	ldd	[%sp+2279], %f0 +	ldd	[%sp+2263], %f4 +	fdtox	%f10, %f10 +	std	%f10, [%sp+2239] +	faddd	%f2, %f6, %f6 +	ldd	[%sp+2271], %f2 +	fdtox	%f12, %f12 +	std	%f12, [%sp+2247] +	fdtox	%f8, %f8 +	std	%f8, [%sp+2231] +	fdtox	%f6, %f6 +	std	%f6, [%sp+2223] + +.Loop:	srlx	%o2, 21, %g3 +	stx	%o1, [%sp+2279] +	add	%g2, 1, %g2 +	and	%g3, %o7, %o1 +	ldx	[%sp+2255], %g4 +	cmp	%g2, %i2 +	stx	%o1, [%sp+2271] +	add	%g1, 8, %g1 +	add	%o0, 16, %o0 +	ldx	[%sp+2239], %o1 +	fxtod	%f0, %f10 +	fxtod	%f4, %f14 +	ldx	[%sp+2231], %i0 +	ldx	[%sp+2223], %g5 +	ldx	[%sp+2247], %g3 +	and	%o2, %o7, %o2 +	fxtod	%f2, %f8 +	fmuld	%f10, %f10, %f0 +	stx	%o2, [%sp+2263] +	fmuld	%f10, %f14, %f6 +	ldx	[%g1-8], %o2 +	fmuld	%f10, %f8, %f12 +	fdtox	%f0, %f2 +	ldd	[%sp+2279], %f0 +	fmuld	%f8, %f8, %f4 +	faddd	%f6, %f6, %f6 +	fmuld	%f14, %f14, %f10 +	std	%f2, [%sp+2255] +	sllx	%g4, 20, %g4 +	ldd	[%sp+2271], %f2 +	fmuld	%f8, %f14, %f8 +	sllx	%i0, 22, %i1 +	fdtox	%f12, %f12 +	std	%f12, [%sp+2247] +	sllx	%g5, 42, %i0 +	add	%o1, %i1, %o1 +	faddd	%f4, %f6, %f6 +	ldd	[%sp+2263], %f4 +	add	%o1, %i0, %o1 +	add	%g3, %g4, %g3 +	fdtox	%f10, %f10 +	std	%f10, [%sp+2239] +	srlx	%o1, 42, %g4 +	and	%g5, %o4, %i0 +	fdtox	%f8, %f8 +	std	%f8, [%sp+2231] +	srlx	%g5, 22, %g5 +	sub	%g4, %i0, %g4 +	fdtox	%f6, %f6 +	std	%f6, [%sp+2223] +	srlx	%g4, 63, %g4 +	add	%g3, %g5, %g3 +	add	%g3, %g4, %g3 +	stx	%o1, [%o0-16] +	srlx	%o2, 42, %o1 +	bl,pt	%xcc, .Loop +	stx	%g3, [%o0-8] + +	stx	%o1, [%sp+2279] +	srlx	%o2, 21, %o1 +	fxtod	%f0, %f16 +	ldx	[%sp+2223], %g3 +	fxtod	%f4, %f6 +	and	%o2, %o7, %o3 +	stx	%o3, [%sp+2263] +	fxtod	%f2, %f4 +	and	%o1, %o7, %o1 +	ldx	[%sp+2231], %o2 +	sllx	%g3, 42, %g4 +	fmuld	%f16, %f16, %f14 +	stx	%o1, [%sp+2271] +	fmuld	%f16, %f6, %f8 +	add	%o0, 48, %o0 +	ldx	[%sp+2239], %o1 +	sllx	%o2, 22, %o2 +	fmuld	%f4, %f4, %f10 +	ldx	[%sp+2255], %o3 +	fdtox	%f14, %f14 +	fmuld	%f4, %f6, %f2 +	std	%f14, [%sp+2255] +	faddd	%f8, %f8, %f12 +	add	%o1, %o2, %o2 +	fmuld	%f16, %f4, %f4 +	ldd	[%sp+2279], %f0 +	sllx	%o3, 20, %g5 +	add	%o2, %g4, %o2 +	fmuld	%f6, %f6, %f6 +	srlx	%o2, 42, %o3 +	and	%g3, %o4, %g4 +	srlx	%g3, 22, %g3 +	faddd	%f10, %f12, %f16 +	ldd	[%sp+2271], %f12 +	ldd	[%sp+2263], %f8 +	fxtod	%f0, %f0 +	sub	%o3, %g4, %o3 +	ldx	[%sp+2247], %o1 +	srlx	%o3, 63, %o3 +	fdtox	%f2, %f10 +	fxtod	%f8, %f8 +	std	%f10, [%sp+2231] +	fdtox	%f6, %f6 +	std	%f6, [%sp+2239] +	add	%o1, %g5, %o1 +	fmuld	%f0, %f0, %f2 +	fdtox	%f16, %f16 +	std	%f16, [%sp+2223] +	add	%o1, %g3, %o1 +	fdtox	%f4, %f4 +	std	%f4, [%sp+2247] +	fmuld	%f0, %f8, %f10 +	fxtod	%f12, %f12 +	add	%o1, %o3, %o1 +	stx	%o2, [%o0-48] +	fmuld	%f8, %f8, %f6 +	stx	%o1, [%o0-40] +	fdtox	%f2, %f2 +	ldx	[%sp+2231], %o2 +	faddd	%f10, %f10, %f10 +	ldx	[%sp+2223], %g3 +	fmuld	%f12, %f12, %f4 +	fdtox	%f6, %f6 +	ldx	[%sp+2239], %o1 +	sllx	%o2, 22, %o2 +	fmuld	%f12, %f8, %f8 +	sllx	%g3, 42, %g5 +	ldx	[%sp+2255], %o3 +	fmuld	%f0, %f12, %f0 +	add	%o1, %o2, %o2 +	faddd	%f4, %f10, %f4 +	ldx	[%sp+2247], %o1 +	add	%o2, %g5, %o2 +	and	%g3, %o4, %g4 +	fdtox	%f8, %f8 +	sllx	%o3, 20, %g5 +	std	%f8, [%sp+2231] +	fdtox	%f0, %f0 +	srlx	%o2, 42, %o3 +	add	%o1, %g5, %o1 +	fdtox	%f4, %f4 +	srlx	%g3, 22, %g3 +	sub	%o3, %g4, %o3 +	std	%f6, [%sp+2239] +	std	%f4, [%sp+2223] +	srlx	%o3, 63, %o3 +	add	%o1, %g3, %o1 +	std	%f2, [%sp+2255] +	add	%o1, %o3, %o1 +	std	%f0, [%sp+2247] +	stx	%o2, [%o0-32] +	stx	%o1, [%o0-24] +	ldx	[%sp+2231], %o2 +	ldx	[%sp+2223], %o3 +	ldx	[%sp+2239], %o1 +	sllx	%o2, 22, %o2 +	sllx	%o3, 42, %g5 +	ldx	[%sp+2255], %g4 +	and	%o3, %o4, %g3 +	add	%o1, %o2, %o2 +	ldx	[%sp+2247], %o1 +	add	%o2, %g5, %o2 +	stx	%o2, [%o0-16] +	sllx	%g4, 20, %g4 +	srlx	%o2, 42, %o2 +	add	%o1, %g4, %o1 +	srlx	%o3, 22, %o3 +	sub	%o2, %g3, %o2 +	srlx	%o2, 63, %o2 +	add	%o1, %o3, %o1 +	add	%o1, %o2, %o1 +	stx	%o1, [%o0-8] +	ret +	restore	%g0, %g0, %g0 +.Lsmall: +	ldx	[%g1], %o2 +.Loop0: +	and	%o2, %o7, %o1 +	stx	%o1, [%sp+2263] +	add	%g2, 1, %g2 +	srlx	%o2, 21, %o1 +	add	%g1, 8, %g1 +	srlx	%o2, 42, %o2 +	stx	%o2, [%sp+2279] +	and	%o1, %o7, %o1 +	ldd	[%sp+2263], %f0 +	cmp	%g2, %i2 +	stx	%o1, [%sp+2271] +	fxtod	%f0, %f6 +	ldd	[%sp+2279], %f0 +	ldd	[%sp+2271], %f4 +	fxtod	%f0, %f2 +	fmuld	%f6, %f6, %f0 +	fxtod	%f4, %f10 +	fmuld	%f2, %f6, %f4 +	fdtox	%f0, %f0 +	std	%f0, [%sp+2239] +	fmuld	%f10, %f6, %f8 +	fmuld	%f10, %f10, %f0 +	faddd	%f4, %f4, %f6 +	fmuld	%f2, %f2, %f4 +	fdtox	%f8, %f8 +	std	%f8, [%sp+2231] +	fmuld	%f2, %f10, %f2 +	faddd	%f0, %f6, %f0 +	fdtox	%f4, %f4 +	std	%f4, [%sp+2255] +	fdtox	%f2, %f2 +	std	%f2, [%sp+2247] +	fdtox	%f0, %f0 +	std	%f0, [%sp+2223] +	ldx	[%sp+2239], %o1 +	ldx	[%sp+2255], %g4 +	ldx	[%sp+2231], %o2 +	sllx	%g4, 20, %g4 +	ldx	[%sp+2223], %o3 +	sllx	%o2, 22, %o2 +	sllx	%o3, 42, %g5 +	add	%o1, %o2, %o2 +	ldx	[%sp+2247], %o1 +	add	%o2, %g5, %o2 +	stx	%o2, [%o0] +	and	%o3, %o4, %g3 +	srlx	%o2, 42, %o2 +	add	%o1, %g4, %o1 +	srlx	%o3, 22, %o3 +	sub	%o2, %g3, %o2 +	srlx	%o2, 63, %o2 +	add	%o1, %o3, %o1 +	add	%o1, %o2, %o1 +	stx	%o1, [%o0+8] +	add	%o0, 16, %o0 +	bl,a,pt	%xcc, .Loop0 +	ldx	[%g1], %o2 +	ret +	restore	%g0, %g0, %g0 +EPILOGUE(mpn_sqr_diagonal) diff --git a/vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234/sub_n.asm b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234/sub_n.asm new file mode 100644 index 0000000..9fb7f70 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234/sub_n.asm @@ -0,0 +1,241 @@ +dnl  SPARC v9 mpn_sub_n -- Subtract two limb vectors of the same length > 0 and +dnl  store difference in a third limb vector. + +dnl  Copyright 2001-2003, 2011 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C		   cycles/limb +C UltraSPARC 1&2:     4 +C UltraSPARC 3:	      4.5 + +C Compute carry-out from the most significant bits of u,v, and r, where +C r=u-v-carry_in, using logic operations. + +C This code runs at 4 cycles/limb on UltraSPARC 1 and 2.  It has a 4 insn +C recurrency, and the UltraSPARC 1 and 2 the IE units are 100% saturated. +C Therefore, it seems futile to try to optimize this any further... + +C INPUT PARAMETERS +define(`rp',`%i0') +define(`up',`%i1') +define(`vp',`%i2') +define(`n',`%i3') + +define(`u0',`%l0') +define(`u1',`%l2') +define(`u2',`%l4') +define(`u3',`%l6') +define(`v0',`%l1') +define(`v1',`%l3') +define(`v2',`%l5') +define(`v3',`%l7') + +define(`cy',`%i4') + +define(`fanop',`fitod %f0,%f2')		dnl  A quasi nop running in the FA pipe +define(`fmnop',`fmuld %f0,%f0,%f4')	dnl  A quasi nop running in the FM pipe + +ASM_START() +	REGISTER(%g2,#scratch) +	REGISTER(%g3,#scratch) +PROLOGUE(mpn_sub_nc) +	save	%sp,-160,%sp + +	fitod	%f0,%f0		C make sure f0 contains small, quiet number +	subcc	n,4,%g0 +	bl,pn	%xcc,.Loop0 +	nop +	b,a	L(com) +EPILOGUE() + +PROLOGUE(mpn_sub_n) +	save	%sp,-160,%sp + +	fitod	%f0,%f0		C make sure f0 contains small, quiet number +	subcc	n,4,%g0 +	bl,pn	%xcc,.Loop0 +	mov	0,cy +L(com): +	ldx	[up+0],u0 +	ldx	[vp+0],v0 +	add	up,32,up +	ldx	[up-24],u1 +	ldx	[vp+8],v1 +	add	vp,32,vp +	ldx	[up-16],u2 +	ldx	[vp-16],v2 +	ldx	[up-8],u3 +	ldx	[vp-8],v3 +	subcc	n,8,n +	sub	u0,v0,%g1	C main sub +	sub	%g1,cy,%g5	C carry sub +	orn	u0,v0,%g2 +	bl,pn	%xcc,.Lend4567 +	fanop +	b,a	.Loop + +	.align	16 +C START MAIN LOOP +.Loop:	orn	%g5,%g2,%g2 +	andn	u0,v0,%g3 +	ldx	[up+0],u0 +	fanop +C -- +	andn	%g2,%g3,%g2 +	ldx	[vp+0],v0 +	add	up,32,up +	fanop +C -- +	srlx	%g2,63,cy +	sub	u1,v1,%g1 +	stx	%g5,[rp+0] +	fanop +C -- +	sub	%g1,cy,%g5 +	orn	u1,v1,%g2 +	fmnop +	fanop +C -- +	orn	%g5,%g2,%g2 +	andn	u1,v1,%g3 +	ldx	[up-24],u1 +	fanop +C -- +	andn	%g2,%g3,%g2 +	ldx	[vp+8],v1 +	add	vp,32,vp +	fanop +C -- +	srlx	%g2,63,cy +	sub	u2,v2,%g1 +	stx	%g5,[rp+8] +	fanop +C -- +	sub	%g1,cy,%g5 +	orn	u2,v2,%g2 +	fmnop +	fanop +C -- +	orn	%g5,%g2,%g2 +	andn	u2,v2,%g3 +	ldx	[up-16],u2 +	fanop +C -- +	andn	%g2,%g3,%g2 +	ldx	[vp-16],v2 +	add	rp,32,rp +	fanop +C -- +	srlx	%g2,63,cy +	sub	u3,v3,%g1 +	stx	%g5,[rp-16] +	fanop +C -- +	sub	%g1,cy,%g5 +	orn	u3,v3,%g2 +	fmnop +	fanop +C -- +	orn	%g5,%g2,%g2 +	andn	u3,v3,%g3 +	ldx	[up-8],u3 +	fanop +C -- +	andn	%g2,%g3,%g2 +	subcc	n,4,n +	ldx	[vp-8],v3 +	fanop +C -- +	srlx	%g2,63,cy +	sub	u0,v0,%g1 +	stx	%g5,[rp-8] +	fanop +C -- +	sub	%g1,cy,%g5 +	orn	u0,v0,%g2 +	bge,pt	%xcc,.Loop +	fanop +C END MAIN LOOP +.Lend4567: +	orn	%g5,%g2,%g2 +	andn	u0,v0,%g3 +	andn	%g2,%g3,%g2 +	srlx	%g2,63,cy +	sub	u1,v1,%g1 +	stx	%g5,[rp+0] +	sub	%g1,cy,%g5 +	orn	u1,v1,%g2 +	orn	%g5,%g2,%g2 +	andn	u1,v1,%g3 +	andn	%g2,%g3,%g2 +	srlx	%g2,63,cy +	sub	u2,v2,%g1 +	stx	%g5,[rp+8] +	sub	%g1,cy,%g5 +	orn	u2,v2,%g2 +	orn	%g5,%g2,%g2 +	andn	u2,v2,%g3 +	andn	%g2,%g3,%g2 +	add	rp,32,rp +	srlx	%g2,63,cy +	sub	u3,v3,%g1 +	stx	%g5,[rp-16] +	sub	%g1,cy,%g5 +	orn	u3,v3,%g2 +	orn	%g5,%g2,%g2 +	andn	u3,v3,%g3 +	andn	%g2,%g3,%g2 +	srlx	%g2,63,cy +	stx	%g5,[rp-8] + +	addcc	n,4,n +	bz,pn	%xcc,.Lret +	fanop + +.Loop0:	ldx	[up],u0 +	add	up,8,up +	ldx	[vp],v0 +	add	vp,8,vp +	add	rp,8,rp +	subcc	n,1,n +	sub	u0,v0,%g1 +	orn	u0,v0,%g2 +	sub	%g1,cy,%g5 +	andn	u0,v0,%g3 +	orn	%g5,%g2,%g2 +	stx	%g5,[rp-8] +	andn	%g2,%g3,%g2 +	bnz,pt	%xcc,.Loop0 +	srlx	%g2,63,cy + +.Lret:	mov	cy,%i0 +	ret +	restore +EPILOGUE(mpn_sub_n) diff --git a/vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234/submul_1.asm b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234/submul_1.asm new file mode 100644 index 0000000..0bdb566 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparc1234/submul_1.asm @@ -0,0 +1,68 @@ +dnl  SPARC v9 64-bit mpn_submul_1 -- Multiply a limb vector with a limb and +dnl  subtract the result from a second limb vector. + +dnl  Copyright 2001-2003 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C		   cycles/limb +C UltraSPARC 1&2:     18 +C UltraSPARC 3:	      23 + +C INPUT PARAMETERS +C rp	i0 +C up	i1 +C n	i2 +C v	i3 + +ASM_START() +	REGISTER(%g2,#scratch) + +PROLOGUE(mpn_submul_1) +	save	%sp,-176,%sp + +	sllx	%i2, 3, %g2 +	or	%g0, %i1, %o1 +	add	%g2, 15, %o0 +	or	%g0, %i2, %o2 +	and	%o0, -16, %o0 +	sub	%sp, %o0, %sp +	add	%sp, 2223, %o0 +	or	%g0, %o0, %l0 +	call	mpn_mul_1 +	or	%g0, %i3, %o3 +	or	%g0, %o0, %l1		C preserve carry value from mpn_mul_1 +	or	%g0, %i0, %o0 +	or	%g0, %i0, %o1 +	or	%g0, %l0, %o2 +	call	mpn_sub_n +	or	%g0, %i2, %o3 +	ret +	restore	%l1, %o0, %o0		C sum carry values +EPILOGUE(mpn_submul_1) diff --git a/vendor/gmp-6.3.0/mpn/sparc64/ultrasparc34/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparc34/gmp-mparam.h new file mode 100644 index 0000000..c88e680 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparc34/gmp-mparam.h @@ -0,0 +1,222 @@ +/* ultrasparc3/4 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2002, 2004, 2006, 2008-2010, 2014, 2015 Free +Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + +  * the GNU Lesser General Public License as published by the Free +    Software Foundation; either version 3 of the License, or (at your +    option) any later version. + +or + +  * the GNU General Public License as published by the Free Software +    Foundation; either version 2 of the License, or (at your option) any +    later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library.  If not, +see https://www.gnu.org/licenses/.  */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 1593 MHz ultrasparc3 running Solaris 10 (swift.nada.kth.se) */ +/* FFT tuning limit = 100 M */ +/* Generated by tuneup.c, 2015-10-09, gcc 3.4 */ + +#define DIVREM_1_NORM_THRESHOLD              0  /* always */ +#define DIVREM_1_UNNORM_THRESHOLD            0  /* always */ +#define MOD_1_1P_METHOD                      2 +#define MOD_1_NORM_THRESHOLD                 0  /* always */ +#define MOD_1_UNNORM_THRESHOLD               0  /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD          7 +#define MOD_1U_TO_MOD_1_1_THRESHOLD          5 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD        22 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     29 +#define USE_PREINV_DIVREM_1                  1 +#define DIV_QR_1N_PI1_METHOD                 1 +#define DIV_QR_1_NORM_THRESHOLD              2 +#define DIV_QR_1_UNNORM_THRESHOLD            1 +#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */ +#define DIVEXACT_1_THRESHOLD                 0  /* always */ +#define BMOD_1_TO_MOD_1_THRESHOLD        MP_SIZE_T_MAX  /* never */ + +#define MUL_TOOM22_THRESHOLD                28 +#define MUL_TOOM33_THRESHOLD                93 +#define MUL_TOOM44_THRESHOLD               142 +#define MUL_TOOM6H_THRESHOLD               165 +#define MUL_TOOM8H_THRESHOLD               278 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD      93 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD     114 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD      88 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD      50 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD      67 + +#define SQR_BASECASE_THRESHOLD               7 +#define SQR_TOOM2_THRESHOLD                 70 +#define SQR_TOOM3_THRESHOLD                101 +#define SQR_TOOM4_THRESHOLD                184 +#define SQR_TOOM6_THRESHOLD                  0  /* always */ +#define SQR_TOOM8_THRESHOLD                339 + +#define MULMID_TOOM42_THRESHOLD             40 + +#define MULMOD_BNM1_THRESHOLD               14 +#define SQRMOD_BNM1_THRESHOLD                9 + +#define MUL_FFT_MODF_THRESHOLD             212  /* k = 5 */ +#define MUL_FFT_TABLE3                                      \ +  { {    212, 5}, {     13, 6}, {     17, 7}, {      9, 6}, \ +    {     19, 7}, {     17, 8}, {      9, 7}, {     20, 8}, \ +    {     11, 7}, {     23, 8}, {     13, 9}, {      7, 8}, \ +    {     19, 9}, {     11, 8}, {     25,10}, {      7, 9}, \ +    {     15, 8}, {     33, 9}, {     19, 8}, {     39, 9}, \ +    {     23, 8}, {     47, 9}, {     27,10}, {     15, 9}, \ +    {     39,10}, {     23, 9}, {     47,11}, {     15,10}, \ +    {     31, 9}, {     63, 8}, {    127, 7}, {    255, 9}, \ +    {     67,10}, {     39, 9}, {     79, 8}, {    159, 7}, \ +    {    319, 9}, {     83,10}, {     47, 9}, {     95, 8}, \ +    {    191, 7}, {    383,10}, {     55,11}, {     31,10}, \ +    {     63, 9}, {    127, 8}, {    255, 7}, {    511,10}, \ +    {     71, 9}, {    143, 8}, {    287,10}, {     79, 9}, \ +    {    159, 8}, {    319, 9}, {    175, 8}, {    351,11}, \ +    {     47,10}, {     95, 9}, {    191, 8}, {    383, 7}, \ +    {    767,10}, {    103,12}, {     31,11}, {     63,10}, \ +    {    127, 9}, {    287,11}, {     79,10}, {    159, 9}, \ +    {    319, 8}, {    639,10}, {    175, 9}, {    351, 8}, \ +    {    703,11}, {     95,10}, {    207, 9}, {    415,11}, \ +    {    111,10}, {    223, 9}, {    479,12}, {     63,11}, \ +    {    127,10}, {    255,11}, {    143,10}, {    287, 9}, \ +    {    575,10}, {    319, 9}, {    639,11}, {    175,10}, \ +    {    351,11}, {    191,10}, {    383,11}, {    207,10}, \ +    {    415,11}, {    223,10}, {    447,13}, {     63,12}, \ +    {    127,11}, {    287,10}, {    575,11}, {    319,10}, \ +    {    703,12}, {    191,11}, {    383,12}, {    223,11}, \ +    {    447,13}, {    127,12}, {    287,11}, {    575,12}, \ +    {    351,13}, {    191,12}, {    479,14}, {    127,13}, \ +    {    255,12}, {    575,13}, {    319,12}, {    703,13}, \ +    {    383,12}, {    767,13}, {    447,12}, {    895,14}, \ +    {    255,13}, {    511,12}, {   1023,13}, {    575,12}, \ +    {   1151,13}, {    703,14}, {    383,13}, {    831,12}, \ +    {   1663,13}, {    895,15}, {    255,14}, {    511,13}, \ +    {   1151,14}, {    639,13}, {   1407,12}, {   2815,14}, \ +    {    767,13}, {   1663,14}, {    895,13}, {   1791,15}, \ +    {    511,14}, {   1023,13}, {   2047,14}, {   1151,13}, \ +    {   2303,14}, {   1407,13}, {   2815,15}, {    767,14}, \ +    {   1791,16}, {    511,15}, {   1023,14}, {   2303,15}, \ +    {   1279,14}, {   2815,15}, {   1535,14}, {   3199,15}, \ +    {   1791,16}, {   1023,15}, {   2047,14}, {   4223,15}, \ +    {   2303,14}, {   4863,15}, {   2815,16}, {  65536,17}, \ +    { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ +    {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 171 +#define MUL_FFT_THRESHOLD                 2240 + +#define SQR_FFT_MODF_THRESHOLD             244  /* k = 5 */ +#define SQR_FFT_TABLE3                                      \ +  { {    244, 5}, {      8, 4}, {     17, 5}, {     15, 6}, \ +    {      8, 5}, {     17, 6}, {     17, 7}, {      9, 6}, \ +    {     19, 7}, {     17, 8}, {      9, 7}, {     20, 8}, \ +    {     11, 7}, {     23, 8}, {     13, 9}, {      7, 8}, \ +    {     19, 9}, {     11, 8}, {     25,10}, {      7, 9}, \ +    {     15, 8}, {     31, 9}, {     19, 8}, {     39, 9}, \ +    {     27,10}, {     15, 9}, {     39,10}, {     23, 9}, \ +    {     47,11}, {     15,10}, {     31, 9}, {     67,10}, \ +    {     39, 9}, {     79, 8}, {    159,10}, {     47, 9}, \ +    {     95,10}, {     55,11}, {     31,10}, {     63, 9}, \ +    {    127, 8}, {    255,10}, {     71, 9}, {    143, 8}, \ +    {    287, 7}, {    575,10}, {     79, 9}, {    159,11}, \ +    {     47, 9}, {    191, 8}, {    383, 7}, {    767, 9}, \ +    {    207,12}, {     31,11}, {     63,10}, {    127, 9}, \ +    {    255, 8}, {    511,10}, {    135, 9}, {    271,10}, \ +    {    143, 9}, {    287,11}, {     79,10}, {    159, 9}, \ +    {    319, 8}, {    639,10}, {    175, 9}, {    351, 8}, \ +    {    703, 7}, {   1407,11}, {     95,10}, {    191, 9}, \ +    {    383, 8}, {    767,10}, {    207, 9}, {    415,10}, \ +    {    223, 9}, {    447,12}, {     63,11}, {    127,10}, \ +    {    271, 9}, {    543,10}, {    287, 9}, {    575, 8}, \ +    {   1151,11}, {    159,10}, {    319, 9}, {    639,10}, \ +    {    351, 9}, {    703, 8}, {   1407, 7}, {   2815,11}, \ +    {    207,10}, {    415, 9}, {    831,11}, {    223,10}, \ +    {    447, 9}, {    895,13}, {     63,11}, {    271,10}, \ +    {    543,11}, {    287,12}, {    159,11}, {    351,10}, \ +    {    703,12}, {    191,11}, {    415,10}, {    831,12}, \ +    {    223,13}, {    127,12}, {    255,11}, {    511,10}, \ +    {   1023,11}, {    543,12}, {    287,11}, {    607,12}, \ +    {    319,11}, {    639,12}, {    415,11}, {    895,12}, \ +    {    479,14}, {    127,13}, {    255,12}, {    543,11}, \ +    {   1087,12}, {    575,11}, {   1151,13}, {    319,12}, \ +    {    639,11}, {   1279,12}, {    703,10}, {   2815,12}, \ +    {    831,11}, {   1663,13}, {    447,12}, {    959,14}, \ +    {    255,13}, {    511,12}, {   1087,13}, {    703,12}, \ +    {   1407,14}, {    383,13}, {    767,12}, {   1535,13}, \ +    {    895,15}, {    255,14}, {    511,13}, {   1215,14}, \ +    {    639,13}, {   1279,14}, {    767,13}, {   1663,14}, \ +    {    895,13}, {   1919,15}, {    511,14}, {   1023,13}, \ +    {   2175,14}, {   1151,13}, {   2431,14}, {   1407,15}, \ +    {    767,14}, {   1791,16}, {    511,15}, {   1023,14}, \ +    {   2303,15}, {   1279,14}, {   2815,15}, {   1535,14}, \ +    {   3199,15}, {   1791,16}, {   1023,15}, {   2047,14}, \ +    {   4351,15}, {   2303,14}, {   4863,15}, {   2815,16}, \ +    {  65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \ +    {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 184 +#define SQR_FFT_THRESHOLD                 1728 + +#define MULLO_BASECASE_THRESHOLD             0  /* always */ +#define MULLO_DC_THRESHOLD                  29 +#define MULLO_MUL_N_THRESHOLD             4392 +#define SQRLO_BASECASE_THRESHOLD             2 +#define SQRLO_DC_THRESHOLD                  63 +#define SQRLO_SQR_THRESHOLD               3176 + +#define DC_DIV_QR_THRESHOLD                 16 +#define DC_DIVAPPR_Q_THRESHOLD              64 +#define DC_BDIV_QR_THRESHOLD                30 +#define DC_BDIV_Q_THRESHOLD                 86 + +#define INV_MULMOD_BNM1_THRESHOLD           58 +#define INV_NEWTON_THRESHOLD                17 +#define INV_APPR_THRESHOLD                  15 + +#define BINV_NEWTON_THRESHOLD              109 +#define REDC_1_TO_REDC_2_THRESHOLD           0  /* always */ +#define REDC_2_TO_REDC_N_THRESHOLD         117 + +#define MU_DIV_QR_THRESHOLD                618 +#define MU_DIVAPPR_Q_THRESHOLD             618 +#define MUPI_DIV_QR_THRESHOLD                0  /* always */ +#define MU_BDIV_QR_THRESHOLD               680 +#define MU_BDIV_Q_THRESHOLD                807 + +#define POWM_SEC_TABLE  3,22,102,579,1555 + +#define GET_STR_DC_THRESHOLD                20 +#define GET_STR_PRECOMPUTE_THRESHOLD        28 +#define SET_STR_DC_THRESHOLD               381 +#define SET_STR_PRECOMPUTE_THRESHOLD      1042 + +#define FAC_DSC_THRESHOLD                  462 +#define FAC_ODD_THRESHOLD                    0  /* always */ + +#define MATRIX22_STRASSEN_THRESHOLD         12 +#define HGCD_THRESHOLD                      45 +#define HGCD_APPR_THRESHOLD                 50 +#define HGCD_REDUCE_THRESHOLD             1094 +#define GCD_DC_THRESHOLD                   126 +#define GCDEXT_DC_THRESHOLD                132 +#define JACOBI_BASE_METHOD                   4 diff --git a/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct1/add_n.asm b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct1/add_n.asm new file mode 100644 index 0000000..954c7f6 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct1/add_n.asm @@ -0,0 +1,68 @@ +dnl  SPARC v9 mpn_add_n for T1/T2. + +dnl  Copyright 2010 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C		   cycles/limb +C UltraSPARC T1:	 ? +C UltraSPARC T2:	 ? + +C INPUT PARAMETERS +define(`rp', `%o0') +define(`up', `%o1') +define(`vp', `%o2') +define(`n',  `%o3') +define(`cy', `%o4') + +ASM_START() +	REGISTER(%g2,#scratch) +	REGISTER(%g3,#scratch) +PROLOGUE(mpn_add_nc) +	b,a	L(ent) +EPILOGUE() +PROLOGUE(mpn_add_n) +	mov	0, cy +L(ent):	cmp	%g0, cy +L(top):	ldx	[up+0], %o4 +	add	up, 8, up +	ldx	[vp+0], %o5 +	add	vp, 8, vp +	add	rp, 8, rp +	add	n, -1, n +	srlx	%o4, 32, %g1 +	srlx	%o5, 32, %g2 +	addccc	%o4, %o5, %g3 +	addccc	%g1, %g2, %g0 +	brgz	n, L(top) +	 stx	%g3, [rp-8] + +	retl +	addc	%g0, %g0, %o0 +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct1/addlsh1_n.asm b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct1/addlsh1_n.asm new file mode 100644 index 0000000..3134797 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct1/addlsh1_n.asm @@ -0,0 +1,41 @@ +dnl  SPARC v9 mpn_addlsh1_n for T1/T2. + +dnl  Copyright 2010 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +define(LSH,             1) +define(RSH,             63) + +define(func, mpn_addlsh1_n) + +MULFUNC_PROLOGUE(mpn_addlsh1_n) + +include_mpn(`sparc64/ultrasparct1/addlshC_n.asm') diff --git a/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct1/addlsh2_n.asm b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct1/addlsh2_n.asm new file mode 100644 index 0000000..ee1afd0 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct1/addlsh2_n.asm @@ -0,0 +1,41 @@ +dnl  SPARC v9 mpn_addlsh2_n for T1/T2. + +dnl  Copyright 2010 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +define(LSH,             2) +define(RSH,             62) + +define(func, mpn_addlsh2_n) + +MULFUNC_PROLOGUE(mpn_addlsh2_n) + +include_mpn(`sparc64/ultrasparct1/addlshC_n.asm') diff --git a/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct1/addlshC_n.asm b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct1/addlshC_n.asm new file mode 100644 index 0000000..5be9a0d --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct1/addlshC_n.asm @@ -0,0 +1,69 @@ +dnl  SPARC v9 mpn_addlshC_n for T1/T2. + +dnl  Copyright 2010 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +C		   cycles/limb +C UltraSPARC T1:	21 +C UltraSPARC T2:	 ? + +C INPUT PARAMETERS +define(`rp', `%o0') +define(`up', `%o1') +define(`vp', `%o2') +define(`n',  `%o3') +define(`cy', `%o4') + +ASM_START() +	REGISTER(%g2,#scratch) +	REGISTER(%g3,#scratch) +PROLOGUE(func) +	mov	0, cy +	mov	0, %g5 +	cmp	%g0, cy +L(top):	ldx	[up+0], %o4 +	add	up, 8, up +	ldx	[vp+0], %o5 +	add	vp, 8, vp +	add	rp, 8, rp + +	sllx	%o5, LSH, %g4 +	add	n, -1, n +	or	%g5, %g4, %g4 +	srlx	%o5, RSH, %g5 + +	srlx	%o4, 32, %g1 +	srlx	%g4, 32, %g2 +	addccc	%o4, %g4, %g3 +	addccc	%g1, %g2, %g0 +	brgz	n, L(top) +	 stx	%g3, [rp-8] + +	retl +	addc	%g5, %g0, %o0 +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct1/addmul_1.asm b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct1/addmul_1.asm new file mode 100644 index 0000000..29dba96 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct1/addmul_1.asm @@ -0,0 +1,86 @@ +dnl  SPARC v9 mpn_addmul_1 for T1/T2. + +dnl  Copyright 2010 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C		   cycles/limb +C UltraSPARC T1:	74 +C UltraSPARC T2:	 ? + +C INPUT PARAMETERS +define(`rp', `%i0') +define(`up', `%i1') +define(`n',  `%i2') +define(`v0', `%i3') + +ASM_START() +	REGISTER(%g2,#scratch) +	REGISTER(%g3,#scratch) +PROLOGUE(mpn_addmul_1) +	save	%sp, -176, %sp +	mov	1, %o2 +	mov	%i0, %g2 +	srlx	%i3, 32, %o4 +	sllx	%o2, 32, %o2 +	srl	%i3, 0, %i3 +	mov	0, %g3 +	mov	0, %i0 + +L(top):	ldx	[%i1+%g3], %g1 +	srl	%g1, 0, %g4 +	mulx	%g4, %i3, %o5 +	srlx	%g1, 32, %g1 +	mulx	%g1, %i3, %g5 +	mulx	%g4, %o4, %g4 +	mulx	%g1, %o4, %g1 +	srlx	%o5, 32, %o1 +	add	%g5, %o1, %o1 +	addcc	%o1, %g4, %g4 +	srl	%o5, 0, %o0 +	ldx	[%g2+%g3], %o5 +	sllx	%g4, 32, %o1 +	add	%g1, %o2, %l1 +	movlu	%xcc, %l1, %g1 +	add	%o1, %o0, %l0 +	addcc	%l0, %i0, %g5 +	srlx	%g4, 32, %i0 +	add	%i0, 1, %g4 +	movlu	%xcc, %g4, %i0 +	addcc	%o5, %g5, %g5 +	stx	%g5, [%g2+%g3] +	add	%i0, 1, %g4 +	movlu	%xcc, %g4, %i0 +	add	%i2, -1, %i2 +	add	%i0, %g1, %i0 +	brnz,pt	%i2, L(top) +	 add	%g3, 8, %g3 +	return	%i7+8 +	 nop +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct1/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct1/gmp-mparam.h new file mode 100644 index 0000000..99db78a --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct1/gmp-mparam.h @@ -0,0 +1,154 @@ +/* Sparc64 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2002, 2004, 2006, 2008-2010 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + +  * the GNU Lesser General Public License as published by the Free +    Software Foundation; either version 3 of the License, or (at your +    option) any later version. + +or + +  * the GNU General Public License as published by the Free Software +    Foundation; either version 2 of the License, or (at your option) any +    later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library.  If not, +see https://www.gnu.org/licenses/.  */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 1000 MHz ultrasparc t1 running GNU/Linux */ + +#define DIVREM_1_NORM_THRESHOLD              0  /* always */ +#define DIVREM_1_UNNORM_THRESHOLD            0  /* always */ +#define MOD_1_1P_METHOD                      2 +#define MOD_1_NORM_THRESHOLD                 0  /* always */ +#define MOD_1_UNNORM_THRESHOLD               0  /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD         13 +#define MOD_1U_TO_MOD_1_1_THRESHOLD      MP_SIZE_T_MAX +#define MOD_1_1_TO_MOD_1_2_THRESHOLD         0  /* never mpn_mod_1_1p */ +#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     34 +#define USE_PREINV_DIVREM_1                  1 +#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */ +#define DIVEXACT_1_THRESHOLD                 0  /* always */ +#define BMOD_1_TO_MOD_1_THRESHOLD        MP_SIZE_T_MAX  /* never */ + +#define MUL_TOOM22_THRESHOLD                 8 +#define MUL_TOOM33_THRESHOLD                50 +#define MUL_TOOM44_THRESHOLD                99 +#define MUL_TOOM6H_THRESHOLD               125 +#define MUL_TOOM8H_THRESHOLD               187 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD      65 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD      77 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD      65 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD      50 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD      34 + +#define SQR_BASECASE_THRESHOLD               0  /* always */ +#define SQR_TOOM2_THRESHOLD                 14 +#define SQR_TOOM3_THRESHOLD                 57 +#define SQR_TOOM4_THRESHOLD                133 +#define SQR_TOOM6_THRESHOLD                156 +#define SQR_TOOM8_THRESHOLD                260 + +#define MULMID_TOOM42_THRESHOLD             12 + +#define MULMOD_BNM1_THRESHOLD                7 +#define SQRMOD_BNM1_THRESHOLD                7 + +#define MUL_FFT_MODF_THRESHOLD             176  /* k = 5 */ +#define MUL_FFT_TABLE3                                      \ +  { {    176, 5}, {      7, 6}, {      4, 5}, {      9, 6}, \ +    {      5, 5}, {     11, 6}, {     11, 7}, {      6, 6}, \ +    {     13, 7}, {      7, 6}, {     15, 7}, {      9, 8}, \ +    {      5, 7}, {     13, 8}, {      7, 7}, {     15, 6}, \ +    {     32, 7}, {     24, 8}, {     21, 9}, {     11, 8}, \ +    {     23,10}, {      7, 9}, {     15, 8}, {     33, 9}, \ +    {     19, 8}, {     39, 9}, {     23,10}, {     15, 9}, \ +    {     43,10}, {     23,11}, {     15,10}, {     31, 9}, \ +    {     63, 8}, {    127, 9}, {     67,10}, {     39, 9}, \ +    {     79, 8}, {    159,10}, {     47, 9}, {     95,11}, \ +    {   2048,12}, {   4096,13}, {   8192,14}, {  16384,15}, \ +    {  32768,16}, {  65536,17}, { 131072,18}, { 262144,19}, \ +    { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ +    {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 53 +#define MUL_FFT_THRESHOLD                 1728 + + +#define SQR_FFT_MODF_THRESHOLD             148  /* k = 5 */ +#define SQR_FFT_TABLE3                                      \ +  { {    148, 5}, {      7, 6}, {      4, 5}, {      9, 6}, \ +    {      5, 5}, {     11, 6}, {     11, 7}, {      6, 6}, \ +    {     13, 7}, {      7, 6}, {     15, 7}, {     13, 8}, \ +    {      7, 7}, {     16, 8}, {      9, 6}, {     38, 7}, \ +    {     20, 8}, {     11, 7}, {     24, 8}, {     13, 9}, \ +    {      7, 7}, {     30, 8}, {     19, 9}, {     11, 8}, \ +    {     25,10}, {      7, 9}, {     15, 8}, {     31, 9}, \ +    {     19, 8}, {     39, 9}, {     27,10}, {     15, 9}, \ +    {     39,10}, {     23, 9}, {     47, 8}, {     95, 9}, \ +    {     51,11}, {     15,10}, {     31, 8}, {    127,10}, \ +    {     39, 9}, {     79, 8}, {    159,10}, {     47, 9}, \ +    {     95,11}, {   2048,12}, {   4096,13}, {   8192,14}, \ +    {  16384,15}, {  32768,16}, {  65536,17}, { 131072,18}, \ +    { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ +    {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 58 +#define SQR_FFT_THRESHOLD                 1344 + +#define MULLO_BASECASE_THRESHOLD             0  /* always */ +#define MULLO_DC_THRESHOLD                  28 +#define MULLO_MUL_N_THRESHOLD             3176 + +#define DC_DIV_QR_THRESHOLD                 27 +#define DC_DIVAPPR_Q_THRESHOLD             106 +#define DC_BDIV_QR_THRESHOLD                27 +#define DC_BDIV_Q_THRESHOLD                 62 + +#define INV_MULMOD_BNM1_THRESHOLD           14 +#define INV_NEWTON_THRESHOLD               163 +#define INV_APPR_THRESHOLD                 117 + +#define BINV_NEWTON_THRESHOLD              166 +#define REDC_1_TO_REDC_N_THRESHOLD          31 + +#define MU_DIV_QR_THRESHOLD                734 +#define MU_DIVAPPR_Q_THRESHOLD             748 +#define MUPI_DIV_QR_THRESHOLD               67 +#define MU_BDIV_QR_THRESHOLD               562 +#define MU_BDIV_Q_THRESHOLD                734 + +#define POWM_SEC_TABLE  4,29,188,643,2741 + +#define MATRIX22_STRASSEN_THRESHOLD         11 +#define HGCD_THRESHOLD                      58 +#define HGCD_APPR_THRESHOLD                 55 +#define HGCD_REDUCE_THRESHOLD              637 +#define GCD_DC_THRESHOLD                   186 +#define GCDEXT_DC_THRESHOLD                140 +#define JACOBI_BASE_METHOD                   3 + +#define GET_STR_DC_THRESHOLD                20 +#define GET_STR_PRECOMPUTE_THRESHOLD        33 +#define SET_STR_DC_THRESHOLD               268 +#define SET_STR_PRECOMPUTE_THRESHOLD       960 + +#define FAC_DSC_THRESHOLD                  268 +#define FAC_ODD_THRESHOLD                    0  /* always */ diff --git a/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct1/mul_1.asm b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct1/mul_1.asm new file mode 100644 index 0000000..1fea2a1 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct1/mul_1.asm @@ -0,0 +1,82 @@ +dnl  SPARC v9 mpn_mul_1 for T1/T2. + +dnl  Copyright 2010 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C		   cycles/limb +C UltraSPARC T1:	68 +C UltraSPARC T2:	 ? + +C INPUT PARAMETERS +define(`rp', `%i0') +define(`up', `%i1') +define(`n',  `%i2') +define(`v0', `%i3') + +ASM_START() +	REGISTER(%g2,#scratch) +	REGISTER(%g3,#scratch) +PROLOGUE(mpn_mul_1) +	save	%sp, -176, %sp +	mov	1, %o2 +	mov	%i0, %g2 +	srlx	%i3, 32, %o4 +	sllx	%o2, 32, %o2 +	srl	%i3, 0, %i3 +	mov	0, %g3 +	mov	0, %i0 + +L(top):	ldx	[%i1+%g3], %g1 +	srl	%g1, 0, %g4 +	mulx	%g4, %i3, %o5 +	srlx	%g1, 32, %g1 +	mulx	%g1, %i3, %g5 +	mulx	%g4, %o4, %g4 +	mulx	%g1, %o4, %g1 +	srlx	%o5, 32, %o1 +	add	%g5, %o1, %o1 +	addcc	%o1, %g4, %g4 +	srl	%o5, 0, %o0 +	sllx	%g4, 32, %o1 +	add	%g1, %o2, %l1 +	movlu	%xcc, %l1, %g1 +	add	%o1, %o0, %l0 +	addcc	%l0, %i0, %g5 +	srlx	%g4, 32, %i0 +	add	%i0, 1, %g4 +	movlu	%xcc, %g4, %i0 +	stx	%g5, [%g2+%g3] +	add	%i2, -1, %i2 +	add	%i0, %g1, %i0 +	brnz,pt	%i2, L(top) +	 add	%g3, 8, %g3 +	return	%i7+8 +	 nop +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct1/rsblsh1_n.asm b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct1/rsblsh1_n.asm new file mode 100644 index 0000000..51bd4ab --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct1/rsblsh1_n.asm @@ -0,0 +1,41 @@ +dnl  SPARC v9 mpn_rsblsh1_n for T1/T2. + +dnl  Copyright 2010 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +define(LSH,             1) +define(RSH,             63) + +define(func, mpn_rsblsh1_n) + +MULFUNC_PROLOGUE(mpn_rsblsh1_n) + +include_mpn(`sparc64/ultrasparct1/rsblshC_n.asm') diff --git a/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct1/rsblsh2_n.asm b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct1/rsblsh2_n.asm new file mode 100644 index 0000000..f0d208e --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct1/rsblsh2_n.asm @@ -0,0 +1,41 @@ +dnl  SPARC v9 mpn_rsblsh2_n for T1/T2. + +dnl  Copyright 2010 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +define(LSH,             2) +define(RSH,             62) + +define(func, mpn_rsblsh2_n) + +MULFUNC_PROLOGUE(mpn_rsblsh2_n) + +include_mpn(`sparc64/ultrasparct1/rsblshC_n.asm') diff --git a/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct1/rsblshC_n.asm b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct1/rsblshC_n.asm new file mode 100644 index 0000000..7c03e9f --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct1/rsblshC_n.asm @@ -0,0 +1,69 @@ +dnl  SPARC v9 mpn_rsblshC_n for T1/T2. + +dnl  Copyright 2010 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +C		   cycles/limb +C UltraSPARC T1:	21 +C UltraSPARC T2:	 ? + +C INPUT PARAMETERS +define(`rp', `%o0') +define(`up', `%o1') +define(`vp', `%o2') +define(`n',  `%o3') +define(`cy', `%o4') + +ASM_START() +	REGISTER(%g2,#scratch) +	REGISTER(%g3,#scratch) +PROLOGUE(func) +	mov	0, cy +	mov	0, %g5 +	cmp	%g0, cy +L(top):	ldx	[up+0], %o4 +	add	up, 8, up +	ldx	[vp+0], %o5 +	add	vp, 8, vp +	add	rp, 8, rp + +	sllx	%o5, LSH, %g4 +	add	n, -1, n +	or	%g5, %g4, %g4 +	srlx	%o5, RSH, %g5 + +	srlx	%o4, 32, %g1 +	srlx	%g4, 32, %g2 +	subccc	%g4, %o4, %g3 +	subccc	%g2, %g1, %g0 +	brgz	n, L(top) +	 stx	%g3, [rp-8] + +	retl +	subc	%g5, %g0, %o0 +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct1/sub_n.asm b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct1/sub_n.asm new file mode 100644 index 0000000..c2af89f --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct1/sub_n.asm @@ -0,0 +1,68 @@ +dnl  SPARC v9 mpn_sub_n for T1/T2. + +dnl  Copyright 2010 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C		   cycles/limb +C UltraSPARC T1:	 ? +C UltraSPARC T2:	 ? + +C INPUT PARAMETERS +define(`rp', `%o0') +define(`up', `%o1') +define(`vp', `%o2') +define(`n',  `%o3') +define(`cy', `%o4') + +ASM_START() +	REGISTER(%g2,#scratch) +	REGISTER(%g3,#scratch) +PROLOGUE(mpn_sub_nc) +	b,a	L(ent) +EPILOGUE() +PROLOGUE(mpn_sub_n) +	mov	0, cy +L(ent):	cmp	%g0, cy +L(top):	ldx	[up+0], %o4 +	add	up, 8, up +	ldx	[vp+0], %o5 +	add	vp, 8, vp +	add	rp, 8, rp +	add	n, -1, n +	srlx	%o4, 32, %g1 +	srlx	%o5, 32, %g2 +	subccc	%o4, %o5, %g3 +	subccc	%g1, %g2, %g0 +	brgz	n, L(top) +	 stx	%g3, [rp-8] + +	retl +	addc	%g0, %g0, %o0 +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct1/sublsh1_n.asm b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct1/sublsh1_n.asm new file mode 100644 index 0000000..8c8fa80 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct1/sublsh1_n.asm @@ -0,0 +1,41 @@ +dnl  SPARC v9 mpn_sublsh1_n for T1/T2. + +dnl  Copyright 2010 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +define(LSH,             1) +define(RSH,             63) + +define(func, mpn_sublsh1_n) + +MULFUNC_PROLOGUE(mpn_sublsh1_n) + +include_mpn(`sparc64/ultrasparct1/sublshC_n.asm') diff --git a/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct1/sublsh2_n.asm b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct1/sublsh2_n.asm new file mode 100644 index 0000000..2fd5eee --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct1/sublsh2_n.asm @@ -0,0 +1,41 @@ +dnl  SPARC v9 mpn_sublsh2_n for T1/T2. + +dnl  Copyright 2010 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +define(LSH,             2) +define(RSH,             62) + +define(func, mpn_sublsh2_n) + +MULFUNC_PROLOGUE(mpn_sublsh2_n) + +include_mpn(`sparc64/ultrasparct1/sublshC_n.asm') diff --git a/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct1/sublshC_n.asm b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct1/sublshC_n.asm new file mode 100644 index 0000000..01eafef --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct1/sublshC_n.asm @@ -0,0 +1,69 @@ +dnl  SPARC v9 mpn_sublshC_n for T1/T2. + +dnl  Copyright 2010 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +C		   cycles/limb +C UltraSPARC T1:	21 +C UltraSPARC T2:	 ? + +C INPUT PARAMETERS +define(`rp', `%o0') +define(`up', `%o1') +define(`vp', `%o2') +define(`n',  `%o3') +define(`cy', `%o4') + +ASM_START() +	REGISTER(%g2,#scratch) +	REGISTER(%g3,#scratch) +PROLOGUE(func) +	mov	0, cy +	mov	0, %g5 +	cmp	%g0, cy +L(top):	ldx	[up+0], %o4 +	add	up, 8, up +	ldx	[vp+0], %o5 +	add	vp, 8, vp +	add	rp, 8, rp + +	sllx	%o5, LSH, %g4 +	add	n, -1, n +	or	%g5, %g4, %g4 +	srlx	%o5, RSH, %g5 + +	srlx	%o4, 32, %g1 +	srlx	%g4, 32, %g2 +	subccc	%o4, %g4, %g3 +	subccc	%g1, %g2, %g0 +	brgz	n, L(top) +	 stx	%g3, [rp-8] + +	retl +	addc	%g5, %g0, %o0 +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct1/submul_1.asm b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct1/submul_1.asm new file mode 100644 index 0000000..4f553a8 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct1/submul_1.asm @@ -0,0 +1,86 @@ +dnl  SPARC v9 mpn_submul_1 for T1/T2. + +dnl  Copyright 2010 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C		   cycles/limb +C UltraSPARC T1:	74 +C UltraSPARC T2:	 ? + +C INPUT PARAMETERS +define(`rp', `%i0') +define(`up', `%i1') +define(`n',  `%i2') +define(`v0', `%i3') + +ASM_START() +	REGISTER(%g2,#scratch) +	REGISTER(%g3,#scratch) +PROLOGUE(mpn_submul_1) +	save	%sp, -176, %sp +	mov	1, %o2 +	mov	%i0, %g2 +	srlx	%i3, 32, %o4 +	sllx	%o2, 32, %o2 +	srl	%i3, 0, %i3 +	mov	0, %g3 +	mov	0, %i0 + +L(top):	ldx	[%i1+%g3], %g1 +	srl	%g1, 0, %g4 +	mulx	%g4, %i3, %o5 +	srlx	%g1, 32, %g1 +	mulx	%g1, %i3, %g5 +	mulx	%g4, %o4, %g4 +	mulx	%g1, %o4, %g1 +	srlx	%o5, 32, %o1 +	add	%g5, %o1, %o1 +	addcc	%o1, %g4, %g4 +	srl	%o5, 0, %o0 +	ldx	[%g2+%g3], %o5 +	sllx	%g4, 32, %o1 +	add	%g1, %o2, %l1 +	movlu	%xcc, %l1, %g1 +	add	%o1, %o0, %l0 +	addcc	%l0, %i0, %g5 +	srlx	%g4, 32, %i0 +	add	%i0, 1, %g4 +	movlu	%xcc, %g4, %i0 +	subcc	%o5, %g5, %g5 +	stx	%g5, [%g2+%g3] +	add	%i0, 1, %g4 +	movlu	%xcc, %g4, %i0 +	add	%i2, -1, %i2 +	add	%i0, %g1, %i0 +	brnz,pt	%i2, L(top) +	 add	%g3, 8, %g3 +	return	%i7+8 +	 nop +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/add_n.asm b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/add_n.asm new file mode 100644 index 0000000..0170746 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/add_n.asm @@ -0,0 +1,126 @@ +dnl  SPARC v9 mpn_add_n for T3/T4. + +dnl  Contributed to the GNU project by David Miller. + +dnl  Copyright 2013 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C		   cycles/limb +C UltraSPARC T3:	 8 +C UltraSPARC T4:	 3 + +C INPUT PARAMETERS +define(`rp', `%i0') +define(`up', `%i1') +define(`vp', `%i2') +define(`n',  `%i3') +define(`cy', `%i4') + +define(`u0_off', `%l2') +define(`u1_off', `%l3') +define(`loop_n', `%l6') +define(`tmp', `%l7') + +ASM_START() +	REGISTER(%g2,#scratch) +	REGISTER(%g3,#scratch) +PROLOGUE(mpn_add_nc) +	save	%sp, -176, %sp +	b,a	L(ent) +EPILOGUE() +PROLOGUE(mpn_add_n) +	save	%sp, -176, %sp + +	mov	0, cy +L(ent): +	subcc	n, 1, n +	be	L(final_one) +	 cmp	%g0, cy + +	ldx	[up + 0], %o4 +	sllx	n, 3, tmp + +	ldx	[vp + 0], %o5 +	add	up, tmp, u0_off + +	ldx	[up + 8], %g5 +	neg	tmp, loop_n + +	ldx	[vp + 8], %g1 +	add	u0_off, 8, u1_off + +	sub	loop_n, -(2 * 8), loop_n + +	brgez,pn loop_n, L(loop_tail) +	 add	vp, (2 * 8), vp + +	b,a	L(top) +	ALIGN(16) +L(top): +	addxccc(%o4, %o5, tmp) +	ldx	[vp + 0], %o5 + +	add	rp, (2 * 8), rp +	ldx	[loop_n + u0_off], %o4 + +	add	vp, (2 * 8), vp +	stx	tmp, [rp - 16] + +	addxccc(%g1, %g5, tmp) +	ldx	[vp - 8], %g1 + +	ldx	[loop_n + u1_off], %g5 +	sub	loop_n, -(2 * 8), loop_n + +	brlz	loop_n, L(top) +	 stx	tmp, [rp - 8] + +L(loop_tail): +	addxccc(%o4, %o5, %g3) +	add	loop_n, u0_off, up + +	addxccc(%g1, %g5, %g5) +	stx	%g3, [rp + 0] + +	brgz,pt	loop_n, L(done) +	 stx	%g5, [rp + 8] + +	add	rp, (2 * 8), rp +L(final_one): +	ldx	[up+0], %o4 +	ldx	[vp+0], %o5 +	addxccc(%o4, %o5, %g3) +	stx	%g3, [rp+0] + +L(done): +	addxc(%g0, %g0, %i0) +	ret +	 restore +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/addmul_1.asm b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/addmul_1.asm new file mode 100644 index 0000000..939811e --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/addmul_1.asm @@ -0,0 +1,182 @@ +dnl  SPARC v9 mpn_addmul_1 for T3/T4/T5. + +dnl  Contributed to the GNU project by David Miller and Torbjörn Granlund. + +dnl  Copyright 2013 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C		   cycles/limb +C UltraSPARC T3:	26 +C UltraSPARC T4:	4.5 + +C INPUT PARAMETERS +define(`rp', `%i0') +define(`up', `%i1') +define(`n',  `%i2') +define(`v0', `%i3') + +define(`u0',  `%l0') +define(`u1',  `%l1') +define(`u2',  `%l2') +define(`u3',  `%l3') +define(`r0',  `%l4') +define(`r1',  `%l5') +define(`r2',  `%l6') +define(`r3',  `%l7') + +ASM_START() +	REGISTER(%g2,#scratch) +	REGISTER(%g3,#scratch) +PROLOGUE(mpn_addmul_1) +	save	%sp, -176, %sp +	ldx	[up+0], %g1 + +	and	n, 3, %g3 +	brz	%g3, L(b0) +	 addcc	%g0, %g0, %g5			C clear carry limb, flag +	cmp	%g3, 2 +	bcs	%xcc, L(b01) +	 nop +	be	%xcc, L(b10) +	 ldx	[up+8], %g5 + +L(b11):	ldx	[up+16], u3 +	mulx	%g1, v0, %o2 +	umulxhi(%g1, v0, %o3) +	ldx	[rp+0], r1 +	mulx	%g5, v0, %o4 +	ldx	[rp+8], r2 +	umulxhi(%g5, v0, %o5) +	ldx	[rp+16], r3 +	mulx	u3, v0, %g4 +	umulxhi(u3, v0, %g5) +	addcc	%o3, %o4, %o4 +	addxccc(%o5, %g4, %g4) +	addxc(	%g0, %g5, %g5) +	addcc	r1, %o2, r1 +	stx	r1, [rp+0] +	addxccc(r2, %o4, r2) +	stx	r2, [rp+8] +	addxccc(r3, %g4, r3) +	stx	r3, [rp+16] +	add	n, -3, n +	add	up, 24, up +	brz	n, L(xit) +	 add	rp, 24, rp +	b	L(com) +	 nop + +L(b10):	mulx	%g1, v0, %o4 +	ldx	[rp+0], r2 +	umulxhi(%g1, v0, %o5) +	ldx	[rp+8], r3 +	mulx	%g5, v0, %g4 +	umulxhi(%g5, v0, %g5) +	addcc	%o5, %g4, %g4 +	addxc(	%g0, %g5, %g5) +	addcc	r2, %o4, r2 +	stx	r2, [rp+0] +	addxccc(r3, %g4, r3) +	stx	r3, [rp+8] +	add	n, -2, n +	add	up, 16, up +	brz	n, L(xit) +	 add	rp, 16, rp +	b	L(com) +	 nop + +L(b01):	ldx	[rp+0], r3 +	mulx	%g1, v0, %g4 +	umulxhi(%g1, v0, %g5) +	addcc	r3, %g4, r3 +	stx	r3, [rp+0] +	add	n, -1, n +	add	up, 8, up +	brz	n, L(xit) +	 add	rp, 8, rp + +L(com):	ldx	[up+0], %g1 +L(b0):	ldx	[up+8], u1 +	ldx	[up+16], u2 +	ldx	[up+24], u3 +	mulx	%g1, v0, %o0 +	umulxhi(%g1, v0, %o1) +	b	L(lo0) +	 nop + +	ALIGN(16) +L(top):	ldx	[up+0], u0 +	addxc(	%g0, %g5, %g5)		C propagate carry into carry limb +	ldx	[up+8], u1 +	addcc	r0, %o0, r0 +	ldx	[up+16], u2 +	addxccc(r1, %o2, r1) +	ldx	[up+24], u3 +	addxccc(r2, %o4, r2) +	stx	r0, [rp-32] +	addxccc(r3, %g4, r3) +	stx	r1, [rp-24] +	mulx	u0, v0, %o0 +	stx	r2, [rp-16] +	umulxhi(u0, v0, %o1) +	stx	r3, [rp-8] +L(lo0):	mulx	u1, v0, %o2 +	ldx	[rp+0], r0 +	umulxhi(u1, v0, %o3) +	ldx	[rp+8], r1 +	mulx	u2, v0, %o4 +	ldx	[rp+16], r2 +	umulxhi(u2, v0, %o5) +	ldx	[rp+24], r3 +	mulx	u3, v0, %g4 +	addxccc(%g5, %o0, %o0) +	umulxhi(u3, v0, %g5) +	add	up, 32, up +	addxccc(%o1, %o2, %o2) +	add	rp, 32, rp +	addxccc(%o3, %o4, %o4) +	add	n, -4, n +	addxccc(%o5, %g4, %g4) +	brgz	n, L(top) +	 nop + +	addxc(	%g0, %g5, %g5) +	addcc	r0, %o0, r0 +	stx	r0, [rp-32] +	addxccc(r1, %o2, r1) +	stx	r1, [rp-24] +	addxccc(r2, %o4, r2) +	stx	r2, [rp-16] +	addxccc(r3, %g4, r3) +	stx	r3, [rp-8] +L(xit):	addxc(	%g0, %g5, %i0) +	ret +	 restore +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/aormul_2.asm b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/aormul_2.asm new file mode 100644 index 0000000..ccc6a44 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/aormul_2.asm @@ -0,0 +1,228 @@ +dnl  SPARC v9 mpn_mul_2 and mpn_addmul_2 for T3/T4/T5. + +dnl  Contributed to the GNU project by Torbjörn Granlund. + +dnl  Copyright 2013 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C		    cycles/limb      cycles/limb +C		       mul_2           addmul_2 +C UltraSPARC T3:	22.5		 23.5 +C UltraSPARC T4:	 3.25		 3.75 + + +C The code is reasonably scheduled but also relies on OoO.  There was hope that +C this could run at around 3.0 and 3.5 c/l respectively, on T4.  Two cycles per +C iteration needs to be removed. +C +C We could almost use 2-way unrolling, but currently the wN registers live too +C long.  By changing add x,w1,w1 to add x,w1,w0, i.e. migrate the values down- +C wards, 2-way unrolling should become possible.  With n-indexed addressing it +C should run no slower. +C +C The rp loads to g1/g3 are very much over-scheduled.  Presumably, they could +C be postponed a full way, and then just one register could be used. + +C INPUT PARAMETERS +define(`rp', `%i0') +define(`up', `%i1') +define(`n',  `%i2') +define(`vp', `%i3') + +define(`v0', `%o0') +define(`v1', `%o1') + +define(`w0', `%o2') +define(`w1', `%o3') +define(`w2', `%o4') +define(`w3', `%o5') + +ifdef(`OPERATION_mul_2',` +      define(`AM2',      `') +      define(`ADDX',	 `addcc`'$1') +      define(`func',     `mpn_mul_2') +') +ifdef(`OPERATION_addmul_2',` +      define(`AM2',      `$1') +      define(`ADDX',	 `addxccc($1,$2,$3)') +      define(`func',     `mpn_addmul_2') +') + + +MULFUNC_PROLOGUE(mpn_mul_2 mpn_addmul_2) + +ASM_START() +	REGISTER(%g2,#scratch) +	REGISTER(%g3,#scratch) +PROLOGUE(func) +	save	%sp, -176, %sp + +	ldx	[vp+0], v0		C load v0 +	and	n, 3, %g5 +	ldx	[vp+8], v1		C load v1 +	add	n, -6, n +	ldx	[up+0], %g4 +	brz	%g5, L(b0) +	 cmp	%g5, 2 +	bcs	L(b1) +	 nop +	be	L(b2) +	 nop + +L(b3): +AM2(`	ldx	[rp+0], %g1') +	mulx	%g4, v0, w2 +	umulxhi(%g4, v0, w3) +	ldx	[up+8], %i5 +	mulx	%g4, v1, %l3 +	umulxhi(%g4, v1, %l7) +AM2(`	ldx	[rp+8], %g3') +	add	up, -8, up +	add	rp, -8, rp +	b	L(lo3) +	 mov	0, w0 + +L(b2): +AM2(`	ldx	[rp+0], %g3') +	mulx	%g4, v0, w3 +	umulxhi(%g4, v0, w0) +	ldx	[up+8], %i4 +	mulx	%g4, v1, %l1 +	umulxhi(%g4, v1, %l5) +AM2(`	ldx	[rp+8], %g1') +	add	rp, 16, rp +	brlz	n, L(end) +	 mov	0, w1 +	ba	L(top) +	 add	up, 16, up + +L(b1): +AM2(`	ldx	[rp+0], %g1') +	mulx	%g4, v0, w0 +	umulxhi(%g4, v0, w1) +	ldx	[up+8], %i5 +	mulx	%g4, v1, %l3 +	umulxhi(%g4, v1, %l7) +AM2(`	ldx	[rp+8], %g3') +	add	up, 8, up +	add	rp, 8, rp +	b	L(lo1) +	 mov	0, w2 + +L(b0): +AM2(`	ldx	[rp+0], %g3') +	mulx	%g4, v0, w1 +	umulxhi(%g4, v0, w2) +	ldx	[up+8], %i4 +	mulx	%g4, v1, %l1 +	umulxhi(%g4, v1, %l5) +AM2(`	ldx	[rp+8], %g1') +	b	L(lo0) +	 mov	0, w3 + +	ALIGN(16)			C cycle +L(top):	mulx	%i4, v0, %l2		C 0->5 +	umulxhi(%i4, v0, %l6)		C 0->5 +	ldx	[up+0], %i5		C 1->6 +AM2(`	addcc	w3, %g3, w3')		C 1 +	stx	w3, [rp-16]		C 2 +	ADDX(`	%l1, w0, w0')		C 2 +	addxccc(%l5, w1, w1)		C 3 +	mulx	%i4, v1, %l3		C 3->9 +	umulxhi(%i4, v1, %l7)		C 4->9 +AM2(`	ldx	[rp+0], %g3')		C 4 +	addcc	%l2, w0, w0		C 5 +	addxccc(%l6, w1, w1)		C 5 +	addxc(	%g0, %g0, w2)		C 6 +L(lo1):	mulx	%i5, v0, %l0		C 6 +	umulxhi(%i5, v0, %l4)		C 7 +	ldx	[up+8], %i4		C 7 +AM2(`	addcc	w0, %g1, w0')		C 8 +	stx	w0, [rp-8]		C 8 +	ADDX(`	%l3, w1, w1')		C 9 +	addxccc(%l7, w2, w2)		C 9 +	mulx	%i5, v1, %l1		C 10 +	umulxhi(%i5, v1, %l5)		C 10 +AM2(`	ldx	[rp+8], %g1')		C 11 +	addcc	%l0, w1, w1		C 11 +	addxccc(%l4, w2, w2)		C 12 +	addxc(	%g0, %g0, w3)		C 12 +L(lo0):	mulx	%i4, v0, %l2		C 13 +	umulxhi(%i4, v0, %l6)		C 13 +	ldx	[up+16], %i5		C 14 +AM2(`	addcc	w1, %g3, w1')		C 14 +	stx	w1, [rp+0]		C 15 +	ADDX(`	%l1, w2, w2')		C 15 +	addxccc(%l5, w3, w3)		C 16 +	mulx	%i4, v1, %l3		C 16 +	umulxhi(%i4, v1, %l7)		C 17 +AM2(`	ldx	[rp+16], %g3')		C 17 +	addcc	%l2, w2, w2		C 18 +	addxccc(%l6, w3, w3)		C 18 +	addxc(	%g0, %g0, w0)		C 19 +L(lo3):	mulx	%i5, v0, %l0		C 19 +	umulxhi(%i5, v0, %l4)		C 20 +	ldx	[up+24], %i4		C 20 +AM2(`	addcc	w2, %g1, w2')		C 21 +	stx	w2, [rp+8]		C 21 +	ADDX(`	%l3, w3, w3')		C 22 +	addxccc(%l7, w0, w0)		C 22 +	mulx	%i5, v1, %l1		C 23 +	umulxhi(%i5, v1, %l5)		C 23 +AM2(`	ldx	[rp+24], %g1')		C 24 +	addcc	%l0, w3, w3		C 24 +	addxccc(%l4, w0, w0)		C 25 +	addxc(	%g0, %g0, w1)		C 25 +	add	up, 32, up +	add	rp, 32, rp +	brgz	n, L(top) +	 add	n, -4, n + +L(end):	mulx	%i4, v0, %l2 +	umulxhi(%i4, v0, %l6) +AM2(`	addcc	w3, %g3, w3') +	stx	w3, [rp-16] +	ADDX(`	%l1, w0, w0') +	addxccc(%l5, w1, w1) +	mulx	%i4, v1, %l3 +	umulxhi(%i4, v1, %l7) +	addcc	%l2, w0, w0 +	addxccc(%l6, w1, w1) +	addxc(	%g0, %g0, w2) +AM2(`	addcc	w0, %g1, w0') +	stx	w0, [rp-8] +	ADDX(`	%l3, w1, w1') +	stx	w1, [rp+0] +	addxc(%l7, w2, %i0) + +	ret +	 restore +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/aormul_4.asm b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/aormul_4.asm new file mode 100644 index 0000000..845f6d6 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/aormul_4.asm @@ -0,0 +1,219 @@ +dnl  SPARC v9 mpn_mul_4 and mpn_addmul_4 for T3/T4/T5. + +dnl  Contributed to the GNU project by Torbjörn Granlund. + +dnl  Copyright 2013 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C		    cycles/limb      cycles/limb +C		       mul_4           addmul_4 +C UltraSPARC T3:	21.5		22.0 +C UltraSPARC T4:	 2.625		 2.75 + + +C The code is well-scheduled and relies on OoO very little.  There is hope that +C this will run at around 2.5 and 2.75 c/l respectively, on T4. + +define(`rp', `%i0') +define(`up', `%i1') +define(`n',  `%i2') +define(`vp', `%i3') + +define(`v0', `%g1') +define(`v1', `%o7') +define(`v2', `%g2') +define(`v3', `%i3') + +define(`w0', `%o0') +define(`w1', `%o1') +define(`w2', `%o2') +define(`w3', `%o3') +define(`w4', `%o4') + +define(`r0', `%o5') + +define(`u0', `%i4') +define(`u1', `%i5') + +define(`rp0', `rp') +define(`rp1', `%g3') +define(`rp2', `%g4') +define(`up0', `up') +define(`up1', `%g5') + +ifdef(`OPERATION_mul_4',` +      define(`AM4',      `') +      define(`ADDX',	 `addcc`'$1') +      define(`func',     `mpn_mul_4') +') +ifdef(`OPERATION_addmul_4',` +      define(`AM4',      `$1') +      define(`ADDX',	 `addxccc($1,$2,$3)') +      define(`func',     `mpn_addmul_4') +') + + +MULFUNC_PROLOGUE(mpn_mul_4 mpn_addmul_4) + +ASM_START() +	REGISTER(%g2,#scratch) +	REGISTER(%g3,#scratch) +PROLOGUE(func) +	save	%sp, -176, %sp + +	ldx	[up + 0], u1		C load up[0] early +	andcc	n, 1, %g0		C is n odd? +	ldx	[vp + 0], v0 +	sllx	n, 3, n +	ldx	[vp + 8], v1 +	add	n, -28, n +	ldx	[vp + 16], v2 +	add	rp, -16, rp +	ldx	[vp + 24], v3 +	add	up, n, up0 +	add	rp, n, rp0 +	add	up0, 8, up1 +	add	rp0, 8, rp1 +	add	rp0, 16, rp2 +	mulx	u1, v0, %l0 +	mov	0, w0 +	mulx	u1, v1, %l1 +	mov	0, w1 +	mulx	u1, v2, %l2 +	mov	0, w2 +	mulx	u1, v3, %l3 +	mov	0, w3 + +	be	L(evn) +	 neg	n, n + +L(odd):	mov	u1, u0 +	ldx	[up1 + n], u1 +AM4(`	ldx	[rp2 + n], r0') +	umulxhi(u0, v0, %l4) +	umulxhi(u0, v1, %l5) +	umulxhi(u0, v2, %l6) +	umulxhi(u0, v3, %l7) +	b	L(mid) +	 add	n, 8, n + +L(evn):	ldx	[up1 + n], u0 +AM4(`	ldx	[rp2 + n], r0') +	umulxhi(u1, v0, %l4) +	umulxhi(u1, v1, %l5) +	umulxhi(u1, v2, %l6) +	umulxhi(u1, v3, %l7) +	add	n, 16, n + +	ALIGN(16) +L(top):	addcc	%l0, w0, w0 +	mulx	u0, v0, %l0	C w 0 +	addxccc(%l1, w1, w1) +	mulx	u0, v1, %l1	C w 1 +	addxccc(%l2, w2, w2) +	mulx	u0, v2, %l2	C w 2 +	addxccc(%l3, w3, w3) +	mulx	u0, v3, %l3	C w 3 +	ldx	[up0 + n], u1 +	addxc(	%g0, %g0, w4) +AM4(`	addcc	r0, w0, w0') +	stx	w0, [rp0 + n] +	ADDX(`	%l4, w1, w0') +	umulxhi(u0, v0, %l4)	C w 1 +AM4(`	ldx	[rp1 + n], r0') +	addxccc(%l5, w2, w1) +	umulxhi(u0, v1, %l5)	C w 2 +	addxccc(%l6, w3, w2) +	umulxhi(u0, v2, %l6)	C w 3 +	addxc(	%l7, w4, w3) +	umulxhi(u0, v3, %l7)	C w 4 +L(mid):	addcc	%l0, w0, w0 +	mulx	u1, v0, %l0	C w 1 +	addxccc(%l1, w1, w1) +	mulx	u1, v1, %l1	C w 2 +	addxccc(%l2, w2, w2) +	mulx	u1, v2, %l2	C w 3 +	addxccc(%l3, w3, w3) +	mulx	u1, v3, %l3	C w 4 +	ldx	[up1 + n], u0 +	addxc(	%g0, %g0, w4) +AM4(`	addcc	r0, w0, w0') +	stx	w0, [rp1 + n] +	ADDX(`	%l4, w1, w0') +	umulxhi(u1, v0, %l4)	C w 2 +AM4(`	ldx	[rp2 + n], r0') +	addxccc(%l5, w2, w1) +	umulxhi(u1, v1, %l5)	C w 3 +	addxccc(%l6, w3, w2) +	umulxhi(u1, v2, %l6)	C w 4 +	addxc(	%l7, w4, w3) +	umulxhi(u1, v3, %l7)	C w 5 +	brlz	n, L(top) +	 add	n, 16, n + +L(end):	addcc	%l0, w0, w0 +	mulx	u0, v0, %l0 +	addxccc(%l1, w1, w1) +	mulx	u0, v1, %l1 +	addxccc(%l2, w2, w2) +	mulx	u0, v2, %l2 +	addxccc(%l3, w3, w3) +	mulx	u0, v3, %l3 +	addxc(	%g0, %g0, w4) +AM4(`	addcc	r0, w0, w0') +	stx	w0, [rp0 + n] +	ADDX(`	%l4, w1, w0') +	umulxhi(u0, v0, %l4) +AM4(`	ldx	[rp1 + n], r0') +	addxccc(%l5, w2, w1) +	umulxhi(u0, v1, %l5) +	addxccc(%l6, w3, w2) +	umulxhi(u0, v2, %l6) +	addxc(	%l7, w4, w3) +	umulxhi(u0, v3, %l7) +	addcc	%l0, w0, w0 +	addxccc(%l1, w1, w1) +	addxccc(%l2, w2, w2) +	addxccc(%l3, w3, w3) +	addxc(	%g0, %g0, w4) +AM4(`	addcc	r0, w0, w0') +	stx	w0, [rp1 + n] +	ADDX(`	%l4, w1, w0') +	addxccc(%l5, w2, w1) +	addxccc(%l6, w3, w2) +	stx	w0, [rp2 + n] +	add	n, 16, n +	stx	w1, [rp1 + n] +	stx	w2, [rp2 + n] +	addxc(	%l7, w4, %i0) +	ret +	 restore +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/aorslsh_n.asm b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/aorslsh_n.asm new file mode 100644 index 0000000..1014b1b --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/aorslsh_n.asm @@ -0,0 +1,147 @@ +dnl  SPARC v9 mpn_addlsh_n and mpn_sublsh_n for T3/T4/T5. + +dnl  Contributed to the GNU project by Torbjörn Granlund. + +dnl  Copyright 2013 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C		   cycles/limb +C UltraSPARC T3:	11 +C UltraSPARC T4:	 4 + +C For sublsh_n we combine the two shifted limbs using xnor, using the identity +C (a xor not b) = (not (a xor b)) which equals (not (a or b)) when (a and b) = +C 0 as it is in our usage.  This gives us the ones complement for free. +C Unfortunately, the same trick will not work for rsblsh_n, which will instead +C require a separate negation. +C +C FIXME: Add rsblsh_n to this file. + +define(`rp', `%i0') +define(`up', `%i1') +define(`vp', `%i2') +define(`n',  `%i3') +define(`cnt',`%i4') + +define(`tnc',`%o5') + +ifdef(`OPERATION_addlsh_n',` +  define(`INITCY', `subcc	%g0, 0, %g0') +  define(`MERGE',  `or') +  define(`func',   `mpn_addlsh_n') +') +ifdef(`OPERATION_sublsh_n',` +  define(`INITCY', `subcc	%g0, 1, %g0') +  define(`MERGE',  `xnor') +  define(`func',   `mpn_sublsh_n') +') + +define(`rp0',  `rp') +define(`rp1',  `%o2') +define(`up0',  `up') +define(`up1',  `%o3') +define(`vp0',  `vp') +define(`vp1',  `%o4') + +MULFUNC_PROLOGUE(mpn_addlsh_n mpn_sublsh_n) +ASM_START() +	REGISTER(%g2,#scratch) +	REGISTER(%g3,#scratch) +PROLOGUE(func) +	save	%sp, -176, %sp +	mov	64, tnc +	sub	tnc, cnt, tnc + +	andcc	n, 1, %g0 +	sllx	n, 3, n +	add	n, -16, n +	add	up, n, up0 +	add	vp, n, vp0 +	add	rp, n, rp0 +	add	up0, 8, up1 +	add	vp0, 8, vp1 +	add	rp0, -8, rp1 +	add	rp0, -16, rp0 +	neg	n, n +	be	L(evn) +	 INITCY + +L(odd):	ldx	[vp0 + n], %l1 +	mov	0, %l2 +	ldx	[up0 + n], %l5 +	sllx	%l1, cnt, %g3 +	brgez	n, L(wd1) +	 add	n, 8, n +	ldx	[vp0 + n], %l0 +	b	L(lo1) +	 sllx	%l1, cnt, %g3 + +L(evn):	ldx	[vp0 + n], %l0 +	mov	0, %l3 +	ldx	[up0 + n], %l4 +	ldx	[vp1 + n], %l1 +	b	L(lo0) +	 sllx	%l0, cnt, %g1 + +L(top):	addxccc(%l6, %l4, %o0) +	ldx	[vp0 + n], %l0 +	sllx	%l1, cnt, %g3 +	stx	%o0, [rp0 + n] +L(lo1):	srlx	%l1, tnc, %l3 +	MERGE	%l2, %g3, %l7 +	ldx	[up0 + n], %l4 +	addxccc(%l7, %l5, %o1) +	ldx	[vp1 + n], %l1 +	sllx	%l0, cnt, %g1 +	stx	%o1, [rp1 + n] +L(lo0):	srlx	%l0, tnc, %l2 +	MERGE	%l3, %g1, %l6 +	ldx	[up1 + n], %l5 +	brlz,pt	n, L(top) +	 add	n, 16, n + +	addxccc(%l6, %l4, %o0) +	sllx	%l1, cnt, %g3 +	stx	%o0, [rp0 + n] +L(wd1):	srlx	%l1, tnc, %l3 +	MERGE	%l2, %g3, %l7 +	addxccc(%l7, %l5, %o1) +	stx	%o1, [rp1 + n] + +ifdef(`OPERATION_addlsh_n', +`	addxc(	%l3, %g0, %i0)') +ifdef(`OPERATION_sublsh_n', +`	addxc(	%g0, %g0, %g1) +	add	%g1, -1, %g1 +	sub	%l3, %g1, %i0') + +	ret +	 restore +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/bdiv_dbm1c.asm b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/bdiv_dbm1c.asm new file mode 100644 index 0000000..550860d --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/bdiv_dbm1c.asm @@ -0,0 +1,147 @@ +dnl  SPARC T3/T4/T5 mpn_bdiv_dbm1c. + +dnl  Contributed to the GNU project by Torbjörn Granlund. + +dnl  Copyright 2013 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C                  cycles/limb +C UltraSPARC T3:	25 +C UltraSPARC T4/T5:	 4 + +C INPUT PARAMETERS +define(`qp',  `%i0') +define(`ap',  `%i1') +define(`n',   `%i2') +define(`bd',  `%i3') +define(`h',   `%i4') + +define(`plo0',`%g4')  define(`plo1',`%g5') +define(`phi0',`%l0')  define(`phi1',`%l1') +define(`a0',  `%g1')  define(`a1',  `%g3') + +ASM_START() +	REGISTER(%g2,#scratch) +	REGISTER(%g3,#scratch) +PROLOGUE(mpn_bdiv_dbm1c) +	save	%sp, -176, %sp + +	and	n, 3, %g5 +	ldx	[ap + 0], %g2 +	add	n, -5, n +	brz	%g5, L(b0) +	 cmp	%g5, 2 +	bcs	%xcc, L(b1) +	 nop +	be	%xcc, L(b2) +	 nop + +L(b3):	ldx	[ap + 8], a0 +	mulx	bd, %g2, plo1 +	umulxhi(bd, %g2, phi1) +	ldx	[ap + 16], a1 +	add	qp, -24, qp +	b	L(lo3) +	 add	ap, -8, ap + +L(b2):	ldx	[ap + 8], a1 +	mulx	bd, %g2, plo0 +	umulxhi(bd, %g2, phi0) +	brlz,pt n, L(wd2) +	 nop +L(gt2):	ldx	[ap + 16], a0 +	add	ap, 16, ap +	b	L(lo2) +	 add	n, -1, n + +L(b1):	mulx	bd, %g2, plo1 +	 umulxhi(bd, %g2, phi1) +	brlz,pn	n, L(wd1) +	 add	qp, -8, qp +L(gt1):	ldx	[ap + 8], a0 +	ldx	[ap + 16], a1 +	b	L(lo1) +	 add	ap, 8, ap + +L(b0):	ldx	[ap + 8], a1 +	mulx	bd, %g2, plo0 +	umulxhi(bd, %g2, phi0) +	ldx	[ap + 16], a0 +	b	L(lo0) +	 add	qp, -16, qp + +L(top):	ldx	[ap + 0], a0 +	sub	h, phi1, h +L(lo2):	mulx	bd, a1, plo1 +	umulxhi(bd, a1, phi1) +	subcc	h, plo0, h +	addxc(	phi0, %g0, phi0) +	stx	h, [qp + 0] +	ldx	[ap + 8], a1 +	sub	h, phi0, h +L(lo1):	mulx	bd, a0, plo0 +	umulxhi(bd, a0, phi0) +	subcc	h, plo1, h +	addxc(	phi1, %g0, phi1) +	stx	h, [qp + 8] +	ldx	[ap + 16], a0 +	sub	h, phi1, h +L(lo0):	mulx	bd, a1, plo1 +	umulxhi(bd, a1, phi1) +	subcc	h, plo0, h +	addxc(	phi0, %g0, phi0) +	stx	h, [qp + 16] +	ldx	[ap + 24], a1 +	sub	h, phi0, h +L(lo3):	mulx	bd, a0, plo0 +	umulxhi(bd, a0, phi0) +	subcc	h, plo1, h +	addxc(	phi1, %g0, phi1) +	stx	h, [qp + 24] +	add	ap, 32, ap +	add	qp, 32, qp +	brgz,pt	n, L(top) +	 add	n, -4, n + +L(end):	sub	h, phi1, h +L(wd2):	mulx	bd, a1, plo1 +	umulxhi(bd, a1, phi1) +	subcc	h, plo0, h +	addxc(	phi0, %g0, phi0) +	stx	h, [qp + 0] +	sub	h, phi0, h +L(wd1):	subcc	h, plo1, h +	addxc(	phi1, %g0, phi1) +	stx	h, [qp + 8] +	sub	h, phi1, %i0 + +	ret +	 restore +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/bdiv_q_1.asm b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/bdiv_q_1.asm new file mode 100644 index 0000000..9847047 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/bdiv_q_1.asm @@ -0,0 +1,137 @@ +dnl  SPARC T3/T4/T5 mpn_bdiv_q_1. + +dnl  Contributed to the GNU project by Torbjörn Granlund. + +dnl  Copyright 2013, 2017 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C                  cycles/limb +C UltraSPARC T3:	31 +C UltraSPARC T4/T5:	20-26  hits 20 early, then sharply drops + +C INPUT PARAMETERS +define(`qp',  `%i0') +define(`ap',  `%i1') +define(`n',   `%i2') +define(`d',   `%i3') +define(`dinv',`%i4') +define(`cnt', `%i5') + +define(`tnc', `%o2') + +ASM_START() +	REGISTER(%g2,#scratch) +	REGISTER(%g3,#scratch) +PROLOGUE(mpn_bdiv_q_1) +	save	%sp, -176, %sp +	ldx	[ap], %o5 +	add	d, -1, %g1 +	andn	%g1, d, %g1 +	popc	%g1, cnt + +	srlx	d, cnt, d +	srlx	d, 1, %g1 +	and	%g1, 127, %g1 +	LEA64(binvert_limb_table, g2, g4) +	ldub	[%g2+%g1], %g1 +	add	%g1, %g1, %g2 +	mulx	%g1, %g1, %g1 +	mulx	%g1, d, %g1 +	sub	%g2, %g1, %g2 +	add	%g2, %g2, %g1 +	mulx	%g2, %g2, %g2 +	mulx	%g2, d, %g2 +	sub	%g1, %g2, %g1 +	add	%g1, %g1, %o7 +	mulx	%g1, %g1, %g1 +	mulx	%g1, d, %g1 +	add	n, -2, n +	brz,pt	cnt, L(norm) +	 sub	%o7, %g1, dinv + +	brlz,pt	n, L(edu) +	 srlx	%o5, cnt, %o5 +	b	L(eee) +	 mov	0, %g4 +EPILOGUE() + +PROLOGUE(mpn_pi1_bdiv_q_1) +	save	%sp, -176, %sp +	ldx	[ap], %o5 + +	brz,pt	cnt, L(norm) +	 add	n, -2, n + +L(unorm): +	brlz,pt	n, L(edu) +	 srlx	%o5, cnt, %o5 +	mov	0, %g4 +L(eee):	sub	%g0, cnt, tnc + +L(tpu):	ldx	[ap+8], %g3 +	add	ap, 8, ap +	sllx	%g3, tnc, %g5 +	or	%g5, %o5, %g5 +	srlx	%g3, cnt, %o5 +	subcc	%g5, %g4, %g4 +	mulx	%g4, dinv, %g1 +	stx	%g1, [qp] +	add	qp, 8, qp +	umulxhi(d, %g1, %g1) +	addxc(	%g1, %g0, %g4) +	brgz,pt	n, L(tpu) +	 add	n, -1, n + +	sub	%o5, %g4, %o5 +L(edu):	mulx	%o5, dinv, %g1 +	return	%i7+8 +	 stx	%g1, [%o0] + +L(norm): +	mulx	dinv, %o5, %g1 +	brlz,pt	n, L(edn) +	 stx	%g1, [qp] +	add	qp, 8, qp +	addcc	%g0, 0, %g4 + +L(tpn):	umulxhi(d, %g1, %g1) +	ldx	[ap+8], %g5 +	add	ap, 8, ap +	addxc(	%g1, %g0, %g1) +	subcc	%g5, %g1, %g1 +	mulx	%g1, dinv, %g1 +	stx	%g1, [qp] +	add	qp, 8, qp +	brgz,pt	n, L(tpn) +	 add	n, -1, n + +L(edn):	return	%i7+8 +	 nop +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/cnd_aors_n.asm b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/cnd_aors_n.asm new file mode 100644 index 0000000..49ccaec --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/cnd_aors_n.asm @@ -0,0 +1,145 @@ +dnl  SPARC v9 mpn_cnd_add_n and mpn_cnd_sub_n for T3/T4/T5. + +dnl  Contributed to the GNU project by David Miller and Torbjörn Granlund. + +dnl  Copyright 2013, 2017 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C		   cycles/limb +C UltraSPARC T3:	 8.5 +C UltraSPARC T4:	 3 + +C We use a double-pointer trick to allow indexed addressing.  Its setup +C cost might be a problem in these functions, since we don't expect huge n +C arguments. +C +C For sub we need ~(a & mask) = (~a | ~mask) but by complementing mask we can +C instead do ~(a & ~mask) = (~a | mask), allowing us to use the orn insn. + +C INPUT PARAMETERS +define(`cnd', `%i0') +define(`rp',  `%i1') +define(`up',  `%i2') +define(`vp',  `%i3') +define(`n',   `%i4') + +define(`mask',   `cnd') +define(`up0', `%l0')  define(`up1', `%l1') +define(`vp0', `%l2')  define(`vp1', `%l3') +define(`rp0', `%g4')  define(`rp1', `%g5') +define(`u0',  `%l4')  define(`u1',  `%l5') +define(`v0',  `%l6')  define(`v1',  `%l7') +define(`x0',  `%g1')  define(`x1',  `%g3') +define(`w0',  `%g1')  define(`w1',  `%g3') + +ifdef(`OPERATION_cnd_add_n',` +  define(`LOGOP',   `and	$1, $2, $3') +  define(`MAKEMASK',`cmp	%g0, $1 +		     addxc(	%g0, %g0, $2) +		     neg	$2, $2') +  define(`INITCY',  `addcc	%g0, 0, %g0') +  define(`RETVAL',  `addxc(	%g0, %g0, %i0)') +  define(`func',    `mpn_cnd_add_n') +') +ifdef(`OPERATION_cnd_sub_n',` +  define(`LOGOP',   `orn	$2, $1, $3') +  define(`MAKEMASK',`cmp	$1, 1 +		     addxc(	%g0, %g0, $2) +		     neg	$2, $2') +  define(`INITCY',  `subcc	%g0, 1, %g0') +  define(`RETVAL',  `addxc(	%g0, %g0, %i0) +		     xor	%i0, 1, %i0') +  define(`func',    `mpn_cnd_sub_n') +') + +MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n) + +ASM_START() +	REGISTER(%g2,#scratch) +	REGISTER(%g3,#scratch) +PROLOGUE(func) +	save	%sp, -176, %sp + +	MAKEMASK(cnd,mask) + +	andcc	n, 1, %g0 +	sllx	n, 3, n +	add	n, -16, n +	add	vp, n, vp0 +	add	up, n, up0 +	add	rp, n, rp0 +	neg	n, n +	be	L(evn) +	 INITCY + +L(odd):	ldx	[vp0 + n], v1 +	ldx	[up0 + n], u1 +	LOGOP(	v1, mask, x1) +	addxccc(u1, x1, w1) +	stx	w1, [rp0 + n] +	add	n, 8, n +	brgz	n, L(rtn) +	 nop + +L(evn):	add	vp0, 8, vp1 +	add	up0, 8, up1 +	add	rp0, -24, rp1 +	ldx	[vp0 + n], v0 +	ldx	[vp1 + n], v1 +	ldx	[up0 + n], u0 +	ldx	[up1 + n], u1 +	add	n, 16, n +	brgz	n, L(end) +	 add	rp0, -16, rp0 + +L(top):	LOGOP(	v0, mask, x0) +	ldx	[vp0 + n], v0 +	LOGOP(	v1, mask, x1) +	ldx	[vp1 + n], v1 +	addxccc(u0, x0, w0) +	ldx	[up0 + n], u0 +	addxccc(u1, x1, w1) +	ldx	[up1 + n], u1 +	stx	w0, [rp0 + n] +	add	n, 16, n +	brlez	n, L(top) +	 stx	w1, [rp1 + n] + +L(end):	LOGOP(	v0, mask, x0) +	LOGOP(	v1, mask, x1) +	addxccc(u0, x0, w0) +	addxccc(u1, x1, w1) +	stx	w0, [rp0 + n] +	stx	w1, [rp1 + 32] + +L(rtn):	RETVAL +	ret +	 restore +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/dive_1.asm b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/dive_1.asm new file mode 100644 index 0000000..d7dbdf9 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/dive_1.asm @@ -0,0 +1,129 @@ +dnl  SPARC T3/T4/T5 mpn_divexact_1. + +dnl  Contributed to the GNU project by Torbjörn Granlund. + +dnl  Copyright 2013 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C                  cycles/limb +C UltraSPARC T3:	31 +C UltraSPARC T4/T5:	20-26  hits 20 early, then sharply drops + +C INPUT PARAMETERS +define(`qp',  `%i0') +define(`ap',  `%i1') +define(`n',   `%i2') +define(`d',   `%i3') + +define(`dinv',`%o4') + +ASM_START() +	REGISTER(%g2,#scratch) +	REGISTER(%g3,#scratch) +PROLOGUE(mpn_divexact_1) +	save	%sp, -176, %sp +	cmp	n, 1 +	bne,pt	%xcc, L(gt1) +	 ldx	[ap], %o5 +	udivx	%o5, d, %g1 +	stx	%g1, [qp] +	return	%i7+8 +	 nop + +L(gt1):	add	d, -1, %g1 +	andn	%g1, d, %g1 +	popc	%g1, %i4		C i4 = count_trailing_zeros(d) + +	srlx	d, %i4, d +	srlx	d, 1, %g1 +	and	%g1, 127, %g1 + +	LEA64(binvert_limb_table, g2, g4) +	ldub	[%g2+%g1], %g1 +	add	%g1, %g1, %g2 +	mulx	%g1, %g1, %g1 +	mulx	%g1, d, %g1 +	sub	%g2, %g1, %g2 +	add	%g2, %g2, %g1 +	mulx	%g2, %g2, %g2 +	mulx	%g2, d, %g2 +	sub	%g1, %g2, %g1 +	add	%g1, %g1, %o7 +	mulx	%g1, %g1, %g1 +	mulx	%g1, d, %g1 +	add	n, -2, n +	brz,pt	%i4, L(norm) +	 sub	%o7, %g1, dinv + +L(unnorm): +	mov	0, %g4 +	sub	%g0, %i4, %o2 +	srlx	%o5, %i4, %o5 +L(top_unnorm): +	ldx	[ap+8], %g3 +	add	ap, 8, ap +	sllx	%g3, %o2, %g5 +	or	%g5, %o5, %g5 +	srlx	%g3, %i4, %o5 +	subcc	%g5, %g4, %g4 +	mulx	%g4, dinv, %g1 +	stx	%g1, [qp] +	add	qp, 8, qp +	umulxhi(d, %g1, %g1) +	addxc(	%g1, %g0, %g4) +	brgz,pt	n, L(top_unnorm) +	 add	n, -1, n + +	sub	%o5, %g4, %g4 +	mulx	%g4, dinv, %g1 +	stx	%g1, [qp] +	return	%i7+8 +	 nop + +L(norm): +	mulx	dinv, %o5, %g1 +	stx	%g1, [qp] +	add	qp, 8, qp +	addcc	%g0, 0, %g4 +L(top_norm): +	umulxhi(d, %g1, %g1) +	ldx	[ap+8], %g5 +	add	ap, 8, ap +	addxc(	%g1, %g0, %g1) +	subcc	%g5, %g1, %g1 +	mulx	%g1, dinv, %g1 +	stx	%g1, [qp] +	add	qp, 8, qp +	brgz,pt	n, L(top_norm) +	 add	n, -1, n + +	return	%i7+8 +	 nop +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/hamdist.asm b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/hamdist.asm new file mode 100644 index 0000000..20ed8bf --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/hamdist.asm @@ -0,0 +1,78 @@ +dnl  SPARC v9 mpn_hamdist for T3/T4. + +dnl  Contributed to the GNU project by David Miller. + +dnl  Copyright 2013 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C		   cycles/limb +C UltraSPARC T3:	18 +C UltraSPARC T4:	 3.5 + +C INPUT PARAMETERS +define(`up',   `%o0') +define(`vp',   `%o1') +define(`n',    `%o2') +define(`pcnt', `%o5') + +ASM_START() +	REGISTER(%g2,#scratch) +	REGISTER(%g3,#scratch) +PROLOGUE(mpn_hamdist) +	subcc	n, 1, n +	be	L(final_one) +	 clr	pcnt +L(top): +	ldx	[up + 0], %g1 +	ldx	[vp + 0], %g2 +	ldx	[up + 8], %o4 +	ldx	[vp + 8], %g3 +	sub	n, 2, n +	xor	%g1, %g2, %g1 +	add	up, 16, up +	popc	%g1, %g2 +	add	vp, 16, vp +	xor	%o4, %g3, %o4 +	add	pcnt, %g2, pcnt +	popc	%o4, %g3 +	brgz	n, L(top) +	 add	pcnt, %g3, pcnt +	brlz,pt	n, L(done) +	 nop +L(final_one): +	ldx	[up + 0], %g1 +	ldx	[vp + 0], %g2 +	xor	%g1,%g2, %g1 +	popc	%g1, %g2 +	add	pcnt, %g2, pcnt +L(done): +	retl +	 mov	pcnt, %o0 +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/invert_limb.asm b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/invert_limb.asm new file mode 100644 index 0000000..4da49cf --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/invert_limb.asm @@ -0,0 +1,92 @@ +dnl  SPARC T3/T4/T5 mpn_invert_limb. + +dnl  Contributed to the GNU project by Torbjörn Granlund. + +dnl  Copyright 2013 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C                  cycles/limb +C UltraSPARC T3:	 ? +C UltraSPARC T4/T5:	 ? + +C INPUT PARAMETERS +define(`d',  `%o0') + +ASM_START() +	REGISTER(%g2,#scratch) +	REGISTER(%g3,#scratch) +PROLOGUE(mpn_invert_limb) +	srlx	d, 54, %g1 +	LEA64(approx_tab, g2, g3) +	and	%g1, 0x1fe, %g1 +	srlx	d, 24, %g4 +	lduh	[%g2+%g1], %g3 +	add	%g4, 1, %g4 +	sllx	%g3, 11, %g2 +	add	%g2, -1, %g2 +	mulx	%g3, %g3, %g3 +	mulx	%g3, %g4, %g3 +	srlx	%g3, 40, %g3 +	sub	%g2, %g3, %g2 +	sllx	%g2, 60, %g1 +	mulx	%g2, %g2, %g3 +	mulx	%g3, %g4, %g4 +	sub	%g1, %g4, %g1 +	srlx	%g1, 47, %g1 +	sllx	%g2, 13, %g2 +	add	%g1, %g2, %g1 +	and	d, 1, %g2 +	srlx	%g1, 1, %g4 +	sub	%g0, %g2, %g3 +	and	%g4, %g3, %g3 +	srlx	d, 1, %g4 +	add	%g4, %g2, %g2 +	mulx	%g1, %g2, %g2 +	sub	%g3, %g2, %g2 +	umulxhi(%g1, %g2, %g2) +	srlx	%g2, 1, %g2 +	sllx	%g1, 31, %g1 +	add	%g2, %g1, %g1 +	mulx	%g1, d, %g3 +	umulxhi(d, %g1, %g4) +	addcc	%g3, d, %g0 +	addxc(	%g4, d, %o0) +	jmp	%o7+8 +	 sub	%g1, %o0, %o0 +EPILOGUE() + +	RODATA +	ALIGN(2) +	TYPE(	approx_tab, object) +	SIZE(	approx_tab, 512) +approx_tab: +forloop(i,256,512-1,dnl +`	.half	eval(0x7fd00/i) +')dnl diff --git a/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/missing.asm b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/missing.asm new file mode 100644 index 0000000..c79032d --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/missing.asm @@ -0,0 +1,77 @@ +dnl  SPARC v9-2011 simulation support. + +dnl  Contributed to the GNU project by Torbjörn Granlund. + +dnl  Copyright 2013 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ASM_START() +PROLOGUE(__gmpn_umulh) +	save	%sp, -176, %sp +	ldx	[%sp+2047+176+256], %o0 +	ldx	[%sp+2047+176+256+8], %o1 +	rd	%ccr, %o4 +	srl	%o0, 0, %l4 +	srl	%o1, 0, %l1 +	srlx	%o1, 32, %o1 +	mulx	%o1, %l4, %l2 +	srlx	%o0, 32, %o0 +	mulx	%o0, %l1, %l3 +	mulx	%l1, %l4, %l1 +	srlx	%l1, 32, %l1 +	add	%l2, %l1, %l2 +	addcc	%l2, %l3, %l2 +	mulx	%o1, %o0, %o1 +	mov	0, %l1 +	movcs	%xcc, 1, %l1 +	sllx	%l1, 32, %l1 +	add	%o1, %l1, %o1 +	srlx	%l2, 32, %o0 +	add	%o1, %o0, %o0 +	stx	%o0, [%sp+2047+176+256] +	wr	%o4, 0, %ccr +	ret +	 restore +EPILOGUE() + +PROLOGUE(__gmpn_lzcnt) +	save	%sp, -176, %sp +	ldx	[%sp+2047+176+256], %o0 +	brz,a	%o0, 2f +	 mov	64, %o1 +	brlz	%o0, 2f +	 mov	0, %o1 +1:	sllx	%o0, 1, %o0 +	brgz	%o0, 1b +	 add	%o1, 1, %o1 +	stx	%o1, [%sp+2047+176+256] +2:	ret +	 restore +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/missing.m4 b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/missing.m4 new file mode 100644 index 0000000..e5d6d8e --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/missing.m4 @@ -0,0 +1,88 @@ +dnl  SPARC v9-2011 simulation support. + +dnl  Contributed to the GNU project by Torbjörn Granlund. + +dnl  Copyright 2013 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + + +dnl Usage addxccc(r1,r2,r3, t1) +dnl  64-bit add with carry-in and carry-out +dnl  FIXME: Register g2 must not be destination + +define(`addxccc',`dnl +	add	%sp, -512, %sp +	stx	%g2, [%sp+2047+256+16] +	mov	0, %g2 +	movcs	%xcc, -1, %g2 +	addcc	%g2, 1, %g0 +	addccc	$1, $2, $3 +	ldx	[%sp+2047+256+16], %g2 +	sub	%sp, -512, %sp +') + + +dnl Usage addxc(r1,r2,r3, t1,t2) +dnl  64-bit add with carry-in + +define(`addxc',`dnl +	bcc	%xcc, 1f +	 add	$1, $2, $3 +	add	$3, 1, $3 +1: +') + + +dnl Usage umulxhi(r1,r2,r3) +dnl  64-bit multiply returning upper 64 bits +dnl  Calls __gmpn_umulh using a non-standard calling convention + +define(`umulxhi',`dnl +	add	%sp, -512, %sp +	stx	$1, [%sp+2047+256] +	stx	$2, [%sp+2047+256+8] +	stx	%o7, [%sp+2047+256+16] +	call	__gmpn_umulh +	 nop +	ldx	[%sp+2047+256+16], %o7 +	ldx	[%sp+2047+256], $3 +	sub	%sp, -512, %sp +') +dnl Usage lzcnt(r1,r2) +dnl  Plain count leading zeros +dnl  Calls __gmpn_lzcnt using a non-standard calling convention + +define(`lzcnt',`dnl +	add	%sp, -512, %sp +	stx	%o7, [%sp+2047+256+16] +	call	__gmpn_lzcnt +	 stx	$1, [%sp+2047+256] +	ldx	[%sp+2047+256+16], %o7 +	ldx	[%sp+2047+256], $2 +	sub	%sp, -512, %sp +') diff --git a/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/mod_1_4.asm b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/mod_1_4.asm new file mode 100644 index 0000000..08facbd --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/mod_1_4.asm @@ -0,0 +1,233 @@ +dnl  SPARC T3/T4/T5 mpn_mod_1s_4p. + +dnl  Contributed to the GNU project by Torbjörn Granlund. + +dnl  Copyright 2013 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C                  cycles/limb +C UltraSPARC T3:	30 +C UltraSPARC T4/T5:	 4 + +C INPUT PARAMETERS +define(`ap',  `%o0') +define(`n',   `%o1') +define(`d',   `%o2') +define(`cps', `%o3') + + +ASM_START() +	REGISTER(%g2,#scratch) +	REGISTER(%g3,#scratch) +PROLOGUE(mpn_mod_1s_4p) +	save	%sp, -176, %sp +	ldx	[%i3+16], %o4 +	ldx	[%i3+24], %o3 +	ldx	[%i3+32], %o2 +	ldx	[%i3+40], %o1 +	ldx	[%i3+48], %o0 + +	and	%i1, 3, %g3 +	sllx	%i1, 3, %g1 +	add	%i0, %g1, %i0 +	brz	%g3, L(b00) +	 cmp	%g3, 2 +	bcs	%xcc, L(b01) +	 nop +	be	%xcc, L(b10) +	 nop + +L(b11):	ldx	[%i0-16], %g2 +	mulx	%g2, %o4, %g5 +	umulxhi(%g2, %o4, %g3) +	ldx	[%i0-24], %g4 +	addcc	%g5, %g4, %g5 +	addxc(	%g3, %g0, %g4) +	ldx	[%i0-8], %g2 +	mulx	%g2, %o3, %g1 +	umulxhi(%g2, %o3, %g3) +	addcc	%g1, %g5, %g1 +	addxc(	%g3, %g4, %g2) +	ba,pt	%xcc, .L8 +	 add	%i0, -32, %i0 + +L(b00):	ldx	[%i0-24], %g3 +	mulx	%g3, %o4, %g2 +	umulxhi(%g3, %o4, %g5) +	ldx	[%i0-32], %g4 +	addcc	%g2, %g4, %g2 +	addxc(	%g5, %g0, %g3) +	ldx	[%i0-16], %g4 +	mulx	%g4, %o3, %g5 +	umulxhi(%g4, %o3, %i5) +	addcc	%g2, %g5, %g5 +	addxc(	%g3, %i5, %g4) +	ldx	[%i0-8], %g2 +	mulx	%g2, %o2, %g1 +	umulxhi(%g2, %o2, %g3) +	addcc	%g1, %g5, %g1 +	addxc(	%g3, %g4, %g2) +	ba,pt	%xcc, .L8 +	 add	%i0, -40, %i0 + +L(b01):	ldx	[%i0-8], %g1 +	mov	0, %g2 +	ba,pt	%xcc, .L8 +	 add	%i0, -16, %i0 + +L(b10):	ldx	[%i0-8], %g2 +	ldx	[%i0-16], %g1 +	add	%i0, -24, %i0 + +.L8:	add	%i1, -5, %g3 +	brlz,pn	%g3, L(end) +	 nop + +L(top):	ldx	[%i0-16], %i4 +	mulx	%i4, %o4, %o5 +	umulxhi(%i4, %o4, %i1) +	ldx	[%i0-24], %i5 +	addcc	%o5, %i5, %o5 +	addxc(	%i1, %g0, %i4) +	ldx	[%i0-8], %i5 +	mulx	%i5, %o3, %o7 +	umulxhi(%i5, %o3, %i1) +	addcc	%o5, %o7, %o7 +	addxc(	%i4, %i1, %i5) +	ldx	[%i0+0], %g4 +	mulx	%g4, %o2, %i1 +	umulxhi(%g4, %o2, %i4) +	addcc	%o7, %i1, %i1 +	addxc(	%i5, %i4, %g4) +	mulx	%g1, %o1, %i5 +	umulxhi(%g1, %o1, %i4) +	addcc	%i1, %i5, %i5 +	addxc(	%g4, %i4, %g5) +	mulx	%g2, %o0, %g1 +	umulxhi(%g2, %o0, %g4) +	addcc	%g1, %i5, %g1 +	addxc(	%g4, %g5, %g2) +	add	%g3, -4, %g3 +	brgez,pt %g3, L(top) +	 add	%i0, -32, %i0 + +L(end):	mulx	%g2, %o4, %g5 +	umulxhi(%g2, %o4, %g3) +	addcc	%g1, %g5, %g5 +	addxc(	%g3, %g0, %g2) +	ldx	[%i3+8], %i0 +	ldx	[%i3], %g4 +	sub	%g0, %i0, %i5 +	srlx	%g5, %i5, %i5 +	sllx	%g2, %i0, %g2 +	or	%i5, %g2, %g1 +	mulx	%g1, %g4, %l7 +	umulxhi(%g1, %g4, %g3) +	sllx	%g5, %i0, %g2 +	add	%g1, 1, %g1 +	addcc	%l7, %g2, %g5 +	addxc(	%g3, %g1, %g1) +	mulx	%g1, %i2, %g1 +	sub	%g2, %g1, %g2 +	cmp	%g2, %g5 +	add	%i2, %g2, %g1 +	movlu	%xcc, %g2, %g1 +	subcc	%g1, %i2, %g2 +	movgeu	%xcc, %g2, %g1 +	return	%i7+8 +	 srlx	%g1, %o0, %o0 +EPILOGUE() + +PROLOGUE(mpn_mod_1s_4p_cps) +	save	%sp, -176, %sp +	lzcnt(	%i1, %i5) +	sllx	%i1, %i5, %i1 +	call	mpn_invert_limb, 0 +	 mov	%i1, %o0 +	stx	%o0, [%i0] +	sra	%i5, 0, %g1 +	stx	%g1, [%i0+8] +	sub	%g0, %i5, %g2 +	srlx	%o0, %g2, %g2 +	mov	1, %g1 +	sllx	%g1, %i5, %g1 +	or	%g2, %g1, %g2 +	sub	%g0, %i1, %g1 +	mulx	%g2, %g1, %g2 +	srlx	%g2, %i5, %g1 +	stx	%g1, [%i0+16] + +	umulxhi(%o0, %g2, %g3) +	add	%g2, %g3, %g3 +	xnor	%g0, %g3, %g3 +	mulx	%g3, %i1, %g3 +	mulx	%g2, %o0, %g2 +	cmp	%g2, %g3 +	add	%i1, %g3, %g1 +	movgeu	%xcc, %g3, %g1 +	srlx	%g1, %i5, %g2 +	stx	%g2, [%i0+24] + +	umulxhi(%o0, %g1, %g3) +	add	%g1, %g3, %g3 +	xnor	%g0, %g3, %g3 +	mulx	%g3, %i1, %g3 +	mulx	%g1, %o0, %g1 +	cmp	%g1, %g3 +	add	%i1, %g3, %g2 +	movgeu	%xcc, %g3, %g2 +	srlx	%g2, %i5, %g1 +	stx	%g1, [%i0+32] + +	umulxhi(%o0, %g2, %g3) +	add	%g2, %g3, %g3 +	xnor	%g0, %g3, %g3 +	mulx	%g3, %i1, %g3 +	mulx	%g2, %o0, %g2 +	cmp	%g2, %g3 +	add	%i1, %g3, %g1 +	movgeu	%xcc, %g3, %g1 +	srlx	%g1, %i5, %g2 +	stx	%g2, [%i0+40] + +	umulxhi(%o0, %g1, %g2) +	add	%g1, %g2, %g2 +	xnor	%g0, %g2, %g2 +	mulx	%g2, %i1, %g2 +	mulx	%g1, %o0, %o0 +	cmp	%o0, %g2 +	add	%i1, %g2, %g3 +	movgeu	%xcc, %g2, %g3 +	srlx	%g3, %i5, %i5 +	stx	%i5, [%i0+48] + +	return	%i7+8 +	 nop +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/mod_34lsub1.asm b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/mod_34lsub1.asm new file mode 100644 index 0000000..8744280 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/mod_34lsub1.asm @@ -0,0 +1,117 @@ +dnl  SPARC v9 mpn_mod_34lsub1 for T3/T4/T5. + +dnl  Copyright 2005, 2013 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C		    cycles/limb +C UltraSPARC T1:	 - +C UltraSPARC T3:	 5 +C UltraSPARC T4:	 1.57 + +C This is based on the powerpc64/mode64 code. + +C INPUT PARAMETERS +define(`up', `%i0') +define(`n',  `%i1') + +ASM_START() +	REGISTER(%g2,#scratch) +	REGISTER(%g3,#scratch) +PROLOGUE(mpn_mod_34lsub1) +	save	%sp, -176, %sp + +	mov	0, %g1 +	mov	0, %g3 +	mov	0, %g4 +	addcc	%g0, 0, %g5 + +	add	n, -3, n +	brlz	n, L(lt3) +	 nop + +	add	n, -3, n +	ldx	[up+0], %l5 +	ldx	[up+8], %l6 +	ldx	[up+16], %l7 +	brlz	n, L(end) +	 add	up, 24, up + +	ALIGN(16) +L(top):	addxccc(%g1, %l5, %g1) +	ldx	[up+0], %l5 +	addxccc(%g3, %l6, %g3) +	ldx	[up+8], %l6 +	addxccc(%g4, %l7, %g4) +	ldx	[up+16], %l7 +	add	n, -3, n +	brgez	n, L(top) +	 add	up, 24, up + +L(end):	addxccc(	%g1, %l5, %g1) +	addxccc(%g3, %l6, %g3) +	addxccc(%g4, %l7, %g4) +	addxc(	%g5, %g0, %g5) + +L(lt3):	cmp	n, -2 +	blt	L(2) +	 nop + +	ldx	[up+0], %l5 +	mov	0, %l6 +	beq	L(1) +	 addcc	%g1, %l5, %g1 + +	ldx	[up+8], %l6 +L(1):	addxccc(%g3, %l6, %g3) +	addxccc(%g4, %g0, %g4) +	addxc(	%g5, %g0, %g5) + +L(2):	sllx	%g1, 16, %l0 +	srlx	%l0, 16, %l0		C %l0 = %g1 mod 2^48 +	srlx	%g1, 48, %l3		C %l3 = %g1 div 2^48 +	srl	%g3, 0, %g1 +	sllx	%g1, 16, %l4		C %l4 = (%g3 mod 2^32) << 16 +	srlx	%g3, 32, %l5		C %l5 = %g3 div 2^32 +	sethi	%hi(0xffff0000), %g1 +	andn	%g4, %g1, %g1 +	sllx	%g1, 32, %l6		C %l6 = (%g4 mod 2^16) << 32 +	srlx	%g4, 16, %l7		C %l7 = %g4 div 2^16 + +	add	%l0, %l3, %l0 +	add	%l4, %l5, %l4 +	add	%l6, %l7, %l6 + +	add	%l0, %l4, %l0 +	add	%l6, %g5, %l6 + +	add	%l0, %l6, %i0 +	ret +	 restore +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/mode1o.asm b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/mode1o.asm new file mode 100644 index 0000000..494e1d3 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/mode1o.asm @@ -0,0 +1,82 @@ +dnl  SPARC T3/T4/T5 mpn_modexact_1c_odd. + +dnl  Contributed to the GNU project by Torbjörn Granlund. + +dnl  Copyright 2013 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C                  cycles/limb +C UltraSPARC T3:	30 +C UltraSPARC T4/T5:	26 + +C INPUT PARAMETERS +define(`ap',  `%o0') +define(`n',   `%o1') +define(`d',   `%o2') +define(`cy',  `%o3') + +define(`dinv',`%o5') +define(`a0',  `%g1') + +ASM_START() +	REGISTER(%g2,#scratch) +	REGISTER(%g3,#scratch) +PROLOGUE(mpn_modexact_1c_odd) +	srlx	d, 1, %g1 +	and	%g1, 127, %g1 + +	LEA64(binvert_limb_table, g2, g4) +	ldub	[%g2+%g1], %g1 +	add	%g1, %g1, %g2 +	mulx	%g1, %g1, %g1 +	mulx	%g1, d, %g1 +	sub	%g2, %g1, %g2 +	add	%g2, %g2, %g1 +	mulx	%g2, %g2, %g2 +	mulx	%g2, d, %g2 +	sub	%g1, %g2, %g1 +	add	%g1, %g1, %o5 +	mulx	%g1, %g1, %g1 +	mulx	%g1, d, %g1 +	sub	%o5, %g1, dinv +	add	n, -1, n + +L(top):	ldx	[ap], a0 +	add	ap, 8, ap +	subcc	a0, cy, %g3 +	mulx	%g3, dinv, %g5 +	umulxhi(d, %g5, %g5) +	addxc(	%g5, %g0, cy) +	brnz,pt	n, L(top) +	 add	n, -1, n + +	retl +	 mov	cy, %o0 +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/mul_1.asm b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/mul_1.asm new file mode 100644 index 0000000..af05d62 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/mul_1.asm @@ -0,0 +1,174 @@ +dnl  SPARC v9 mpn_mul_1 for T3/T4/T5. + +dnl  Contributed to the GNU project by David Miller and Torbjörn Granlund. + +dnl  Copyright 2013 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C		   cycles/limb +C UltraSPARC T3:	23 +C UltraSPARC T4:	 3 + +C INPUT PARAMETERS +define(`rp', `%i0') +define(`up', `%i1') +define(`n',  `%i2') +define(`v0', `%i3') + +ASM_START() +	REGISTER(%g2,#scratch) +	REGISTER(%g3,#scratch) +PROLOGUE(mpn_mul_1) +	save	%sp, -176, %sp + +	and	n, 3, %g5 +	add	n, -4, n +	brz	%g5, L(b0) +	 cmp	%g5, 2 +	bcs	%xcc, L(b1) +	 nop +	be	%xcc, L(b2) +	 nop + +L(b3):	addcc	%g0, %g0, %i5 +	ldx	[up+0], %l0 +	ldx	[up+8], %l1 +	ldx	[up+16], %l2 +	mulx	%l0, v0, %o0 +	umulxhi(%l0, v0, %o1) +	brgz	n, L(gt3) +	 add	rp, -8, rp +	mulx	%l1, v0, %o2 +	umulxhi(%l1, v0, %o3) +	b	L(wd3) +	 nop +L(gt3):	ldx	[up+24], %l3 +	mulx	%l1, v0, %o2 +	umulxhi(%l1, v0, %o3) +	add	up, 24, up +	b	L(lo3) +	 add	n, -3, n + +L(b2):	addcc	%g0, %g0, %o1 +	ldx	[up+0], %l1 +	ldx	[up+8], %l2 +	brgz	n, L(gt2) +	 add	rp, -16, rp +	mulx	%l1, v0, %o2 +	umulxhi(%l1, v0, %o3) +	mulx	%l2, v0, %o4 +	umulxhi(%l2, v0, %o5) +	b	L(wd2) +	 nop +L(gt2):	ldx	[up+16], %l3 +	mulx	%l1, v0, %o2 +	umulxhi(%l1, v0, %o3) +	ldx	[up+24], %l0 +	mulx	%l2, v0, %o4 +	umulxhi(%l2, v0, %o5) +	add	up, 16, up +	b	L(lo2) +	 add	n, -2, n + +L(b1):	addcc	%g0, %g0, %o3 +	ldx	[up+0], %l2 +	brgz	n, L(gt1) +	nop +	mulx	%l2, v0, %o4 +	stx	%o4, [rp+0] +	umulxhi(%l2, v0, %i0) +	ret +	 restore +L(gt1):	ldx	[up+8], %l3 +	ldx	[up+16], %l0 +	mulx	%l2, v0, %o4 +	umulxhi(%l2, v0, %o5) +	ldx	[up+24], %l1 +	mulx	%l3, v0, %i4 +	umulxhi(%l3, v0, %i5) +	add	rp, -24, rp +	add	up, 8, up +	b	L(lo1) +	 add	n, -1, n + +L(b0):	addcc	%g0, %g0, %o5 +	ldx	[up+0], %l3 +	ldx	[up+8], %l0 +	ldx	[up+16], %l1 +	mulx	%l3, v0, %i4 +	umulxhi(%l3, v0, %i5) +	ldx	[up+24], %l2 +	mulx	%l0, v0, %o0 +	umulxhi(%l0, v0, %o1) +	b	L(lo0) +	 nop + +	ALIGN(16) +L(top):	ldx	[up+0], %l3	C 0 +	addxccc(%i4, %o5, %i4)	C 0 +	mulx	%l1, v0, %o2	C 1 +	stx	%i4, [rp+0]	C 1 +	umulxhi(%l1, v0, %o3)	C 2 +L(lo3):	ldx	[up+8], %l0	C 2 +	addxccc(%o0, %i5, %o0)	C 3 +	mulx	%l2, v0, %o4	C 3 +	stx	%o0, [rp+8]	C 4 +	umulxhi(%l2, v0, %o5)	C 4 +L(lo2):	ldx	[up+16], %l1	C 5 +	addxccc(%o2, %o1, %o2)	C 5 +	mulx	%l3, v0, %i4	C 6 +	stx	%o2, [rp+16]	C 6 +	umulxhi(%l3, v0, %i5)	C 7 +L(lo1):	ldx	[up+24], %l2	C 7 +	addxccc(%o4, %o3, %o4)	C 8 +	mulx	%l0, v0, %o0	C 8 +	stx	%o4, [rp+24]	C 9 +	umulxhi(%l0, v0, %o1)	C 9 +	add	rp, 32, rp	C 10 +L(lo0):	add	up, 32, up	C 10 +	brgz	n, L(top)	C 11 +	 add	n, -4, n	C 11 + +L(end):	addxccc(%i4, %o5, %i4) +	mulx	%l1, v0, %o2 +	stx	%i4, [rp+0] +	umulxhi(%l1, v0, %o3) +	addxccc(%o0, %i5, %o0) +L(wd3):	mulx	%l2, v0, %o4 +	stx	%o0, [rp+8] +	umulxhi(%l2, v0, %o5) +	addxccc(%o2, %o1, %o2) +L(wd2):	stx	%o2, [rp+16] +	addxccc(%o4, %o3, %o4) +	stx	%o4, [rp+24] +	addxc(	%g0, %o5, %i0) +	ret +	 restore +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/popcount.asm b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/popcount.asm new file mode 100644 index 0000000..de80f3c --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/popcount.asm @@ -0,0 +1,70 @@ +dnl  SPARC v9 mpn_popcount for T3/T4. + +dnl  Contributed to the GNU project by David Miller. + +dnl  Copyright 2013 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C		   cycles/limb +C UltraSPARC T3:	15 +C UltraSPARC T4:	 2.5 + +C INPUT PARAMETERS +define(`up',   `%o0') +define(`n',    `%o1') +define(`pcnt', `%o5') + +ASM_START() +	REGISTER(%g2,#scratch) +	REGISTER(%g3,#scratch) +PROLOGUE(mpn_popcount) +	subcc	n, 1, n +	be	L(final_one) +	 clr	pcnt +L(top): +	ldx	[up + 0], %g1 +	sub	n, 2, n +	ldx	[up + 8], %o4 +	add	up, 16, up +	popc	%g1, %g2 +	popc	%o4, %g3 +	add	pcnt, %g2, pcnt +	brgz	n, L(top) +	 add	pcnt, %g3, pcnt +	brlz,pt	n, L(done) +	 nop +L(final_one): +	ldx	[up + 0], %g1 +	popc	%g1, %g2 +	add	pcnt, %g2, pcnt +L(done): +	retl +	 mov	pcnt, %o0 +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/sqr_diag_addlsh1.asm b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/sqr_diag_addlsh1.asm new file mode 100644 index 0000000..d46499f --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/sqr_diag_addlsh1.asm @@ -0,0 +1,93 @@ +dnl  SPARC v9 mpn_sqr_diag_addlsh1 for T3/T4/T5. + +dnl  Contributed to the GNU project by Torbjörn Granlund. + +dnl  Copyright 2013 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C		   cycles/limb +C UltraSPARC T3:	? +C UltraSPARC T4:	>= 4.5 + + +define(`rp', `%i0') +define(`tp', `%i1') +define(`up', `%i2') +define(`n',  `%i3') + +ASM_START() +	REGISTER(%g2,#scratch) +	REGISTER(%g3,#scratch) +PROLOGUE(mpn_sqr_diag_addlsh1) +	save	%sp, -176, %sp + +	ldx	[up+0], %g1 +	mulx	%g1, %g1, %o0 +	umulxhi(%g1, %g1, %g2) +	stx	%o0, [rp+0] + +	ldx	[up+8], %g1 +	ldx	[tp+0], %g4 +	ldx	[tp+8], %g5 +	mulx	%g1, %g1, %o0 +	orcc	%g0, %g0, %o5 +	b	L(dm) +	 add	n, -2, n + +	ALIGN(16) +L(top):	ldx	[up+8], %g1 +	addcc	%g4, %o2, %o2 +	addxccc(%g5, %o0, %g3) +	ldx	[tp+16], %g4 +	ldx	[tp+24], %g5 +	mulx	%g1, %g1, %o0 +	stx	%o2, [rp+8] +	stx	%g3, [rp+16] +	add	rp, 16, rp +	add	tp, 16, tp +L(dm):	add	%g2, %o5, %o2 +	umulxhi(%g1, %g1, %g2) +	addxccc(%g4, %g4, %g4) +	addxccc(%g5, %g5, %g5) +	add	up, 8, up +	addxc(	%g0, %g0, %o5) +	brnz	n, L(top) +	 add	n, -1, n + +	addcc	%o2, %g4, %g4 +	addxccc(%o0, %g5, %g5) +	stx	%g4, [rp+8] +	stx	%g5, [rp+16] +	addxc(	%o5, %g2, %g2) +	stx	%g2, [rp+24] + +	ret +	 restore +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/sub_n.asm b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/sub_n.asm new file mode 100644 index 0000000..0e4bc93 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/sub_n.asm @@ -0,0 +1,144 @@ +dnl  SPARC v9 mpn_sub_n for T3/T4. + +dnl  Contributed to the GNU project by David Miller. + +dnl  Copyright 2013 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C		   cycles/limb +C UltraSPARC T3:	 8 +C UltraSPARC T4:	 3 + +C INPUT PARAMETERS +define(`rp', `%i0') +define(`up', `%i1') +define(`vp', `%i2') +define(`n',  `%i3') +define(`cy', `%i4') + +define(`u0_off', `%l0') +define(`u1_off', `%l1') +define(`v0_off', `%l2') +define(`v1_off', `%l3') +define(`r0_off', `%l4') +define(`r1_off', `%l5') +define(`loop_n', `%l6') +define(`tmp', `%l7') + +ASM_START() +	REGISTER(%g2,#scratch) +	REGISTER(%g3,#scratch) +PROLOGUE(mpn_sub_nc) +	save	%sp, -176, %sp +	ba,pt	%xcc, L(ent) +	 xor	cy, 1, cy +EPILOGUE() +PROLOGUE(mpn_sub_n) +	save	%sp, -176, %sp +	mov	1, cy +L(ent): +	subcc	n, 1, n +	be	L(final_one) +	 cmp	%g0, cy + +	ldx	[up + 0], %o4 +	sllx	n, 3, tmp + +	ldx	[vp + 0], %o5 +	add	up, tmp, u0_off + +	ldx	[up + 8], %g5 +	add	vp, tmp, v0_off + +	ldx	[vp + 8], %g1 +	add	rp, tmp, r0_off + +	neg	tmp, loop_n +	add	u0_off, 8, u1_off + +	add	v0_off, 8, v1_off +	sub	loop_n, -(2 * 8), loop_n + +	sub	r0_off, 16, r0_off +	brgez,pn loop_n, L(loop_tail) +	 sub	r0_off, 8, r1_off + +	b,a	L(top) +	ALIGN(16) +L(top): +	xnor	%o5, 0, tmp +	ldx	[loop_n + v0_off], %o5 + +	addxccc(%o4, tmp, %g3) +	ldx	[loop_n + u0_off], %o4 + +	xnor	%g1, 0, %g1 +	stx	%g3, [loop_n + r0_off] + +	addxccc(%g5, %g1, tmp) +	ldx	[loop_n + v1_off], %g1 + +	ldx	[loop_n + u1_off], %g5 +	sub	loop_n, -(2 * 8), loop_n + +	brlz	loop_n, L(top) +	 stx	tmp, [loop_n + r1_off] + +L(loop_tail): +	xnor	%o5, 0, tmp +	xnor	%g1, 0, %g1 + +	addxccc(%o4, tmp, %g3) +	add	loop_n, u0_off, up + +	addxccc(%g5, %g1, %g5) +	add	loop_n, r0_off, rp + +	stx	%g3, [rp + 0] +	add	loop_n, v0_off, vp + +	brgz,pt	loop_n, L(done) +	 stx	%g5, [rp + 8] + +	add	rp, (2 * 8), rp + +L(final_one): +	ldx	[up+0], %o4 +	ldx	[vp+0], %o5 +	xnor	%o5, %g0, %o5 +	addxccc(%o4, %o5, %g3) +	stx	%g3, [rp+0] + +L(done): +	clr	%i0 +	movcc	%xcc, 1, %i0 +	ret +	 restore +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/submul_1.asm b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/submul_1.asm new file mode 100644 index 0000000..5635d1b --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct3/submul_1.asm @@ -0,0 +1,170 @@ +dnl  SPARC v9 mpn_submul_1 for T3/T4/T5. + +dnl  Contributed to the GNU project by David Miller and Torbjörn Granlund. + +dnl  Copyright 2013 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C		   cycles/limb +C UltraSPARC T3:	26 +C UltraSPARC T4:	 4.5 + +C INPUT PARAMETERS +define(`rp', `%i0') +define(`up', `%i1') +define(`n',  `%i2') +define(`v0', `%i3') + +ASM_START() +	REGISTER(%g2,#scratch) +	REGISTER(%g3,#scratch) +PROLOGUE(mpn_submul_1) +	save	%sp, -176, %sp +	ldx	[up+0], %g1 + +	and	n, 3, %g5 +	add	n, -4, n +	brz	%g5, L(b00) +	 cmp	%g5, 2 +	bcs	%xcc, L(b01) +	 nop +	bne	%xcc, L(b11) +	 ldx	[up+8], %g4 + +L(b10):	add	up, 16, up +	addcc	%g0, 0, %g3 +	mulx	%g1, v0, %l4 +	umulxhi(%g1, v0, %l5) +	ldx	[rp+0], %o2 +	mulx	%g4, v0, %l6 +	umulxhi(%g4, v0, %l7) +	brlz	n, L(wd2) +	 nop +L(gt2):	ldx	[up+0], %o0 +	b	L(lo2) +	 nop + +L(b00):	add	rp, -16, rp +	addcc	%g0, 0, %g3 +	ldx	[up+8], %o1 +	mulx	%g1, v0, %l0 +	umulxhi(%g1, v0, %l1) +	ldx	[up+16], %o0 +	ldx	[rp+16], %o2 +	mulx	%o1, v0, %l2 +	umulxhi(%o1, v0, %l3) +	b	     L(lo0) +	 nop + +L(b01):	add	up, 8, up +	add	rp, -8, rp +	addcc	%g0, 0, %g3 +	ldx	[rp+8], %o3 +	mulx	%g1, v0, %l6 +	umulxhi(%g1, v0, %l7) +	brlz	n, L(wd1) +	 nop +	ldx	[up+0], %o0 +	ldx	[up+8], %o1 +	mulx	%o0, v0, %l0 +	umulxhi(%o0, v0, %l1) +	b	L(lo1) +	 nop + +L(b11):	add	up, 24, up +	add	rp, 8, rp +	addcc	%g0, 0, %g3 +	mulx	%g1, v0, %l2 +	umulxhi(%g1, v0, %l3) +	ldx	[up-8], %o1 +	ldx	[rp-8], %o3 +	mulx	%g4, v0, %l4 +	umulxhi(%g4, v0, %l5) +	brlz	n, L(end) +	 nop + +	ALIGN(16) +L(top):	ldx	[up+0], %o0 +	addxccc(%g3, %l2, %g1) +	ldx	[rp+0], %o2 +	addxc(	%g0, %l3, %g3) +	mulx	%o1, v0, %l6 +	subcc	%o3, %g1, %g4 +	umulxhi(%o1, v0, %l7) +	stx	%g4, [rp-8] +L(lo2):	ldx	[up+8], %o1 +	addxccc(%g3, %l4, %g1) +	ldx	[rp+8], %o3 +	addxc(	%g0, %l5, %g3) +	mulx	%o0, v0, %l0 +	subcc	%o2, %g1, %g4 +	umulxhi(%o0, v0, %l1) +	stx	%g4, [rp+0] +L(lo1):	ldx	[up+16], %o0 +	addxccc(%g3, %l6, %g1) +	ldx	[rp+16], %o2 +	addxc(	%g0, %l7, %g3) +	mulx	%o1, v0, %l2 +	subcc	%o3, %g1, %g4 +	umulxhi(%o1, v0, %l3) +	stx	%g4, [rp+8] +L(lo0):	ldx	[up+24], %o1 +	addxccc(%g3, %l0, %g1) +	ldx	[rp+24], %o3 +	addxc(	%g0, %l1, %g3) +	mulx	%o0, v0, %l4 +	subcc	%o2, %g1, %g4 +	umulxhi(%o0, v0, %l5) +	stx	%g4, [rp+16] +	add	n, -4, n +	add	up, 32, up +	brgez	n, L(top) +	 add	rp, 32, rp + +L(end):	addxccc(%g3, %l2, %g1) +	ldx	[rp+0], %o2 +	addxc(	%g0, %l3, %g3) +	mulx	%o1, v0, %l6 +	subcc	%o3, %g1, %g4 +	umulxhi(%o1, v0, %l7) +	stx	%g4, [rp-8] +L(wd2):	addxccc(%g3, %l4, %g1) +	ldx	[rp+8], %o3 +	addxc(	%g0, %l5, %g3) +	subcc	%o2, %g1, %g4 +	stx	%g4, [rp+0] +L(wd1):	addxccc(%g3, %l6, %g1) +	addxc(	%g0, %l7, %g3) +	subcc	%o3, %g1, %g4 +	stx	%g4, [rp+8] +	addxc(	%g0, %g3, %i0) +	ret +	 restore +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct45/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct45/gmp-mparam.h new file mode 100644 index 0000000..c10fd0d --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/sparc64/ultrasparct45/gmp-mparam.h @@ -0,0 +1,174 @@ +/* Sparc64 T4-T5 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + +  * the GNU Lesser General Public License as published by the Free +    Software Foundation; either version 3 of the License, or (at your +    option) any later version. + +or + +  * the GNU General Public License as published by the Free Software +    Foundation; either version 2 of the License, or (at your option) any +    later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library.  If not, +see https://www.gnu.org/licenses/.  */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 3600 MHz ultrasparct5 running GNU/Linux */ +/* FFT tuning limit = 0.5 M */ +/* Generated by tuneup.c, 2019-10-01, gcc 7.4 */ + +#define DIVREM_1_NORM_THRESHOLD              3 +#define DIVREM_1_UNNORM_THRESHOLD            3 +#define MOD_1_1P_METHOD                      2  /* 0.34% faster than 1 */ +#define MOD_1_NORM_THRESHOLD                 0  /* always */ +#define MOD_1_UNNORM_THRESHOLD               3 +#define MOD_1N_TO_MOD_1_1_THRESHOLD          6 +#define MOD_1U_TO_MOD_1_1_THRESHOLD          6 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD         8 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     13 +#define USE_PREINV_DIVREM_1                  1 +/* From gcc105.fsffrance.org, 2023-07-25 */ +#define DIV_QR_1N_PI1_METHOD                 4  /* 7.06% faster than 2 */ +#define DIV_QR_1_NORM_THRESHOLD              3 +#define DIV_QR_1_UNNORM_THRESHOLD            2 +#define DIV_QR_2_PI2_THRESHOLD               5 +#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD           19 + +#define DIV_1_VS_MUL_1_PERCENT             654 + +#define MUL_TOOM22_THRESHOLD                40 +#define MUL_TOOM33_THRESHOLD               129 +#define MUL_TOOM44_THRESHOLD               372 +#define MUL_TOOM6H_THRESHOLD               494 +#define MUL_TOOM8H_THRESHOLD               656 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD     126 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD     247 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD     225 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD     219 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD     188 + +#define SQR_BASECASE_THRESHOLD              20 +#define SQR_TOOM2_THRESHOLD                 59 +#define SQR_TOOM3_THRESHOLD                107 +#define SQR_TOOM4_THRESHOLD                298 +#define SQR_TOOM6_THRESHOLD                399 +#define SQR_TOOM8_THRESHOLD                562 + +#define MULMID_TOOM42_THRESHOLD             48 + +#define MULMOD_BNM1_THRESHOLD               25 +#define SQRMOD_BNM1_THRESHOLD               23 + +#define MUL_FFT_MODF_THRESHOLD             555  /* k = 5 */ +#define MUL_FFT_TABLE3                                      \ +  { {    555, 5}, {     29, 6}, {     31, 7}, {     31, 8}, \ +    {     17, 7}, {     36, 8}, {     19, 7}, {     39, 8}, \ +    {     21, 7}, {     43, 8}, {     29, 9}, {     15, 8}, \ +    {     31, 7}, {     63, 8}, {     35, 9}, {     19, 8}, \ +    {     43, 9}, {     23, 8}, {     51, 9}, {     27, 8}, \ +    {     57,10}, {     15, 8}, {     61, 9}, {     31, 8}, \ +    {     67, 9}, {     35, 8}, {     71, 9}, {     39, 8}, \ +    {     81, 9}, {     43,10}, {     23, 9}, {     59,11}, \ +    {     15,10}, {     31, 9}, {     71,10}, {     39, 9}, \ +    {     87,10}, {     47, 9}, {     99,10}, {     55, 9}, \ +    {    115,11}, {     31,10}, {     63, 9}, {    131,10}, \ +    {     87,11}, {     47,10}, {    111, 9}, {    223,12}, \ +    {     31,11}, {     63,10}, {    135,11}, {     79,10}, \ +    {    159,11}, {     95,10}, {    191,11}, {    111,12}, \ +    {     63,11}, {    143,10}, {    287,11}, {    159,12}, \ +    {     95,11}, {    191,10}, {    383, 9}, {    767,13}, \ +    {   8192,14}, {  16384,15}, {  32768,16}, {  65536,17}, \ +    { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ +    {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 75 +#define MUL_FFT_THRESHOLD                 5760 + +#define SQR_FFT_MODF_THRESHOLD             372  /* k = 5 */ +#define SQR_FFT_TABLE3                                      \ +  { {    372, 5}, {     23, 6}, {     12, 5}, {     25, 6}, \ +    {     25, 7}, {     13, 6}, {     27, 7}, {     25, 8}, \ +    {     13, 7}, {     28, 8}, {     15, 7}, {     31, 8}, \ +    {     27, 9}, {     15, 8}, {     35, 9}, {     19, 8}, \ +    {     41, 9}, {     23, 8}, {     47, 9}, {     27,10}, \ +    {     15, 9}, {     39,10}, {     23, 9}, {     51,11}, \ +    {     15,10}, {     31, 9}, {     67,10}, {     39, 9}, \ +    {     79,10}, {     47, 9}, {     95,10}, {     55,11}, \ +    {     31,10}, {     79,11}, {     47,10}, {     95,12}, \ +    {     31,11}, {     63,10}, {    135,11}, {     79,10}, \ +    {    159, 9}, {    319,11}, {     95,10}, {    191, 9}, \ +    {    383,11}, {    111,12}, {     63,11}, {    127,10}, \ +    {    255, 9}, {    511,10}, {    271,11}, {    143,10}, \ +    {    287, 9}, {    575,10}, {    303, 9}, {    607,11}, \ +    {    159,10}, {    319, 9}, {    639,12}, {     95,11}, \ +    {    191,10}, {    383, 9}, {    767,11}, {    207,13}, \ +    {   8192,14}, {  16384,15}, {  32768,16}, {  65536,17}, \ +    { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ +    {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 75 +#define SQR_FFT_THRESHOLD                 3776 + +#define MULLO_BASECASE_THRESHOLD             0  /* always */ +#define MULLO_DC_THRESHOLD                  35 +#define MULLO_MUL_N_THRESHOLD            11278 +#define SQRLO_BASECASE_THRESHOLD             0  /* always */ +#define SQRLO_DC_THRESHOLD                 168 +#define SQRLO_SQR_THRESHOLD               7511 + +#define DC_DIV_QR_THRESHOLD                 36 +#define DC_DIVAPPR_Q_THRESHOLD             103 +#define DC_BDIV_QR_THRESHOLD                28 +#define DC_BDIV_Q_THRESHOLD                 88 + +#define INV_MULMOD_BNM1_THRESHOLD           78 +#define INV_NEWTON_THRESHOLD               181 +#define INV_APPR_THRESHOLD                 118 + +#define BINV_NEWTON_THRESHOLD              296 +#define REDC_1_TO_REDC_2_THRESHOLD           4 +#define REDC_2_TO_REDC_N_THRESHOLD          79 + +#define MU_DIV_QR_THRESHOLD               1970 +#define MU_DIVAPPR_Q_THRESHOLD            1970 +#define MUPI_DIV_QR_THRESHOLD               82 +#define MU_BDIV_QR_THRESHOLD              1528 +#define MU_BDIV_Q_THRESHOLD               1970 + +#define POWM_SEC_TABLE  1,58,102,1509 + +#define GET_STR_DC_THRESHOLD                15 +#define GET_STR_PRECOMPUTE_THRESHOLD        29 +#define SET_STR_DC_THRESHOLD               686 +#define SET_STR_PRECOMPUTE_THRESHOLD      2717 + +#define FAC_DSC_THRESHOLD                  336 +#define FAC_ODD_THRESHOLD                   24 + +#define MATRIX22_STRASSEN_THRESHOLD         32 +#define HGCD2_DIV1_METHOD                    1  /* 0.66% faster than 3 */ +#define HGCD_THRESHOLD                      57 +#define HGCD_APPR_THRESHOLD                 50 +#define HGCD_REDUCE_THRESHOLD             3389 +#define GCD_DC_THRESHOLD                   386 +#define GCDEXT_DC_THRESHOLD                288 +#define JACOBI_BASE_METHOD                   4  /* 2.50% faster than 3 */ |