From a89a14ef5da44684a16b204e7a70460cc8c4922a Mon Sep 17 00:00:00 2001 From: Thomas Voss Date: Fri, 21 Jun 2024 23:36:36 +0200 Subject: Basic constant folding implementation --- vendor/gmp-6.3.0/mpn/cray/README | 121 ++++++++++++ vendor/gmp-6.3.0/mpn/cray/add_n.c | 90 +++++++++ vendor/gmp-6.3.0/mpn/cray/cfp/addmul_1.c | 48 +++++ vendor/gmp-6.3.0/mpn/cray/cfp/mul_1.c | 47 +++++ vendor/gmp-6.3.0/mpn/cray/cfp/mulwwc90.s | 254 ++++++++++++++++++++++++++ vendor/gmp-6.3.0/mpn/cray/cfp/mulwwj90.s | 253 +++++++++++++++++++++++++ vendor/gmp-6.3.0/mpn/cray/cfp/submul_1.c | 48 +++++ vendor/gmp-6.3.0/mpn/cray/gmp-mparam.h | 74 ++++++++ vendor/gmp-6.3.0/mpn/cray/hamdist.c | 42 +++++ vendor/gmp-6.3.0/mpn/cray/ieee/addmul_1.c | 111 +++++++++++ vendor/gmp-6.3.0/mpn/cray/ieee/gmp-mparam.h | 73 ++++++++ vendor/gmp-6.3.0/mpn/cray/ieee/invert_limb.c | 127 +++++++++++++ vendor/gmp-6.3.0/mpn/cray/ieee/mul_1.c | 103 +++++++++++ vendor/gmp-6.3.0/mpn/cray/ieee/mul_basecase.c | 107 +++++++++++ vendor/gmp-6.3.0/mpn/cray/ieee/sqr_basecase.c | 105 +++++++++++ vendor/gmp-6.3.0/mpn/cray/ieee/submul_1.c | 111 +++++++++++ vendor/gmp-6.3.0/mpn/cray/lshift.c | 58 ++++++ vendor/gmp-6.3.0/mpn/cray/mulww.f | 63 +++++++ vendor/gmp-6.3.0/mpn/cray/popcount.c | 42 +++++ vendor/gmp-6.3.0/mpn/cray/rshift.c | 58 ++++++ vendor/gmp-6.3.0/mpn/cray/sub_n.c | 90 +++++++++ 21 files changed, 2025 insertions(+) create mode 100644 vendor/gmp-6.3.0/mpn/cray/README create mode 100644 vendor/gmp-6.3.0/mpn/cray/add_n.c create mode 100644 vendor/gmp-6.3.0/mpn/cray/cfp/addmul_1.c create mode 100644 vendor/gmp-6.3.0/mpn/cray/cfp/mul_1.c create mode 100644 vendor/gmp-6.3.0/mpn/cray/cfp/mulwwc90.s create mode 100644 vendor/gmp-6.3.0/mpn/cray/cfp/mulwwj90.s create mode 100644 vendor/gmp-6.3.0/mpn/cray/cfp/submul_1.c create mode 100644 vendor/gmp-6.3.0/mpn/cray/gmp-mparam.h create mode 100644 vendor/gmp-6.3.0/mpn/cray/hamdist.c create mode 100644 vendor/gmp-6.3.0/mpn/cray/ieee/addmul_1.c create mode 100644 vendor/gmp-6.3.0/mpn/cray/ieee/gmp-mparam.h create mode 100644 vendor/gmp-6.3.0/mpn/cray/ieee/invert_limb.c create mode 100644 vendor/gmp-6.3.0/mpn/cray/ieee/mul_1.c create mode 100644 vendor/gmp-6.3.0/mpn/cray/ieee/mul_basecase.c create mode 100644 vendor/gmp-6.3.0/mpn/cray/ieee/sqr_basecase.c create mode 100644 vendor/gmp-6.3.0/mpn/cray/ieee/submul_1.c create mode 100644 vendor/gmp-6.3.0/mpn/cray/lshift.c create mode 100644 vendor/gmp-6.3.0/mpn/cray/mulww.f create mode 100644 vendor/gmp-6.3.0/mpn/cray/popcount.c create mode 100644 vendor/gmp-6.3.0/mpn/cray/rshift.c create mode 100644 vendor/gmp-6.3.0/mpn/cray/sub_n.c (limited to 'vendor/gmp-6.3.0/mpn/cray') diff --git a/vendor/gmp-6.3.0/mpn/cray/README b/vendor/gmp-6.3.0/mpn/cray/README new file mode 100644 index 0000000..3a347d2 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/cray/README @@ -0,0 +1,121 @@ +Copyright 2000-2002 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. + + + + + + +The code in this directory works for Cray vector systems such as C90, +J90, T90 (both the CFP variant and the IEEE variant) and SV1. (For +the T3E and T3D systems, see the `alpha' subdirectory at the same +level as the directory containing this file.) + +The cfp subdirectory is for systems utilizing the traditional Cray +floating-point format, and the ieee subdirectory is for the newer +systems that use the IEEE floating-point format. + +There are several issues that reduces speed on Cray systems. For +systems with cfp floating point, the main obstacle is the forming of +128-bit products. For IEEE systems, adding, and in particular +computing carry is the main issue. There are no vectorizing +unsigned-less-than instructions, and the sequence that implement that +operation is very long. + +Shifting is the only operation that is simple to make fast. All Cray +systems have a bitblt instructions (Vi Vj,VjAk) that +should be really useful. + +For best speed for cfp systems, we need a mul_basecase, since that +reduces the need for carry propagation to a minimum. Depending on the +size (vn) of the smaller of the two operands (V), we should split U and V +in different chunk sizes: + +U split in 2 32-bit parts +V split according to the table: +parts 4 5 6 7 8 +bits/part 16 13 11 10 8 +max allowed vn 1 8 32 64 256 +number of multiplies 8 10 12 14 16 +peak cycles/limb 4 5 6 7 8 + +U split in 3 22-bit parts +V split according to the table: +parts 3 4 5 +bits/part 22 16 13 +max allowed vn 16 1024 8192 +number of multiplies 9 12 15 +peak cycles/limb 4.5 6 7.5 + +U split in 4 16-bit parts +V split according to the table: +parts 4 +bits/part 16 +max allowed vn 65536 +number of multiplies 16 +peak cycles/limb 8 + +(A T90 CPU can accumulate two products per cycle.) + +IDEA: +* Rewrite mpn_add_n: + short cy[n + 1]; + #pragma _CRI ivdep + for (i = 0; i < n; i++) + { s = up[i] + vp[i]; + rp[i] = s; + cy[i + 1] = s < up[i]; } + more_carries = 0; + #pragma _CRI ivdep + for (i = 1; i < n; i++) + { s = rp[i] + cy[i]; + rp[i] = s; + more_carries += s < cy[i]; } + cys = 0; + if (more_carries) + { + cys = rp[1] < cy[1]; + for (i = 2; i < n; i++) + { rp[i] += cys; + cys = rp[i] < cys; } + } + return cys + cy[n]; + +* Write mpn_add3_n for adding three operands. First add operands 1 + and 2, and generate cy[]. Then add operand 3 to the partial result, + and accumulate carry into cy[]. Finally propagate carry just like + in the new mpn_add_n. + +IDEA: + +Store fewer bits, perhaps 62, per limb. That brings mpn_add_n time +down to 2.5 cycles/limb and mpn_addmul_1 times to 4 cycles/limb. By +storing even fewer bits per limb, perhaps 56, it would be possible to +write a mul_mul_basecase that would run at effectively 1 cycle/limb. +(Use VM here to better handle the romb-shaped multiply area, perhaps +rounding operand sizes up to the next power of 2.) diff --git a/vendor/gmp-6.3.0/mpn/cray/add_n.c b/vendor/gmp-6.3.0/mpn/cray/add_n.c new file mode 100644 index 0000000..af49159 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/cray/add_n.c @@ -0,0 +1,90 @@ +/* Cray PVP mpn_add_n -- add two limb vectors and store their sum in a third + limb vector. + +Copyright 1996, 2000, 2001 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +/* This code runs at 4 cycles/limb. It may be possible to bring it down + to 3 cycles/limb. */ + +#include "gmp-impl.h" + +mp_limb_t +mpn_add_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n) +{ + mp_limb_t cy[n]; + mp_limb_t a, b, r, s0, c0, c1; + mp_size_t i; + int more_carries; + + /* Main add loop. Generate a raw output sum in rp[] and a carry vector + in cy[]. */ +#pragma _CRI ivdep + for (i = 0; i < n; i++) + { + a = up[i]; + b = vp[i]; + s0 = a + b; + rp[i] = s0; + c0 = ((a & b) | ((a | b) & ~s0)) >> 63; + cy[i] = c0; + } + /* Carry add loop. Add the carry vector cy[] to the raw sum rp[] and + store the new sum back to rp[0]. If this generates further carry, set + more_carries. */ + more_carries = 0; +#pragma _CRI ivdep + for (i = 1; i < n; i++) + { + r = rp[i]; + c0 = cy[i - 1]; + s0 = r + c0; + rp[i] = s0; + c0 = (r & ~s0) >> 63; + more_carries += c0; + } + /* If that second loop generated carry, handle that in scalar loop. */ + if (more_carries) + { + mp_limb_t cyrec = 0; + /* Look for places where rp[k] is zero and cy[k-1] is non-zero. + These are where we got a recurrency carry. */ + for (i = 1; i < n; i++) + { + r = rp[i]; + c0 = (r == 0 && cy[i - 1] != 0); + s0 = r + cyrec; + rp[i] = s0; + c1 = (r & ~s0) >> 63; + cyrec = c0 | c1; + } + return cyrec | cy[n - 1]; + } + + return cy[n - 1]; +} diff --git a/vendor/gmp-6.3.0/mpn/cray/cfp/addmul_1.c b/vendor/gmp-6.3.0/mpn/cray/cfp/addmul_1.c new file mode 100644 index 0000000..9c7f383 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/cray/cfp/addmul_1.c @@ -0,0 +1,48 @@ +/* mpn_addmul_1 for Cray PVP. + +Copyright 1996, 2000, 2001 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + +mp_limb_t +mpn_addmul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t limb) +{ + mp_limb_t p0[n], p1[n], tp[n]; + mp_limb_t cy_limb; + + GMPN_MULWW (p1, p0, up, &n, &limb); + cy_limb = mpn_add_n (tp, rp, p0, n); + rp[0] = tp[0]; + if (n != 1) + cy_limb += mpn_add_n (rp + 1, tp + 1, p1, n - 1); + cy_limb += p1[n - 1]; + + return cy_limb; +} diff --git a/vendor/gmp-6.3.0/mpn/cray/cfp/mul_1.c b/vendor/gmp-6.3.0/mpn/cray/cfp/mul_1.c new file mode 100644 index 0000000..33a6a05 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/cray/cfp/mul_1.c @@ -0,0 +1,47 @@ +/* mpn_mul_1 for Cray PVP. + +Copyright 1996, 2000, 2001 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + +mp_limb_t +mpn_mul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t limb) +{ + mp_limb_t p0[n], p1[n]; + mp_limb_t cy_limb; + + GMPN_MULWW (p1, p0, up, &n, &limb); + rp[0] = p0[0]; + cy_limb = p1[n - 1]; + if (n != 1) + cy_limb += mpn_add_n (rp + 1, p0 + 1, p1, n - 1); + + return cy_limb; +} diff --git a/vendor/gmp-6.3.0/mpn/cray/cfp/mulwwc90.s b/vendor/gmp-6.3.0/mpn/cray/cfp/mulwwc90.s new file mode 100644 index 0000000..71d2285 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/cray/cfp/mulwwc90.s @@ -0,0 +1,254 @@ +* Helper for mpn_mul_1, mpn_addmul_1, and mpn_submul_1 for Cray PVP. + +* Copyright 1996, 2000 Free Software Foundation, Inc. +* This file is generated from mulww.f in this same directory. + +* This file is part of the GNU MP Library. +* +* The GNU MP Library is free software; you can redistribute it and/or modify +* it under the terms of either: +* +* * the GNU Lesser General Public License as published by the Free +* Software Foundation; either version 3 of the License, or (at your +* option) any later version. +* +* or +* +* * the GNU General Public License as published by the Free Software +* Foundation; either version 2 of the License, or (at your option) any +* later version. +* +* or both in parallel, as here. +* +* The GNU MP Library is distributed in the hope that it will be useful, but +* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +* for more details. +* +* You should have received copies of the GNU General Public License and the +* GNU Lesser General Public License along with the GNU MP Library. If not, +* see https://www.gnu.org/licenses/. + + IDENT GMPN_MULWW +********************************************** +* Assemble with Cal Version 2.0 * +* * +* Generated by CFT77 6.0.4.19 * +* on 06/27/00 at 04:34:13 * +* * +********************************************** +* ALLOW UNDERSCORES IN IDENTIFIERS + EDIT OFF + FORMAT NEW +@DATA SECTION DATA,CM +@DATA = W.* + CON O'0000000000040000000000 + CON O'0435152404713723252514 + CON O'0535270000000000000000 + CON O'0000000000000001200012 + VWD 32/0,32/P.GMPN_MULWW + CON O'0014003000000000001416 + CON O'0000000000000000000011 + CON O'0000000000000000000215 + BSSZ 1 +@CODE SECTION CODE +@CODE = P.* +L3 = P.* + A0 A6 + A5 6 + B03,A5 0,A0 + A0 A1+A2 + A5 1 + 0,A0 T00,A5 + B02 A2 + B66 A3 + B01 A6 + A7 P.L4 + B00 A7 + A6 @DATA + J $STKOFEN +GMPN_MULWW = P.* + A0 @DATA+3 + B77 A0 + A1 13 + A0 B66 + A2 B66 + A4 B67 + 0,A0 B77,A1 + A7 782 + A3 A2+A7 + A0 A4-A3 + JAM L3 + A0 A6 + A5 6 + B03,A5 0,A0 + A0 A1+A2 + A5 1 + 0,A0 T00,A5 + B02 A2 + B66 A3 + B01 A6 +L4 = P.* + A7 B07 + S7 0,A7 + A6 B10 + S6 0,A6 + S5 1 + S4 <22 + S7 S7-S5 + S5 #S7 + T00 S6 + S6 S6>22 + S7 T00 + S7 S7>44 + S3 T00 + S3 S3&S4 + S6 S6&S4 + S7 S7&S4 + S3 S3<24 + S6 S6<24 + S7 S7<24 + S0 S5 + S4 S5 + S1 S6 + S2 S3 + S3 S7 + JSP L5 +L6 = P.* + S7 -S4 + A2 S7 + VL A2 + A3 B06 + A5 B05 + A4 B04 + A1 VL + A2 S4 +L7 = P.* + A0 A3 + VL A1 + V7 ,A0,1 + B11 A5 + A7 22 + B12 A4 + V6 V7>A7 + B13 A3 + S7 <22 + A3 B02 + V5 S7&V6 + A6 24 + V4 V5A5 + V2 S1*FV1 + V3 S7&V5 + A0 14 + B77 A0 + A4 B77 + A0 A4+A3 + ,A0,1 V2 + V0 V3A7 + V2 S2*FV0 + V3 V6+V2 + S7 <20 + V1 S7&V3 + A4 270 + A0 A4+A3 + ,A0,1 V0 + A4 14 + A0 A4+A3 + V7 ,A0,1 + V6 V1A5 + V0 S1*FV4 + A5 654 + A0 A5+A3 + ,A0,1 V1 + V6 V7+V0 + A5 2 + V2 V6A6 + A5 654 + CPW + A0 A5+A3 + V1 ,A0,1 + A5 398 + A0 A5+A3 + V3 ,A0,1 + V6 V4+V1 + V2 V3>A6 + V5 V6+V2 + A6 B12 + V4 V322 + S7 T00 + S7 S7>44 + S3 T00 + S3 S3&S4 + S6 S6&S4 + S7 S7&S4 + S3 S3<24 + S6 S6<24 + S7 S7<24 + S0 S5 + S4 S5 + S1 S6 + S2 S3 + S3 S7 + JSP L5 +L6 = P.* + S7 -S4 + A2 S7 + VL A2 + A3 B06 + A5 B05 + A4 B04 + A1 VL + A2 S4 +L7 = P.* + A0 A3 + VL A1 + V7 ,A0,1 + B11 A5 + A7 22 + B12 A4 + V6 V7>A7 + B13 A3 + S7 <22 + A3 B02 + V5 S7&V6 + A6 24 + V4 V5A5 + V2 S1*FV1 + V3 S7&V5 + A0 14 + B77 A0 + A4 B77 + A0 A4+A3 + ,A0,1 V2 + V0 V3A7 + V2 S2*FV0 + V3 V6+V2 + S7 <20 + V1 S7&V3 + A4 270 + A0 A4+A3 + ,A0,1 V0 + A4 14 + A0 A4+A3 + V7 ,A0,1 + V6 V1A5 + V0 S1*FV4 + A5 654 + A0 A5+A3 + ,A0,1 V1 + V6 V7+V0 + A5 2 + V2 V6A6 + A5 654 + A0 A5+A3 + V1 ,A0,1 + A5 398 + A0 A5+A3 + V3 ,A0,1 + V6 V4+V1 + V2 V3>A6 + V5 V6+V2 + A6 B12 + V4 V3 +#include "gmp-impl.h" + +unsigned long int +mpn_hamdist (mp_srcptr p1, mp_srcptr p2, mp_size_t n) +{ + unsigned long int result = 0; + mp_size_t i; + for (i = 0; i < n; i++) + result += _popcnt (p1[i] ^ p2[i]); + return result; +} diff --git a/vendor/gmp-6.3.0/mpn/cray/ieee/addmul_1.c b/vendor/gmp-6.3.0/mpn/cray/ieee/addmul_1.c new file mode 100644 index 0000000..ce7dfbb --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/cray/ieee/addmul_1.c @@ -0,0 +1,111 @@ +/* Cray PVP/IEEE mpn_addmul_1 -- multiply a limb vector with a limb and add the + result to a second limb vector. + +Copyright 2000-2002 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +/* This code runs at just under 9 cycles/limb on a T90. That is not perfect, + mainly due to vector register shortage in the main loop. Assembly code + should bring it down to perhaps 7 cycles/limb. */ + +#include +#include "gmp-impl.h" + +mp_limb_t +mpn_addmul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t vl) +{ + mp_limb_t cy[n]; + mp_limb_t a, b, r, s0, s1, c0, c1; + mp_size_t i; + int more_carries; + + if (up == rp) + { + /* The algorithm used below cannot handle overlap. Handle it here by + making a temporary copy of the source vector, then call ourselves. */ + mp_limb_t xp[n]; + MPN_COPY (xp, up, n); + return mpn_addmul_1 (rp, xp, n, vl); + } + + a = up[0] * vl; + r = rp[0]; + s0 = a + r; + rp[0] = s0; + c0 = ((a & r) | ((a | r) & ~s0)) >> 63; + cy[0] = c0; + + /* Main multiply loop. Generate a raw accumulated output product in rp[] + and a carry vector in cy[]. */ +#pragma _CRI ivdep + for (i = 1; i < n; i++) + { + a = up[i] * vl; + b = _int_mult_upper (up[i - 1], vl); + s0 = a + b; + c0 = ((a & b) | ((a | b) & ~s0)) >> 63; + r = rp[i]; + s1 = s0 + r; + rp[i] = s1; + c1 = ((s0 & r) | ((s0 | r) & ~s1)) >> 63; + cy[i] = c0 + c1; + } + /* Carry add loop. Add the carry vector cy[] to the raw result rp[] and + store the new result back to rp[]. */ + more_carries = 0; +#pragma _CRI ivdep + for (i = 1; i < n; i++) + { + r = rp[i]; + c0 = cy[i - 1]; + s0 = r + c0; + rp[i] = s0; + c0 = (r & ~s0) >> 63; + more_carries += c0; + } + /* If that second loop generated carry, handle that in scalar loop. */ + if (more_carries) + { + mp_limb_t cyrec = 0; + /* Look for places where rp[k] == 0 and cy[k-1] == 1 or + rp[k] == 1 and cy[k-1] == 2. + These are where we got a recurrency carry. */ + for (i = 1; i < n; i++) + { + r = rp[i]; + c0 = r < cy[i - 1]; + s0 = r + cyrec; + rp[i] = s0; + c1 = (r & ~s0) >> 63; + cyrec = c0 | c1; + } + return _int_mult_upper (up[n - 1], vl) + cyrec + cy[n - 1]; + } + + return _int_mult_upper (up[n - 1], vl) + cy[n - 1]; +} diff --git a/vendor/gmp-6.3.0/mpn/cray/ieee/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/cray/ieee/gmp-mparam.h new file mode 100644 index 0000000..1fdc286 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/cray/ieee/gmp-mparam.h @@ -0,0 +1,73 @@ +/* Cray T90 IEEE gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1996, 2000-2002, 2004 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* Generated by tuneup.c, 2004-02-07, system compiler */ + +#define MUL_TOOM22_THRESHOLD 130 +#define MUL_TOOM33_THRESHOLD 260 + +#define SQR_BASECASE_THRESHOLD 9 /* karatsuba */ +#define SQR_TOOM2_THRESHOLD 0 /* never sqr_basecase */ +#define SQR_TOOM3_THRESHOLD 34 + +#define DIV_SB_PREINV_THRESHOLD 0 /* preinv always */ +#define DIV_DC_THRESHOLD 390 +#define POWM_THRESHOLD 656 + +#define HGCD_THRESHOLD 964 +#define GCD_ACCEL_THRESHOLD 3 +#define GCD_DC_THRESHOLD 964 +#define JACOBI_BASE_METHOD 2 + +#define DIVREM_1_NORM_THRESHOLD 0 /* preinv always */ +#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define USE_PREINV_DIVREM_1 1 /* preinv always */ +#define USE_PREINV_MOD_1 1 /* preinv always */ +#define DIVREM_2_THRESHOLD 0 /* preinv always */ +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define MODEXACT_1_ODD_THRESHOLD 0 /* always */ + +#define GET_STR_DC_THRESHOLD 45 +#define GET_STR_PRECOMPUTE_THRESHOLD 77 +#define SET_STR_THRESHOLD 145756 + +#define MUL_FFT_TABLE { 1104, 2208, 4416, 8960, 19456, 45056, 0 } +#define MUL_FFT_MODF_THRESHOLD 1168 +#define MUL_FFT_THRESHOLD 6528 + +#define SQR_FFT_TABLE { 368, 736, 1600, 2816, 7168, 12288, 0 } +#define SQR_FFT_MODF_THRESHOLD 296 +#define SQR_FFT_THRESHOLD 1312 diff --git a/vendor/gmp-6.3.0/mpn/cray/ieee/invert_limb.c b/vendor/gmp-6.3.0/mpn/cray/ieee/invert_limb.c new file mode 100644 index 0000000..774a27b --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/cray/ieee/invert_limb.c @@ -0,0 +1,127 @@ +/* mpn_invert_limb -- Invert a normalized limb. + +Copyright 1991, 2000, 2001 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +/* + This is needed to make configure define HAVE_NATIVE_mpn_invert_limb: + PROLOGUE(mpn_invert_limb) +*/ + +static const unsigned short int approx_tab[0x100] = +{ + /* 0x400, */ + 0x3ff, + 0x3fc, 0x3f8, 0x3f4, 0x3f0, 0x3ec, 0x3e8, 0x3e4, + 0x3e0, 0x3dd, 0x3d9, 0x3d5, 0x3d2, 0x3ce, 0x3ca, 0x3c7, + 0x3c3, 0x3c0, 0x3bc, 0x3b9, 0x3b5, 0x3b2, 0x3ae, 0x3ab, + 0x3a8, 0x3a4, 0x3a1, 0x39e, 0x39b, 0x397, 0x394, 0x391, + 0x38e, 0x38b, 0x387, 0x384, 0x381, 0x37e, 0x37b, 0x378, + 0x375, 0x372, 0x36f, 0x36c, 0x369, 0x366, 0x364, 0x361, + 0x35e, 0x35b, 0x358, 0x355, 0x353, 0x350, 0x34d, 0x34a, + 0x348, 0x345, 0x342, 0x340, 0x33d, 0x33a, 0x338, 0x335, + 0x333, 0x330, 0x32e, 0x32b, 0x329, 0x326, 0x324, 0x321, + 0x31f, 0x31c, 0x31a, 0x317, 0x315, 0x313, 0x310, 0x30e, + 0x30c, 0x309, 0x307, 0x305, 0x303, 0x300, 0x2fe, 0x2fc, + 0x2fa, 0x2f7, 0x2f5, 0x2f3, 0x2f1, 0x2ef, 0x2ec, 0x2ea, + 0x2e8, 0x2e6, 0x2e4, 0x2e2, 0x2e0, 0x2de, 0x2dc, 0x2da, + 0x2d8, 0x2d6, 0x2d4, 0x2d2, 0x2d0, 0x2ce, 0x2cc, 0x2ca, + 0x2c8, 0x2c6, 0x2c4, 0x2c2, 0x2c0, 0x2be, 0x2bc, 0x2bb, + 0x2b9, 0x2b7, 0x2b5, 0x2b3, 0x2b1, 0x2b0, 0x2ae, 0x2ac, + 0x2aa, 0x2a8, 0x2a7, 0x2a5, 0x2a3, 0x2a1, 0x2a0, 0x29e, + 0x29c, 0x29b, 0x299, 0x297, 0x295, 0x294, 0x292, 0x291, + 0x28f, 0x28d, 0x28c, 0x28a, 0x288, 0x287, 0x285, 0x284, + 0x282, 0x280, 0x27f, 0x27d, 0x27c, 0x27a, 0x279, 0x277, + 0x276, 0x274, 0x273, 0x271, 0x270, 0x26e, 0x26d, 0x26b, + 0x26a, 0x268, 0x267, 0x265, 0x264, 0x263, 0x261, 0x260, + 0x25e, 0x25d, 0x25c, 0x25a, 0x259, 0x257, 0x256, 0x255, + 0x253, 0x252, 0x251, 0x24f, 0x24e, 0x24d, 0x24b, 0x24a, + 0x249, 0x247, 0x246, 0x245, 0x243, 0x242, 0x241, 0x240, + 0x23e, 0x23d, 0x23c, 0x23b, 0x239, 0x238, 0x237, 0x236, + 0x234, 0x233, 0x232, 0x231, 0x230, 0x22e, 0x22d, 0x22c, + 0x22b, 0x22a, 0x229, 0x227, 0x226, 0x225, 0x224, 0x223, + 0x222, 0x220, 0x21f, 0x21e, 0x21d, 0x21c, 0x21b, 0x21a, + 0x219, 0x218, 0x216, 0x215, 0x214, 0x213, 0x212, 0x211, + 0x210, 0x20f, 0x20e, 0x20d, 0x20c, 0x20b, 0x20a, 0x209, + 0x208, 0x207, 0x206, 0x205, 0x204, 0x203, 0x202, 0x201, +}; + +/* iteration: z = 2z-(z**2)d */ + +mp_limb_t +mpn_invert_limb (mp_limb_t d) +{ + mp_limb_t z, z2l, z2h, tl, th; + mp_limb_t xh, xl; + mp_limb_t zh, zl; + +#if GMP_LIMB_BITS == 32 + z = approx_tab[(d >> 23) - 0x100] << 6; /* z < 2^16 */ + + z2l = z * z; /* z2l < 2^32 */ + umul_ppmm (th, tl, z2l, d); + z = (z << 17) - (th << 1); +#endif +#if GMP_LIMB_BITS == 64 + z = approx_tab[(d >> 55) - 0x100] << 6; /* z < 2^16 */ + + z2l = z * z; /* z2l < 2^32 */ + th = z2l * (d >> 32); /* th < 2^64 */ + z = (z << 17) - (th >> 31); /* z < 2^32 */ + + z2l = z * z; + umul_ppmm (th, tl, z2l, d); + z = (z << 33) - (th << 1); +#endif + + umul_ppmm (z2h, z2l, z, z); + umul_ppmm (th, tl, z2h, d); + umul_ppmm (xh, xl, z2l, d); + tl += xh; + th += tl < xh; + th = (th << 2) | (tl >> GMP_LIMB_BITS - 2); + tl = tl << 2; + sub_ddmmss (zh, zl, z << 2, 0, th, tl); + + umul_ppmm (xh, xl, d, zh); + xh += d; /* add_ssaaaa (xh, xl, xh, xl, d, 0); */ + if (~xh != 0) + { + add_ssaaaa (xh, xl, xh, xl, 0, d); + zh++; + } + + add_ssaaaa (xh, xl, xh, xl, 0, d); + if (xh != 0) + zh++; + + return zh; +} diff --git a/vendor/gmp-6.3.0/mpn/cray/ieee/mul_1.c b/vendor/gmp-6.3.0/mpn/cray/ieee/mul_1.c new file mode 100644 index 0000000..40139fb --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/cray/ieee/mul_1.c @@ -0,0 +1,103 @@ +/* Cray PVP/IEEE mpn_mul_1 -- multiply a limb vector with a limb and store the + result in a second limb vector. + +Copyright 2000, 2001 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +/* This code runs at 5 cycles/limb on a T90. That would probably + be hard to improve upon, even with assembly code. */ + +#include +#include "gmp-impl.h" + +mp_limb_t +mpn_mul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t vl) +{ + mp_limb_t cy[n]; + mp_limb_t a, b, r, s0, s1, c0, c1; + mp_size_t i; + int more_carries; + + if (up == rp) + { + /* The algorithm used below cannot handle overlap. Handle it here by + making a temporary copy of the source vector, then call ourselves. */ + mp_limb_t xp[n]; + MPN_COPY (xp, up, n); + return mpn_mul_1 (rp, xp, n, vl); + } + + a = up[0] * vl; + rp[0] = a; + cy[0] = 0; + + /* Main multiply loop. Generate a raw accumulated output product in rp[] + and a carry vector in cy[]. */ +#pragma _CRI ivdep + for (i = 1; i < n; i++) + { + a = up[i] * vl; + b = _int_mult_upper (up[i - 1], vl); + s0 = a + b; + c0 = ((a & b) | ((a | b) & ~s0)) >> 63; + rp[i] = s0; + cy[i] = c0; + } + /* Carry add loop. Add the carry vector cy[] to the raw sum rp[] and + store the new sum back to rp[0]. */ + more_carries = 0; +#pragma _CRI ivdep + for (i = 2; i < n; i++) + { + r = rp[i]; + c0 = cy[i - 1]; + s0 = r + c0; + rp[i] = s0; + c0 = (r & ~s0) >> 63; + more_carries += c0; + } + /* If that second loop generated carry, handle that in scalar loop. */ + if (more_carries) + { + mp_limb_t cyrec = 0; + /* Look for places where rp[k] is zero and cy[k-1] is non-zero. + These are where we got a recurrency carry. */ + for (i = 2; i < n; i++) + { + r = rp[i]; + c0 = (r == 0 && cy[i - 1] != 0); + s0 = r + cyrec; + rp[i] = s0; + c1 = (r & ~s0) >> 63; + cyrec = c0 | c1; + } + return _int_mult_upper (up[n - 1], vl) + cyrec + cy[n - 1]; + } + + return _int_mult_upper (up[n - 1], vl) + cy[n - 1]; +} diff --git a/vendor/gmp-6.3.0/mpn/cray/ieee/mul_basecase.c b/vendor/gmp-6.3.0/mpn/cray/ieee/mul_basecase.c new file mode 100644 index 0000000..72628f7 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/cray/ieee/mul_basecase.c @@ -0,0 +1,107 @@ +/* Cray PVP/IEEE mpn_mul_basecase. + +Copyright 2000, 2001 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +/* The most critical loop of this code runs at about 5 cycles/limb on a T90. + That is not perfect, mainly due to vector register shortage. */ + +#include +#include "gmp-impl.h" + +void +mpn_mul_basecase (mp_ptr rp, + mp_srcptr up, mp_size_t un, + mp_srcptr vp, mp_size_t vn) +{ + mp_limb_t cy[un + vn]; + mp_limb_t vl; + mp_limb_t a, b, r, s0, s1, c0, c1; + mp_size_t i, j; + int more_carries; + + for (i = 0; i < un + vn; i++) + { + rp[i] = 0; + cy[i] = 0; + } + +#pragma _CRI novector + for (j = 0; j < vn; j++) + { + vl = vp[j]; + + a = up[0] * vl; + r = rp[j]; + s0 = a + r; + rp[j] = s0; + c0 = ((a & r) | ((a | r) & ~s0)) >> 63; + cy[j] += c0; + +#pragma _CRI ivdep + for (i = 1; i < un; i++) + { + a = up[i] * vl; + b = _int_mult_upper (up[i - 1], vl); + s0 = a + b; + c0 = ((a & b) | ((a | b) & ~s0)) >> 63; + r = rp[j + i]; + s1 = s0 + r; + rp[j + i] = s1; + c1 = ((s0 & r) | ((s0 | r) & ~s1)) >> 63; + cy[j + i] += c0 + c1; + } + rp[j + un] = _int_mult_upper (up[un - 1], vl); + } + + more_carries = 0; +#pragma _CRI ivdep + for (i = 1; i < un + vn; i++) + { + r = rp[i]; + c0 = cy[i - 1]; + s0 = r + c0; + rp[i] = s0; + c0 = (r & ~s0) >> 63; + more_carries += c0; + } + /* If that second loop generated carry, handle that in scalar loop. */ + if (more_carries) + { + mp_limb_t cyrec = 0; + for (i = 1; i < un + vn; i++) + { + r = rp[i]; + c0 = (r < cy[i - 1]); + s0 = r + cyrec; + rp[i] = s0; + c1 = (r & ~s0) >> 63; + cyrec = c0 | c1; + } + } +} diff --git a/vendor/gmp-6.3.0/mpn/cray/ieee/sqr_basecase.c b/vendor/gmp-6.3.0/mpn/cray/ieee/sqr_basecase.c new file mode 100644 index 0000000..5bd4e56 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/cray/ieee/sqr_basecase.c @@ -0,0 +1,105 @@ +/* Cray PVP/IEEE mpn_sqr_basecase. + +Copyright 2000, 2001 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +/* This is just mpn_mul_basecase with trivial modifications. */ + +#include +#include "gmp-impl.h" + +void +mpn_sqr_basecase (mp_ptr rp, + mp_srcptr up, mp_size_t un) +{ + mp_limb_t cy[un + un]; + mp_limb_t ul; + mp_limb_t a, b, r, s0, s1, c0, c1; + mp_size_t i, j; + int more_carries; + + for (i = 0; i < un + un; i++) + { + rp[i] = 0; + cy[i] = 0; + } + +#pragma _CRI novector + for (j = 0; j < un; j++) + { + ul = up[j]; + + a = up[0] * ul; + r = rp[j]; + s0 = a + r; + rp[j] = s0; + c0 = ((a & r) | ((a | r) & ~s0)) >> 63; + cy[j] += c0; + +#pragma _CRI ivdep + for (i = 1; i < un; i++) + { + a = up[i] * ul; + b = _int_mult_upper (up[i - 1], ul); + s0 = a + b; + c0 = ((a & b) | ((a | b) & ~s0)) >> 63; + r = rp[j + i]; + s1 = s0 + r; + rp[j + i] = s1; + c1 = ((s0 & r) | ((s0 | r) & ~s1)) >> 63; + cy[j + i] += c0 + c1; + } + rp[j + un] = _int_mult_upper (up[un - 1], ul); + } + + more_carries = 0; +#pragma _CRI ivdep + for (i = 1; i < un + un; i++) + { + r = rp[i]; + c0 = cy[i - 1]; + s0 = r + c0; + rp[i] = s0; + c0 = (r & ~s0) >> 63; + more_carries += c0; + } + /* If that second loop generated carry, handle that in scalar loop. */ + if (more_carries) + { + mp_limb_t cyrec = 0; + for (i = 1; i < un + un; i++) + { + r = rp[i]; + c0 = (r < cy[i - 1]); + s0 = r + cyrec; + rp[i] = s0; + c1 = (r & ~s0) >> 63; + cyrec = c0 | c1; + } + } +} diff --git a/vendor/gmp-6.3.0/mpn/cray/ieee/submul_1.c b/vendor/gmp-6.3.0/mpn/cray/ieee/submul_1.c new file mode 100644 index 0000000..2b3ca21 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/cray/ieee/submul_1.c @@ -0,0 +1,111 @@ +/* Cray PVP/IEEE mpn_submul_1 -- multiply a limb vector with a limb and + subtract the result from a second limb vector. + +Copyright 2000-2002 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +/* This code runs at just under 9 cycles/limb on a T90. That is not perfect, + mainly due to vector register shortage in the main loop. Assembly code + should bring it down to perhaps 7 cycles/limb. */ + +#include +#include "gmp-impl.h" + +mp_limb_t +mpn_submul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t vl) +{ + mp_limb_t cy[n]; + mp_limb_t a, b, r, s0, s1, c0, c1; + mp_size_t i; + int more_carries; + + if (up == rp) + { + /* The algorithm used below cannot handle overlap. Handle it here by + making a temporary copy of the source vector, then call ourselves. */ + mp_limb_t xp[n]; + MPN_COPY (xp, up, n); + return mpn_submul_1 (rp, xp, n, vl); + } + + a = up[0] * vl; + r = rp[0]; + s0 = r - a; + rp[0] = s0; + c1 = ((s0 & a) | ((s0 | a) & ~r)) >> 63; + cy[0] = c1; + + /* Main multiply loop. Generate a raw accumulated output product in rp[] + and a carry vector in cy[]. */ +#pragma _CRI ivdep + for (i = 1; i < n; i++) + { + a = up[i] * vl; + b = _int_mult_upper (up[i - 1], vl); + s0 = a + b; + c0 = ((a & b) | ((a | b) & ~s0)) >> 63; + r = rp[i]; + s1 = r - s0; + rp[i] = s1; + c1 = ((s1 & s0) | ((s1 | s0) & ~r)) >> 63; + cy[i] = c0 + c1; + } + /* Carry subtract loop. Subtract the carry vector cy[] from the raw result + rp[] and store the new result back to rp[]. */ + more_carries = 0; +#pragma _CRI ivdep + for (i = 1; i < n; i++) + { + r = rp[i]; + c0 = cy[i - 1]; + s0 = r - c0; + rp[i] = s0; + c0 = (s0 & ~r) >> 63; + more_carries += c0; + } + /* If that second loop generated carry, handle that in scalar loop. */ + if (more_carries) + { + mp_limb_t cyrec = 0; + /* Look for places where rp[k] == ~0 and cy[k-1] == 1 or + rp[k] == ~1 and cy[k-1] == 2. + These are where we got a recurrency carry. */ + for (i = 1; i < n; i++) + { + r = rp[i]; + c0 = ~r < cy[i - 1]; + s0 = r - cyrec; + rp[i] = s0; + c1 = (s0 & ~r) >> 63; + cyrec = c0 | c1; + } + return _int_mult_upper (up[n - 1], vl) + cyrec + cy[n - 1]; + } + + return _int_mult_upper (up[n - 1], vl) + cy[n - 1]; +} diff --git a/vendor/gmp-6.3.0/mpn/cray/lshift.c b/vendor/gmp-6.3.0/mpn/cray/lshift.c new file mode 100644 index 0000000..8534e93 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/cray/lshift.c @@ -0,0 +1,58 @@ +/* mpn_lshift -- Shift left low level for Cray vector processors. + +Copyright (C) 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include +#include "gmp-impl.h" + +mp_limb_t +mpn_lshift (mp_ptr wp, mp_srcptr up, mp_size_t n, unsigned int cnt) +{ + unsigned sh_1, sh_2; + mp_size_t i; + mp_limb_t retval; + + sh_1 = cnt; + sh_2 = GMP_LIMB_BITS - sh_1; + retval = up[n - 1] >> sh_2; + +#pragma _CRI ivdep + for (i = n - 1; i > 0; i--) + { +#if 1 + wp[i] = (up[i] << sh_1) | (up[i - 1] >> sh_2); +#else + /* This is the recommended way, but at least on SV1 it is slower. */ + wp[i] = _dshiftl (up[i], up[i - 1], sh_1); +#endif + } + + wp[0] = up[0] << sh_1; + return retval; +} diff --git a/vendor/gmp-6.3.0/mpn/cray/mulww.f b/vendor/gmp-6.3.0/mpn/cray/mulww.f new file mode 100644 index 0000000..6885dfc --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/cray/mulww.f @@ -0,0 +1,63 @@ +c Helper for mpn_mul_1, mpn_addmul_1, and mpn_submul_1 for Cray PVP. + +c Copyright 1996, 2000 Free Software Foundation, Inc. + +c This file is part of the GNU MP Library. +c +c The GNU MP Library is free software; you can redistribute it and/or modify +c it under the terms of either: +c +c * the GNU Lesser General Public License as published by the Free +c Software Foundation; either version 3 of the License, or (at your +c option) any later version. +c +c or +c +c * the GNU General Public License as published by the Free Software +c Foundation; either version 2 of the License, or (at your option) any +c later version. +c +c or both in parallel, as here. +c +c The GNU MP Library is distributed in the hope that it will be useful, but +c WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +c or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +c for more details. +c +c You should have received copies of the GNU General Public License and the +c GNU Lesser General Public License along with the GNU MP Library. If not, +c see https://www.gnu.org/licenses/. + +c p1[] = hi(a[]*s); the upper limbs of each product +c p0[] = low(a[]*s); the corresponding lower limbs +c n is number of limbs in the vectors + + subroutine gmpn_mulww(p1,p0,a,n,s) + integer*8 p1(0:*),p0(0:*),a(0:*),s + integer n + + integer*8 a0,a1,a2,s0,s1,s2,c + integer*8 ai,t0,t1,t2,t3,t4 + + s0 = shiftl(and(s,4194303),24) + s1 = shiftl(and(shiftr(s,22),4194303),24) + s2 = shiftl(and(shiftr(s,44),4194303),24) + + do i = 0,n-1 + ai = a(i) + a0 = shiftl(and(ai,4194303),24) + a1 = shiftl(and(shiftr(ai,22),4194303),24) + a2 = shiftl(and(shiftr(ai,44),4194303),24) + + t0 = i24mult(a0,s0) + t1 = i24mult(a0,s1)+i24mult(a1,s0) + t2 = i24mult(a0,s2)+i24mult(a1,s1)+i24mult(a2,s0) + t3 = i24mult(a1,s2)+i24mult(a2,s1) + t4 = i24mult(a2,s2) + + p0(i)=shiftl(t2,44)+shiftl(t1,22)+t0 + c=shiftr(shiftr(t0,22)+and(t1,4398046511103)+ + $ shiftl(and(t2,1048575),22),42) + p1(i)=shiftl(t4,24)+shiftl(t3,2)+shiftr(t2,20)+shiftr(t1,42)+c + end do + end diff --git a/vendor/gmp-6.3.0/mpn/cray/popcount.c b/vendor/gmp-6.3.0/mpn/cray/popcount.c new file mode 100644 index 0000000..a79211f --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/cray/popcount.c @@ -0,0 +1,42 @@ +/* Cray mpn_popcount -- population count. + +Copyright 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include +#include "gmp-impl.h" + +unsigned long int +mpn_popcount (mp_srcptr p, mp_size_t n) +{ + unsigned long int result = 0; + mp_size_t i; + for (i = 0; i < n; i++) + result += _popcnt (p[i]); + return result; +} diff --git a/vendor/gmp-6.3.0/mpn/cray/rshift.c b/vendor/gmp-6.3.0/mpn/cray/rshift.c new file mode 100644 index 0000000..9c4aa22 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/cray/rshift.c @@ -0,0 +1,58 @@ +/* mpn_rshift -- Shift right low level for Cray vector processors. + +Copyright (C) 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include +#include "gmp-impl.h" + +mp_limb_t +mpn_rshift (mp_ptr wp, mp_srcptr up, mp_size_t n, unsigned int cnt) +{ + unsigned sh_1, sh_2; + mp_size_t i; + mp_limb_t retval; + + sh_1 = cnt; + sh_2 = GMP_LIMB_BITS - sh_1; + retval = up[0] << sh_2; + +#pragma _CRI ivdep + for (i = 0; i < n - 1; i++) + { +#if 1 + wp[i] = (up[i] >> sh_1) | (up[i + 1] << sh_2); +#else + /* This is the recommended way, but at least on SV1 it is slower. */ + wp[i] = _dshiftr (up[i + 1], up[i], sh_1); +#endif + } + + wp[n - 1] = up[n - 1] >> sh_1; + return retval; +} diff --git a/vendor/gmp-6.3.0/mpn/cray/sub_n.c b/vendor/gmp-6.3.0/mpn/cray/sub_n.c new file mode 100644 index 0000000..f518764 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/cray/sub_n.c @@ -0,0 +1,90 @@ +/* Cray PVP mpn_sub_n -- subtract two limb vectors and store their difference + in a third limb vector. + +Copyright 1996, 2000, 2001 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +/* This code runs at 4 cycles/limb. It may be possible to bring it down + to 3 cycles/limb. */ + +#include "gmp-impl.h" + +mp_limb_t +mpn_sub_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n) +{ + mp_limb_t cy[n]; + mp_limb_t a, b, r, s0, c0, c1; + mp_size_t i; + int more_carries; + + /* Main subtract loop. Generate a raw output difference in rp[] and a + borrow vector in cy[]. */ +#pragma _CRI ivdep + for (i = 0; i < n; i++) + { + a = up[i]; + b = vp[i]; + s0 = a - b; /* a = s0 + b */ + rp[i] = s0; + c0 = ((s0 & b) | ((s0 | b) & ~a)) >> 63; + cy[i] = c0; + } + /* Borrow subtract loop. Subtract the borrow vector cy[] from the raw + difference rp[] and store the new difference back to rp[0]. If this + generates further borrow, set more_carries. */ + more_carries = 0; +#pragma _CRI ivdep + for (i = 1; i < n; i++) + { + r = rp[i]; + c0 = cy[i - 1]; + s0 = r - c0; /* r = s0 + c0 */ + rp[i] = s0; + c0 = (s0 & ~r) >> 63; + more_carries += c0; + } + /* If that second loop generated borrow, handle that in scalar loop. */ + if (more_carries) + { + mp_limb_t cyrec = 0; + /* Look for places where rp[k] contains just ones and cy[k-1] is + non-zero. These are where we got a recurrency borrow. */ + for (i = 1; i < n; i++) + { + r = rp[i]; + c0 = (~r == 0 && cy[i - 1] != 0); + s0 = r - cyrec; + rp[i] = s0; + c1 = (s0 & ~r) >> 63; + cyrec = c0 | c1; + } + return cyrec | cy[n - 1]; + } + + return cy[n - 1]; +} -- cgit v1.2.3