diff options
Diffstat (limited to 'vendor/gmp-6.3.0/mpn/generic/toom42_mulmid.c')
-rw-r--r-- | vendor/gmp-6.3.0/mpn/generic/toom42_mulmid.c | 237 |
1 files changed, 237 insertions, 0 deletions
diff --git a/vendor/gmp-6.3.0/mpn/generic/toom42_mulmid.c b/vendor/gmp-6.3.0/mpn/generic/toom42_mulmid.c new file mode 100644 index 0000000..f581b10 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/generic/toom42_mulmid.c @@ -0,0 +1,237 @@ +/* mpn_toom42_mulmid -- toom42 middle product + + Contributed by David Harvey. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + + + +/* + Middle product of {ap,2n-1} and {bp,n}, output written to {rp,n+2}. + + Neither ap nor bp may overlap rp. + + Must have n >= 4. + + Amount of scratch space required is given by mpn_toom42_mulmid_itch(). + + FIXME: this code assumes that n is small compared to GMP_NUMB_MAX. The exact + requirements should be clarified. +*/ +void +mpn_toom42_mulmid (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n, + mp_ptr scratch) +{ + mp_limb_t cy, e[12], zh, zl; + mp_size_t m; + int neg; + + ASSERT (n >= 4); + ASSERT (! MPN_OVERLAP_P (rp, n + 2, ap, 2*n - 1)); + ASSERT (! MPN_OVERLAP_P (rp, n + 2, bp, n)); + + ap += n & 1; /* handle odd row and diagonal later */ + m = n / 2; + + /* (e0h:e0l) etc are correction terms, in 2's complement */ +#define e0l (e[0]) +#define e0h (e[1]) +#define e1l (e[2]) +#define e1h (e[3]) +#define e2l (e[4]) +#define e2h (e[5]) +#define e3l (e[6]) +#define e3h (e[7]) +#define e4l (e[8]) +#define e4h (e[9]) +#define e5l (e[10]) +#define e5h (e[11]) + +#define s (scratch + 2) +#define t (rp + m + 2) +#define p0 rp +#define p1 scratch +#define p2 (rp + m) +#define next_scratch (scratch + 3*m + 1) + + /* + rp scratch + |---------|-----------| |---------|---------|----------| + 0 m 2m+2 0 m 2m 3m+1 + <----p2----> <-------------s-------------> + <----p0----><---t----> <----p1----> + */ + + /* compute {s,3m-1} = {a,3m-1} + {a+m,3m-1} and error terms e0, e1, e2, e3 */ + cy = mpn_add_err1_n (s, ap, ap + m, &e0l, bp + m, m - 1, 0); + cy = mpn_add_err2_n (s + m - 1, ap + m - 1, ap + 2*m - 1, &e1l, + bp + m, bp, m, cy); + mpn_add_err1_n (s + 2*m - 1, ap + 2*m - 1, ap + 3*m - 1, &e3l, bp, m, cy); + + /* compute t = (-1)^neg * ({b,m} - {b+m,m}) and error terms e4, e5 */ + if (mpn_cmp (bp + m, bp, m) < 0) + { + ASSERT_NOCARRY (mpn_sub_err2_n (t, bp, bp + m, &e4l, + ap + m - 1, ap + 2*m - 1, m, 0)); + neg = 1; + } + else + { + ASSERT_NOCARRY (mpn_sub_err2_n (t, bp + m, bp, &e4l, + ap + m - 1, ap + 2*m - 1, m, 0)); + neg = 0; + } + + /* recursive middle products. The picture is: + + b[2m-1] A A A B B B - - - - - + ... - A A A B B B - - - - + b[m] - - A A A B B B - - - + b[m-1] - - - C C C D D D - - + ... - - - - C C C D D D - + b[0] - - - - - C C C D D D + a[0] ... a[m] ... a[2m] ... a[4m-2] + */ + + if (m < MULMID_TOOM42_THRESHOLD) + { + /* A + B */ + mpn_mulmid_basecase (p0, s, 2*m - 1, bp + m, m); + /* accumulate high limbs of p0 into e1 */ + ADDC_LIMB (cy, e1l, e1l, p0[m]); + e1h += p0[m + 1] + cy; + /* (-1)^neg * (B - C) (overwrites first m limbs of s) */ + mpn_mulmid_basecase (p1, ap + m, 2*m - 1, t, m); + /* C + D (overwrites t) */ + mpn_mulmid_basecase (p2, s + m, 2*m - 1, bp, m); + } + else + { + /* as above, but use toom42 instead */ + mpn_toom42_mulmid (p0, s, bp + m, m, next_scratch); + ADDC_LIMB (cy, e1l, e1l, p0[m]); + e1h += p0[m + 1] + cy; + mpn_toom42_mulmid (p1, ap + m, t, m, next_scratch); + mpn_toom42_mulmid (p2, s + m, bp, m, next_scratch); + } + + /* apply error terms */ + + /* -e0 at rp[0] */ + SUBC_LIMB (cy, rp[0], rp[0], e0l); + SUBC_LIMB (cy, rp[1], rp[1], e0h + cy); + if (UNLIKELY (cy)) + { + cy = (m > 2) ? mpn_sub_1 (rp + 2, rp + 2, m - 2, 1) : 1; + SUBC_LIMB (cy, e1l, e1l, cy); + e1h -= cy; + } + + /* z = e1 - e2 + high(p0) */ + SUBC_LIMB (cy, zl, e1l, e2l); + zh = e1h - e2h - cy; + + /* z at rp[m] */ + ADDC_LIMB (cy, rp[m], rp[m], zl); + zh = (zh + cy) & GMP_NUMB_MASK; + ADDC_LIMB (cy, rp[m + 1], rp[m + 1], zh); + cy -= (zh >> (GMP_NUMB_BITS - 1)); + if (UNLIKELY (cy)) + { + if (cy == 1) + mpn_add_1 (rp + m + 2, rp + m + 2, m, 1); + else /* cy == -1 */ + mpn_sub_1 (rp + m + 2, rp + m + 2, m, 1); + } + + /* e3 at rp[2*m] */ + ADDC_LIMB (cy, rp[2*m], rp[2*m], e3l); + rp[2*m + 1] = (rp[2*m + 1] + e3h + cy) & GMP_NUMB_MASK; + + /* e4 at p1[0] */ + ADDC_LIMB (cy, p1[0], p1[0], e4l); + ADDC_LIMB (cy, p1[1], p1[1], e4h + cy); + if (UNLIKELY (cy)) + mpn_add_1 (p1 + 2, p1 + 2, m, 1); + + /* -e5 at p1[m] */ + SUBC_LIMB (cy, p1[m], p1[m], e5l); + p1[m + 1] = (p1[m + 1] - e5h - cy) & GMP_NUMB_MASK; + + /* adjustment if p1 ends up negative */ + cy = (p1[m + 1] >> (GMP_NUMB_BITS - 1)); + + /* add (-1)^neg * (p1 - B^m * p1) to output */ + if (neg) + { + mpn_sub_1 (rp + m + 2, rp + m + 2, m, cy); + mpn_add (rp, rp, 2*m + 2, p1, m + 2); /* A + C */ + mpn_sub_n (rp + m, rp + m, p1, m + 2); /* B + D */ + } + else + { + mpn_add_1 (rp + m + 2, rp + m + 2, m, cy); + mpn_sub (rp, rp, 2*m + 2, p1, m + 2); /* A + C */ + mpn_add_n (rp + m, rp + m, p1, m + 2); /* B + D */ + } + + /* odd row and diagonal */ + if (n & 1) + { + /* + Products marked E are already done. We need to do products marked O. + + OOOOO---- + -EEEEO--- + --EEEEO-- + ---EEEEO- + ----EEEEO + */ + + /* first row of O's */ + cy = mpn_addmul_1 (rp, ap - 1, n, bp[n - 1]); + ADDC_LIMB (rp[n + 1], rp[n], rp[n], cy); + + /* O's on diagonal */ + /* FIXME: should probably define an interface "mpn_mulmid_diag_1" + that can handle the sum below. Currently we're relying on + mulmid_basecase being pretty fast for a diagonal sum like this, + which is true at least for the K8 asm version, but surely false + for the generic version. */ + mpn_mulmid_basecase (e, ap + n - 1, n - 1, bp, n - 1); + mpn_add_n (rp + n - 1, rp + n - 1, e, 3); + } +} |