Basic constant folding implementation

author: Thomas Voss <mail@thomasvoss.com> 2024-06-21 23:36:36 +0200
committer: Thomas Voss <mail@thomasvoss.com> 2024-06-21 23:42:26 +0200
commit: a89a14ef5da44684a16b204e7a70460cc8c4922a (patch)
tree: b23b4c6b155977909ef508fdae2f48d33d802813 /vendor/gmp-6.3.0/mpn/generic/mu_divappr_q.c
parent: 1db63fcedab0b288820d66e100b1877b1a5a8851 (diff)
1 files changed, 368 insertions, 0 deletions
diff --git a/vendor/gmp-6.3.0/mpn/generic/mu_divappr_q.c b/vendor/gmp-6.3.0/mpn/generic/mu_divappr_q.c
new file mode 100644
index 0000000..0ef7e03
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/generic/mu_divappr_q.c
@@ -0,0 +1,368 @@
+/* mpn_mu_divappr_q, mpn_preinv_mu_divappr_q.
+
+   Compute Q = floor(N / D) + e.  N is nn limbs, D is dn limbs and must be
+   normalized, and Q must be nn-dn limbs, 0 <= e <= 4.  The requirement that Q
+   is nn-dn limbs (and not nn-dn+1 limbs) was put in place in order to allow us
+   to let N be unmodified during the operation.
+
+   Contributed to the GNU project by Torbjorn Granlund.
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
+
+Copyright 2005-2007, 2009, 2010 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+/*
+   The idea of the algorithm used herein is to compute a smaller inverted value
+   than used in the standard Barrett algorithm, and thus save time in the
+   Newton iterations, and pay just a small price when using the inverted value
+   for developing quotient bits.  This algorithm was presented at ICMS 2006.
+*/
+
+/* CAUTION: This code and the code in mu_div_qr.c should be edited in sync.
+
+ Things to work on:
+
+  * The itch/scratch scheme isn't perhaps such a good idea as it once seemed,
+    demonstrated by the fact that the mpn_invertappr function's scratch needs
+    mean that we need to keep a large allocation long after it is needed.
+    Things are worse as mpn_mul_fft does not accept any scratch parameter,
+    which means we'll have a large memory hole while in mpn_mul_fft.  In
+    general, a peak scratch need in the beginning of a function isn't
+    well-handled by the itch/scratch scheme.
+*/
+
+#ifdef STAT
+#undef STAT
+#define STAT(x) x
+#else
+#define STAT(x)
+#endif
+
+#include <stdlib.h>		/* for NULL */
+#include "gmp-impl.h"
+
+static mp_limb_t mpn_preinv_mu_divappr_q (mp_ptr, mp_srcptr, mp_size_t,
+			 mp_srcptr, mp_size_t, mp_srcptr, mp_size_t, mp_ptr);
+static mp_size_t mpn_mu_divappr_q_choose_in (mp_size_t, mp_size_t, int);
+
+mp_limb_t
+mpn_mu_divappr_q (mp_ptr qp,
+		  mp_srcptr np,
+		  mp_size_t nn,
+		  mp_srcptr dp,
+		  mp_size_t dn,
+		  mp_ptr scratch)
+{
+  mp_size_t qn, in;
+  mp_limb_t cy, qh;
+  mp_ptr ip, tp;
+
+  ASSERT (dn > 1);
+
+  qn = nn - dn;
+
+  /* If Q is smaller than D, truncate operands. */
+  if (qn + 1 < dn)
+    {
+      np += dn - (qn + 1);
+      nn -= dn - (qn + 1);
+      dp += dn - (qn + 1);
+      dn = qn + 1;
+    }
+
+  /* Compute the inverse size.  */
+  in = mpn_mu_divappr_q_choose_in (qn, dn, 0);
+  ASSERT (in <= dn);
+
+#if 1
+  /* This alternative inverse computation method gets slightly more accurate
+     results.  FIXMEs: (1) Temp allocation needs not analysed (2) itch function
+     not adapted (3) mpn_invertappr scratch needs not met.  */
+  ip = scratch;
+  tp = scratch + in + 1;
+
+  /* compute an approximate inverse on (in+1) limbs */
+  if (dn == in)
+    {
+      MPN_COPY (tp + 1, dp, in);
+      tp[0] = 1;
+      mpn_invertappr (ip, tp, in + 1, tp + in + 1);
+      MPN_COPY_INCR (ip, ip + 1, in);
+    }
+  else
+    {
+      cy = mpn_add_1 (tp, dp + dn - (in + 1), in + 1, 1);
+      if (UNLIKELY (cy != 0))
+	MPN_ZERO (ip, in);
+      else
+	{
+	  mpn_invertappr (ip, tp, in + 1, tp + in + 1);
+	  MPN_COPY_INCR (ip, ip + 1, in);
+	}
+    }
+#else
+  /* This older inverse computation method gets slightly worse results than the
+     one above.  */
+  ip = scratch;
+  tp = scratch + in;
+
+  /* Compute inverse of D to in+1 limbs, then round to 'in' limbs.  Ideally the
+     inversion function should do this automatically.  */
+  if (dn == in)
+    {
+      tp[in + 1] = 0;
+      MPN_COPY (tp + in + 2, dp, in);
+      mpn_invertappr (tp, tp + in + 1, in + 1, NULL);
+    }
+  else
+    {
+      mpn_invertappr (tp, dp + dn - (in + 1), in + 1, NULL);
+    }
+  cy = mpn_sub_1 (tp, tp, in + 1, GMP_NUMB_HIGHBIT);
+  if (UNLIKELY (cy != 0))
+    MPN_ZERO (tp + 1, in);
+  MPN_COPY (ip, tp + 1, in);
+#endif
+
+  qh = mpn_preinv_mu_divappr_q (qp, np, nn, dp, dn, ip, in, scratch + in);
+
+  return qh;
+}
+
+static mp_limb_t
+mpn_preinv_mu_divappr_q (mp_ptr qp,
+			 mp_srcptr np,
+			 mp_size_t nn,
+			 mp_srcptr dp,
+			 mp_size_t dn,
+			 mp_srcptr ip,
+			 mp_size_t in,
+			 mp_ptr scratch)
+{
+  mp_size_t qn;
+  mp_limb_t cy, cx, qh;
+  mp_limb_t r;
+  mp_size_t tn, wn;
+
+#define rp           scratch
+#define tp           (scratch + dn)
+#define scratch_out  (scratch + dn + tn)
+
+  qn = nn - dn;
+
+  np += qn;
+  qp += qn;
+
+  qh = mpn_cmp (np, dp, dn) >= 0;
+  if (qh != 0)
+    mpn_sub_n (rp, np, dp, dn);
+  else
+    MPN_COPY (rp, np, dn);
+
+  if (UNLIKELY (qn == 0))
+    return qh;			/* Degenerate use.  Should we allow this? */
+
+  for (;;) /* The exit condition (qn == 0) is verified in the loop. */
+    {
+      if (qn < in)
+	{
+	  ip += in - qn;
+	  in = qn;
+	}
+      np -= in;
+      qp -= in;
+
+      /* Compute the next block of quotient limbs by multiplying the inverse I
+	 by the upper part of the partial remainder R.  */
+      mpn_mul_n (tp, rp + dn - in, ip, in);		/* mulhi  */
+      cy = mpn_add_n (qp, tp + in, rp + dn - in, in);	/* I's msb implicit */
+      ASSERT_ALWAYS (cy == 0);
+
+      qn -= in;
+      if (qn == 0)
+	break;
+
+      /* Compute the product of the quotient block and the divisor D, to be
+	 subtracted from the partial remainder combined with new limbs from the
+	 dividend N.  We only really need the low dn limbs.  */
+
+      if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
+	mpn_mul (tp, dp, dn, qp, in);		/* dn+in limbs, high 'in' cancels */
+      else
+	{
+	  tn = mpn_mulmod_bnm1_next_size (dn + 1);
+	  mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
+	  wn = dn + in - tn;			/* number of wrapped limbs */
+	  if (wn > 0)
+	    {
+	      cy = mpn_sub_n (tp, tp, rp + dn - wn, wn);
+	      cy = mpn_sub_1 (tp + wn, tp + wn, tn - wn, cy);
+	      cx = mpn_cmp (rp + dn - in, tp + dn, tn - dn) < 0;
+	      ASSERT_ALWAYS (cx >= cy);
+	      mpn_incr_u (tp, cx - cy);
+	    }
+	}
+
+      r = rp[dn - in] - tp[dn];
+
+      /* Subtract the product from the partial remainder combined with new
+	 limbs from the dividend N, generating a new partial remainder R.  */
+      if (dn != in)
+	{
+	  cy = mpn_sub_n (tp, np, tp, in);	/* get next 'in' limbs from N */
+	  cy = mpn_sub_nc (tp + in, rp, tp + in, dn - in, cy);
+	  MPN_COPY (rp, tp, dn);		/* FIXME: try to avoid this */
+	}
+      else
+	{
+	  cy = mpn_sub_n (rp, np, tp, in);	/* get next 'in' limbs from N */
+	}
+
+      STAT (int i; int err = 0;
+	    static int errarr[5]; static int err_rec; static int tot);
+
+      /* Check the remainder R and adjust the quotient as needed.  */
+      r -= cy;
+      while (r != 0)
+	{
+	  /* We loop 0 times with about 69% probability, 1 time with about 31%
+	     probability, 2 times with about 0.6% probability, if inverse is
+	     computed as recommended.  */
+	  mpn_incr_u (qp, 1);
+	  cy = mpn_sub_n (rp, rp, dp, dn);
+	  r -= cy;
+	  STAT (err++);
+	}
+      if (mpn_cmp (rp, dp, dn) >= 0)
+	{
+	  /* This is executed with about 76% probability.  */
+	  mpn_incr_u (qp, 1);
+	  cy = mpn_sub_n (rp, rp, dp, dn);
+	  STAT (err++);
+	}
+
+      STAT (
+	    tot++;
+	    errarr[err]++;
+	    if (err > err_rec)
+	      err_rec = err;
+	    if (tot % 0x10000 == 0)
+	      {
+		for (i = 0; i <= err_rec; i++)
+		  printf ("  %d(%.1f%%)", errarr[i], 100.0*errarr[i]/tot);
+		printf ("\n");
+	      }
+	    );
+    }
+
+  /* FIXME: We should perhaps be somewhat more elegant in our rounding of the
+     quotient.  For now, just make sure the returned quotient is >= the real
+     quotient; add 3 with saturating arithmetic.  */
+  qn = nn - dn;
+  cy += mpn_add_1 (qp, qp, qn, 3);
+  if (cy != 0)
+    {
+      if (qh != 0)
+	{
+	  /* Return a quotient of just 1-bits, with qh set.  */
+	  mp_size_t i;
+	  for (i = 0; i < qn; i++)
+	    qp[i] = GMP_NUMB_MAX;
+	}
+      else
+	{
+	  /* Propagate carry into qh.  */
+	  qh = 1;
+	}
+    }
+
+  return qh;
+}
+
+/* In case k=0 (automatic choice), we distinguish 3 cases:
+   (a) dn < qn:         in = ceil(qn / ceil(qn/dn))
+   (b) dn/3 < qn <= dn: in = ceil(qn / 2)
+   (c) qn < dn/3:       in = qn
+   In all cases we have in <= dn.
+ */
+static mp_size_t
+mpn_mu_divappr_q_choose_in (mp_size_t qn, mp_size_t dn, int k)
+{
+  mp_size_t in;
+
+  if (k == 0)
+    {
+      mp_size_t b;
+      if (qn > dn)
+	{
+	  /* Compute an inverse size that is a nice partition of the quotient.  */
+	  b = (qn - 1) / dn + 1;	/* ceil(qn/dn), number of blocks */
+	  in = (qn - 1) / b + 1;	/* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
+	}
+      else if (3 * qn > dn)
+	{
+	  in = (qn - 1) / 2 + 1;	/* b = 2 */
+	}
+      else
+	{
+	  in = (qn - 1) / 1 + 1;	/* b = 1 */
+	}
+    }
+  else
+    {
+      mp_size_t xn;
+      xn = MIN (dn, qn);
+      in = (xn - 1) / k + 1;
+    }
+
+  return in;
+}
+
+mp_size_t
+mpn_mu_divappr_q_itch (mp_size_t nn, mp_size_t dn, int mua_k)
+{
+  mp_size_t qn, in, itch_local, itch_out, itch_invapp;
+
+  qn = nn - dn;
+  if (qn + 1 < dn)
+    {
+      dn = qn + 1;
+    }
+  in = mpn_mu_divappr_q_choose_in (qn, dn, mua_k);
+
+  itch_local = mpn_mulmod_bnm1_next_size (dn + 1);
+  itch_out = mpn_mulmod_bnm1_itch (itch_local, dn, in);
+  itch_invapp = mpn_invertappr_itch (in + 1) + in + 2; /* 3in + 4 */
+
+  ASSERT (dn + itch_local + itch_out >= itch_invapp);
+  return in + MAX (dn + itch_local + itch_out, itch_invapp);
+}
author	Thomas Voss <mail@thomasvoss.com>	2024-06-21 23:36:36 +0200
committer	Thomas Voss <mail@thomasvoss.com>	2024-06-21 23:42:26 +0200
commit	a89a14ef5da44684a16b204e7a70460cc8c4922a (patch)
tree	b23b4c6b155977909ef508fdae2f48d33d802813 /vendor/gmp-6.3.0/mpn/generic/mu_divappr_q.c
parent	1db63fcedab0b288820d66e100b1877b1a5a8851 (diff)