From a89a14ef5da44684a16b204e7a70460cc8c4922a Mon Sep 17 00:00:00 2001 From: Thomas Voss Date: Fri, 21 Jun 2024 23:36:36 +0200 Subject: Basic constant folding implementation --- vendor/gmp-6.3.0/mpn/x86/pentium/mmx/gmp-mparam.h | 163 ++++++++ vendor/gmp-6.3.0/mpn/x86/pentium/mmx/hamdist.asm | 40 ++ vendor/gmp-6.3.0/mpn/x86/pentium/mmx/lshift.asm | 463 +++++++++++++++++++++ vendor/gmp-6.3.0/mpn/x86/pentium/mmx/mul_1.asm | 371 +++++++++++++++++ vendor/gmp-6.3.0/mpn/x86/pentium/mmx/rshift.asm | 468 ++++++++++++++++++++++ 5 files changed, 1505 insertions(+) create mode 100644 vendor/gmp-6.3.0/mpn/x86/pentium/mmx/gmp-mparam.h create mode 100644 vendor/gmp-6.3.0/mpn/x86/pentium/mmx/hamdist.asm create mode 100644 vendor/gmp-6.3.0/mpn/x86/pentium/mmx/lshift.asm create mode 100644 vendor/gmp-6.3.0/mpn/x86/pentium/mmx/mul_1.asm create mode 100644 vendor/gmp-6.3.0/mpn/x86/pentium/mmx/rshift.asm (limited to 'vendor/gmp-6.3.0/mpn/x86/pentium/mmx') diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/gmp-mparam.h new file mode 100644 index 0000000..02a0def --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/gmp-mparam.h @@ -0,0 +1,163 @@ +/* Intel P55 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2002, 2004, 2009, 2010 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + + +/* For mpn/x86/pentium/mod_1.asm */ +#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB + + +/* 233MHz P55 */ + +#define MOD_1_NORM_THRESHOLD 5 +#define MOD_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD MP_SIZE_T_MAX /* never */ +#define MOD_1U_TO_MOD_1_1_THRESHOLD 12 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 11 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 63 +#define USE_PREINV_DIVREM_1 0 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 51 + +#define MUL_TOOM22_THRESHOLD 16 +#define MUL_TOOM33_THRESHOLD 53 +#define MUL_TOOM44_THRESHOLD 128 +#define MUL_TOOM6H_THRESHOLD 189 +#define MUL_TOOM8H_THRESHOLD 260 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 89 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 91 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 90 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 88 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 20 +#define SQR_TOOM3_THRESHOLD 73 +#define SQR_TOOM4_THRESHOLD 178 +#define SQR_TOOM6_THRESHOLD 210 +#define SQR_TOOM8_THRESHOLD 375 + +#define MULMOD_BNM1_THRESHOLD 11 +#define SQRMOD_BNM1_THRESHOLD 12 + +#define MUL_FFT_MODF_THRESHOLD 364 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 364, 5}, { 15, 6}, { 8, 5}, { 17, 6}, \ + { 9, 5}, { 19, 6}, { 17, 7}, { 9, 6}, \ + { 21, 7}, { 11, 6}, { 23, 7}, { 15, 6}, \ + { 31, 7}, { 21, 8}, { 11, 7}, { 27, 8}, \ + { 15, 7}, { 33, 8}, { 19, 7}, { 39, 8}, \ + { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \ + { 31, 7}, { 63, 8}, { 39, 9}, { 23, 8}, \ + { 47,10}, { 15, 9}, { 31, 8}, { 67, 9}, \ + { 39, 8}, { 79, 9}, { 47, 8}, { 95, 9}, \ + { 55,10}, { 31, 9}, { 79,10}, { 47, 9}, \ + { 95,11}, { 31,10}, { 63, 9}, { 135,10}, \ + { 79, 9}, { 159, 8}, { 319, 9}, { 167,10}, \ + { 95, 9}, { 191, 8}, { 383,11}, { 63,10}, \ + { 127, 9}, { 255,10}, { 143, 9}, { 287,10}, \ + { 159, 9}, { 319,11}, { 95,10}, { 191, 9}, \ + { 383,12}, { 63,11}, { 127,10}, { 271, 9}, \ + { 543,10}, { 287,11}, { 159,10}, { 351,11}, \ + { 191,10}, { 415,11}, { 223,12}, { 127,11}, \ + { 255,10}, { 511,11}, { 287,10}, { 575,11}, \ + { 351,12}, { 191,11}, { 415,13}, { 127,12}, \ + { 255,11}, { 575,12}, { 319,11}, { 703,12}, \ + { 383,11}, { 831,12}, { 447,13}, { 8192,14}, \ + { 16384,15}, { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 90 +#define MUL_FFT_THRESHOLD 3520 + +#define SQR_FFT_MODF_THRESHOLD 340 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 340, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \ + { 17, 7}, { 9, 6}, { 21, 7}, { 11, 6}, \ + { 23, 7}, { 15, 6}, { 31, 7}, { 21, 8}, \ + { 11, 7}, { 29, 8}, { 15, 7}, { 33, 8}, \ + { 19, 7}, { 39, 8}, { 27, 7}, { 55, 9}, \ + { 15, 8}, { 31, 7}, { 65, 8}, { 43, 9}, \ + { 23, 8}, { 47,10}, { 15, 9}, { 31, 8}, \ + { 67, 9}, { 39, 8}, { 83, 9}, { 47, 8}, \ + { 95,10}, { 31, 9}, { 63, 8}, { 127, 9}, \ + { 79,10}, { 47, 9}, { 95,11}, { 31,10}, \ + { 63, 9}, { 127, 8}, { 255, 9}, { 135,10}, \ + { 79, 9}, { 159, 8}, { 319,10}, { 95, 9}, \ + { 191,11}, { 63,10}, { 127, 9}, { 255, 8}, \ + { 511, 9}, { 271,10}, { 143, 9}, { 287, 8}, \ + { 575, 9}, { 303,10}, { 159, 9}, { 319,11}, \ + { 95,10}, { 191, 9}, { 383,10}, { 207,12}, \ + { 63,11}, { 127,10}, { 271, 9}, { 543,10}, \ + { 287, 9}, { 575,10}, { 303,11}, { 159,10}, \ + { 351,11}, { 191,10}, { 415,11}, { 223,10}, \ + { 447,12}, { 127,11}, { 255,10}, { 543,11}, \ + { 287,10}, { 607,11}, { 351,12}, { 191,11}, \ + { 479,13}, { 127,12}, { 255,11}, { 575,12}, \ + { 319,11}, { 703,12}, { 383,11}, { 767,12}, \ + { 447,13}, { 8192,14}, { 16384,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 96 +#define SQR_FFT_THRESHOLD 5504 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 48 +#define MULLO_MUL_N_THRESHOLD 6633 + +#define DC_DIV_QR_THRESHOLD 43 +#define DC_DIVAPPR_Q_THRESHOLD 170 +#define DC_BDIV_QR_THRESHOLD 43 +#define DC_BDIV_Q_THRESHOLD 110 + +#define INV_MULMOD_BNM1_THRESHOLD 30 +#define INV_NEWTON_THRESHOLD 177 +#define INV_APPR_THRESHOLD 171 + +#define BINV_NEWTON_THRESHOLD 194 +#define REDC_1_TO_REDC_N_THRESHOLD 50 + +#define MU_DIV_QR_THRESHOLD 1142 +#define MU_DIVAPPR_Q_THRESHOLD 1142 +#define MUPI_DIV_QR_THRESHOLD 90 +#define MU_BDIV_QR_THRESHOLD 942 +#define MU_BDIV_Q_THRESHOLD 1017 + +#define MATRIX22_STRASSEN_THRESHOLD 13 +#define HGCD_THRESHOLD 92 +#define GCD_DC_THRESHOLD 283 +#define GCDEXT_DC_THRESHOLD 221 +#define JACOBI_BASE_METHOD 2 + +#define GET_STR_DC_THRESHOLD 18 +#define GET_STR_PRECOMPUTE_THRESHOLD 31 +#define SET_STR_DC_THRESHOLD 490 +#define SET_STR_PRECOMPUTE_THRESHOLD 994 diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/hamdist.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/hamdist.asm new file mode 100644 index 0000000..72e3196 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/hamdist.asm @@ -0,0 +1,40 @@ +dnl Intel P55 mpn_hamdist -- mpn hamming distance. + +dnl Copyright 2000, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P55: hamdist 12.0 cycles/limb + +C For reference, this code runs at 11.5 cycles/limb for popcount, which is +C slower than the plain integer mpn/x86/pentium/popcount.asm. + +MULFUNC_PROLOGUE(mpn_hamdist) +include_mpn(`x86/k6/mmx/popham.asm') diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/lshift.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/lshift.asm new file mode 100644 index 0000000..04b0ddc --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/lshift.asm @@ -0,0 +1,463 @@ +dnl Intel P5 mpn_lshift -- mpn left shift. + +dnl Copyright 2000-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P5: 1.75 cycles/limb. + + +C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned shift); +C +C Shift src,size left by shift many bits and store the result in dst,size. +C Zeros are shifted in at the right. Return the bits shifted out at the +C left. +C +C The comments in mpn_rshift apply here too. + +defframe(PARAM_SHIFT,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) +deflit(`FRAME',0) + +dnl minimum 5, because the unrolled loop can't handle less +deflit(UNROLL_THRESHOLD, 5) + + TEXT + ALIGN(8) + +PROLOGUE(mpn_lshift) + + pushl %ebx + pushl %edi +deflit(`FRAME',8) + + movl PARAM_SIZE, %eax + movl PARAM_DST, %edx + + movl PARAM_SRC, %ebx + movl PARAM_SHIFT, %ecx + + cmp $UNROLL_THRESHOLD, %eax + jae L(unroll) + + movl -4(%ebx,%eax,4), %edi C src high limb + decl %eax + + jnz L(simple) + + shldl( %cl, %edi, %eax) C eax was decremented to zero + + shll %cl, %edi + + movl %edi, (%edx) C dst low limb + popl %edi C risk of data cache bank clash + + popl %ebx + + ret + + +C ----------------------------------------------------------------------------- +L(simple): + C eax size-1 + C ebx src + C ecx shift + C edx dst + C esi + C edi + C ebp +deflit(`FRAME',8) + + movd (%ebx,%eax,4), %mm5 C src high limb + + movd %ecx, %mm6 C lshift + negl %ecx + + psllq %mm6, %mm5 + addl $32, %ecx + + movd %ecx, %mm7 + psrlq $32, %mm5 C retval + + +L(simple_top): + C eax counter, limbs, negative + C ebx src + C ecx + C edx dst + C esi + C edi + C + C mm0 scratch + C mm5 return value + C mm6 shift + C mm7 32-shift + + movq -4(%ebx,%eax,4), %mm0 + decl %eax + + psrlq %mm7, %mm0 + + C + + movd %mm0, 4(%edx,%eax,4) + jnz L(simple_top) + + + movd (%ebx), %mm0 + + movd %mm5, %eax + psllq %mm6, %mm0 + + popl %edi + popl %ebx + + movd %mm0, (%edx) + + emms + + ret + + +C ----------------------------------------------------------------------------- + ALIGN(8) +L(unroll): + C eax size + C ebx src + C ecx shift + C edx dst + C esi + C edi + C ebp +deflit(`FRAME',8) + + movd -4(%ebx,%eax,4), %mm5 C src high limb + leal (%ebx,%eax,4), %edi + + movd %ecx, %mm6 C lshift + andl $4, %edi + + psllq %mm6, %mm5 + jz L(start_src_aligned) + + + C src isn't aligned, process high limb separately (marked xxx) to + C make it so. + C + C source -8(ebx,%eax,4) + C | + C +-------+-------+-------+-- + C | | + C +-------+-------+-------+-- + C 0mod8 4mod8 0mod8 + C + C dest + C -4(edx,%eax,4) + C | + C +-------+-------+-- + C | xxx | | + C +-------+-------+-- + + movq -8(%ebx,%eax,4), %mm0 C unaligned load + + psllq %mm6, %mm0 + decl %eax + + psrlq $32, %mm0 + + C + + movd %mm0, (%edx,%eax,4) +L(start_src_aligned): + + movq -8(%ebx,%eax,4), %mm1 C src high qword + leal (%edx,%eax,4), %edi + + andl $4, %edi + psrlq $32, %mm5 C return value + + movq -16(%ebx,%eax,4), %mm3 C src second highest qword + jz L(start_dst_aligned) + + C dst isn't aligned, subtract 4 to make it so, and pretend the shift + C is 32 bits extra. High limb of dst (marked xxx) handled here + C separately. + C + C source -8(ebx,%eax,4) + C | + C +-------+-------+-- + C | mm1 | + C +-------+-------+-- + C 0mod8 4mod8 + C + C dest + C -4(edx,%eax,4) + C | + C +-------+-------+-------+-- + C | xxx | | + C +-------+-------+-------+-- + C 0mod8 4mod8 0mod8 + + movq %mm1, %mm0 + addl $32, %ecx C new shift + + psllq %mm6, %mm0 + + movd %ecx, %mm6 + psrlq $32, %mm0 + + C wasted cycle here waiting for %mm0 + + movd %mm0, -4(%edx,%eax,4) + subl $4, %edx +L(start_dst_aligned): + + + psllq %mm6, %mm1 + negl %ecx C -shift + + addl $64, %ecx C 64-shift + movq %mm3, %mm2 + + movd %ecx, %mm7 + subl $8, %eax C size-8 + + psrlq %mm7, %mm3 + + por %mm1, %mm3 C mm3 ready to store + jc L(finish) + + + C The comments in mpn_rshift apply here too. + + ALIGN(8) +L(unroll_loop): + C eax counter, limbs + C ebx src + C ecx + C edx dst + C esi + C edi + C + C mm0 + C mm1 + C mm2 src qword from 16(%ebx,%eax,4) + C mm3 dst qword ready to store to 24(%edx,%eax,4) + C + C mm5 return value + C mm6 lshift + C mm7 rshift + + movq 8(%ebx,%eax,4), %mm0 + psllq %mm6, %mm2 + + movq %mm0, %mm1 + psrlq %mm7, %mm0 + + movq %mm3, 24(%edx,%eax,4) C prev + por %mm2, %mm0 + + movq (%ebx,%eax,4), %mm3 C + psllq %mm6, %mm1 C + + movq %mm0, 16(%edx,%eax,4) + movq %mm3, %mm2 C + + psrlq %mm7, %mm3 C + subl $4, %eax + + por %mm1, %mm3 C + jnc L(unroll_loop) + + + +L(finish): + C eax -4 to -1 representing respectively 0 to 3 limbs remaining + + testb $2, %al + + jz L(finish_no_two) + + movq 8(%ebx,%eax,4), %mm0 + psllq %mm6, %mm2 + + movq %mm0, %mm1 + psrlq %mm7, %mm0 + + movq %mm3, 24(%edx,%eax,4) C prev + por %mm2, %mm0 + + movq %mm1, %mm2 + movq %mm0, %mm3 + + subl $2, %eax +L(finish_no_two): + + + C eax -4 or -3 representing respectively 0 or 1 limbs remaining + C + C mm2 src prev qword, from 16(%ebx,%eax,4) + C mm3 dst qword, for 24(%edx,%eax,4) + + testb $1, %al + movd %mm5, %eax C retval + + popl %edi + jz L(finish_zero) + + + C One extra src limb, destination was aligned. + C + C source ebx + C --+---------------+-------+ + C | mm2 | | + C --+---------------+-------+ + C + C dest edx+12 edx+4 edx + C --+---------------+---------------+-------+ + C | mm3 | | | + C --+---------------+---------------+-------+ + C + C mm6 = shift + C mm7 = ecx = 64-shift + + + C One extra src limb, destination was unaligned. + C + C source ebx + C --+---------------+-------+ + C | mm2 | | + C --+---------------+-------+ + C + C dest edx+12 edx+4 + C --+---------------+---------------+ + C | mm3 | | + C --+---------------+---------------+ + C + C mm6 = shift+32 + C mm7 = ecx = 64-(shift+32) + + + C In both cases there's one extra limb of src to fetch and combine + C with mm2 to make a qword at 4(%edx), and in the aligned case + C there's an extra limb of dst to be formed from that extra src limb + C left shifted. + + + movd (%ebx), %mm0 + psllq %mm6, %mm2 + + movq %mm3, 12(%edx) + psllq $32, %mm0 + + movq %mm0, %mm1 + psrlq %mm7, %mm0 + + por %mm2, %mm0 + psllq %mm6, %mm1 + + movq %mm0, 4(%edx) + psrlq $32, %mm1 + + andl $32, %ecx + popl %ebx + + jz L(finish_one_unaligned) + + movd %mm1, (%edx) +L(finish_one_unaligned): + + emms + + ret + + +L(finish_zero): + + C No extra src limbs, destination was aligned. + C + C source ebx + C --+---------------+ + C | mm2 | + C --+---------------+ + C + C dest edx+8 edx + C --+---------------+---------------+ + C | mm3 | | + C --+---------------+---------------+ + C + C mm6 = shift + C mm7 = ecx = 64-shift + + + C No extra src limbs, destination was unaligned. + C + C source ebx + C --+---------------+ + C | mm2 | + C --+---------------+ + C + C dest edx+8 edx+4 + C --+---------------+-------+ + C | mm3 | | + C --+---------------+-------+ + C + C mm6 = shift+32 + C mm7 = ecx = 64-(shift+32) + + + C The movd for the unaligned case writes the same data to 4(%edx) + C that the movq does for the aligned case. + + + movq %mm3, 8(%edx) + andl $32, %ecx + + psllq %mm6, %mm2 + jz L(finish_zero_unaligned) + + movq %mm2, (%edx) +L(finish_zero_unaligned): + + psrlq $32, %mm2 + popl %ebx + + movd %mm5, %eax C retval + + movd %mm2, 4(%edx) + + emms + + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/mul_1.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/mul_1.asm new file mode 100644 index 0000000..4ced577 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/mul_1.asm @@ -0,0 +1,371 @@ +dnl Intel Pentium MMX mpn_mul_1 -- mpn by limb multiplication. + +dnl Copyright 2000-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C P5: 12.0 for 32-bit multiplier +C 7.0 for 16-bit multiplier + + +C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t multiplier); +C +C When the multiplier is 16 bits some special case MMX code is used. Small +C multipliers might arise reasonably often from mpz_mul_ui etc. If the size +C is odd there's roughly a 5 cycle penalty, so times for say size==7 and +C size==8 end up being quite close. If src isn't aligned to an 8 byte +C boundary then one limb is processed separately with roughly a 5 cycle +C penalty, so in that case it's say size==8 and size==9 which are close. +C +C Alternatives: +C +C MMX is not believed to be of any use for 32-bit multipliers, since for +C instance the current method would just have to be more or less duplicated +C for the high and low halves of the multiplier, and would probably +C therefore run at about 14 cycles, which is slower than the plain integer +C at 12. +C +C Adding the high and low MMX products using integer code seems best. An +C attempt at using paddd and carry bit propagation with pcmpgtd didn't give +C any joy. Perhaps something could be done keeping the values signed and +C thereby avoiding adjustments to make pcmpgtd into an unsigned compare, or +C perhaps not. +C +C Future: +C +C An mpn_mul_1c entrypoint would need a double carry out of the low result +C limb in the 16-bit code, unless it could be assumed the carry fits in 16 +C bits, possibly as carry