diff options
author | Thomas Voss <mail@thomasvoss.com> | 2024-06-21 23:36:36 +0200 |
---|---|---|
committer | Thomas Voss <mail@thomasvoss.com> | 2024-06-21 23:42:26 +0200 |
commit | a89a14ef5da44684a16b204e7a70460cc8c4922a (patch) | |
tree | b23b4c6b155977909ef508fdae2f48d33d802813 /vendor/gmp-6.3.0/mpn/x86/pentium/mmx | |
parent | 1db63fcedab0b288820d66e100b1877b1a5a8851 (diff) |
Basic constant folding implementation
Diffstat (limited to 'vendor/gmp-6.3.0/mpn/x86/pentium/mmx')
-rw-r--r-- | vendor/gmp-6.3.0/mpn/x86/pentium/mmx/gmp-mparam.h | 163 | ||||
-rw-r--r-- | vendor/gmp-6.3.0/mpn/x86/pentium/mmx/hamdist.asm | 40 | ||||
-rw-r--r-- | vendor/gmp-6.3.0/mpn/x86/pentium/mmx/lshift.asm | 463 | ||||
-rw-r--r-- | vendor/gmp-6.3.0/mpn/x86/pentium/mmx/mul_1.asm | 371 | ||||
-rw-r--r-- | vendor/gmp-6.3.0/mpn/x86/pentium/mmx/rshift.asm | 468 |
5 files changed, 1505 insertions, 0 deletions
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/gmp-mparam.h new file mode 100644 index 0000000..02a0def --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/gmp-mparam.h @@ -0,0 +1,163 @@ +/* Intel P55 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2002, 2004, 2009, 2010 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + + +/* For mpn/x86/pentium/mod_1.asm */ +#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB + + +/* 233MHz P55 */ + +#define MOD_1_NORM_THRESHOLD 5 +#define MOD_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD MP_SIZE_T_MAX /* never */ +#define MOD_1U_TO_MOD_1_1_THRESHOLD 12 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 11 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 63 +#define USE_PREINV_DIVREM_1 0 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 51 + +#define MUL_TOOM22_THRESHOLD 16 +#define MUL_TOOM33_THRESHOLD 53 +#define MUL_TOOM44_THRESHOLD 128 +#define MUL_TOOM6H_THRESHOLD 189 +#define MUL_TOOM8H_THRESHOLD 260 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 89 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 91 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 90 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 88 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 20 +#define SQR_TOOM3_THRESHOLD 73 +#define SQR_TOOM4_THRESHOLD 178 +#define SQR_TOOM6_THRESHOLD 210 +#define SQR_TOOM8_THRESHOLD 375 + +#define MULMOD_BNM1_THRESHOLD 11 +#define SQRMOD_BNM1_THRESHOLD 12 + +#define MUL_FFT_MODF_THRESHOLD 364 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 364, 5}, { 15, 6}, { 8, 5}, { 17, 6}, \ + { 9, 5}, { 19, 6}, { 17, 7}, { 9, 6}, \ + { 21, 7}, { 11, 6}, { 23, 7}, { 15, 6}, \ + { 31, 7}, { 21, 8}, { 11, 7}, { 27, 8}, \ + { 15, 7}, { 33, 8}, { 19, 7}, { 39, 8}, \ + { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \ + { 31, 7}, { 63, 8}, { 39, 9}, { 23, 8}, \ + { 47,10}, { 15, 9}, { 31, 8}, { 67, 9}, \ + { 39, 8}, { 79, 9}, { 47, 8}, { 95, 9}, \ + { 55,10}, { 31, 9}, { 79,10}, { 47, 9}, \ + { 95,11}, { 31,10}, { 63, 9}, { 135,10}, \ + { 79, 9}, { 159, 8}, { 319, 9}, { 167,10}, \ + { 95, 9}, { 191, 8}, { 383,11}, { 63,10}, \ + { 127, 9}, { 255,10}, { 143, 9}, { 287,10}, \ + { 159, 9}, { 319,11}, { 95,10}, { 191, 9}, \ + { 383,12}, { 63,11}, { 127,10}, { 271, 9}, \ + { 543,10}, { 287,11}, { 159,10}, { 351,11}, \ + { 191,10}, { 415,11}, { 223,12}, { 127,11}, \ + { 255,10}, { 511,11}, { 287,10}, { 575,11}, \ + { 351,12}, { 191,11}, { 415,13}, { 127,12}, \ + { 255,11}, { 575,12}, { 319,11}, { 703,12}, \ + { 383,11}, { 831,12}, { 447,13}, { 8192,14}, \ + { 16384,15}, { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 90 +#define MUL_FFT_THRESHOLD 3520 + +#define SQR_FFT_MODF_THRESHOLD 340 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 340, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \ + { 17, 7}, { 9, 6}, { 21, 7}, { 11, 6}, \ + { 23, 7}, { 15, 6}, { 31, 7}, { 21, 8}, \ + { 11, 7}, { 29, 8}, { 15, 7}, { 33, 8}, \ + { 19, 7}, { 39, 8}, { 27, 7}, { 55, 9}, \ + { 15, 8}, { 31, 7}, { 65, 8}, { 43, 9}, \ + { 23, 8}, { 47,10}, { 15, 9}, { 31, 8}, \ + { 67, 9}, { 39, 8}, { 83, 9}, { 47, 8}, \ + { 95,10}, { 31, 9}, { 63, 8}, { 127, 9}, \ + { 79,10}, { 47, 9}, { 95,11}, { 31,10}, \ + { 63, 9}, { 127, 8}, { 255, 9}, { 135,10}, \ + { 79, 9}, { 159, 8}, { 319,10}, { 95, 9}, \ + { 191,11}, { 63,10}, { 127, 9}, { 255, 8}, \ + { 511, 9}, { 271,10}, { 143, 9}, { 287, 8}, \ + { 575, 9}, { 303,10}, { 159, 9}, { 319,11}, \ + { 95,10}, { 191, 9}, { 383,10}, { 207,12}, \ + { 63,11}, { 127,10}, { 271, 9}, { 543,10}, \ + { 287, 9}, { 575,10}, { 303,11}, { 159,10}, \ + { 351,11}, { 191,10}, { 415,11}, { 223,10}, \ + { 447,12}, { 127,11}, { 255,10}, { 543,11}, \ + { 287,10}, { 607,11}, { 351,12}, { 191,11}, \ + { 479,13}, { 127,12}, { 255,11}, { 575,12}, \ + { 319,11}, { 703,12}, { 383,11}, { 767,12}, \ + { 447,13}, { 8192,14}, { 16384,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 96 +#define SQR_FFT_THRESHOLD 5504 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 48 +#define MULLO_MUL_N_THRESHOLD 6633 + +#define DC_DIV_QR_THRESHOLD 43 +#define DC_DIVAPPR_Q_THRESHOLD 170 +#define DC_BDIV_QR_THRESHOLD 43 +#define DC_BDIV_Q_THRESHOLD 110 + +#define INV_MULMOD_BNM1_THRESHOLD 30 +#define INV_NEWTON_THRESHOLD 177 +#define INV_APPR_THRESHOLD 171 + +#define BINV_NEWTON_THRESHOLD 194 +#define REDC_1_TO_REDC_N_THRESHOLD 50 + +#define MU_DIV_QR_THRESHOLD 1142 +#define MU_DIVAPPR_Q_THRESHOLD 1142 +#define MUPI_DIV_QR_THRESHOLD 90 +#define MU_BDIV_QR_THRESHOLD 942 +#define MU_BDIV_Q_THRESHOLD 1017 + +#define MATRIX22_STRASSEN_THRESHOLD 13 +#define HGCD_THRESHOLD 92 +#define GCD_DC_THRESHOLD 283 +#define GCDEXT_DC_THRESHOLD 221 +#define JACOBI_BASE_METHOD 2 + +#define GET_STR_DC_THRESHOLD 18 +#define GET_STR_PRECOMPUTE_THRESHOLD 31 +#define SET_STR_DC_THRESHOLD 490 +#define SET_STR_PRECOMPUTE_THRESHOLD 994 diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/hamdist.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/hamdist.asm new file mode 100644 index 0000000..72e3196 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/hamdist.asm @@ -0,0 +1,40 @@ +dnl Intel P55 mpn_hamdist -- mpn hamming distance. + +dnl Copyright 2000, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P55: hamdist 12.0 cycles/limb + +C For reference, this code runs at 11.5 cycles/limb for popcount, which is +C slower than the plain integer mpn/x86/pentium/popcount.asm. + +MULFUNC_PROLOGUE(mpn_hamdist) +include_mpn(`x86/k6/mmx/popham.asm') diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/lshift.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/lshift.asm new file mode 100644 index 0000000..04b0ddc --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/lshift.asm @@ -0,0 +1,463 @@ +dnl Intel P5 mpn_lshift -- mpn left shift. + +dnl Copyright 2000-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P5: 1.75 cycles/limb. + + +C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned shift); +C +C Shift src,size left by shift many bits and store the result in dst,size. +C Zeros are shifted in at the right. Return the bits shifted out at the +C left. +C +C The comments in mpn_rshift apply here too. + +defframe(PARAM_SHIFT,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) +deflit(`FRAME',0) + +dnl minimum 5, because the unrolled loop can't handle less +deflit(UNROLL_THRESHOLD, 5) + + TEXT + ALIGN(8) + +PROLOGUE(mpn_lshift) + + pushl %ebx + pushl %edi +deflit(`FRAME',8) + + movl PARAM_SIZE, %eax + movl PARAM_DST, %edx + + movl PARAM_SRC, %ebx + movl PARAM_SHIFT, %ecx + + cmp $UNROLL_THRESHOLD, %eax + jae L(unroll) + + movl -4(%ebx,%eax,4), %edi C src high limb + decl %eax + + jnz L(simple) + + shldl( %cl, %edi, %eax) C eax was decremented to zero + + shll %cl, %edi + + movl %edi, (%edx) C dst low limb + popl %edi C risk of data cache bank clash + + popl %ebx + + ret + + +C ----------------------------------------------------------------------------- +L(simple): + C eax size-1 + C ebx src + C ecx shift + C edx dst + C esi + C edi + C ebp +deflit(`FRAME',8) + + movd (%ebx,%eax,4), %mm5 C src high limb + + movd %ecx, %mm6 C lshift + negl %ecx + + psllq %mm6, %mm5 + addl $32, %ecx + + movd %ecx, %mm7 + psrlq $32, %mm5 C retval + + +L(simple_top): + C eax counter, limbs, negative + C ebx src + C ecx + C edx dst + C esi + C edi + C + C mm0 scratch + C mm5 return value + C mm6 shift + C mm7 32-shift + + movq -4(%ebx,%eax,4), %mm0 + decl %eax + + psrlq %mm7, %mm0 + + C + + movd %mm0, 4(%edx,%eax,4) + jnz L(simple_top) + + + movd (%ebx), %mm0 + + movd %mm5, %eax + psllq %mm6, %mm0 + + popl %edi + popl %ebx + + movd %mm0, (%edx) + + emms + + ret + + +C ----------------------------------------------------------------------------- + ALIGN(8) +L(unroll): + C eax size + C ebx src + C ecx shift + C edx dst + C esi + C edi + C ebp +deflit(`FRAME',8) + + movd -4(%ebx,%eax,4), %mm5 C src high limb + leal (%ebx,%eax,4), %edi + + movd %ecx, %mm6 C lshift + andl $4, %edi + + psllq %mm6, %mm5 + jz L(start_src_aligned) + + + C src isn't aligned, process high limb separately (marked xxx) to + C make it so. + C + C source -8(ebx,%eax,4) + C | + C +-------+-------+-------+-- + C | | + C +-------+-------+-------+-- + C 0mod8 4mod8 0mod8 + C + C dest + C -4(edx,%eax,4) + C | + C +-------+-------+-- + C | xxx | | + C +-------+-------+-- + + movq -8(%ebx,%eax,4), %mm0 C unaligned load + + psllq %mm6, %mm0 + decl %eax + + psrlq $32, %mm0 + + C + + movd %mm0, (%edx,%eax,4) +L(start_src_aligned): + + movq -8(%ebx,%eax,4), %mm1 C src high qword + leal (%edx,%eax,4), %edi + + andl $4, %edi + psrlq $32, %mm5 C return value + + movq -16(%ebx,%eax,4), %mm3 C src second highest qword + jz L(start_dst_aligned) + + C dst isn't aligned, subtract 4 to make it so, and pretend the shift + C is 32 bits extra. High limb of dst (marked xxx) handled here + C separately. + C + C source -8(ebx,%eax,4) + C | + C +-------+-------+-- + C | mm1 | + C +-------+-------+-- + C 0mod8 4mod8 + C + C dest + C -4(edx,%eax,4) + C | + C +-------+-------+-------+-- + C | xxx | | + C +-------+-------+-------+-- + C 0mod8 4mod8 0mod8 + + movq %mm1, %mm0 + addl $32, %ecx C new shift + + psllq %mm6, %mm0 + + movd %ecx, %mm6 + psrlq $32, %mm0 + + C wasted cycle here waiting for %mm0 + + movd %mm0, -4(%edx,%eax,4) + subl $4, %edx +L(start_dst_aligned): + + + psllq %mm6, %mm1 + negl %ecx C -shift + + addl $64, %ecx C 64-shift + movq %mm3, %mm2 + + movd %ecx, %mm7 + subl $8, %eax C size-8 + + psrlq %mm7, %mm3 + + por %mm1, %mm3 C mm3 ready to store + jc L(finish) + + + C The comments in mpn_rshift apply here too. + + ALIGN(8) +L(unroll_loop): + C eax counter, limbs + C ebx src + C ecx + C edx dst + C esi + C edi + C + C mm0 + C mm1 + C mm2 src qword from 16(%ebx,%eax,4) + C mm3 dst qword ready to store to 24(%edx,%eax,4) + C + C mm5 return value + C mm6 lshift + C mm7 rshift + + movq 8(%ebx,%eax,4), %mm0 + psllq %mm6, %mm2 + + movq %mm0, %mm1 + psrlq %mm7, %mm0 + + movq %mm3, 24(%edx,%eax,4) C prev + por %mm2, %mm0 + + movq (%ebx,%eax,4), %mm3 C + psllq %mm6, %mm1 C + + movq %mm0, 16(%edx,%eax,4) + movq %mm3, %mm2 C + + psrlq %mm7, %mm3 C + subl $4, %eax + + por %mm1, %mm3 C + jnc L(unroll_loop) + + + +L(finish): + C eax -4 to -1 representing respectively 0 to 3 limbs remaining + + testb $2, %al + + jz L(finish_no_two) + + movq 8(%ebx,%eax,4), %mm0 + psllq %mm6, %mm2 + + movq %mm0, %mm1 + psrlq %mm7, %mm0 + + movq %mm3, 24(%edx,%eax,4) C prev + por %mm2, %mm0 + + movq %mm1, %mm2 + movq %mm0, %mm3 + + subl $2, %eax +L(finish_no_two): + + + C eax -4 or -3 representing respectively 0 or 1 limbs remaining + C + C mm2 src prev qword, from 16(%ebx,%eax,4) + C mm3 dst qword, for 24(%edx,%eax,4) + + testb $1, %al + movd %mm5, %eax C retval + + popl %edi + jz L(finish_zero) + + + C One extra src limb, destination was aligned. + C + C source ebx + C --+---------------+-------+ + C | mm2 | | + C --+---------------+-------+ + C + C dest edx+12 edx+4 edx + C --+---------------+---------------+-------+ + C | mm3 | | | + C --+---------------+---------------+-------+ + C + C mm6 = shift + C mm7 = ecx = 64-shift + + + C One extra src limb, destination was unaligned. + C + C source ebx + C --+---------------+-------+ + C | mm2 | | + C --+---------------+-------+ + C + C dest edx+12 edx+4 + C --+---------------+---------------+ + C | mm3 | | + C --+---------------+---------------+ + C + C mm6 = shift+32 + C mm7 = ecx = 64-(shift+32) + + + C In both cases there's one extra limb of src to fetch and combine + C with mm2 to make a qword at 4(%edx), and in the aligned case + C there's an extra limb of dst to be formed from that extra src limb + C left shifted. + + + movd (%ebx), %mm0 + psllq %mm6, %mm2 + + movq %mm3, 12(%edx) + psllq $32, %mm0 + + movq %mm0, %mm1 + psrlq %mm7, %mm0 + + por %mm2, %mm0 + psllq %mm6, %mm1 + + movq %mm0, 4(%edx) + psrlq $32, %mm1 + + andl $32, %ecx + popl %ebx + + jz L(finish_one_unaligned) + + movd %mm1, (%edx) +L(finish_one_unaligned): + + emms + + ret + + +L(finish_zero): + + C No extra src limbs, destination was aligned. + C + C source ebx + C --+---------------+ + C | mm2 | + C --+---------------+ + C + C dest edx+8 edx + C --+---------------+---------------+ + C | mm3 | | + C --+---------------+---------------+ + C + C mm6 = shift + C mm7 = ecx = 64-shift + + + C No extra src limbs, destination was unaligned. + C + C source ebx + C --+---------------+ + C | mm2 | + C --+---------------+ + C + C dest edx+8 edx+4 + C --+---------------+-------+ + C | mm3 | | + C --+---------------+-------+ + C + C mm6 = shift+32 + C mm7 = ecx = 64-(shift+32) + + + C The movd for the unaligned case writes the same data to 4(%edx) + C that the movq does for the aligned case. + + + movq %mm3, 8(%edx) + andl $32, %ecx + + psllq %mm6, %mm2 + jz L(finish_zero_unaligned) + + movq %mm2, (%edx) +L(finish_zero_unaligned): + + psrlq $32, %mm2 + popl %ebx + + movd %mm5, %eax C retval + + movd %mm2, 4(%edx) + + emms + + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/mul_1.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/mul_1.asm new file mode 100644 index 0000000..4ced577 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/mul_1.asm @@ -0,0 +1,371 @@ +dnl Intel Pentium MMX mpn_mul_1 -- mpn by limb multiplication. + +dnl Copyright 2000-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C P5: 12.0 for 32-bit multiplier +C 7.0 for 16-bit multiplier + + +C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t multiplier); +C +C When the multiplier is 16 bits some special case MMX code is used. Small +C multipliers might arise reasonably often from mpz_mul_ui etc. If the size +C is odd there's roughly a 5 cycle penalty, so times for say size==7 and +C size==8 end up being quite close. If src isn't aligned to an 8 byte +C boundary then one limb is processed separately with roughly a 5 cycle +C penalty, so in that case it's say size==8 and size==9 which are close. +C +C Alternatives: +C +C MMX is not believed to be of any use for 32-bit multipliers, since for +C instance the current method would just have to be more or less duplicated +C for the high and low halves of the multiplier, and would probably +C therefore run at about 14 cycles, which is slower than the plain integer +C at 12. +C +C Adding the high and low MMX products using integer code seems best. An +C attempt at using paddd and carry bit propagation with pcmpgtd didn't give +C any joy. Perhaps something could be done keeping the values signed and +C thereby avoiding adjustments to make pcmpgtd into an unsigned compare, or +C perhaps not. +C +C Future: +C +C An mpn_mul_1c entrypoint would need a double carry out of the low result +C limb in the 16-bit code, unless it could be assumed the carry fits in 16 +C bits, possibly as carry<multiplier, this being true of a big calculation +C done piece by piece. But let's worry about that if/when mul_1c is +C actually used. + +defframe(PARAM_MULTIPLIER,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + TEXT + + ALIGN(8) +PROLOGUE(mpn_mul_1) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + movl PARAM_SRC, %edx + + cmpl $1, %ecx + jne L(two_or_more) + + C one limb only + + movl PARAM_MULTIPLIER, %eax + movl PARAM_DST, %ecx + + mull (%edx) + + movl %eax, (%ecx) + movl %edx, %eax + + ret + + +L(two_or_more): + C eax size + C ebx + C ecx carry + C edx + C esi src + C edi + C ebp + + pushl %esi FRAME_pushl() + pushl %edi FRAME_pushl() + + movl %edx, %esi C src + movl PARAM_DST, %edi + + movl PARAM_MULTIPLIER, %eax + pushl %ebx FRAME_pushl() + + leal (%esi,%ecx,4), %esi C src end + leal (%edi,%ecx,4), %edi C dst end + + negl %ecx C -size + + pushl %ebp FRAME_pushl() + cmpl $65536, %eax + + jb L(small) + + +L(big): + xorl %ebx, %ebx C carry limb + sarl %ecx C -size/2 + + jnc L(top) C with carry flag clear + + + C size was odd, process one limb separately + + mull 4(%esi,%ecx,8) C m * src[0] + + movl %eax, 4(%edi,%ecx,8) + incl %ecx + + orl %edx, %ebx C carry limb, and clear carry flag + + +L(top): + C eax + C ebx carry + C ecx counter, negative + C edx + C esi src end + C edi dst end + C ebp (scratch carry) + + adcl $0, %ebx + movl (%esi,%ecx,8), %eax + + mull PARAM_MULTIPLIER + + movl %edx, %ebp + addl %eax, %ebx + + adcl $0, %ebp + movl 4(%esi,%ecx,8), %eax + + mull PARAM_MULTIPLIER + + movl %ebx, (%edi,%ecx,8) + addl %ebp, %eax + + movl %eax, 4(%edi,%ecx,8) + incl %ecx + + movl %edx, %ebx + jnz L(top) + + + adcl $0, %ebx + popl %ebp + + movl %ebx, %eax + popl %ebx + + popl %edi + popl %esi + + ret + + +L(small): + C Special case for 16-bit multiplier. + C + C eax multiplier + C ebx + C ecx -size + C edx src + C esi src end + C edi dst end + C ebp multiplier + + C size<3 not supported here. At size==3 we're already a couple of + C cycles faster, so there's no threshold as such, just use the MMX + C as soon as possible. + + cmpl $-3, %ecx + ja L(big) + + movd %eax, %mm7 C m + pxor %mm6, %mm6 C initial carry word + + punpcklwd %mm7, %mm7 C m replicated 2 times + addl $2, %ecx C -size+2 + + punpckldq %mm7, %mm7 C m replicated 4 times + andl $4, %edx C test alignment, clear carry flag + + movq %mm7, %mm0 C m + jz L(small_entry) + + + C Source is unaligned, process one limb separately. + C + C Plain integer code is used here, since it's smaller and is about + C the same 13 cycles as an mmx block would be. + C + C An "addl $1,%ecx" doesn't clear the carry flag when size==3, hence + C the use of separate incl and orl. + + mull -8(%esi,%ecx,4) C m * src[0] + + movl %eax, -8(%edi,%ecx,4) C dst[0] + incl %ecx C one limb processed + + movd %edx, %mm6 C initial carry + + orl %eax, %eax C clear carry flag + jmp L(small_entry) + + +C The scheduling here is quite tricky, since so many instructions have +C pairing restrictions. In particular the js won't pair with a movd, and +C can't be paired with an adc since it wants flags from the inc, so +C instructions are rotated to the top of the loop to find somewhere useful +C for it. +C +C Trouble has been taken to avoid overlapping successive loop iterations, +C since that would greatly increase the size of the startup and finishup +C code. Actually there's probably not much advantage to be had from +C overlapping anyway, since the difficulties are mostly with pairing, not +C with latencies as such. +C +C In the comments x represents the src data and m the multiplier (16 +C bits, but replicated 4 times). +C +C The m signs calculated in %mm3 are a loop invariant and could be held in +C say %mm5, but that would save only one instruction and hence be no faster. + +L(small_top): + C eax l.low, then l.high + C ebx (h.low) + C ecx counter, -size+2 to 0 or 1 + C edx (h.high) + C esi &src[size] + C edi &dst[size] + C ebp + C + C %mm0 (high products) + C %mm1 (low products) + C %mm2 (adjust for m using x signs) + C %mm3 (adjust for x using m signs) + C %mm4 + C %mm5 + C %mm6 h.low, then carry + C %mm7 m replicated 4 times + + movd %mm6, %ebx C h.low + psrlq $32, %mm1 C l.high + + movd %mm0, %edx C h.high + movq %mm0, %mm6 C new c + + adcl %eax, %ebx + incl %ecx + + movd %mm1, %eax C l.high + movq %mm7, %mm0 + + adcl %eax, %edx + movl %ebx, -16(%edi,%ecx,4) + + movl %edx, -12(%edi,%ecx,4) + psrlq $32, %mm6 C c + +L(small_entry): + pmulhw -8(%esi,%ecx,4), %mm0 C h = (x*m).high + movq %mm7, %mm1 + + pmullw -8(%esi,%ecx,4), %mm1 C l = (x*m).low + movq %mm7, %mm3 + + movq -8(%esi,%ecx,4), %mm2 C x + psraw $15, %mm3 C m signs + + pand -8(%esi,%ecx,4), %mm3 C x selected by m signs + psraw $15, %mm2 C x signs + + paddw %mm3, %mm0 C add x to h if m neg + pand %mm7, %mm2 C m selected by x signs + + paddw %mm2, %mm0 C add m to h if x neg + incl %ecx + + movd %mm1, %eax C l.low + punpcklwd %mm0, %mm6 C c + h.low << 16 + + psrlq $16, %mm0 C h.high + js L(small_top) + + + + + movd %mm6, %ebx C h.low + psrlq $32, %mm1 C l.high + + adcl %eax, %ebx + popl %ebp FRAME_popl() + + movd %mm0, %edx C h.high + psrlq $32, %mm0 C l.high + + movd %mm1, %eax C l.high + + adcl %eax, %edx + movl %ebx, -12(%edi,%ecx,4) + + movd %mm0, %eax C c + + adcl $0, %eax + movl %edx, -8(%edi,%ecx,4) + + orl %ecx, %ecx + jnz L(small_done) C final %ecx==1 means even, ==0 odd + + + C Size odd, one extra limb to process. + C Plain integer code is used here, since it's smaller and is about + C the same speed as another mmx block would be. + + movl %eax, %ecx + movl PARAM_MULTIPLIER, %eax + + mull -4(%esi) + + addl %ecx, %eax + + adcl $0, %edx + movl %eax, -4(%edi) + + movl %edx, %eax +L(small_done): + popl %ebx + + popl %edi + popl %esi + + emms + + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/rshift.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/rshift.asm new file mode 100644 index 0000000..e3b274b --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/rshift.asm @@ -0,0 +1,468 @@ +dnl Intel P5 mpn_rshift -- mpn right shift. + +dnl Copyright 2000, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P5: 1.75 cycles/limb. + + +C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned shift); +C +C Shift src,size right by shift many bits and store the result in dst,size. +C Zeros are shifted in at the left. Return the bits shifted out at the +C right. +C +C It takes 6 mmx instructions to process 2 limbs, making 1.5 cycles/limb, +C and with a 4 limb loop and 1 cycle of loop overhead the total is 1.75 c/l. +C +C Full speed depends on source and destination being aligned. Unaligned mmx +C loads and stores on P5 don't pair and have a 2 cycle penalty. Some hairy +C setups and finish-ups are done to ensure alignment for the loop. +C +C MMX shifts work out a bit faster even for the simple loop. + +defframe(PARAM_SHIFT,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) +deflit(`FRAME',0) + +dnl Minimum 5, because the unrolled loop can't handle less. +deflit(UNROLL_THRESHOLD, 5) + + TEXT + ALIGN(8) + +PROLOGUE(mpn_rshift) + + pushl %ebx + pushl %edi +deflit(`FRAME',8) + + movl PARAM_SIZE, %eax + movl PARAM_DST, %edx + + movl PARAM_SRC, %ebx + movl PARAM_SHIFT, %ecx + + cmp $UNROLL_THRESHOLD, %eax + jae L(unroll) + + decl %eax + movl (%ebx), %edi C src low limb + + jnz L(simple) + + shrdl( %cl, %edi, %eax) C eax was decremented to zero + + shrl %cl, %edi + + movl %edi, (%edx) C dst low limb + popl %edi C risk of data cache bank clash + + popl %ebx + + ret + + +C ----------------------------------------------------------------------------- + ALIGN(8) +L(simple): + C eax size-1 + C ebx src + C ecx shift + C edx dst + C esi + C edi + C ebp +deflit(`FRAME',8) + + movd (%ebx), %mm5 C src[0] + leal (%ebx,%eax,4), %ebx C &src[size-1] + + movd %ecx, %mm6 C rshift + leal -4(%edx,%eax,4), %edx C &dst[size-2] + + psllq $32, %mm5 + negl %eax + + +C This loop is 5 or 8 cycles, with every second load unaligned and a wasted +C cycle waiting for the mm0 result to be ready. For comparison a shrdl is 4 +C cycles and would be 8 in a simple loop. Using mmx helps the return value +C and last limb calculations too. + +L(simple_top): + C eax counter, limbs, negative + C ebx &src[size-1] + C ecx return value + C edx &dst[size-2] + C + C mm0 scratch + C mm5 return value + C mm6 shift + + movq (%ebx,%eax,4), %mm0 + incl %eax + + psrlq %mm6, %mm0 + + movd %mm0, (%edx,%eax,4) + jnz L(simple_top) + + + movd (%ebx), %mm0 + psrlq %mm6, %mm5 C return value + + psrlq %mm6, %mm0 + popl %edi + + movd %mm5, %eax + popl %ebx + + movd %mm0, 4(%edx) + + emms + + ret + + +C ----------------------------------------------------------------------------- + ALIGN(8) +L(unroll): + C eax size + C ebx src + C ecx shift + C edx dst + C esi + C edi + C ebp +deflit(`FRAME',8) + + movd (%ebx), %mm5 C src[0] + movl $4, %edi + + movd %ecx, %mm6 C rshift + testl %edi, %ebx + + psllq $32, %mm5 + jz L(start_src_aligned) + + + C src isn't aligned, process low limb separately (marked xxx) and + C step src and dst by one limb, making src aligned. + C + C source ebx + C --+-------+-------+-------+ + C | xxx | + C --+-------+-------+-------+ + C 4mod8 0mod8 4mod8 + C + C dest edx + C --+-------+-------+ + C | | xxx | + C --+-------+-------+ + + movq (%ebx), %mm0 C unaligned load + + psrlq %mm6, %mm0 + addl $4, %ebx + + decl %eax + + movd %mm0, (%edx) + addl $4, %edx +L(start_src_aligned): + + + movq (%ebx), %mm1 + testl %edi, %edx + + psrlq %mm6, %mm5 C retval + jz L(start_dst_aligned) + + C dst isn't aligned, add 4 to make it so, and pretend the shift is + C 32 bits extra. Low limb of dst (marked xxx) handled here + C separately. + C + C source ebx + C --+-------+-------+ + C | mm1 | + C --+-------+-------+ + C 4mod8 0mod8 + C + C dest edx + C --+-------+-------+-------+ + C | xxx | + C --+-------+-------+-------+ + C 4mod8 0mod8 4mod8 + + movq %mm1, %mm0 + addl $32, %ecx C new shift + + psrlq %mm6, %mm0 + + movd %ecx, %mm6 + + movd %mm0, (%edx) + addl $4, %edx +L(start_dst_aligned): + + + movq 8(%ebx), %mm3 + negl %ecx + + movq %mm3, %mm2 C mm2 src qword + addl $64, %ecx + + movd %ecx, %mm7 + psrlq %mm6, %mm1 + + leal -12(%ebx,%eax,4), %ebx + leal -20(%edx,%eax,4), %edx + + psllq %mm7, %mm3 + subl $7, %eax C size-7 + + por %mm1, %mm3 C mm3 ready to store + negl %eax C -(size-7) + + jns L(finish) + + + C This loop is the important bit, the rest is just support. Careful + C instruction scheduling achieves the claimed 1.75 c/l. The + C relevant parts of the pairing rules are: + C + C - mmx loads and stores execute only in the U pipe + C - only one mmx shift in a pair + C - wait one cycle before storing an mmx register result + C - the usual address generation interlock + C + C Two qword calculations are slightly interleaved. The instructions + C marked "C" belong to the second qword, and the "C prev" one is for + C the second qword from the previous iteration. + + ALIGN(8) +L(unroll_loop): + C eax counter, limbs, negative + C ebx &src[size-12] + C ecx + C edx &dst[size-12] + C esi + C edi + C + C mm0 + C mm1 + C mm2 src qword from -8(%ebx,%eax,4) + C mm3 dst qword ready to store to -8(%edx,%eax,4) + C + C mm5 return value + C mm6 rshift + C mm7 lshift + + movq (%ebx,%eax,4), %mm0 + psrlq %mm6, %mm2 + + movq %mm0, %mm1 + psllq %mm7, %mm0 + + movq %mm3, -8(%edx,%eax,4) C prev + por %mm2, %mm0 + + movq 8(%ebx,%eax,4), %mm3 C + psrlq %mm6, %mm1 C + + movq %mm0, (%edx,%eax,4) + movq %mm3, %mm2 C + + psllq %mm7, %mm3 C + addl $4, %eax + + por %mm1, %mm3 C + js L(unroll_loop) + + +L(finish): + C eax 0 to 3 representing respectively 3 to 0 limbs remaining + + testb $2, %al + + jnz L(finish_no_two) + + movq (%ebx,%eax,4), %mm0 + psrlq %mm6, %mm2 + + movq %mm0, %mm1 + psllq %mm7, %mm0 + + movq %mm3, -8(%edx,%eax,4) C prev + por %mm2, %mm0 + + movq %mm1, %mm2 + movq %mm0, %mm3 + + addl $2, %eax +L(finish_no_two): + + + C eax 2 or 3 representing respectively 1 or 0 limbs remaining + C + C mm2 src prev qword, from -8(%ebx,%eax,4) + C mm3 dst qword, for -8(%edx,%eax,4) + + testb $1, %al + popl %edi + + movd %mm5, %eax C retval + jnz L(finish_zero) + + + C One extra limb, destination was aligned. + C + C source ebx + C +-------+---------------+-- + C | | mm2 | + C +-------+---------------+-- + C + C dest edx + C +-------+---------------+---------------+-- + C | | | mm3 | + C +-------+---------------+---------------+-- + C + C mm6 = shift + C mm7 = ecx = 64-shift + + + C One extra limb, destination was unaligned. + C + C source ebx + C +-------+---------------+-- + C | | mm2 | + C +-------+---------------+-- + C + C dest edx + C +---------------+---------------+-- + C | | mm3 | + C +---------------+---------------+-- + C + C mm6 = shift+32 + C mm7 = ecx = 64-(shift+32) + + + C In both cases there's one extra limb of src to fetch and combine + C with mm2 to make a qword at 8(%edx), and in the aligned case + C there's a further extra limb of dst to be formed. + + + movd 8(%ebx), %mm0 + psrlq %mm6, %mm2 + + movq %mm0, %mm1 + psllq %mm7, %mm0 + + movq %mm3, (%edx) + por %mm2, %mm0 + + psrlq %mm6, %mm1 + andl $32, %ecx + + popl %ebx + jz L(finish_one_unaligned) + + C dst was aligned, must store one extra limb + movd %mm1, 16(%edx) +L(finish_one_unaligned): + + movq %mm0, 8(%edx) + + emms + + ret + + +L(finish_zero): + + C No extra limbs, destination was aligned. + C + C source ebx + C +---------------+-- + C | mm2 | + C +---------------+-- + C + C dest edx+4 + C +---------------+---------------+-- + C | | mm3 | + C +---------------+---------------+-- + C + C mm6 = shift + C mm7 = ecx = 64-shift + + + C No extra limbs, destination was unaligned. + C + C source ebx + C +---------------+-- + C | mm2 | + C +---------------+-- + C + C dest edx+4 + C +-------+---------------+-- + C | | mm3 | + C +-------+---------------+-- + C + C mm6 = shift+32 + C mm7 = 64-(shift+32) + + + C The movd for the unaligned case is clearly the same data as the + C movq for the aligned case, it's just a choice between whether one + C or two limbs should be written. + + + movq %mm3, 4(%edx) + psrlq %mm6, %mm2 + + movd %mm2, 12(%edx) + andl $32, %ecx + + popl %ebx + jz L(finish_zero_unaligned) + + movq %mm2, 12(%edx) +L(finish_zero_unaligned): + + emms + + ret + +EPILOGUE() |