diff options
Diffstat (limited to 'vendor/gmp-6.3.0/mpn/x86/pentium')
25 files changed, 5560 insertions, 0 deletions
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/README b/vendor/gmp-6.3.0/mpn/x86/pentium/README new file mode 100644 index 0000000..305936b --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium/README @@ -0,0 +1,181 @@ +Copyright 1996, 1999-2001, 2003 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. + + + + + + INTEL PENTIUM P5 MPN SUBROUTINES + + +This directory contains mpn functions optimized for Intel Pentium (P5,P54) +processors. The mmx subdirectory has additional code for Pentium with MMX +(P55). + + +STATUS + + cycles/limb + + mpn_add_n/sub_n 2.375 + + mpn_mul_1 12.0 + mpn_add/submul_1 14.0 + + mpn_mul_basecase 14.2 cycles/crossproduct (approx) + + mpn_sqr_basecase 8 cycles/crossproduct (approx) + or 15.5 cycles/triangleproduct (approx) + + mpn_l/rshift 5.375 normal (6.0 on P54) + 1.875 special shift by 1 bit + + mpn_divrem_1 44.0 + mpn_mod_1 28.0 + mpn_divexact_by3 15.0 + + mpn_copyi/copyd 1.0 + +Pentium MMX gets the following improvements + + mpn_l/rshift 1.75 + + mpn_mul_1 12.0 normal, 7.0 for 16-bit multiplier + + +mpn_add_n and mpn_sub_n run at asymptotically 2 cycles/limb. Due to loop +overhead and other delays (cache refill?), they run at or near 2.5 +cycles/limb. + +mpn_mul_1, mpn_addmul_1, mpn_submul_1 all run 1 cycle faster than they +should. Intel documentation says a mul instruction is 10 cycles, but it +measures 9 and the routines using it run as 9. + + + +P55 MMX AND X87 + +The cost of switching between MMX and x87 floating point on P55 is about 100 +cycles (fld1/por/emms for instance). In order to avoid that the two aren't +mixed and currently that means using MMX and not x87. + +MMX offers a big speedup for lshift and rshift, and a nice speedup for +16-bit multipliers in mpn_mul_1. If fast code using x87 is found then +perhaps the preference for MMX will be reversed. + + + + +P54 SHLDL + +mpn_lshift and mpn_rshift run at about 6 cycles/limb on P5 and P54, but the +documentation indicates that they should take only 43/8 = 5.375 cycles/limb, +or 5 cycles/limb asymptotically. The P55 runs them at the expected speed. + +It seems that on P54 a shldl or shrdl allows pairing in one following cycle, +but not two. For example, back to back repetitions of the following + + shldl( %cl, %eax, %ebx) + xorl %edx, %edx + xorl %esi, %esi + +run at 5 cycles, as expected, but repetitions of the following run at 7 +cycles, whereas 6 would be expected (and is achieved on P55), + + shldl( %cl, %eax, %ebx) + xorl %edx, %edx + xorl %esi, %esi + xorl %edi, %edi + xorl %ebp, %ebp + +Three xorls run at 7 cycles too, so it doesn't seem to be just that pairing +inhibited is only in the second following cycle (or something like that). + +Avoiding this problem would bring P54 shifts down from 6.0 c/l to 5.5 with a +pattern of shift, 2 loads, shift, 2 stores, shift, etc. A start has been +made on something like that, but it's not yet complete. + + + + +OTHER NOTES + +Prefetching Destinations + + Pentium doesn't allocate cache lines on writes, unlike most other modern + processors. Since the functions in the mpn class do array writes, we + have to handle allocating the destination cache lines by reading a word + from it in the loops, to achieve the best performance. + +Prefetching Sources + + Prefetching of sources is pointless since there's no out-of-order loads. + Any load instruction blocks until the line is brought to L1, so it may + as well be the load that wants the data which blocks. + +Data Cache Bank Clashes + + Pairing of memory operations requires that the two issued operations + refer to different cache banks (ie. different addresses modulo 32 + bytes). The simplest way to ensure this is to read/write two words from + the same object. If we make operations on different objects, they might + or might not be to the same cache bank. + +PIC %eip Fetching + + A simple call $+5 and popl can be used to get %eip, there's no need to + balance calls and returns since P5 doesn't have any return stack branch + prediction. + +Float Multiplies + + fmul is pairable and can be issued every 2 cycles (with a 4 cycle + latency for data ready to use). This is a lot better than integer mull + or imull at 9 cycles non-pairing. Unfortunately the advantage is + quickly eaten away by needing to throw data through memory back to the + integer registers to adjust for fild and fist being signed, and to do + things like propagating carry bits. + + + + + +REFERENCES + +"Intel Architecture Optimization Manual", 1997, order number 242816. This +is mostly about P5, the parts about P6 aren't relevant. Available on-line: + + http://download.intel.com/design/PentiumII/manuals/242816.htm + + + +---------------- +Local variables: +mode: text +fill-column: 76 +End: diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/aors_n.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/aors_n.asm new file mode 100644 index 0000000..01ebfb9 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium/aors_n.asm @@ -0,0 +1,203 @@ +dnl Intel Pentium mpn_add_n/mpn_sub_n -- mpn addition and subtraction. + +dnl Copyright 1992, 1994-1996, 1999, 2000, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P5: 2.375 cycles/limb + + +ifdef(`OPERATION_add_n',` + define(M4_inst, adcl) + define(M4_function_n, mpn_add_n) + define(M4_function_nc, mpn_add_nc) + +',`ifdef(`OPERATION_sub_n',` + define(M4_inst, sbbl) + define(M4_function_n, mpn_sub_n) + define(M4_function_nc, mpn_sub_nc) + +',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n +')')') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) + + +C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, +C mp_size_t size); +C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, +C mp_size_t size, mp_limb_t carry); + +defframe(PARAM_CARRY,20) +defframe(PARAM_SIZE, 16) +defframe(PARAM_SRC2, 12) +defframe(PARAM_SRC1, 8) +defframe(PARAM_DST, 4) + + TEXT + ALIGN(8) +PROLOGUE(M4_function_nc) + + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp +deflit(`FRAME',16) + + movl PARAM_DST,%edi + movl PARAM_SRC1,%esi + movl PARAM_SRC2,%ebp + movl PARAM_SIZE,%ecx + + movl (%ebp),%ebx + + decl %ecx + movl %ecx,%edx + shrl $3,%ecx + andl $7,%edx + testl %ecx,%ecx C zero carry flag + jz L(endgo) + + pushl %edx +FRAME_pushl() + movl PARAM_CARRY,%eax + shrl %eax C shift bit 0 into carry + jmp L(oop) + +L(endgo): +deflit(`FRAME',16) + movl PARAM_CARRY,%eax + shrl %eax C shift bit 0 into carry + jmp L(end) + +EPILOGUE() + + + ALIGN(8) +PROLOGUE(M4_function_n) + + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp +deflit(`FRAME',16) + + movl PARAM_DST,%edi + movl PARAM_SRC1,%esi + movl PARAM_SRC2,%ebp + movl PARAM_SIZE,%ecx + + movl (%ebp),%ebx + + decl %ecx + movl %ecx,%edx + shrl $3,%ecx + andl $7,%edx + testl %ecx,%ecx C zero carry flag + jz L(end) + pushl %edx +FRAME_pushl() + + ALIGN(8) +L(oop): movl 28(%edi),%eax C fetch destination cache line + leal 32(%edi),%edi + +L(1): movl (%esi),%eax + movl 4(%esi),%edx + M4_inst %ebx,%eax + movl 4(%ebp),%ebx + M4_inst %ebx,%edx + movl 8(%ebp),%ebx + movl %eax,-32(%edi) + movl %edx,-28(%edi) + +L(2): movl 8(%esi),%eax + movl 12(%esi),%edx + M4_inst %ebx,%eax + movl 12(%ebp),%ebx + M4_inst %ebx,%edx + movl 16(%ebp),%ebx + movl %eax,-24(%edi) + movl %edx,-20(%edi) + +L(3): movl 16(%esi),%eax + movl 20(%esi),%edx + M4_inst %ebx,%eax + movl 20(%ebp),%ebx + M4_inst %ebx,%edx + movl 24(%ebp),%ebx + movl %eax,-16(%edi) + movl %edx,-12(%edi) + +L(4): movl 24(%esi),%eax + movl 28(%esi),%edx + M4_inst %ebx,%eax + movl 28(%ebp),%ebx + M4_inst %ebx,%edx + movl 32(%ebp),%ebx + movl %eax,-8(%edi) + movl %edx,-4(%edi) + + leal 32(%esi),%esi + leal 32(%ebp),%ebp + decl %ecx + jnz L(oop) + + popl %edx +FRAME_popl() +L(end): + decl %edx C test %edx w/o clobbering carry + js L(end2) + incl %edx +L(oop2): + leal 4(%edi),%edi + movl (%esi),%eax + M4_inst %ebx,%eax + movl 4(%ebp),%ebx + movl %eax,-4(%edi) + leal 4(%esi),%esi + leal 4(%ebp),%ebp + decl %edx + jnz L(oop2) +L(end2): + movl (%esi),%eax + M4_inst %ebx,%eax + movl %eax,(%edi) + + sbbl %eax,%eax + negl %eax + + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/aorsmul_1.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/aorsmul_1.asm new file mode 100644 index 0000000..d83cc45 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium/aorsmul_1.asm @@ -0,0 +1,144 @@ +dnl Intel Pentium mpn_addmul_1 -- mpn by limb multiplication. + +dnl Copyright 1992, 1994, 1996, 1999, 2000, 2002 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P5: 14.0 cycles/limb + + +ifdef(`OPERATION_addmul_1', ` + define(M4_inst, addl) + define(M4_function_1, mpn_addmul_1) + define(M4_function_1c, mpn_addmul_1c) + +',`ifdef(`OPERATION_submul_1', ` + define(M4_inst, subl) + define(M4_function_1, mpn_submul_1) + define(M4_function_1c, mpn_submul_1c) + +',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1 +')')') + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c) + + +C mp_limb_t mpn_addmul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t mult); +C mp_limb_t mpn_addmul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t mult, mp_limb_t carry); +C +C mp_limb_t mpn_submul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t mult); +C mp_limb_t mpn_submul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t mult, mp_limb_t carry); +C + +defframe(PARAM_CARRY, 20) +defframe(PARAM_MULTIPLIER,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + TEXT + + ALIGN(8) +PROLOGUE(M4_function_1c) +deflit(`FRAME',0) + + movl PARAM_CARRY, %ecx + pushl %esi FRAME_pushl() + + jmp L(start_1c) + +EPILOGUE() + + + ALIGN(8) +PROLOGUE(M4_function_1) +deflit(`FRAME',0) + + xorl %ecx, %ecx + pushl %esi FRAME_pushl() + +L(start_1c): + movl PARAM_SRC, %esi + movl PARAM_SIZE, %eax + + pushl %edi FRAME_pushl() + pushl %ebx FRAME_pushl() + + movl PARAM_DST, %edi + leal -1(%eax), %ebx C size-1 + + leal (%esi,%eax,4), %esi + xorl $-1, %ebx C -size, and clear carry + + leal (%edi,%eax,4), %edi + +L(top): + C eax + C ebx counter, negative + C ecx carry + C edx + C esi src end + C edi dst end + C ebp + + adcl $0, %ecx + movl (%esi,%ebx,4), %eax + + mull PARAM_MULTIPLIER + + addl %ecx, %eax + movl (%edi,%ebx,4), %ecx + + adcl $0, %edx + M4_inst %eax, %ecx + + movl %ecx, (%edi,%ebx,4) + incl %ebx + + movl %edx, %ecx + jnz L(top) + + + adcl $0, %ecx + popl %ebx + + movl %ecx, %eax + popl %edi + + popl %esi + + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/bdiv_q_1.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/bdiv_q_1.asm new file mode 100644 index 0000000..c2c4f58 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium/bdiv_q_1.asm @@ -0,0 +1,266 @@ +dnl Intel Pentium mpn_divexact_1 -- mpn by limb exact division. + +dnl Rearranged from mpn/x86/pentium/dive_1.asm by Marco Bodrato. + +dnl Copyright 2001, 2002, 2011, 2014 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C divisor +C odd even +C P54: 24.5 30.5 cycles/limb +C P55: 23.0 28.0 + +MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1) + +C The P55 speeds noted above, 23 cycles odd or 28 cycles even, are as +C expected. On P54 in the even case the shrdl pairing nonsense (see +C mpn/x86/pentium/README) costs 1 cycle, but it's not clear why there's a +C further 1.5 slowdown for both odd and even. + +defframe(PARAM_SHIFT, 24) +defframe(PARAM_INVERSE,20) +defframe(PARAM_DIVISOR,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +dnl re-use parameter space +define(VAR_INVERSE,`PARAM_DST') + + TEXT + + ALIGN(32) +C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t divisor); +C +PROLOGUE(mpn_bdiv_q_1) +deflit(`FRAME',0) + + movl $-1, %ecx + movl PARAM_DIVISOR, %eax + +L(strip_twos): + ASSERT(nz, `orl %eax, %eax') + shrl %eax + incl %ecx C shift count + + jnc L(strip_twos) + + leal 1(%eax,%eax), %edx C d + andl $127, %eax C d/2, 7 bits + + pushl %ebx FRAME_pushl() + pushl %ebp FRAME_pushl() + +ifdef(`PIC',` +ifdef(`DARWIN',` + LEA( binvert_limb_table, %ebp) + movzbl (%eax,%ebp), %eax +',` + call L(here) +L(here): + popl %ebp C eip + + addl $_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ebp + C AGI + movl binvert_limb_table@GOT(%ebp), %ebp + C AGI + movzbl (%eax,%ebp), %eax +') +',` + +dnl non-PIC + movzbl binvert_limb_table(%eax), %eax C inv 8 bits +') + + movl %eax, %ebp C inv + addl %eax, %eax C 2*inv + + imull %ebp, %ebp C inv*inv + + imull %edx, %ebp C inv*inv*d + + subl %ebp, %eax C inv = 2*inv - inv*inv*d + movl PARAM_SIZE, %ebx + + movl %eax, %ebp + addl %eax, %eax C 2*inv + + imull %ebp, %ebp C inv*inv + + imull %edx, %ebp C inv*inv*d + + subl %ebp, %eax C inv = 2*inv - inv*inv*d + movl %edx, PARAM_DIVISOR C d without twos + + ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS + pushl %eax FRAME_pushl() + imull PARAM_DIVISOR, %eax + cmpl $1, %eax + popl %eax FRAME_popl()') + + jmp L(common) +EPILOGUE() + +C mp_limb_t +C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor, +C mp_limb_t inverse, int shift) + ALIGN(32) +PROLOGUE(mpn_pi1_bdiv_q_1) +deflit(`FRAME',0) + + movl PARAM_SHIFT, %ecx + + pushl %ebx FRAME_pushl() + pushl %ebp FRAME_pushl() + + movl PARAM_SIZE, %ebx + movl PARAM_INVERSE, %eax + +L(common): + pushl %esi FRAME_pushl() + push %edi FRAME_pushl() + + movl PARAM_SRC, %esi + movl PARAM_DST, %edi + movl %eax, VAR_INVERSE + + leal (%esi,%ebx,4), %esi C src end + leal (%edi,%ebx,4), %edi C dst end + + negl %ebx C -size + + xorl %ebp, %ebp C initial carry bit + + orl %ecx, %ecx C shift + movl (%esi,%ebx,4), %eax C src low limb + jz L(odd_entry) + + xorl %edx, %edx C initial carry limb (for even, if one) + incl %ebx + jz L(one) + + movl (%esi,%ebx,4), %edx C src second limb (for even) + shrdl( %cl, %edx, %eax) + + jmp L(even_entry) + + + ALIGN(8) +L(odd_top): + C eax scratch + C ebx counter, limbs, negative + C ecx + C edx + C esi src end + C edi dst end + C ebp carry bit, 0 or -1 + + mull PARAM_DIVISOR + + movl (%esi,%ebx,4), %eax + subl %ebp, %edx + + subl %edx, %eax + + sbbl %ebp, %ebp + +L(odd_entry): + imull VAR_INVERSE, %eax + + movl %eax, (%edi,%ebx,4) + + incl %ebx + jnz L(odd_top) + + popl %edi + popl %esi + + popl %ebp + popl %ebx + + ret + +L(even_top): + C eax scratch + C ebx counter, limbs, negative + C ecx twos + C edx + C esi src end + C edi dst end + C ebp carry bit, 0 or -1 + + mull PARAM_DIVISOR + + subl %ebp, %edx C carry bit + movl -4(%esi,%ebx,4), %eax C src limb + + movl (%esi,%ebx,4), %ebp C and one above it + + shrdl( %cl, %ebp, %eax) + + subl %edx, %eax C carry limb + + sbbl %ebp, %ebp + +L(even_entry): + imull VAR_INVERSE, %eax + + movl %eax, -4(%edi,%ebx,4) + incl %ebx + + jnz L(even_top) + + mull PARAM_DIVISOR + + movl -4(%esi), %eax C src high limb + subl %ebp, %edx + +L(one): + shrl %cl, %eax + + subl %edx, %eax C no carry if division is exact + + imull VAR_INVERSE, %eax + + movl %eax, -4(%edi) C dst high limb + nop C protect against cache bank clash + + popl %edi + popl %esi + + popl %ebp + popl %ebx + + ret + +EPILOGUE() +ASM_END() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/com.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/com.asm new file mode 100644 index 0000000..b080545 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium/com.asm @@ -0,0 +1,181 @@ +dnl Intel Pentium mpn_com -- mpn ones complement. + +dnl Copyright 1996, 2001, 2002, 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P5: 1.75 cycles/limb + + +NAILS_SUPPORT(0-31) + + +C void mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size); +C +C This code is similar to mpn_copyi, basically there's just some "xorl +C $GMP_NUMB_MASK"s inserted. +C +C Alternatives: +C +C On P55 some MMX code could be 1.25 c/l (8 limb unrolled) if src and dst +C are the same alignment mod 8, but it doesn't seem worth the trouble for +C just that case (there'd need to be some plain integer available too for +C the unaligned case). + +defframe(PARAM_SIZE,12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + TEXT + ALIGN(8) +PROLOGUE(mpn_com) +deflit(`FRAME',0) + + movl PARAM_SRC, %eax + movl PARAM_SIZE, %ecx + + pushl %esi FRAME_pushl() + pushl %edi FRAME_pushl() + + leal (%eax,%ecx,4), %eax + xorl $-1, %ecx C -size-1 + + movl PARAM_DST, %edx + addl $8, %ecx C -size+7 + + jns L(end) + + movl (%edx), %esi C fetch destination cache line + nop + +L(top): + C eax &src[size] + C ebx + C ecx counter, limbs, negative + C edx dst, incrementing + C esi scratch + C edi scratch + C ebp + + movl 28(%edx), %esi C destination prefetch + addl $32, %edx + + movl -28(%eax,%ecx,4), %esi + movl -24(%eax,%ecx,4), %edi + xorl $GMP_NUMB_MASK, %esi + xorl $GMP_NUMB_MASK, %edi + movl %esi, -32(%edx) + movl %edi, -28(%edx) + + movl -20(%eax,%ecx,4), %esi + movl -16(%eax,%ecx,4), %edi + xorl $GMP_NUMB_MASK, %esi + xorl $GMP_NUMB_MASK, %edi + movl %esi, -24(%edx) + movl %edi, -20(%edx) + + movl -12(%eax,%ecx,4), %esi + movl -8(%eax,%ecx,4), %edi + xorl $GMP_NUMB_MASK, %esi + xorl $GMP_NUMB_MASK, %edi + movl %esi, -16(%edx) + movl %edi, -12(%edx) + + movl -4(%eax,%ecx,4), %esi + movl (%eax,%ecx,4), %edi + xorl $GMP_NUMB_MASK, %esi + xorl $GMP_NUMB_MASK, %edi + movl %esi, -8(%edx) + movl %edi, -4(%edx) + + addl $8, %ecx + js L(top) + + +L(end): + C eax &src[size] + C ecx 0 to 7, representing respectively 7 to 0 limbs remaining + C edx dst, next location to store + + subl $4, %ecx + nop + + jns L(no4) + + movl -12(%eax,%ecx,4), %esi + movl -8(%eax,%ecx,4), %edi + xorl $GMP_NUMB_MASK, %esi + xorl $GMP_NUMB_MASK, %edi + movl %esi, (%edx) + movl %edi, 4(%edx) + + movl -4(%eax,%ecx,4), %esi + movl (%eax,%ecx,4), %edi + xorl $GMP_NUMB_MASK, %esi + xorl $GMP_NUMB_MASK, %edi + movl %esi, 8(%edx) + movl %edi, 12(%edx) + + addl $16, %edx + addl $4, %ecx +L(no4): + + subl $2, %ecx + nop + + jns L(no2) + + movl -4(%eax,%ecx,4), %esi + movl (%eax,%ecx,4), %edi + xorl $GMP_NUMB_MASK, %esi + xorl $GMP_NUMB_MASK, %edi + movl %esi, (%edx) + movl %edi, 4(%edx) + + addl $8, %edx + addl $2, %ecx +L(no2): + + popl %edi + jnz L(done) + + movl -4(%eax), %ecx + + xorl $GMP_NUMB_MASK, %ecx + popl %esi + + movl %ecx, (%edx) + ret + +L(done): + popl %esi + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/copyd.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/copyd.asm new file mode 100644 index 0000000..72a543b --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium/copyd.asm @@ -0,0 +1,146 @@ +dnl Intel Pentium mpn_copyd -- copy limb vector, decrementing. + +dnl Copyright 1996, 2001, 2002, 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P5: 1.25 cycles/limb + + +C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size); +C +C See comments in copyi.asm. + +defframe(PARAM_SIZE,12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + TEXT + ALIGN(8) +PROLOGUE(mpn_copyd) +deflit(`FRAME',0) + + movl PARAM_SRC, %eax + movl PARAM_SIZE, %ecx + + pushl %esi FRAME_pushl() + pushl %edi FRAME_pushl() + + leal -4(%eax,%ecx,4), %eax C &src[size-1] + movl PARAM_DST, %edx + + subl $7, %ecx C size-7 + jle L(end) + + movl 28-4(%edx,%ecx,4), %esi C prefetch cache, dst[size-1] + nop + +L(top): + C eax src, decrementing + C ebx + C ecx counter, limbs + C edx dst + C esi scratch + C edi scratch + C ebp + + movl 28-32(%edx,%ecx,4), %esi C prefetch dst cache line + subl $8, %ecx + + movl (%eax), %esi C read words pairwise + movl -4(%eax), %edi + movl %esi, 56(%edx,%ecx,4) C store words pairwise + movl %edi, 52(%edx,%ecx,4) + + movl -8(%eax), %esi + movl -12(%eax), %edi + movl %esi, 48(%edx,%ecx,4) + movl %edi, 44(%edx,%ecx,4) + + movl -16(%eax), %esi + movl -20(%eax), %edi + movl %esi, 40(%edx,%ecx,4) + movl %edi, 36(%edx,%ecx,4) + + movl -24(%eax), %esi + movl -28(%eax), %edi + movl %esi, 32(%edx,%ecx,4) + movl %edi, 28(%edx,%ecx,4) + + leal -32(%eax), %eax + jg L(top) + + +L(end): + C ecx -7 to 0, representing respectively 0 to 7 limbs remaining + C eax src end + C edx dst, next location to store + + addl $4, %ecx + jle L(no4) + + movl (%eax), %esi + movl -4(%eax), %edi + movl %esi, 8(%edx,%ecx,4) + movl %edi, 4(%edx,%ecx,4) + + movl -8(%eax), %esi + movl -12(%eax), %edi + movl %esi, (%edx,%ecx,4) + movl %edi, -4(%edx,%ecx,4) + + subl $16, %eax + subl $4, %ecx +L(no4): + + addl $2, %ecx + jle L(no2) + + movl (%eax), %esi + movl -4(%eax), %edi + movl %esi, (%edx,%ecx,4) + movl %edi, -4(%edx,%ecx,4) + + subl $8, %eax + subl $2, %ecx +L(no2): + + jnz L(done) + + movl (%eax), %ecx + movl %ecx, (%edx) C risk of cache bank clash here + +L(done): + popl %edi + popl %esi + + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/copyi.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/copyi.asm new file mode 100644 index 0000000..d983d6b --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium/copyi.asm @@ -0,0 +1,164 @@ +dnl Intel Pentium mpn_copyi -- copy limb vector, incrementing. + +dnl Copyright 1996, 2001, 2002, 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P5: 1.25 cycles/limb + + +C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size); +C +C Destination prefetching is done to avoid repeated write-throughs on lines +C not already in L1. +C +C At least one of the src or dst pointer needs to be incremented rather than +C using indexing, so that there's somewhere to put the loop control without +C an AGI. Incrementing one and not two lets us keep loop overhead to 2 +C cycles. Making it the src pointer incremented avoids an AGI on the %ecx +C subtracts in the finishup code. +C +C The block of finishup code is almost as big as the main loop itself, which +C is unfortunate, but it's faster that way than with say rep movsl, by about +C 10 cycles for instance on P55. +C +C There's nothing to be gained from MMX on P55, since it can do only one +C movq load (or store) per cycle, so the throughput would be the same as the +C code here (and even then only if src and dst have the same alignment mod +C 8). + +defframe(PARAM_SIZE,12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + TEXT + ALIGN(8) +PROLOGUE(mpn_copyi) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + movl PARAM_DST, %edx + + pushl %ebx FRAME_pushl() + pushl %esi FRAME_pushl() + + leal (%edx,%ecx,4), %edx C &dst[size-1] + xorl $-1, %ecx C -size-1 + + movl PARAM_SRC, %esi + addl $8, %ecx C -size+7 + + jns L(end) + + movl -28(%edx,%ecx,4), %eax C fetch destination cache line, dst[0] + nop + +L(top): + C eax scratch + C ebx scratch + C ecx counter, limbs, negative + C edx &dst[size-1] + C esi src, incrementing + C edi + C ebp + + movl (%edx,%ecx,4), %eax C fetch destination cache line + addl $8, %ecx + + movl (%esi), %eax C read words pairwise + movl 4(%esi), %ebx + movl %eax, -60(%edx,%ecx,4) C store words pairwise + movl %ebx, -56(%edx,%ecx,4) + + movl 8(%esi), %eax + movl 12(%esi), %ebx + movl %eax, -52(%edx,%ecx,4) + movl %ebx, -48(%edx,%ecx,4) + + movl 16(%esi), %eax + movl 20(%esi), %ebx + movl %eax, -44(%edx,%ecx,4) + movl %ebx, -40(%edx,%ecx,4) + + movl 24(%esi), %eax + movl 28(%esi), %ebx + movl %eax, -36(%edx,%ecx,4) + movl %ebx, -32(%edx,%ecx,4) + + leal 32(%esi), %esi + js L(top) + + +L(end): + C ecx 0 to 7, representing respectively 7 to 0 limbs remaining + C esi src end + C edx dst, next location to store + + subl $4, %ecx + jns L(no4) + + movl (%esi), %eax + movl 4(%esi), %ebx + movl %eax, -12(%edx,%ecx,4) + movl %ebx, -8(%edx,%ecx,4) + + movl 8(%esi), %eax + movl 12(%esi), %ebx + movl %eax, -4(%edx,%ecx,4) + movl %ebx, (%edx,%ecx,4) + + addl $16, %esi + addl $4, %ecx +L(no4): + + subl $2, %ecx + jns L(no2) + + movl (%esi), %eax + movl 4(%esi), %ebx + movl %eax, -4(%edx,%ecx,4) + movl %ebx, (%edx,%ecx,4) + + addl $8, %esi + addl $2, %ecx +L(no2): + + jnz L(done) + + movl (%esi), %eax + movl %eax, -4(%edx,%ecx,4) C risk of cache bank clash here + +L(done): + popl %esi + popl %ebx + + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/dive_1.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/dive_1.asm new file mode 100644 index 0000000..21b5287 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium/dive_1.asm @@ -0,0 +1,264 @@ +dnl Intel Pentium mpn_divexact_1 -- mpn by limb exact division. + +dnl Copyright 2001, 2002, 2014 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C divisor +C odd even +C P54: 24.5 30.5 cycles/limb +C P55: 23.0 28.0 + + +C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t divisor); +C +C Plain divl is used for small sizes, since the inverse takes a while to +C setup. Multiplying works out faster for size>=3 when the divisor is odd, +C or size>=4 when the divisor is even. Actually on P55 size==2 for odd or +C size==3 for even are about the same speed for both divl or mul, but the +C former is used since it will use up less code cache. +C +C The P55 speeds noted above, 23 cycles odd or 28 cycles even, are as +C expected. On P54 in the even case the shrdl pairing nonsense (see +C mpn/x86/pentium/README) costs 1 cycle, but it's not clear why there's a +C further 1.5 slowdown for both odd and even. + +defframe(PARAM_DIVISOR,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +dnl re-use parameter space +define(VAR_INVERSE,`PARAM_DST') + + TEXT + + ALIGN(32) +PROLOGUE(mpn_divexact_1) +deflit(`FRAME',0) + + movl PARAM_DIVISOR, %eax + movl PARAM_SIZE, %ecx + + pushl %esi FRAME_pushl() + push %edi FRAME_pushl() + + movl PARAM_SRC, %esi + andl $1, %eax + + movl PARAM_DST, %edi + addl %ecx, %eax C size if even, size+1 if odd + + cmpl $4, %eax + jae L(mul_by_inverse) + + + xorl %edx, %edx +L(div_top): + movl -4(%esi,%ecx,4), %eax + + divl PARAM_DIVISOR + + movl %eax, -4(%edi,%ecx,4) + decl %ecx + + jnz L(div_top) + + popl %edi + popl %esi + + ret + + + +L(mul_by_inverse): + movl PARAM_DIVISOR, %eax + movl $-1, %ecx + +L(strip_twos): + ASSERT(nz, `orl %eax, %eax') + shrl %eax + incl %ecx C shift count + + jnc L(strip_twos) + + leal 1(%eax,%eax), %edx C d + andl $127, %eax C d/2, 7 bits + + pushl %ebx FRAME_pushl() + pushl %ebp FRAME_pushl() + +ifdef(`PIC',`dnl + LEA( binvert_limb_table, %ebp) + movzbl (%eax,%ebp), %eax C inv 8 bits +',` + movzbl binvert_limb_table(%eax), %eax C inv 8 bits +') + + movl %eax, %ebp C inv + addl %eax, %eax C 2*inv + + imull %ebp, %ebp C inv*inv + + imull %edx, %ebp C inv*inv*d + + subl %ebp, %eax C inv = 2*inv - inv*inv*d + movl PARAM_SIZE, %ebx + + movl %eax, %ebp + addl %eax, %eax C 2*inv + + imull %ebp, %ebp C inv*inv + + imull %edx, %ebp C inv*inv*d + + subl %ebp, %eax C inv = 2*inv - inv*inv*d + movl %edx, PARAM_DIVISOR C d without twos + + leal (%esi,%ebx,4), %esi C src end + leal (%edi,%ebx,4), %edi C dst end + + negl %ebx C -size + + ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS + pushl %eax FRAME_pushl() + imull PARAM_DIVISOR, %eax + cmpl $1, %eax + popl %eax FRAME_popl()') + + movl %eax, VAR_INVERSE + xorl %ebp, %ebp C initial carry bit + + movl (%esi,%ebx,4), %eax C src low limb + orl %ecx, %ecx C shift + + movl 4(%esi,%ebx,4), %edx C src second limb (for even) + jz L(odd_entry) + + shrdl( %cl, %edx, %eax) + + incl %ebx + jmp L(even_entry) + + + ALIGN(8) +L(odd_top): + C eax scratch + C ebx counter, limbs, negative + C ecx + C edx + C esi src end + C edi dst end + C ebp carry bit, 0 or -1 + + mull PARAM_DIVISOR + + movl (%esi,%ebx,4), %eax + subl %ebp, %edx + + subl %edx, %eax + + sbbl %ebp, %ebp + +L(odd_entry): + imull VAR_INVERSE, %eax + + movl %eax, (%edi,%ebx,4) + + incl %ebx + jnz L(odd_top) + + + popl %ebp + popl %ebx + + popl %edi + popl %esi + + ret + + +L(even_top): + C eax scratch + C ebx counter, limbs, negative + C ecx twos + C edx + C esi src end + C edi dst end + C ebp carry bit, 0 or -1 + + mull PARAM_DIVISOR + + subl %ebp, %edx C carry bit + movl -4(%esi,%ebx,4), %eax C src limb + + movl (%esi,%ebx,4), %ebp C and one above it + + shrdl( %cl, %ebp, %eax) + + subl %edx, %eax C carry limb + + sbbl %ebp, %ebp + +L(even_entry): + imull VAR_INVERSE, %eax + + movl %eax, -4(%edi,%ebx,4) + incl %ebx + + jnz L(even_top) + + + + mull PARAM_DIVISOR + + movl -4(%esi), %eax C src high limb + subl %ebp, %edx + + shrl %cl, %eax + + subl %edx, %eax C no carry if division is exact + + imull VAR_INVERSE, %eax + + movl %eax, -4(%edi) C dst high limb + nop C protect against cache bank clash + + popl %ebp + popl %ebx + + popl %edi + popl %esi + + ret + +EPILOGUE() +ASM_END() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/x86/pentium/gmp-mparam.h new file mode 100644 index 0000000..befa6e2 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium/gmp-mparam.h @@ -0,0 +1,76 @@ +/* Intel P54 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2002, 2004 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + + +/* For mpn/x86/pentium/mod_1.asm */ +#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB + + +/* 166MHz P54 */ + +/* Generated by tuneup.c, 2004-02-10, gcc 2.95 */ + +#define MUL_TOOM22_THRESHOLD 16 +#define MUL_TOOM33_THRESHOLD 90 + +#define SQR_BASECASE_THRESHOLD 0 /* always */ +#define SQR_TOOM2_THRESHOLD 22 +#define SQR_TOOM3_THRESHOLD 122 + +#define DIV_SB_PREINV_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_DC_THRESHOLD 52 +#define POWM_THRESHOLD 77 + +#define HGCD_THRESHOLD 121 +#define GCD_ACCEL_THRESHOLD 3 +#define GCD_DC_THRESHOLD 615 +#define JACOBI_BASE_METHOD 2 + +#define USE_PREINV_DIVREM_1 0 +#define USE_PREINV_MOD_1 1 /* native */ +#define DIVREM_2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define MODEXACT_1_ODD_THRESHOLD 0 /* always (native) */ + +#define GET_STR_DC_THRESHOLD 23 +#define GET_STR_PRECOMPUTE_THRESHOLD 33 +#define SET_STR_THRESHOLD 2788 + +#define MUL_FFT_TABLE { 432, 928, 1664, 3584, 10240, 40960, 0 } +#define MUL_FFT_MODF_THRESHOLD 448 +#define MUL_FFT_THRESHOLD 3328 + +#define SQR_FFT_TABLE { 496, 928, 1920, 4608, 10240, 40960, 0 } +#define SQR_FFT_MODF_THRESHOLD 512 +#define SQR_FFT_THRESHOLD 3328 diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/hamdist.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/hamdist.asm new file mode 100644 index 0000000..6c6c1a1 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium/hamdist.asm @@ -0,0 +1,154 @@ +dnl Intel P5 mpn_hamdist -- mpn hamming distance. + +dnl Copyright 2001, 2002, 2014, 2015 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P5: 14.0 cycles/limb + + +C unsigned long mpn_hamdist (mp_srcptr src1, mp_srcptr src2, mp_size_t size); +C +C It might be possible to shave 1 cycle from the loop, and hence 2 +C cycles/limb. The xorb is taking 2 cycles, but a separate load and xor +C would be 1, if the right schedule could be found (not found so far). +C Wanting to avoid potential cache bank clashes makes it tricky. + +C The slightly strange quoting here helps the renaming done by tune/many.pl. +deflit(TABLE_NAME, +m4_assert_defined(`GSYM_PREFIX') +GSYM_PREFIX`'mpn_popcount``'_table') + +C FIXME: referencing popcount.asm's table is incorrect as it hurt incremental +C linking. + +defframe(PARAM_SIZE,12) +defframe(PARAM_SRC2, 8) +defframe(PARAM_SRC1, 4) + + TEXT + ALIGN(8) + +PROLOGUE(mpn_hamdist) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + pushl %esi FRAME_pushl() + + shll %ecx C size in byte pairs + pushl %edi FRAME_pushl() + +ifdef(`PIC',` + pushl %ebx FRAME_pushl() + pushl %ebp FRAME_pushl() +ifdef(`DARWIN',` + movl PARAM_SRC1, %esi + movl PARAM_SRC2, %edi + LEA( TABLE_NAME, %ebp) + xorl %ebx, %ebx C byte + xorl %edx, %edx C byte + xorl %eax, %eax C total +',` + call L(here) FRAME_pushl() +L(here): + movl PARAM_SRC1, %esi + popl %ebp FRAME_popl() + + movl PARAM_SRC2, %edi + addl $_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ebp + + xorl %ebx, %ebx C byte + xorl %edx, %edx C byte + + movl TABLE_NAME@GOT(%ebp), %ebp + xorl %eax, %eax C total +') +define(TABLE,`(%ebp,$1)') +',` +dnl non-PIC + movl PARAM_SRC1, %esi + movl PARAM_SRC2, %edi + + xorl %eax, %eax C total + pushl %ebx FRAME_pushl() + + xorl %edx, %edx C byte + xorl %ebx, %ebx C byte + +define(TABLE,`TABLE_NAME($1)') +') + + + C The nop after the xorb seems necessary. Although a movb might be + C expected to go down the V pipe in the second cycle of the xorb, it + C doesn't and costs an extra 2 cycles. +L(top): + C eax total + C ebx byte + C ecx counter, 2*size to 2 + C edx byte + C esi src1 + C edi src2 + C ebp [PIC] table + + addl %ebx, %eax + movb -1(%esi,%ecx,2), %bl + + addl %edx, %eax + movb -1(%edi,%ecx,2), %dl + + xorb %dl, %bl + movb -2(%esi,%ecx,2), %dl + + xorb -2(%edi,%ecx,2), %dl + nop + + movb TABLE(%ebx), %bl + decl %ecx + + movb TABLE(%edx), %dl + jnz L(top) + + +ifdef(`PIC',` + popl %ebp +') + addl %ebx, %eax + popl %ebx + + addl %edx, %eax + popl %edi + + popl %esi + + ret + +EPILOGUE() +ASM_END() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/logops_n.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/logops_n.asm new file mode 100644 index 0000000..1877317 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium/logops_n.asm @@ -0,0 +1,176 @@ +dnl Intel Pentium mpn_and_n,...,mpn_xnor_n -- bitwise logical operations. + +dnl Copyright 2001, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P5: 3.0 c/l and, ior, xor +C 3.5 c/l andn, iorn, nand, nior, xnor + + +define(M4_choose_op, +`ifdef(`OPERATION_$1',` +define(`M4_function', `mpn_$1') +define(`M4_want_pre', `$4') +define(`M4op', `$3') +define(`M4_want_post',`$2') +')') +define(M4pre, `ifelse(M4_want_pre, yes,`$1')') +define(M4post,`ifelse(M4_want_post,yes,`$1')') + +M4_choose_op( and_n, , andl, ) +M4_choose_op( andn_n, , andl, yes) +M4_choose_op( nand_n, yes, andl, ) +M4_choose_op( ior_n, , orl, ) +M4_choose_op( iorn_n, , orl, yes) +M4_choose_op( nior_n, yes, orl, ) +M4_choose_op( xor_n, , xorl, ) +M4_choose_op( xnor_n, yes, xorl, ) + +ifdef(`M4_function',, +`m4_error(`Unrecognised or undefined OPERATION symbol +')') + +MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n) + +NAILS_SUPPORT(0-31) + + +C void M4_function (mp_ptr wp, mp_srcptr xp, mp_srcptr yp, mp_size_t size); +C +C Nothing complicated here, just some care to avoid data cache bank clashes +C and AGIs. +C +C We're one register short of being able to do a simple 4 loads, 2 ops, 2 +C stores. Instead %ebp is juggled a bit and nops are introduced to keep the +C pairings as intended. An in-place operation would free up a register, for +C an 0.5 c/l speedup, if that's worth bothering with. +C +C This code seems best for P55 too. Data alignment is a big problem for MMX +C and the pairing restrictions on movq and integer instructions make life +C difficult. + +defframe(PARAM_SIZE,16) +defframe(PARAM_YP, 12) +defframe(PARAM_XP, 8) +defframe(PARAM_WP, 4) + + TEXT + ALIGN(8) + +PROLOGUE(M4_function) +deflit(`FRAME',0) + + pushl %ebx FRAME_pushl() + pushl %esi FRAME_pushl() + + pushl %edi FRAME_pushl() + pushl %ebp FRAME_pushl() + + movl PARAM_SIZE, %ecx + movl PARAM_XP, %ebx + + movl PARAM_YP, %esi + movl PARAM_WP, %edi + + shrl %ecx + jnc L(entry) + + movl (%ebx,%ecx,8), %eax C risk of data cache bank clash here + movl (%esi,%ecx,8), %edx + +M4pre(` notl_or_xorl_GMP_NUMB_MASK(%edx)') + + M4op %edx, %eax + +M4post(`xorl $GMP_NUMB_MASK, %eax') + orl %ecx, %ecx + + movl %eax, (%edi,%ecx,8) + jz L(done) + + jmp L(entry) + + +L(top): + C eax + C ebx xp + C ecx counter, limb pairs, decrementing + C edx + C esi yp + C edi wp + C ebp + + M4op %ebp, %edx + nop + +M4post(`xorl $GMP_NUMB_MASK, %eax') +M4post(`xorl $GMP_NUMB_MASK, %edx') + + movl %eax, 4(%edi,%ecx,8) + movl %edx, (%edi,%ecx,8) + +L(entry): + movl -4(%ebx,%ecx,8), %ebp + nop + + movl -4(%esi,%ecx,8), %eax + movl -8(%esi,%ecx,8), %edx + +M4pre(` xorl $GMP_NUMB_MASK, %eax') +M4pre(` xorl $GMP_NUMB_MASK, %edx') + + M4op %ebp, %eax + movl -8(%ebx,%ecx,8), %ebp + + decl %ecx + jnz L(top) + + + M4op %ebp, %edx + nop + +M4post(`xorl $GMP_NUMB_MASK, %eax') +M4post(`xorl $GMP_NUMB_MASK, %edx') + + movl %eax, 4(%edi,%ecx,8) + movl %edx, (%edi,%ecx,8) + + +L(done): + popl %ebp + popl %edi + + popl %esi + popl %ebx + + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/lshift.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/lshift.asm new file mode 100644 index 0000000..2a31f36 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium/lshift.asm @@ -0,0 +1,243 @@ +dnl Intel Pentium mpn_lshift -- mpn left shift. + +dnl Copyright 1992, 1994-1996, 1999, 2000, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C P5,P54: 6.0 +C P55: 5.375 + + +C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned shift); +C +C The main shift-by-N loop should run at 5.375 c/l and that's what P55 does, +C but P5 and P54 run only at 6.0 c/l, which is 4 cycles lost somewhere. + +defframe(PARAM_SHIFT,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + TEXT + ALIGN(8) +PROLOGUE(mpn_lshift) + + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp +deflit(`FRAME',16) + + movl PARAM_DST,%edi + movl PARAM_SRC,%esi + movl PARAM_SIZE,%ebp + movl PARAM_SHIFT,%ecx + +C We can use faster code for shift-by-1 under certain conditions. + cmp $1,%ecx + jne L(normal) + leal 4(%esi),%eax + cmpl %edi,%eax + jnc L(special) C jump if s_ptr + 1 >= res_ptr + leal (%esi,%ebp,4),%eax + cmpl %eax,%edi + jnc L(special) C jump if res_ptr >= s_ptr + size + +L(normal): + leal -4(%edi,%ebp,4),%edi + leal -4(%esi,%ebp,4),%esi + + movl (%esi),%edx + subl $4,%esi + xorl %eax,%eax + shldl( %cl, %edx, %eax) C compute carry limb + pushl %eax C push carry limb onto stack + + decl %ebp + pushl %ebp + shrl $3,%ebp + jz L(end) + + movl (%edi),%eax C fetch destination cache line + + ALIGN(4) +L(oop): movl -28(%edi),%eax C fetch destination cache line + movl %edx,%ebx + + movl (%esi),%eax + movl -4(%esi),%edx + shldl( %cl, %eax, %ebx) + shldl( %cl, %edx, %eax) + movl %ebx,(%edi) + movl %eax,-4(%edi) + + movl -8(%esi),%ebx + movl -12(%esi),%eax + shldl( %cl, %ebx, %edx) + shldl( %cl, %eax, %ebx) + movl %edx,-8(%edi) + movl %ebx,-12(%edi) + + movl -16(%esi),%edx + movl -20(%esi),%ebx + shldl( %cl, %edx, %eax) + shldl( %cl, %ebx, %edx) + movl %eax,-16(%edi) + movl %edx,-20(%edi) + + movl -24(%esi),%eax + movl -28(%esi),%edx + shldl( %cl, %eax, %ebx) + shldl( %cl, %edx, %eax) + movl %ebx,-24(%edi) + movl %eax,-28(%edi) + + subl $32,%esi + subl $32,%edi + decl %ebp + jnz L(oop) + +L(end): popl %ebp + andl $7,%ebp + jz L(end2) +L(oop2): + movl (%esi),%eax + shldl( %cl,%eax,%edx) + movl %edx,(%edi) + movl %eax,%edx + subl $4,%esi + subl $4,%edi + decl %ebp + jnz L(oop2) + +L(end2): + shll %cl,%edx C compute least significant limb + movl %edx,(%edi) C store it + + popl %eax C pop carry limb + + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + + +C We loop from least significant end of the arrays, which is only +C permissable if the source and destination don't overlap, since the +C function is documented to work for overlapping source and destination. + +L(special): + movl (%esi),%edx + addl $4,%esi + + decl %ebp + pushl %ebp + shrl $3,%ebp + + addl %edx,%edx + incl %ebp + decl %ebp + jz L(Lend) + + movl (%edi),%eax C fetch destination cache line + + ALIGN(4) +L(Loop): + movl 28(%edi),%eax C fetch destination cache line + movl %edx,%ebx + + movl (%esi),%eax + movl 4(%esi),%edx + adcl %eax,%eax + movl %ebx,(%edi) + adcl %edx,%edx + movl %eax,4(%edi) + + movl 8(%esi),%ebx + movl 12(%esi),%eax + adcl %ebx,%ebx + movl %edx,8(%edi) + adcl %eax,%eax + movl %ebx,12(%edi) + + movl 16(%esi),%edx + movl 20(%esi),%ebx + adcl %edx,%edx + movl %eax,16(%edi) + adcl %ebx,%ebx + movl %edx,20(%edi) + + movl 24(%esi),%eax + movl 28(%esi),%edx + adcl %eax,%eax + movl %ebx,24(%edi) + adcl %edx,%edx + movl %eax,28(%edi) + + leal 32(%esi),%esi C use leal not to clobber carry + leal 32(%edi),%edi + decl %ebp + jnz L(Loop) + +L(Lend): + popl %ebp + sbbl %eax,%eax C save carry in %eax + andl $7,%ebp + jz L(Lend2) + addl %eax,%eax C restore carry from eax +L(Loop2): + movl %edx,%ebx + movl (%esi),%edx + adcl %edx,%edx + movl %ebx,(%edi) + + leal 4(%esi),%esi C use leal not to clobber carry + leal 4(%edi),%edi + decl %ebp + jnz L(Loop2) + + jmp L(L1) +L(Lend2): + addl %eax,%eax C restore carry from eax +L(L1): movl %edx,(%edi) C store last limb + + sbbl %eax,%eax + negl %eax + + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/gmp-mparam.h new file mode 100644 index 0000000..02a0def --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/gmp-mparam.h @@ -0,0 +1,163 @@ +/* Intel P55 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2002, 2004, 2009, 2010 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + + +/* For mpn/x86/pentium/mod_1.asm */ +#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB + + +/* 233MHz P55 */ + +#define MOD_1_NORM_THRESHOLD 5 +#define MOD_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD MP_SIZE_T_MAX /* never */ +#define MOD_1U_TO_MOD_1_1_THRESHOLD 12 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 11 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 63 +#define USE_PREINV_DIVREM_1 0 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 51 + +#define MUL_TOOM22_THRESHOLD 16 +#define MUL_TOOM33_THRESHOLD 53 +#define MUL_TOOM44_THRESHOLD 128 +#define MUL_TOOM6H_THRESHOLD 189 +#define MUL_TOOM8H_THRESHOLD 260 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 89 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 91 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 90 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 88 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 20 +#define SQR_TOOM3_THRESHOLD 73 +#define SQR_TOOM4_THRESHOLD 178 +#define SQR_TOOM6_THRESHOLD 210 +#define SQR_TOOM8_THRESHOLD 375 + +#define MULMOD_BNM1_THRESHOLD 11 +#define SQRMOD_BNM1_THRESHOLD 12 + +#define MUL_FFT_MODF_THRESHOLD 364 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 364, 5}, { 15, 6}, { 8, 5}, { 17, 6}, \ + { 9, 5}, { 19, 6}, { 17, 7}, { 9, 6}, \ + { 21, 7}, { 11, 6}, { 23, 7}, { 15, 6}, \ + { 31, 7}, { 21, 8}, { 11, 7}, { 27, 8}, \ + { 15, 7}, { 33, 8}, { 19, 7}, { 39, 8}, \ + { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \ + { 31, 7}, { 63, 8}, { 39, 9}, { 23, 8}, \ + { 47,10}, { 15, 9}, { 31, 8}, { 67, 9}, \ + { 39, 8}, { 79, 9}, { 47, 8}, { 95, 9}, \ + { 55,10}, { 31, 9}, { 79,10}, { 47, 9}, \ + { 95,11}, { 31,10}, { 63, 9}, { 135,10}, \ + { 79, 9}, { 159, 8}, { 319, 9}, { 167,10}, \ + { 95, 9}, { 191, 8}, { 383,11}, { 63,10}, \ + { 127, 9}, { 255,10}, { 143, 9}, { 287,10}, \ + { 159, 9}, { 319,11}, { 95,10}, { 191, 9}, \ + { 383,12}, { 63,11}, { 127,10}, { 271, 9}, \ + { 543,10}, { 287,11}, { 159,10}, { 351,11}, \ + { 191,10}, { 415,11}, { 223,12}, { 127,11}, \ + { 255,10}, { 511,11}, { 287,10}, { 575,11}, \ + { 351,12}, { 191,11}, { 415,13}, { 127,12}, \ + { 255,11}, { 575,12}, { 319,11}, { 703,12}, \ + { 383,11}, { 831,12}, { 447,13}, { 8192,14}, \ + { 16384,15}, { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 90 +#define MUL_FFT_THRESHOLD 3520 + +#define SQR_FFT_MODF_THRESHOLD 340 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 340, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \ + { 17, 7}, { 9, 6}, { 21, 7}, { 11, 6}, \ + { 23, 7}, { 15, 6}, { 31, 7}, { 21, 8}, \ + { 11, 7}, { 29, 8}, { 15, 7}, { 33, 8}, \ + { 19, 7}, { 39, 8}, { 27, 7}, { 55, 9}, \ + { 15, 8}, { 31, 7}, { 65, 8}, { 43, 9}, \ + { 23, 8}, { 47,10}, { 15, 9}, { 31, 8}, \ + { 67, 9}, { 39, 8}, { 83, 9}, { 47, 8}, \ + { 95,10}, { 31, 9}, { 63, 8}, { 127, 9}, \ + { 79,10}, { 47, 9}, { 95,11}, { 31,10}, \ + { 63, 9}, { 127, 8}, { 255, 9}, { 135,10}, \ + { 79, 9}, { 159, 8}, { 319,10}, { 95, 9}, \ + { 191,11}, { 63,10}, { 127, 9}, { 255, 8}, \ + { 511, 9}, { 271,10}, { 143, 9}, { 287, 8}, \ + { 575, 9}, { 303,10}, { 159, 9}, { 319,11}, \ + { 95,10}, { 191, 9}, { 383,10}, { 207,12}, \ + { 63,11}, { 127,10}, { 271, 9}, { 543,10}, \ + { 287, 9}, { 575,10}, { 303,11}, { 159,10}, \ + { 351,11}, { 191,10}, { 415,11}, { 223,10}, \ + { 447,12}, { 127,11}, { 255,10}, { 543,11}, \ + { 287,10}, { 607,11}, { 351,12}, { 191,11}, \ + { 479,13}, { 127,12}, { 255,11}, { 575,12}, \ + { 319,11}, { 703,12}, { 383,11}, { 767,12}, \ + { 447,13}, { 8192,14}, { 16384,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 96 +#define SQR_FFT_THRESHOLD 5504 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 48 +#define MULLO_MUL_N_THRESHOLD 6633 + +#define DC_DIV_QR_THRESHOLD 43 +#define DC_DIVAPPR_Q_THRESHOLD 170 +#define DC_BDIV_QR_THRESHOLD 43 +#define DC_BDIV_Q_THRESHOLD 110 + +#define INV_MULMOD_BNM1_THRESHOLD 30 +#define INV_NEWTON_THRESHOLD 177 +#define INV_APPR_THRESHOLD 171 + +#define BINV_NEWTON_THRESHOLD 194 +#define REDC_1_TO_REDC_N_THRESHOLD 50 + +#define MU_DIV_QR_THRESHOLD 1142 +#define MU_DIVAPPR_Q_THRESHOLD 1142 +#define MUPI_DIV_QR_THRESHOLD 90 +#define MU_BDIV_QR_THRESHOLD 942 +#define MU_BDIV_Q_THRESHOLD 1017 + +#define MATRIX22_STRASSEN_THRESHOLD 13 +#define HGCD_THRESHOLD 92 +#define GCD_DC_THRESHOLD 283 +#define GCDEXT_DC_THRESHOLD 221 +#define JACOBI_BASE_METHOD 2 + +#define GET_STR_DC_THRESHOLD 18 +#define GET_STR_PRECOMPUTE_THRESHOLD 31 +#define SET_STR_DC_THRESHOLD 490 +#define SET_STR_PRECOMPUTE_THRESHOLD 994 diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/hamdist.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/hamdist.asm new file mode 100644 index 0000000..72e3196 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/hamdist.asm @@ -0,0 +1,40 @@ +dnl Intel P55 mpn_hamdist -- mpn hamming distance. + +dnl Copyright 2000, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P55: hamdist 12.0 cycles/limb + +C For reference, this code runs at 11.5 cycles/limb for popcount, which is +C slower than the plain integer mpn/x86/pentium/popcount.asm. + +MULFUNC_PROLOGUE(mpn_hamdist) +include_mpn(`x86/k6/mmx/popham.asm') diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/lshift.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/lshift.asm new file mode 100644 index 0000000..04b0ddc --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/lshift.asm @@ -0,0 +1,463 @@ +dnl Intel P5 mpn_lshift -- mpn left shift. + +dnl Copyright 2000-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P5: 1.75 cycles/limb. + + +C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned shift); +C +C Shift src,size left by shift many bits and store the result in dst,size. +C Zeros are shifted in at the right. Return the bits shifted out at the +C left. +C +C The comments in mpn_rshift apply here too. + +defframe(PARAM_SHIFT,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) +deflit(`FRAME',0) + +dnl minimum 5, because the unrolled loop can't handle less +deflit(UNROLL_THRESHOLD, 5) + + TEXT + ALIGN(8) + +PROLOGUE(mpn_lshift) + + pushl %ebx + pushl %edi +deflit(`FRAME',8) + + movl PARAM_SIZE, %eax + movl PARAM_DST, %edx + + movl PARAM_SRC, %ebx + movl PARAM_SHIFT, %ecx + + cmp $UNROLL_THRESHOLD, %eax + jae L(unroll) + + movl -4(%ebx,%eax,4), %edi C src high limb + decl %eax + + jnz L(simple) + + shldl( %cl, %edi, %eax) C eax was decremented to zero + + shll %cl, %edi + + movl %edi, (%edx) C dst low limb + popl %edi C risk of data cache bank clash + + popl %ebx + + ret + + +C ----------------------------------------------------------------------------- +L(simple): + C eax size-1 + C ebx src + C ecx shift + C edx dst + C esi + C edi + C ebp +deflit(`FRAME',8) + + movd (%ebx,%eax,4), %mm5 C src high limb + + movd %ecx, %mm6 C lshift + negl %ecx + + psllq %mm6, %mm5 + addl $32, %ecx + + movd %ecx, %mm7 + psrlq $32, %mm5 C retval + + +L(simple_top): + C eax counter, limbs, negative + C ebx src + C ecx + C edx dst + C esi + C edi + C + C mm0 scratch + C mm5 return value + C mm6 shift + C mm7 32-shift + + movq -4(%ebx,%eax,4), %mm0 + decl %eax + + psrlq %mm7, %mm0 + + C + + movd %mm0, 4(%edx,%eax,4) + jnz L(simple_top) + + + movd (%ebx), %mm0 + + movd %mm5, %eax + psllq %mm6, %mm0 + + popl %edi + popl %ebx + + movd %mm0, (%edx) + + emms + + ret + + +C ----------------------------------------------------------------------------- + ALIGN(8) +L(unroll): + C eax size + C ebx src + C ecx shift + C edx dst + C esi + C edi + C ebp +deflit(`FRAME',8) + + movd -4(%ebx,%eax,4), %mm5 C src high limb + leal (%ebx,%eax,4), %edi + + movd %ecx, %mm6 C lshift + andl $4, %edi + + psllq %mm6, %mm5 + jz L(start_src_aligned) + + + C src isn't aligned, process high limb separately (marked xxx) to + C make it so. + C + C source -8(ebx,%eax,4) + C | + C +-------+-------+-------+-- + C | | + C +-------+-------+-------+-- + C 0mod8 4mod8 0mod8 + C + C dest + C -4(edx,%eax,4) + C | + C +-------+-------+-- + C | xxx | | + C +-------+-------+-- + + movq -8(%ebx,%eax,4), %mm0 C unaligned load + + psllq %mm6, %mm0 + decl %eax + + psrlq $32, %mm0 + + C + + movd %mm0, (%edx,%eax,4) +L(start_src_aligned): + + movq -8(%ebx,%eax,4), %mm1 C src high qword + leal (%edx,%eax,4), %edi + + andl $4, %edi + psrlq $32, %mm5 C return value + + movq -16(%ebx,%eax,4), %mm3 C src second highest qword + jz L(start_dst_aligned) + + C dst isn't aligned, subtract 4 to make it so, and pretend the shift + C is 32 bits extra. High limb of dst (marked xxx) handled here + C separately. + C + C source -8(ebx,%eax,4) + C | + C +-------+-------+-- + C | mm1 | + C +-------+-------+-- + C 0mod8 4mod8 + C + C dest + C -4(edx,%eax,4) + C | + C +-------+-------+-------+-- + C | xxx | | + C +-------+-------+-------+-- + C 0mod8 4mod8 0mod8 + + movq %mm1, %mm0 + addl $32, %ecx C new shift + + psllq %mm6, %mm0 + + movd %ecx, %mm6 + psrlq $32, %mm0 + + C wasted cycle here waiting for %mm0 + + movd %mm0, -4(%edx,%eax,4) + subl $4, %edx +L(start_dst_aligned): + + + psllq %mm6, %mm1 + negl %ecx C -shift + + addl $64, %ecx C 64-shift + movq %mm3, %mm2 + + movd %ecx, %mm7 + subl $8, %eax C size-8 + + psrlq %mm7, %mm3 + + por %mm1, %mm3 C mm3 ready to store + jc L(finish) + + + C The comments in mpn_rshift apply here too. + + ALIGN(8) +L(unroll_loop): + C eax counter, limbs + C ebx src + C ecx + C edx dst + C esi + C edi + C + C mm0 + C mm1 + C mm2 src qword from 16(%ebx,%eax,4) + C mm3 dst qword ready to store to 24(%edx,%eax,4) + C + C mm5 return value + C mm6 lshift + C mm7 rshift + + movq 8(%ebx,%eax,4), %mm0 + psllq %mm6, %mm2 + + movq %mm0, %mm1 + psrlq %mm7, %mm0 + + movq %mm3, 24(%edx,%eax,4) C prev + por %mm2, %mm0 + + movq (%ebx,%eax,4), %mm3 C + psllq %mm6, %mm1 C + + movq %mm0, 16(%edx,%eax,4) + movq %mm3, %mm2 C + + psrlq %mm7, %mm3 C + subl $4, %eax + + por %mm1, %mm3 C + jnc L(unroll_loop) + + + +L(finish): + C eax -4 to -1 representing respectively 0 to 3 limbs remaining + + testb $2, %al + + jz L(finish_no_two) + + movq 8(%ebx,%eax,4), %mm0 + psllq %mm6, %mm2 + + movq %mm0, %mm1 + psrlq %mm7, %mm0 + + movq %mm3, 24(%edx,%eax,4) C prev + por %mm2, %mm0 + + movq %mm1, %mm2 + movq %mm0, %mm3 + + subl $2, %eax +L(finish_no_two): + + + C eax -4 or -3 representing respectively 0 or 1 limbs remaining + C + C mm2 src prev qword, from 16(%ebx,%eax,4) + C mm3 dst qword, for 24(%edx,%eax,4) + + testb $1, %al + movd %mm5, %eax C retval + + popl %edi + jz L(finish_zero) + + + C One extra src limb, destination was aligned. + C + C source ebx + C --+---------------+-------+ + C | mm2 | | + C --+---------------+-------+ + C + C dest edx+12 edx+4 edx + C --+---------------+---------------+-------+ + C | mm3 | | | + C --+---------------+---------------+-------+ + C + C mm6 = shift + C mm7 = ecx = 64-shift + + + C One extra src limb, destination was unaligned. + C + C source ebx + C --+---------------+-------+ + C | mm2 | | + C --+---------------+-------+ + C + C dest edx+12 edx+4 + C --+---------------+---------------+ + C | mm3 | | + C --+---------------+---------------+ + C + C mm6 = shift+32 + C mm7 = ecx = 64-(shift+32) + + + C In both cases there's one extra limb of src to fetch and combine + C with mm2 to make a qword at 4(%edx), and in the aligned case + C there's an extra limb of dst to be formed from that extra src limb + C left shifted. + + + movd (%ebx), %mm0 + psllq %mm6, %mm2 + + movq %mm3, 12(%edx) + psllq $32, %mm0 + + movq %mm0, %mm1 + psrlq %mm7, %mm0 + + por %mm2, %mm0 + psllq %mm6, %mm1 + + movq %mm0, 4(%edx) + psrlq $32, %mm1 + + andl $32, %ecx + popl %ebx + + jz L(finish_one_unaligned) + + movd %mm1, (%edx) +L(finish_one_unaligned): + + emms + + ret + + +L(finish_zero): + + C No extra src limbs, destination was aligned. + C + C source ebx + C --+---------------+ + C | mm2 | + C --+---------------+ + C + C dest edx+8 edx + C --+---------------+---------------+ + C | mm3 | | + C --+---------------+---------------+ + C + C mm6 = shift + C mm7 = ecx = 64-shift + + + C No extra src limbs, destination was unaligned. + C + C source ebx + C --+---------------+ + C | mm2 | + C --+---------------+ + C + C dest edx+8 edx+4 + C --+---------------+-------+ + C | mm3 | | + C --+---------------+-------+ + C + C mm6 = shift+32 + C mm7 = ecx = 64-(shift+32) + + + C The movd for the unaligned case writes the same data to 4(%edx) + C that the movq does for the aligned case. + + + movq %mm3, 8(%edx) + andl $32, %ecx + + psllq %mm6, %mm2 + jz L(finish_zero_unaligned) + + movq %mm2, (%edx) +L(finish_zero_unaligned): + + psrlq $32, %mm2 + popl %ebx + + movd %mm5, %eax C retval + + movd %mm2, 4(%edx) + + emms + + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/mul_1.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/mul_1.asm new file mode 100644 index 0000000..4ced577 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/mul_1.asm @@ -0,0 +1,371 @@ +dnl Intel Pentium MMX mpn_mul_1 -- mpn by limb multiplication. + +dnl Copyright 2000-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C P5: 12.0 for 32-bit multiplier +C 7.0 for 16-bit multiplier + + +C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t multiplier); +C +C When the multiplier is 16 bits some special case MMX code is used. Small +C multipliers might arise reasonably often from mpz_mul_ui etc. If the size +C is odd there's roughly a 5 cycle penalty, so times for say size==7 and +C size==8 end up being quite close. If src isn't aligned to an 8 byte +C boundary then one limb is processed separately with roughly a 5 cycle +C penalty, so in that case it's say size==8 and size==9 which are close. +C +C Alternatives: +C +C MMX is not believed to be of any use for 32-bit multipliers, since for +C instance the current method would just have to be more or less duplicated +C for the high and low halves of the multiplier, and would probably +C therefore run at about 14 cycles, which is slower than the plain integer +C at 12. +C +C Adding the high and low MMX products using integer code seems best. An +C attempt at using paddd and carry bit propagation with pcmpgtd didn't give +C any joy. Perhaps something could be done keeping the values signed and +C thereby avoiding adjustments to make pcmpgtd into an unsigned compare, or +C perhaps not. +C +C Future: +C +C An mpn_mul_1c entrypoint would need a double carry out of the low result +C limb in the 16-bit code, unless it could be assumed the carry fits in 16 +C bits, possibly as carry<multiplier, this being true of a big calculation +C done piece by piece. But let's worry about that if/when mul_1c is +C actually used. + +defframe(PARAM_MULTIPLIER,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + TEXT + + ALIGN(8) +PROLOGUE(mpn_mul_1) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + movl PARAM_SRC, %edx + + cmpl $1, %ecx + jne L(two_or_more) + + C one limb only + + movl PARAM_MULTIPLIER, %eax + movl PARAM_DST, %ecx + + mull (%edx) + + movl %eax, (%ecx) + movl %edx, %eax + + ret + + +L(two_or_more): + C eax size + C ebx + C ecx carry + C edx + C esi src + C edi + C ebp + + pushl %esi FRAME_pushl() + pushl %edi FRAME_pushl() + + movl %edx, %esi C src + movl PARAM_DST, %edi + + movl PARAM_MULTIPLIER, %eax + pushl %ebx FRAME_pushl() + + leal (%esi,%ecx,4), %esi C src end + leal (%edi,%ecx,4), %edi C dst end + + negl %ecx C -size + + pushl %ebp FRAME_pushl() + cmpl $65536, %eax + + jb L(small) + + +L(big): + xorl %ebx, %ebx C carry limb + sarl %ecx C -size/2 + + jnc L(top) C with carry flag clear + + + C size was odd, process one limb separately + + mull 4(%esi,%ecx,8) C m * src[0] + + movl %eax, 4(%edi,%ecx,8) + incl %ecx + + orl %edx, %ebx C carry limb, and clear carry flag + + +L(top): + C eax + C ebx carry + C ecx counter, negative + C edx + C esi src end + C edi dst end + C ebp (scratch carry) + + adcl $0, %ebx + movl (%esi,%ecx,8), %eax + + mull PARAM_MULTIPLIER + + movl %edx, %ebp + addl %eax, %ebx + + adcl $0, %ebp + movl 4(%esi,%ecx,8), %eax + + mull PARAM_MULTIPLIER + + movl %ebx, (%edi,%ecx,8) + addl %ebp, %eax + + movl %eax, 4(%edi,%ecx,8) + incl %ecx + + movl %edx, %ebx + jnz L(top) + + + adcl $0, %ebx + popl %ebp + + movl %ebx, %eax + popl %ebx + + popl %edi + popl %esi + + ret + + +L(small): + C Special case for 16-bit multiplier. + C + C eax multiplier + C ebx + C ecx -size + C edx src + C esi src end + C edi dst end + C ebp multiplier + + C size<3 not supported here. At size==3 we're already a couple of + C cycles faster, so there's no threshold as such, just use the MMX + C as soon as possible. + + cmpl $-3, %ecx + ja L(big) + + movd %eax, %mm7 C m + pxor %mm6, %mm6 C initial carry word + + punpcklwd %mm7, %mm7 C m replicated 2 times + addl $2, %ecx C -size+2 + + punpckldq %mm7, %mm7 C m replicated 4 times + andl $4, %edx C test alignment, clear carry flag + + movq %mm7, %mm0 C m + jz L(small_entry) + + + C Source is unaligned, process one limb separately. + C + C Plain integer code is used here, since it's smaller and is about + C the same 13 cycles as an mmx block would be. + C + C An "addl $1,%ecx" doesn't clear the carry flag when size==3, hence + C the use of separate incl and orl. + + mull -8(%esi,%ecx,4) C m * src[0] + + movl %eax, -8(%edi,%ecx,4) C dst[0] + incl %ecx C one limb processed + + movd %edx, %mm6 C initial carry + + orl %eax, %eax C clear carry flag + jmp L(small_entry) + + +C The scheduling here is quite tricky, since so many instructions have +C pairing restrictions. In particular the js won't pair with a movd, and +C can't be paired with an adc since it wants flags from the inc, so +C instructions are rotated to the top of the loop to find somewhere useful +C for it. +C +C Trouble has been taken to avoid overlapping successive loop iterations, +C since that would greatly increase the size of the startup and finishup +C code. Actually there's probably not much advantage to be had from +C overlapping anyway, since the difficulties are mostly with pairing, not +C with latencies as such. +C +C In the comments x represents the src data and m the multiplier (16 +C bits, but replicated 4 times). +C +C The m signs calculated in %mm3 are a loop invariant and could be held in +C say %mm5, but that would save only one instruction and hence be no faster. + +L(small_top): + C eax l.low, then l.high + C ebx (h.low) + C ecx counter, -size+2 to 0 or 1 + C edx (h.high) + C esi &src[size] + C edi &dst[size] + C ebp + C + C %mm0 (high products) + C %mm1 (low products) + C %mm2 (adjust for m using x signs) + C %mm3 (adjust for x using m signs) + C %mm4 + C %mm5 + C %mm6 h.low, then carry + C %mm7 m replicated 4 times + + movd %mm6, %ebx C h.low + psrlq $32, %mm1 C l.high + + movd %mm0, %edx C h.high + movq %mm0, %mm6 C new c + + adcl %eax, %ebx + incl %ecx + + movd %mm1, %eax C l.high + movq %mm7, %mm0 + + adcl %eax, %edx + movl %ebx, -16(%edi,%ecx,4) + + movl %edx, -12(%edi,%ecx,4) + psrlq $32, %mm6 C c + +L(small_entry): + pmulhw -8(%esi,%ecx,4), %mm0 C h = (x*m).high + movq %mm7, %mm1 + + pmullw -8(%esi,%ecx,4), %mm1 C l = (x*m).low + movq %mm7, %mm3 + + movq -8(%esi,%ecx,4), %mm2 C x + psraw $15, %mm3 C m signs + + pand -8(%esi,%ecx,4), %mm3 C x selected by m signs + psraw $15, %mm2 C x signs + + paddw %mm3, %mm0 C add x to h if m neg + pand %mm7, %mm2 C m selected by x signs + + paddw %mm2, %mm0 C add m to h if x neg + incl %ecx + + movd %mm1, %eax C l.low + punpcklwd %mm0, %mm6 C c + h.low << 16 + + psrlq $16, %mm0 C h.high + js L(small_top) + + + + + movd %mm6, %ebx C h.low + psrlq $32, %mm1 C l.high + + adcl %eax, %ebx + popl %ebp FRAME_popl() + + movd %mm0, %edx C h.high + psrlq $32, %mm0 C l.high + + movd %mm1, %eax C l.high + + adcl %eax, %edx + movl %ebx, -12(%edi,%ecx,4) + + movd %mm0, %eax C c + + adcl $0, %eax + movl %edx, -8(%edi,%ecx,4) + + orl %ecx, %ecx + jnz L(small_done) C final %ecx==1 means even, ==0 odd + + + C Size odd, one extra limb to process. + C Plain integer code is used here, since it's smaller and is about + C the same speed as another mmx block would be. + + movl %eax, %ecx + movl PARAM_MULTIPLIER, %eax + + mull -4(%esi) + + addl %ecx, %eax + + adcl $0, %edx + movl %eax, -4(%edi) + + movl %edx, %eax +L(small_done): + popl %ebx + + popl %edi + popl %esi + + emms + + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/rshift.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/rshift.asm new file mode 100644 index 0000000..e3b274b --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/rshift.asm @@ -0,0 +1,468 @@ +dnl Intel P5 mpn_rshift -- mpn right shift. + +dnl Copyright 2000, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P5: 1.75 cycles/limb. + + +C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned shift); +C +C Shift src,size right by shift many bits and store the result in dst,size. +C Zeros are shifted in at the left. Return the bits shifted out at the +C right. +C +C It takes 6 mmx instructions to process 2 limbs, making 1.5 cycles/limb, +C and with a 4 limb loop and 1 cycle of loop overhead the total is 1.75 c/l. +C +C Full speed depends on source and destination being aligned. Unaligned mmx +C loads and stores on P5 don't pair and have a 2 cycle penalty. Some hairy +C setups and finish-ups are done to ensure alignment for the loop. +C +C MMX shifts work out a bit faster even for the simple loop. + +defframe(PARAM_SHIFT,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) +deflit(`FRAME',0) + +dnl Minimum 5, because the unrolled loop can't handle less. +deflit(UNROLL_THRESHOLD, 5) + + TEXT + ALIGN(8) + +PROLOGUE(mpn_rshift) + + pushl %ebx + pushl %edi +deflit(`FRAME',8) + + movl PARAM_SIZE, %eax + movl PARAM_DST, %edx + + movl PARAM_SRC, %ebx + movl PARAM_SHIFT, %ecx + + cmp $UNROLL_THRESHOLD, %eax + jae L(unroll) + + decl %eax + movl (%ebx), %edi C src low limb + + jnz L(simple) + + shrdl( %cl, %edi, %eax) C eax was decremented to zero + + shrl %cl, %edi + + movl %edi, (%edx) C dst low limb + popl %edi C risk of data cache bank clash + + popl %ebx + + ret + + +C ----------------------------------------------------------------------------- + ALIGN(8) +L(simple): + C eax size-1 + C ebx src + C ecx shift + C edx dst + C esi + C edi + C ebp +deflit(`FRAME',8) + + movd (%ebx), %mm5 C src[0] + leal (%ebx,%eax,4), %ebx C &src[size-1] + + movd %ecx, %mm6 C rshift + leal -4(%edx,%eax,4), %edx C &dst[size-2] + + psllq $32, %mm5 + negl %eax + + +C This loop is 5 or 8 cycles, with every second load unaligned and a wasted +C cycle waiting for the mm0 result to be ready. For comparison a shrdl is 4 +C cycles and would be 8 in a simple loop. Using mmx helps the return value +C and last limb calculations too. + +L(simple_top): + C eax counter, limbs, negative + C ebx &src[size-1] + C ecx return value + C edx &dst[size-2] + C + C mm0 scratch + C mm5 return value + C mm6 shift + + movq (%ebx,%eax,4), %mm0 + incl %eax + + psrlq %mm6, %mm0 + + movd %mm0, (%edx,%eax,4) + jnz L(simple_top) + + + movd (%ebx), %mm0 + psrlq %mm6, %mm5 C return value + + psrlq %mm6, %mm0 + popl %edi + + movd %mm5, %eax + popl %ebx + + movd %mm0, 4(%edx) + + emms + + ret + + +C ----------------------------------------------------------------------------- + ALIGN(8) +L(unroll): + C eax size + C ebx src + C ecx shift + C edx dst + C esi + C edi + C ebp +deflit(`FRAME',8) + + movd (%ebx), %mm5 C src[0] + movl $4, %edi + + movd %ecx, %mm6 C rshift + testl %edi, %ebx + + psllq $32, %mm5 + jz L(start_src_aligned) + + + C src isn't aligned, process low limb separately (marked xxx) and + C step src and dst by one limb, making src aligned. + C + C source ebx + C --+-------+-------+-------+ + C | xxx | + C --+-------+-------+-------+ + C 4mod8 0mod8 4mod8 + C + C dest edx + C --+-------+-------+ + C | | xxx | + C --+-------+-------+ + + movq (%ebx), %mm0 C unaligned load + + psrlq %mm6, %mm0 + addl $4, %ebx + + decl %eax + + movd %mm0, (%edx) + addl $4, %edx +L(start_src_aligned): + + + movq (%ebx), %mm1 + testl %edi, %edx + + psrlq %mm6, %mm5 C retval + jz L(start_dst_aligned) + + C dst isn't aligned, add 4 to make it so, and pretend the shift is + C 32 bits extra. Low limb of dst (marked xxx) handled here + C separately. + C + C source ebx + C --+-------+-------+ + C | mm1 | + C --+-------+-------+ + C 4mod8 0mod8 + C + C dest edx + C --+-------+-------+-------+ + C | xxx | + C --+-------+-------+-------+ + C 4mod8 0mod8 4mod8 + + movq %mm1, %mm0 + addl $32, %ecx C new shift + + psrlq %mm6, %mm0 + + movd %ecx, %mm6 + + movd %mm0, (%edx) + addl $4, %edx +L(start_dst_aligned): + + + movq 8(%ebx), %mm3 + negl %ecx + + movq %mm3, %mm2 C mm2 src qword + addl $64, %ecx + + movd %ecx, %mm7 + psrlq %mm6, %mm1 + + leal -12(%ebx,%eax,4), %ebx + leal -20(%edx,%eax,4), %edx + + psllq %mm7, %mm3 + subl $7, %eax C size-7 + + por %mm1, %mm3 C mm3 ready to store + negl %eax C -(size-7) + + jns L(finish) + + + C This loop is the important bit, the rest is just support. Careful + C instruction scheduling achieves the claimed 1.75 c/l. The + C relevant parts of the pairing rules are: + C + C - mmx loads and stores execute only in the U pipe + C - only one mmx shift in a pair + C - wait one cycle before storing an mmx register result + C - the usual address generation interlock + C + C Two qword calculations are slightly interleaved. The instructions + C marked "C" belong to the second qword, and the "C prev" one is for + C the second qword from the previous iteration. + + ALIGN(8) +L(unroll_loop): + C eax counter, limbs, negative + C ebx &src[size-12] + C ecx + C edx &dst[size-12] + C esi + C edi + C + C mm0 + C mm1 + C mm2 src qword from -8(%ebx,%eax,4) + C mm3 dst qword ready to store to -8(%edx,%eax,4) + C + C mm5 return value + C mm6 rshift + C mm7 lshift + + movq (%ebx,%eax,4), %mm0 + psrlq %mm6, %mm2 + + movq %mm0, %mm1 + psllq %mm7, %mm0 + + movq %mm3, -8(%edx,%eax,4) C prev + por %mm2, %mm0 + + movq 8(%ebx,%eax,4), %mm3 C + psrlq %mm6, %mm1 C + + movq %mm0, (%edx,%eax,4) + movq %mm3, %mm2 C + + psllq %mm7, %mm3 C + addl $4, %eax + + por %mm1, %mm3 C + js L(unroll_loop) + + +L(finish): + C eax 0 to 3 representing respectively 3 to 0 limbs remaining + + testb $2, %al + + jnz L(finish_no_two) + + movq (%ebx,%eax,4), %mm0 + psrlq %mm6, %mm2 + + movq %mm0, %mm1 + psllq %mm7, %mm0 + + movq %mm3, -8(%edx,%eax,4) C prev + por %mm2, %mm0 + + movq %mm1, %mm2 + movq %mm0, %mm3 + + addl $2, %eax +L(finish_no_two): + + + C eax 2 or 3 representing respectively 1 or 0 limbs remaining + C + C mm2 src prev qword, from -8(%ebx,%eax,4) + C mm3 dst qword, for -8(%edx,%eax,4) + + testb $1, %al + popl %edi + + movd %mm5, %eax C retval + jnz L(finish_zero) + + + C One extra limb, destination was aligned. + C + C source ebx + C +-------+---------------+-- + C | | mm2 | + C +-------+---------------+-- + C + C dest edx + C +-------+---------------+---------------+-- + C | | | mm3 | + C +-------+---------------+---------------+-- + C + C mm6 = shift + C mm7 = ecx = 64-shift + + + C One extra limb, destination was unaligned. + C + C source ebx + C +-------+---------------+-- + C | | mm2 | + C +-------+---------------+-- + C + C dest edx + C +---------------+---------------+-- + C | | mm3 | + C +---------------+---------------+-- + C + C mm6 = shift+32 + C mm7 = ecx = 64-(shift+32) + + + C In both cases there's one extra limb of src to fetch and combine + C with mm2 to make a qword at 8(%edx), and in the aligned case + C there's a further extra limb of dst to be formed. + + + movd 8(%ebx), %mm0 + psrlq %mm6, %mm2 + + movq %mm0, %mm1 + psllq %mm7, %mm0 + + movq %mm3, (%edx) + por %mm2, %mm0 + + psrlq %mm6, %mm1 + andl $32, %ecx + + popl %ebx + jz L(finish_one_unaligned) + + C dst was aligned, must store one extra limb + movd %mm1, 16(%edx) +L(finish_one_unaligned): + + movq %mm0, 8(%edx) + + emms + + ret + + +L(finish_zero): + + C No extra limbs, destination was aligned. + C + C source ebx + C +---------------+-- + C | mm2 | + C +---------------+-- + C + C dest edx+4 + C +---------------+---------------+-- + C | | mm3 | + C +---------------+---------------+-- + C + C mm6 = shift + C mm7 = ecx = 64-shift + + + C No extra limbs, destination was unaligned. + C + C source ebx + C +---------------+-- + C | mm2 | + C +---------------+-- + C + C dest edx+4 + C +-------+---------------+-- + C | | mm3 | + C +-------+---------------+-- + C + C mm6 = shift+32 + C mm7 = 64-(shift+32) + + + C The movd for the unaligned case is clearly the same data as the + C movq for the aligned case, it's just a choice between whether one + C or two limbs should be written. + + + movq %mm3, 4(%edx) + psrlq %mm6, %mm2 + + movd %mm2, 12(%edx) + andl $32, %ecx + + popl %ebx + jz L(finish_zero_unaligned) + + movq %mm2, 12(%edx) +L(finish_zero_unaligned): + + emms + + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/mod_34lsub1.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/mod_34lsub1.asm new file mode 100644 index 0000000..2d88223 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium/mod_34lsub1.asm @@ -0,0 +1,192 @@ +dnl Intel P5 mpn_mod_34lsub1 -- mpn remainder modulo 2**24-1. + +dnl Copyright 2000-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P5: 1.66 cycles/limb + + +C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size) +C + +defframe(PARAM_SIZE, 8) +defframe(PARAM_SRC, 4) + + TEXT + ALIGN(16) +PROLOGUE(mpn_mod_34lsub1) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + movl PARAM_SRC, %edx + + subl $2, %ecx + ja L(three_or_more) + + movl (%edx), %eax + jne L(one) + + + movl 4(%edx), %ecx + movl %eax, %edx + + shrl $24, %edx + andl $0xFFFFFF, %eax + + addl %edx, %eax + movl %ecx, %edx + + shrl $16, %ecx + andl $0xFFFF, %edx + + shll $8, %edx + addl %ecx, %eax + + addl %edx, %eax + +L(one): + ret + + +L(three_or_more): + C eax + C ebx + C ecx size-2 + C edx src + C esi + C edi + C ebp + + pushl %ebx FRAME_pushl() + pushl %esi FRAME_pushl() + + pushl %edi FRAME_pushl() + pushl %ebp FRAME_pushl() + + xorl %esi, %esi C 0mod3 + xorl %edi, %edi C 1mod3 + + xorl %ebp, %ebp C 2mod3, and clear carry + +L(top): + C eax scratch + C ebx scratch + C ecx counter, limbs + C edx src + C esi 0mod3 + C edi 1mod3 + C ebp 2mod3 + + movl (%edx), %eax + movl 4(%edx), %ebx + + adcl %eax, %esi + movl 8(%edx), %eax + + adcl %ebx, %edi + leal 12(%edx), %edx + + adcl %eax, %ebp + leal -2(%ecx), %ecx + + decl %ecx + jg L(top) + + + C ecx is -2, -1 or 0, representing 0, 1 or 2 more limbs, respectively + + movl $0xFFFFFFFF, %ebx C mask + incl %ecx + + js L(combine) C 0 more + + movl (%edx), %eax + movl $0xFFFFFF00, %ebx + + adcl %eax, %esi + decl %ecx + + js L(combine) C 1 more + + movl 4(%edx), %eax + movl $0xFFFF0000, %ebx + + adcl %eax, %edi + + + +L(combine): + C eax + C ebx mask + C ecx + C edx + C esi 0mod3 + C edi 1mod3 + C ebp 2mod3 + + sbbl %ecx, %ecx C carry + movl %esi, %eax C 0mod3 + + andl %ebx, %ecx C masked for position + andl $0xFFFFFF, %eax C 0mod3 low + + shrl $24, %esi C 0mod3 high + subl %ecx, %eax C apply carry + + addl %esi, %eax C apply 0mod3 + movl %edi, %ebx C 1mod3 + + shrl $16, %edi C 1mod3 high + andl $0x0000FFFF, %ebx + + shll $8, %ebx C 1mod3 low + addl %edi, %eax C apply 1mod3 high + + addl %ebx, %eax C apply 1mod3 low + movl %ebp, %ebx C 2mod3 + + shrl $8, %ebp C 2mod3 high + andl $0xFF, %ebx + + shll $16, %ebx C 2mod3 low + addl %ebp, %eax C apply 2mod3 high + + addl %ebx, %eax C apply 2mod3 low + + popl %ebp + popl %edi + + popl %esi + popl %ebx + + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/mode1o.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/mode1o.asm new file mode 100644 index 0000000..a90abca --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium/mode1o.asm @@ -0,0 +1,279 @@ +dnl Intel Pentium mpn_modexact_1_odd -- exact division style remainder. + +dnl Copyright 2000-2002, 2014 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P5: 23.0 cycles/limb + + +C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size, +C mp_limb_t divisor); +C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, +C mp_limb_t divisor, mp_limb_t carry); +C +C There seems no way to pair up the two lone instructions in the main loop. +C +C The special case for size==1 saves about 20 cycles (non-PIC), making it +C the same as mpn_mod_1, and in fact making modexact faster than mod_1 at +C all sizes. +C +C Alternatives: +C +C Using mmx for the multiplies might be possible, with pmullw and pmulhw +C having just 3 cycle latencies, but carry bit handling would probably be +C complicated. + +defframe(PARAM_CARRY, 16) +defframe(PARAM_DIVISOR,12) +defframe(PARAM_SIZE, 8) +defframe(PARAM_SRC, 4) + +dnl re-using parameter space +define(VAR_INVERSE,`PARAM_SIZE') + + TEXT + + ALIGN(16) +PROLOGUE(mpn_modexact_1c_odd) +deflit(`FRAME',0) + + movl PARAM_DIVISOR, %eax + movl PARAM_CARRY, %edx + + jmp L(start_1c) + +EPILOGUE() + + ALIGN(16) +PROLOGUE(mpn_modexact_1_odd) +deflit(`FRAME',0) + + movl PARAM_DIVISOR, %eax + xorl %edx, %edx C carry + +L(start_1c): + +ifdef(`PIC',` +ifdef(`DARWIN',` + shrl %eax C d/2 + LEA( binvert_limb_table, %ecx) + pushl %ebx FRAME_pushl() + movl PARAM_SIZE, %ebx + + andl $127, %eax + subl $2, %ebx + + movb (%eax,%ecx), %cl + jc L(one_limb) +',` + call L(here) FRAME_pushl() +L(here): + + shrl %eax C d/2 + movl (%esp), %ecx C eip + + addl $_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ecx + movl %ebx, (%esp) C push ebx + + andl $127, %eax + movl PARAM_SIZE, %ebx + + movl binvert_limb_table@GOT(%ecx), %ecx + subl $2, %ebx + + movb (%eax,%ecx), %cl C inv 8 bits + jc L(one_limb) +') +',` +dnl non-PIC + shrl %eax C d/2 + pushl %ebx FRAME_pushl() + + movl PARAM_SIZE, %ebx + andl $127, %eax + + subl $2, %ebx + jc L(one_limb) + + movb binvert_limb_table(%eax), %cl C inv 8 bits +') + + movl %ecx, %eax + addl %ecx, %ecx C 2*inv + + imull %eax, %eax C inv*inv + + imull PARAM_DIVISOR, %eax C inv*inv*d + + subl %eax, %ecx C inv = 2*inv - inv*inv*d + + movl %ecx, %eax + addl %ecx, %ecx C 2*inv + + imull %eax, %eax C inv*inv + + imull PARAM_DIVISOR, %eax C inv*inv*d + + subl %eax, %ecx C inv = 2*inv - inv*inv*d + pushl %esi FRAME_pushl() + + ASSERT(e,` C d*inv == 1 mod 2^GMP_LIMB_BITS + movl %ecx, %eax + imull PARAM_DIVISOR, %eax + cmpl $1, %eax') + + movl PARAM_SRC, %esi + movl %ecx, VAR_INVERSE + + movl (%esi), %eax C src[0] + leal 4(%esi,%ebx,4), %esi C &src[size-1] + + xorl $-1, %ebx C -(size-1) + ASSERT(nz) + jmp L(entry) + + +C The use of VAR_INVERSE means only a store is needed for that value, rather +C than a push and pop of say %edi. + + ALIGN(16) +L(top): + C eax scratch, low product + C ebx counter, limbs, negative + C ecx carry bit + C edx scratch, high product + C esi &src[size-1] + C edi + C ebp + + mull PARAM_DIVISOR C h:dummy = q*d + + movl (%esi,%ebx,4), %eax C src[i] + subl %ecx, %edx C h -= -c + +L(entry): + subl %edx, %eax C s = src[i] - h + + sbbl %ecx, %ecx C new -c (0 or -1) + + imull VAR_INVERSE, %eax C q = s*i + + incl %ebx + jnz L(top) + + + mull PARAM_DIVISOR + + movl (%esi), %eax C src high + subl %ecx, %edx C h -= -c + + cmpl PARAM_DIVISOR, %eax + + jbe L(skip_last) +deflit(FRAME_LAST,FRAME) + + + subl %edx, %eax C s = src[i] - h + popl %esi FRAME_popl() + + sbbl %ecx, %ecx C c (0 or -1) + popl %ebx FRAME_popl() + + imull VAR_INVERSE, %eax C q = s*i + + mull PARAM_DIVISOR C h:dummy = q*d + + movl %edx, %eax + + subl %ecx, %eax + + ret + + +C When high<divisor can skip last step. + +L(skip_last): +deflit(`FRAME',FRAME_LAST) + C eax src high + C ebx + C ecx + C edx r + C esi + + subl %eax, %edx C r-s + popl %esi FRAME_popl() + + sbbl %eax, %eax C -1 if underflow + movl PARAM_DIVISOR, %ebx + + andl %ebx, %eax C divisor if underflow + popl %ebx FRAME_popl() + + addl %edx, %eax C addback if underflow + + ret + + +C Special case for size==1 using a division for r = c-a mod d. +C Could look for a-c<d and save a division sometimes, but that doesn't seem +C worth bothering about. + +L(one_limb): +deflit(`FRAME',4) + C eax + C ebx size-2 (==-1) + C ecx + C edx carry + C esi src end + C edi + C ebp + + movl %edx, %eax + movl PARAM_SRC, %edx + + movl PARAM_DIVISOR, %ecx + popl %ebx FRAME_popl() + + subl (%edx), %eax C c-a + + sbbl %edx, %edx + decl %ecx C d-1 + + andl %ecx, %edx C b*d+c-a if c<a, or c-a if c>=a + + divl PARAM_DIVISOR + + movl %edx, %eax + + ret + +EPILOGUE() +ASM_END() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/mul_1.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/mul_1.asm new file mode 100644 index 0000000..a0858af --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium/mul_1.asm @@ -0,0 +1,177 @@ +dnl Intel Pentium mpn_mul_1 -- mpn by limb multiplication. + +dnl Copyright 1992, 1994, 1996, 1999, 2000, 2002 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P5: 12.0 cycles/limb + + +C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t multiplier); +C mp_limb_t mpn_mul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t multiplier, mp_limb_t carry); +C + +defframe(PARAM_CARRY, 20) +defframe(PARAM_MULTIPLIER,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + TEXT + ALIGN(8) +PROLOGUE(mpn_mul_1c) +deflit(`FRAME',0) + + movl PARAM_CARRY, %ecx + pushl %esi FRAME_pushl() + + jmp L(start_1c) + +EPILOGUE() + + + ALIGN(8) +PROLOGUE(mpn_mul_1) +deflit(`FRAME',0) + + xorl %ecx, %ecx + pushl %esi FRAME_pushl() + +L(start_1c): + movl PARAM_SRC, %esi + movl PARAM_SIZE, %eax + + shrl %eax + jnz L(two_or_more) + + + C one limb only + + movl (%esi), %eax + + mull PARAM_MULTIPLIER + + addl %eax, %ecx + movl PARAM_DST, %eax + + adcl $0, %edx + popl %esi + + movl %ecx, (%eax) + movl %edx, %eax + + ret + + +L(two_or_more): + C eax size/2 + C ebx + C ecx carry + C edx + C esi src + C edi + C ebp + + pushl %edi FRAME_pushl() + pushl %ebx FRAME_pushl() + + movl PARAM_DST, %edi + leal -1(%eax), %ebx C size/2-1 + + notl %ebx C -size, preserve carry + + leal (%esi,%eax,8), %esi C src end + leal (%edi,%eax,8), %edi C dst end + + pushl %ebp FRAME_pushl() + jnc L(top) + + + C size was odd, process one limb separately + + movl (%esi,%ebx,8), %eax + addl $4, %esi + + mull PARAM_MULTIPLIER + + addl %ecx, %eax + movl %edx, %ecx + + movl %eax, (%edi,%ebx,8) + leal 4(%edi), %edi + + +L(top): + C eax + C ebx counter, negative + C ecx carry + C edx + C esi src end + C edi dst end + C ebp + + adcl $0, %ecx + movl (%esi,%ebx,8), %eax + + mull PARAM_MULTIPLIER + + movl %edx, %ebp + addl %eax, %ecx + + adcl $0, %ebp + movl 4(%esi,%ebx,8), %eax + + mull PARAM_MULTIPLIER + + movl %ecx, (%edi,%ebx,8) + addl %ebp, %eax + + movl %eax, 4(%edi,%ebx,8) + incl %ebx + + movl %edx, %ecx + jnz L(top) + + + adcl $0, %ecx + popl %ebp + + movl %ecx, %eax + popl %ebx + + popl %edi + popl %esi + + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/mul_2.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/mul_2.asm new file mode 100644 index 0000000..4c7beb5 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium/mul_2.asm @@ -0,0 +1,150 @@ +dnl Intel Pentium mpn_mul_2 -- mpn by 2-limb multiplication. + +dnl Copyright 2001, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P5: 24.0 cycles/limb + + +C mp_limb_t mpn_mul_2 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_srcptr mult); +C +C At 24 c/l this is only 2 cycles faster than a separate mul_1 and addmul_1, +C but has the advantage of making just one pass over the operands. +C +C There's not enough registers to use PARAM_MULT directly, so the multiplier +C limbs are transferred to local variables on the stack. + +defframe(PARAM_MULT, 16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +dnl re-use parameter space +define(VAR_MULT_LOW, `PARAM_SRC') +define(VAR_MULT_HIGH,`PARAM_DST') + + TEXT + ALIGN(8) +PROLOGUE(mpn_mul_2) +deflit(`FRAME',0) + + pushl %esi FRAME_pushl() + pushl %edi FRAME_pushl() + + movl PARAM_SRC, %esi + movl PARAM_DST, %edi + + movl PARAM_MULT, %eax + movl PARAM_SIZE, %ecx + + movl 4(%eax), %edx C mult high + movl (%eax), %eax C mult low + + movl %eax, VAR_MULT_LOW + movl %edx, VAR_MULT_HIGH + + pushl %ebx FRAME_pushl() + pushl %ebp FRAME_pushl() + + mull (%esi) C src[0] * mult[0] + + movl %eax, %ebp C in case src==dst + movl (%esi), %eax C src[0] + + movl %ebp, (%edi) C dst[0] + movl %edx, %ebx C initial low carry + + xorl %ebp, %ebp C initial high carry + leal (%edi,%ecx,4), %edi C dst end + + mull VAR_MULT_HIGH C src[0] * mult[1] + + subl $2, %ecx C size-2 + js L(done) + + leal 8(%esi,%ecx,4), %esi C &src[size] + xorl $-1, %ecx C -(size-1) + + + +L(top): + C eax low prod + C ebx low carry + C ecx counter, negative + C edx high prod + C esi src end + C edi dst end + C ebp high carry (0 or -1) + + andl $1, %ebp C 1 or 0 + addl %eax, %ebx + + adcl %edx, %ebp + ASSERT(nc) + movl (%esi,%ecx,4), %eax + + mull VAR_MULT_LOW + + addl %eax, %ebx C low carry + movl (%esi,%ecx,4), %eax + + adcl %ebp, %edx C high carry + movl %ebx, (%edi,%ecx,4) + + sbbl %ebp, %ebp C new high carry, -1 or 0 + movl %edx, %ebx C new low carry + + mull VAR_MULT_HIGH + + incl %ecx + jnz L(top) + + +L(done): + andl $1, %ebp C 1 or 0 + addl %ebx, %eax + + adcl %ebp, %edx + ASSERT(nc) + movl %eax, (%edi) C store carry low + + movl %edx, %eax C return carry high + + popl %ebp + popl %ebx + + popl %edi + popl %esi + + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/mul_basecase.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/mul_basecase.asm new file mode 100644 index 0000000..e1d0f05 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium/mul_basecase.asm @@ -0,0 +1,142 @@ +dnl Intel Pentium mpn_mul_basecase -- mpn by mpn multiplication. + +dnl Copyright 1996, 1998-2000, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P5: 14.2 cycles/crossproduct (approx) + + +C void mpn_mul_basecase (mp_ptr wp, +C mp_srcptr xp, mp_size_t xsize, +C mp_srcptr yp, mp_size_t ysize); + +defframe(PARAM_YSIZE, 20) +defframe(PARAM_YP, 16) +defframe(PARAM_XSIZE, 12) +defframe(PARAM_XP, 8) +defframe(PARAM_WP, 4) + +defframe(VAR_COUNTER, -4) + + TEXT + ALIGN(8) +PROLOGUE(mpn_mul_basecase) + + pushl %eax C dummy push for allocating stack slot + pushl %esi + pushl %ebp + pushl %edi +deflit(`FRAME',16) + + movl PARAM_XP,%esi + movl PARAM_WP,%edi + movl PARAM_YP,%ebp + + movl (%esi),%eax C load xp[0] + mull (%ebp) C multiply by yp[0] + movl %eax,(%edi) C store to wp[0] + movl PARAM_XSIZE,%ecx C xsize + decl %ecx C If xsize = 1, ysize = 1 too + jz L(done) + + movl PARAM_XSIZE,%eax + pushl %ebx +FRAME_pushl() + movl %edx,%ebx + leal (%esi,%eax,4),%esi C make xp point at end + leal (%edi,%eax,4),%edi C offset wp by xsize + negl %ecx C negate j size/index for inner loop + xorl %eax,%eax C clear carry + + ALIGN(8) +L(oop1): adcl $0,%ebx + movl (%esi,%ecx,4),%eax C load next limb at xp[j] + mull (%ebp) + addl %ebx,%eax + movl %eax,(%edi,%ecx,4) + incl %ecx + movl %edx,%ebx + jnz L(oop1) + + adcl $0,%ebx + movl PARAM_YSIZE,%eax + movl %ebx,(%edi) C most significant limb of product + addl $4,%edi C increment wp + decl %eax + jz L(skip) + movl %eax,VAR_COUNTER C set index i to ysize + +L(outer): + addl $4,%ebp C make ebp point to next y limb + movl PARAM_XSIZE,%ecx + negl %ecx + xorl %ebx,%ebx + + C code at 0x61 here, close enough to aligned +L(oop2): + adcl $0,%ebx + movl (%esi,%ecx,4),%eax + mull (%ebp) + addl %ebx,%eax + movl (%edi,%ecx,4),%ebx + adcl $0,%edx + addl %eax,%ebx + movl %ebx,(%edi,%ecx,4) + incl %ecx + movl %edx,%ebx + jnz L(oop2) + + adcl $0,%ebx + + movl %ebx,(%edi) + addl $4,%edi + movl VAR_COUNTER,%eax + decl %eax + movl %eax,VAR_COUNTER + jnz L(outer) + +L(skip): + popl %ebx + popl %edi + popl %ebp + popl %esi + addl $4,%esp + ret + +L(done): + movl %edx,4(%edi) C store to wp[1] + popl %edi + popl %ebp + popl %esi + popl %eax C dummy pop for deallocating stack slot + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/popcount.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/popcount.asm new file mode 100644 index 0000000..0e82144 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium/popcount.asm @@ -0,0 +1,146 @@ +dnl Intel P5 mpn_popcount -- mpn bit population count. + +dnl Copyright 2001, 2002, 2014, 2015 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P5: 8.0 cycles/limb + + +C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size); +C +C An arithmetic approach has been found to be slower than the table lookup, +C due to needing too many instructions. + +C The slightly strange quoting here helps the renaming done by tune/many.pl. +deflit(TABLE_NAME, +m4_assert_defined(`GSYM_PREFIX') +GSYM_PREFIX`'mpn_popcount``'_table') + +C FIXME: exporting the table to hamdist is incorrect as it hurt incremental +C linking. + + RODATA + ALIGN(8) + GLOBL TABLE_NAME +TABLE_NAME: +forloop(i,0,255, +` .byte m4_popcount(i) +') + +defframe(PARAM_SIZE,8) +defframe(PARAM_SRC, 4) + + TEXT + ALIGN(8) + +PROLOGUE(mpn_popcount) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + pushl %esi FRAME_pushl() + +ifdef(`PIC',` + pushl %ebx FRAME_pushl() + pushl %ebp FRAME_pushl() +ifdef(`DARWIN',` + shll %ecx C size in byte pairs + LEA( TABLE_NAME, %ebp) + movl PARAM_SRC, %esi + xorl %eax, %eax C total + xorl %ebx, %ebx C byte + xorl %edx, %edx C byte +',` + call L(here) +L(here): + popl %ebp + shll %ecx C size in byte pairs + + addl $_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ebp + movl PARAM_SRC, %esi + + xorl %eax, %eax C total + xorl %ebx, %ebx C byte + + movl TABLE_NAME@GOT(%ebp), %ebp + xorl %edx, %edx C byte +') +define(TABLE,`(%ebp,$1)') +',` +dnl non-PIC + shll %ecx C size in byte pairs + movl PARAM_SRC, %esi + + pushl %ebx FRAME_pushl() + xorl %eax, %eax C total + + xorl %ebx, %ebx C byte + xorl %edx, %edx C byte + +define(TABLE,`TABLE_NAME`'($1)') +') + + + ALIGN(8) C necessary on P55 for claimed speed +L(top): + C eax total + C ebx byte + C ecx counter, 2*size to 2 + C edx byte + C esi src + C edi + C ebp [PIC] table + + addl %ebx, %eax + movb -1(%esi,%ecx,2), %bl + + addl %edx, %eax + movb -2(%esi,%ecx,2), %dl + + movb TABLE(%ebx), %bl + decl %ecx + + movb TABLE(%edx), %dl + jnz L(top) + + +ifdef(`PIC',` + popl %ebp +') + addl %ebx, %eax + popl %ebx + + addl %edx, %eax + popl %esi + + ret + +EPILOGUE() +ASM_END() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/rshift.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/rshift.asm new file mode 100644 index 0000000..2105c4c --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium/rshift.asm @@ -0,0 +1,243 @@ +dnl Intel Pentium mpn_rshift -- mpn right shift. + +dnl Copyright 1992, 1994-1996, 1999, 2000, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C P5,P54: 6.0 +C P55: 5.375 + + +C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned shift); +C +C The main shift-by-N loop should run at 5.375 c/l and that's what P55 does, +C but P5 and P54 run only at 6.0 c/l, which is 4 cycles lost somewhere. + +defframe(PARAM_SHIFT,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + TEXT + ALIGN(8) +PROLOGUE(mpn_rshift) + + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp +deflit(`FRAME',16) + + movl PARAM_DST,%edi + movl PARAM_SRC,%esi + movl PARAM_SIZE,%ebp + movl PARAM_SHIFT,%ecx + +C We can use faster code for shift-by-1 under certain conditions. + cmp $1,%ecx + jne L(normal) + leal 4(%edi),%eax + cmpl %esi,%eax + jnc L(special) C jump if res_ptr + 1 >= s_ptr + leal (%edi,%ebp,4),%eax + cmpl %eax,%esi + jnc L(special) C jump if s_ptr >= res_ptr + size + +L(normal): + movl (%esi),%edx + addl $4,%esi + xorl %eax,%eax + shrdl( %cl, %edx, %eax) C compute carry limb + pushl %eax C push carry limb onto stack + + decl %ebp + pushl %ebp + shrl $3,%ebp + jz L(end) + + movl (%edi),%eax C fetch destination cache line + + ALIGN(4) +L(oop): movl 28(%edi),%eax C fetch destination cache line + movl %edx,%ebx + + movl (%esi),%eax + movl 4(%esi),%edx + shrdl( %cl, %eax, %ebx) + shrdl( %cl, %edx, %eax) + movl %ebx,(%edi) + movl %eax,4(%edi) + + movl 8(%esi),%ebx + movl 12(%esi),%eax + shrdl( %cl, %ebx, %edx) + shrdl( %cl, %eax, %ebx) + movl %edx,8(%edi) + movl %ebx,12(%edi) + + movl 16(%esi),%edx + movl 20(%esi),%ebx + shrdl( %cl, %edx, %eax) + shrdl( %cl, %ebx, %edx) + movl %eax,16(%edi) + movl %edx,20(%edi) + + movl 24(%esi),%eax + movl 28(%esi),%edx + shrdl( %cl, %eax, %ebx) + shrdl( %cl, %edx, %eax) + movl %ebx,24(%edi) + movl %eax,28(%edi) + + addl $32,%esi + addl $32,%edi + decl %ebp + jnz L(oop) + +L(end): popl %ebp + andl $7,%ebp + jz L(end2) +L(oop2): + movl (%esi),%eax + shrdl( %cl,%eax,%edx) C compute result limb + movl %edx,(%edi) + movl %eax,%edx + addl $4,%esi + addl $4,%edi + decl %ebp + jnz L(oop2) + +L(end2): + shrl %cl,%edx C compute most significant limb + movl %edx,(%edi) C store it + + popl %eax C pop carry limb + + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + + +C We loop from least significant end of the arrays, which is only +C permissable if the source and destination don't overlap, since the +C function is documented to work for overlapping source and destination. + +L(special): + leal -4(%edi,%ebp,4),%edi + leal -4(%esi,%ebp,4),%esi + + movl (%esi),%edx + subl $4,%esi + + decl %ebp + pushl %ebp + shrl $3,%ebp + + shrl %edx + incl %ebp + decl %ebp + jz L(Lend) + + movl (%edi),%eax C fetch destination cache line + + ALIGN(4) +L(Loop): + movl -28(%edi),%eax C fetch destination cache line + movl %edx,%ebx + + movl (%esi),%eax + movl -4(%esi),%edx + rcrl %eax + movl %ebx,(%edi) + rcrl %edx + movl %eax,-4(%edi) + + movl -8(%esi),%ebx + movl -12(%esi),%eax + rcrl %ebx + movl %edx,-8(%edi) + rcrl %eax + movl %ebx,-12(%edi) + + movl -16(%esi),%edx + movl -20(%esi),%ebx + rcrl %edx + movl %eax,-16(%edi) + rcrl %ebx + movl %edx,-20(%edi) + + movl -24(%esi),%eax + movl -28(%esi),%edx + rcrl %eax + movl %ebx,-24(%edi) + rcrl %edx + movl %eax,-28(%edi) + + leal -32(%esi),%esi C use leal not to clobber carry + leal -32(%edi),%edi + decl %ebp + jnz L(Loop) + +L(Lend): + popl %ebp + sbbl %eax,%eax C save carry in %eax + andl $7,%ebp + jz L(Lend2) + addl %eax,%eax C restore carry from eax +L(Loop2): + movl %edx,%ebx + movl (%esi),%edx + rcrl %edx + movl %ebx,(%edi) + + leal -4(%esi),%esi C use leal not to clobber carry + leal -4(%edi),%edi + decl %ebp + jnz L(Loop2) + + jmp L(L1) +L(Lend2): + addl %eax,%eax C restore carry from eax +L(L1): movl %edx,(%edi) C store last limb + + movl $0,%eax + rcrl %eax + + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/sqr_basecase.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/sqr_basecase.asm new file mode 100644 index 0000000..b11d767 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium/sqr_basecase.asm @@ -0,0 +1,528 @@ +dnl Intel P5 mpn_sqr_basecase -- square an mpn number. + +dnl Copyright 1999-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P5: approx 8 cycles per crossproduct, or 15.5 cycles per triangular +C product at around 20x20 limbs. + + +C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size); +C +C Calculate src,size squared, storing the result in dst,2*size. +C +C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a +C lot of function call overheads are avoided, especially when the size is +C small. + +defframe(PARAM_SIZE,12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + TEXT + ALIGN(8) +PROLOGUE(mpn_sqr_basecase) +deflit(`FRAME',0) + + movl PARAM_SIZE, %edx + movl PARAM_SRC, %eax + + cmpl $2, %edx + movl PARAM_DST, %ecx + + je L(two_limbs) + + movl (%eax), %eax + ja L(three_or_more) + +C ----------------------------------------------------------------------------- +C one limb only + C eax src + C ebx + C ecx dst + C edx + + mull %eax + + movl %eax, (%ecx) + movl %edx, 4(%ecx) + + ret + +C ----------------------------------------------------------------------------- + ALIGN(8) +L(two_limbs): + C eax src + C ebx + C ecx dst + C edx size + + pushl %ebp + pushl %edi + + pushl %esi + pushl %ebx + + movl %eax, %ebx + movl (%eax), %eax + + mull %eax C src[0]^2 + + movl %eax, (%ecx) C dst[0] + movl %edx, %esi C dst[1] + + movl 4(%ebx), %eax + + mull %eax C src[1]^2 + + movl %eax, %edi C dst[2] + movl %edx, %ebp C dst[3] + + movl (%ebx), %eax + + mull 4(%ebx) C src[0]*src[1] + + addl %eax, %esi + popl %ebx + + adcl %edx, %edi + + adcl $0, %ebp + addl %esi, %eax + + adcl %edi, %edx + movl %eax, 4(%ecx) + + adcl $0, %ebp + popl %esi + + movl %edx, 8(%ecx) + movl %ebp, 12(%ecx) + + popl %edi + popl %ebp + + ret + + +C ----------------------------------------------------------------------------- + ALIGN(8) +L(three_or_more): + C eax src low limb + C ebx + C ecx dst + C edx size + + cmpl $4, %edx + pushl %ebx +deflit(`FRAME',4) + + movl PARAM_SRC, %ebx + jae L(four_or_more) + + +C ----------------------------------------------------------------------------- +C three limbs + C eax src low limb + C ebx src + C ecx dst + C edx size + + pushl %ebp + pushl %edi + + mull %eax C src[0] ^ 2 + + movl %eax, (%ecx) + movl %edx, 4(%ecx) + + movl 4(%ebx), %eax + xorl %ebp, %ebp + + mull %eax C src[1] ^ 2 + + movl %eax, 8(%ecx) + movl %edx, 12(%ecx) + + movl 8(%ebx), %eax + pushl %esi C risk of cache bank clash + + mull %eax C src[2] ^ 2 + + movl %eax, 16(%ecx) + movl %edx, 20(%ecx) + + movl (%ebx), %eax + + mull 4(%ebx) C src[0] * src[1] + + movl %eax, %esi + movl %edx, %edi + + movl (%ebx), %eax + + mull 8(%ebx) C src[0] * src[2] + + addl %eax, %edi + movl %edx, %ebp + + adcl $0, %ebp + movl 4(%ebx), %eax + + mull 8(%ebx) C src[1] * src[2] + + xorl %ebx, %ebx + addl %eax, %ebp + + C eax + C ebx zero, will be dst[5] + C ecx dst + C edx dst[4] + C esi dst[1] + C edi dst[2] + C ebp dst[3] + + adcl $0, %edx + addl %esi, %esi + + adcl %edi, %edi + + adcl %ebp, %ebp + + adcl %edx, %edx + movl 4(%ecx), %eax + + adcl $0, %ebx + addl %esi, %eax + + movl %eax, 4(%ecx) + movl 8(%ecx), %eax + + adcl %edi, %eax + movl 12(%ecx), %esi + + adcl %ebp, %esi + movl 16(%ecx), %edi + + movl %eax, 8(%ecx) + movl %esi, 12(%ecx) + + adcl %edx, %edi + popl %esi + + movl 20(%ecx), %eax + movl %edi, 16(%ecx) + + popl %edi + popl %ebp + + adcl %ebx, %eax C no carry out of this + popl %ebx + + movl %eax, 20(%ecx) + + ret + + +C ----------------------------------------------------------------------------- + ALIGN(8) +L(four_or_more): + C eax src low limb + C ebx src + C ecx dst + C edx size + C esi + C edi + C ebp + C + C First multiply src[0]*src[1..size-1] and store at dst[1..size]. + +deflit(`FRAME',4) + + pushl %edi +FRAME_pushl() + pushl %esi +FRAME_pushl() + + pushl %ebp +FRAME_pushl() + leal (%ecx,%edx,4), %edi C dst end of this mul1 + + leal (%ebx,%edx,4), %esi C src end + movl %ebx, %ebp C src + + negl %edx C -size + xorl %ebx, %ebx C clear carry limb and carry flag + + leal 1(%edx), %ecx C -(size-1) + +L(mul1): + C eax scratch + C ebx carry + C ecx counter, negative + C edx scratch + C esi &src[size] + C edi &dst[size] + C ebp src + + adcl $0, %ebx + movl (%esi,%ecx,4), %eax + + mull (%ebp) + + addl %eax, %ebx + + movl %ebx, (%edi,%ecx,4) + incl %ecx + + movl %edx, %ebx + jnz L(mul1) + + + C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for + C n=1..size-2. + C + C The last two products, which are the end corner of the product + C triangle, are handled separately to save looping overhead. These + C are src[size-3]*src[size-2,size-1] and src[size-2]*src[size-1]. + C If size is 4 then it's only these that need to be done. + C + C In the outer loop %esi is a constant, and %edi just advances by 1 + C limb each time. The size of the operation decreases by 1 limb + C each time. + + C eax + C ebx carry (needing carry flag added) + C ecx + C edx + C esi &src[size] + C edi &dst[size] + C ebp + + adcl $0, %ebx + movl PARAM_SIZE, %edx + + movl %ebx, (%edi) + subl $4, %edx + + negl %edx + jz L(corner) + + +L(outer): + C ebx previous carry limb to store + C edx outer loop counter (negative) + C esi &src[size] + C edi dst, pointing at stored carry limb of previous loop + + pushl %edx C new outer loop counter + leal -2(%edx), %ecx + + movl %ebx, (%edi) + addl $4, %edi + + addl $4, %ebp + xorl %ebx, %ebx C initial carry limb, clear carry flag + +L(inner): + C eax scratch + C ebx carry (needing carry flag added) + C ecx counter, negative + C edx scratch + C esi &src[size] + C edi dst end of this addmul + C ebp &src[j] + + adcl $0, %ebx + movl (%esi,%ecx,4), %eax + + mull (%ebp) + + addl %ebx, %eax + movl (%edi,%ecx,4), %ebx + + adcl $0, %edx + addl %eax, %ebx + + movl %ebx, (%edi,%ecx,4) + incl %ecx + + movl %edx, %ebx + jnz L(inner) + + + adcl $0, %ebx + popl %edx C outer loop counter + + incl %edx + jnz L(outer) + + + movl %ebx, (%edi) + +L(corner): + C esi &src[size] + C edi &dst[2*size-4] + + movl -8(%esi), %eax + movl -4(%edi), %ebx C risk of data cache bank clash here + + mull -12(%esi) C src[size-2]*src[size-3] + + addl %eax, %ebx + movl %edx, %ecx + + adcl $0, %ecx + movl -4(%esi), %eax + + mull -12(%esi) C src[size-1]*src[size-3] + + addl %ecx, %eax + movl (%edi), %ecx + + adcl $0, %edx + movl %ebx, -4(%edi) + + addl %eax, %ecx + movl %edx, %ebx + + adcl $0, %ebx + movl -4(%esi), %eax + + mull -8(%esi) C src[size-1]*src[size-2] + + movl %ecx, (%edi) + addl %eax, %ebx + + adcl $0, %edx + movl PARAM_SIZE, %eax + + negl %eax + movl %ebx, 4(%edi) + + addl $1, %eax C -(size-1) and clear carry + movl %edx, 8(%edi) + + +C ----------------------------------------------------------------------------- +C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1]. + +L(lshift): + C eax counter, negative + C ebx next limb + C ecx + C edx + C esi + C edi &dst[2*size-4] + C ebp + + movl 12(%edi,%eax,8), %ebx + + rcll %ebx + movl 16(%edi,%eax,8), %ecx + + rcll %ecx + movl %ebx, 12(%edi,%eax,8) + + movl %ecx, 16(%edi,%eax,8) + incl %eax + + jnz L(lshift) + + + adcl %eax, %eax C high bit out + movl PARAM_SRC, %esi + + movl PARAM_SIZE, %ecx C risk of cache bank clash + movl %eax, 12(%edi) C dst most significant limb + + +C ----------------------------------------------------------------------------- +C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ..., +C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the +C low limb of src[0]^2. + + movl (%esi), %eax C src[0] + leal (%esi,%ecx,4), %esi C src end + + negl %ecx + + mull %eax + + movl %eax, 16(%edi,%ecx,8) C dst[0] + movl %edx, %ebx + + addl $1, %ecx C size-1 and clear carry + +L(diag): + C eax scratch (low product) + C ebx carry limb + C ecx counter, negative + C edx scratch (high product) + C esi &src[size] + C edi &dst[2*size-4] + C ebp scratch (fetched dst limbs) + + movl (%esi,%ecx,4), %eax + adcl $0, %ebx + + mull %eax + + movl 16-4(%edi,%ecx,8), %ebp + + addl %ebp, %ebx + movl 16(%edi,%ecx,8), %ebp + + adcl %eax, %ebp + movl %ebx, 16-4(%edi,%ecx,8) + + movl %ebp, 16(%edi,%ecx,8) + incl %ecx + + movl %edx, %ebx + jnz L(diag) + + + adcl $0, %edx + movl 16-4(%edi), %eax C dst most significant limb + + addl %eax, %edx + popl %ebp + + movl %edx, 16-4(%edi) + popl %esi C risk of cache bank clash + + popl %edi + popl %ebx + + ret + +EPILOGUE() |