diff options
author | Thomas Voss <mail@thomasvoss.com> | 2024-06-21 23:36:36 +0200 |
---|---|---|
committer | Thomas Voss <mail@thomasvoss.com> | 2024-06-21 23:42:26 +0200 |
commit | a89a14ef5da44684a16b204e7a70460cc8c4922a (patch) | |
tree | b23b4c6b155977909ef508fdae2f48d33d802813 /vendor/gmp-6.3.0/mpn/x86/pentium4 | |
parent | 1db63fcedab0b288820d66e100b1877b1a5a8851 (diff) |
Basic constant folding implementation
Diffstat (limited to 'vendor/gmp-6.3.0/mpn/x86/pentium4')
27 files changed, 5649 insertions, 0 deletions
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/README b/vendor/gmp-6.3.0/mpn/x86/pentium4/README new file mode 100644 index 0000000..90f752e --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/README @@ -0,0 +1,124 @@ +Copyright 2001 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. + + + + + INTEL PENTIUM-4 MPN SUBROUTINES + + +This directory contains mpn functions optimized for Intel Pentium-4. + +The mmx subdirectory has routines using MMX instructions, the sse2 +subdirectory has routines using SSE2 instructions. All P4s have these, the +separate directories are just so configure can omit that code if the +assembler doesn't support it. + + +STATUS + + cycles/limb + + mpn_add_n/sub_n 4 normal, 6 in-place + + mpn_mul_1 4 normal, 6 in-place + mpn_addmul_1 6 + mpn_submul_1 7 + + mpn_mul_basecase 6 cycles/crossproduct (approx) + + mpn_sqr_basecase 3.5 cycles/crossproduct (approx) + or 7.0 cycles/triangleproduct (approx) + + mpn_l/rshift 1.75 + + + +The shifts ought to be able to go at 1.5 c/l, but not much effort has been +applied to them yet. + +In-place operations, and all addmul, submul, mul_basecase and sqr_basecase +calls, suffer from pipeline anomalies associated with write combining and +movd reads and writes to the same or nearby locations. The movq +instructions do not trigger the same hardware problems. Unfortunately, +using movq and splitting/combining seems to require too many extra +instructions to help. Perhaps future chip steppings will be better. + + + +NOTES + +The Pentium-4 pipeline "Netburst", provides for quite a number of surprises. +Many traditional x86 instructions run very slowly, requiring use of +alterative instructions for acceptable performance. + +adcl and sbbl are quite slow at 8 cycles for reg->reg. paddq of 32-bits +within a 64-bit mmx register seems better, though the combination +paddq/psrlq when propagating a carry is still a 4 cycle latency. + +incl and decl should be avoided, instead use add $1 and sub $1. Apparently +the carry flag is not separately renamed, so incl and decl depend on all +previous flags-setting instructions. + +shll and shrl have a 4 cycle latency, or 8 times the latency of the fastest +integer instructions (addl, subl, orl, andl, and some more). shldl and +shrdl seem to have 13 and 15 cycles latency, respectively. Bizarre. + +movq mmx -> mmx does have 6 cycle latency, as noted in the documentation. +pxor/por or similar combination at 2 cycles latency can be used instead. +The movq however executes in the float unit, thereby saving MMX execution +resources. With the right juggling, data moves shouldn't be on a dependent +chain. + +L1 is write-through, but the write-combining sounds like it does enough to +not require explicit destination prefetching. + +xmm registers so far haven't found a use, but not much effort has been +expended. A configure test for whether the operating system knows +fxsave/fxrestor will be needed if they're used. + + + +REFERENCES + +Intel Pentium-4 processor manuals, + + http://developer.intel.com/design/pentium4/manuals + +"Intel Pentium 4 Processor Optimization Reference Manual", Intel, 2001, +order number 248966. Available on-line: + + http://developer.intel.com/design/pentium4/manuals/248966.htm + + + +---------------- +Local variables: +mode: text +fill-column: 76 +End: diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/copyd.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/copyd.asm new file mode 100644 index 0000000..82af81c --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/copyd.asm @@ -0,0 +1,71 @@ +dnl Pentium-4 mpn_copyd -- copy limb vector, decrementing. + +dnl Copyright 1999-2001 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +dnl The std/rep/movsl/cld is very slow for small blocks on pentium4. Its +dnl startup time seems to be about 165 cycles. It then needs 2.6 c/l. +dnl We therefore use an open-coded 2 c/l copying loop. + +dnl Ultimately, we may want to use 64-bit movq or 128-bit movdqu in some +dnl nifty unrolled arrangement. Clearly, that could reach much higher +dnl speeds, at least for large blocks. + +include(`../config.m4') + + +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + TEXT + ALIGN(8) + +PROLOGUE(mpn_copyd) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + + movl PARAM_SRC, %eax + movl PARAM_DST, %edx + movl %ebx, PARAM_SIZE + addl $-1, %ecx + js L(end) + +L(loop): + movl (%eax,%ecx,4), %ebx + movl %ebx, (%edx,%ecx,4) + addl $-1, %ecx + + jns L(loop) +L(end): + movl PARAM_SIZE, %ebx + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/copyi.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/copyi.asm new file mode 100644 index 0000000..b614887 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/copyi.asm @@ -0,0 +1,93 @@ +dnl Pentium-4 mpn_copyi -- copy limb vector, incrementing. + +dnl Copyright 1999-2001 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +dnl The rep/movsl is very slow for small blocks on pentium4. Its startup +dnl time seems to be about 110 cycles. It then copies at a rate of one +dnl limb per cycle. We therefore fall back to an open-coded 2 c/l copying +dnl loop for smaller sizes. + +dnl Ultimately, we may want to use 64-bit movd or 128-bit movdqu in some +dnl nifty unrolled arrangement. Clearly, that could reach much higher +dnl speeds, at least for large blocks. + +include(`../config.m4') + + +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + TEXT + ALIGN(8) + +PROLOGUE(mpn_copyi) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + cmpl $150, %ecx + jg L(replmovs) + + movl PARAM_SRC, %eax + movl PARAM_DST, %edx + movl %ebx, PARAM_SIZE + testl %ecx, %ecx + jz L(end) + +L(loop): + movl (%eax), %ebx + leal 4(%eax), %eax + addl $-1, %ecx + movl %ebx, (%edx) + leal 4(%edx), %edx + + jnz L(loop) + +L(end): + movl PARAM_SIZE, %ebx + ret + +L(replmovs): + cld C better safe than sorry, see mpn/x86/README + + movl %esi, %eax + movl PARAM_SRC, %esi + movl %edi, %edx + movl PARAM_DST, %edi + + rep + movsl + + movl %eax, %esi + movl %edx, %edi + + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/mmx/lshift.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/mmx/lshift.asm new file mode 100644 index 0000000..b5eca66 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/mmx/lshift.asm @@ -0,0 +1,39 @@ +dnl Intel Pentium-4 mpn_lshift -- left shift. + +dnl Copyright 2001, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P4 Willamette, Northwood: 1.75 cycles/limb +C P4 Prescott: 2.0 cycles/limb + + +MULFUNC_PROLOGUE(mpn_lshift) +include_mpn(`x86/pentium/mmx/lshift.asm') diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/mmx/popham.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/mmx/popham.asm new file mode 100644 index 0000000..9563cb5 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/mmx/popham.asm @@ -0,0 +1,203 @@ +dnl Intel Pentium 4 mpn_popcount, mpn_hamdist -- population count and +dnl hamming distance. + +dnl Copyright 2000-2002, 2007 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C popcount hamdist +C P3 model 9 (Banias) ? ? +C P3 model 13 (Dothan) 6 6 +C P4 model 0 (Willamette) +C P4 model 1 (?) +C P4 model 2 (Northwood) 8 9 +C P4 model 3 (Prescott) 8 9 +C P4 model 4 (Nocona) + +C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size); +C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size); +C +C Loading with unaligned movq's costs an extra 1 c/l and hence is avoided. +C Two movd's and a punpckldq seems to be the same speed as an aligned movq, +C and using them saves fiddling about with alignment testing on entry. +C +C For popcount there's 13 mmx instructions in the loop, so perhaps 6.5 c/l +C might be possible, but 8 c/l relying on out-of-order execution is already +C quite reasonable. + +ifdef(`OPERATION_popcount',, +`ifdef(`OPERATION_hamdist',, +`m4_error(`Need OPERATION_popcount or OPERATION_hamdist defined +')')') + +define(HAM, +m4_assert_numargs(1) +`ifdef(`OPERATION_hamdist',`$1')') + +define(POP, +m4_assert_numargs(1) +`ifdef(`OPERATION_popcount',`$1')') + +HAM(` +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC2, 8) +defframe(PARAM_SRC, 4) +define(M4_function,mpn_hamdist) +') +POP(` +defframe(PARAM_SIZE, 8) +defframe(PARAM_SRC, 4) +define(M4_function,mpn_popcount) +') + +MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist) + + +ifdef(`PIC',,` + dnl non-PIC + RODATA + ALIGN(8) +L(rodata_AAAAAAAAAAAAAAAA): + .long 0xAAAAAAAA + .long 0xAAAAAAAA +L(rodata_3333333333333333): + .long 0x33333333 + .long 0x33333333 +L(rodata_0F0F0F0F0F0F0F0F): + .long 0x0F0F0F0F + .long 0x0F0F0F0F +') + + TEXT + ALIGN(16) + +PROLOGUE(M4_function) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + movl PARAM_SRC, %eax + +ifdef(`PIC',` + movl $0xAAAAAAAA, %edx + movd %edx, %mm7 + punpckldq %mm7, %mm7 + + movl $0x33333333, %edx + movd %edx, %mm6 + punpckldq %mm6, %mm6 + + movl $0x0F0F0F0F, %edx + movd %edx, %mm5 + punpckldq %mm5, %mm5 + +HAM(` movl PARAM_SRC2, %edx') + +',` + dnl non-PIC +HAM(` movl PARAM_SRC2, %edx') + movq L(rodata_AAAAAAAAAAAAAAAA), %mm7 + movq L(rodata_3333333333333333), %mm6 + movq L(rodata_0F0F0F0F0F0F0F0F), %mm5 +') + + pxor %mm4, %mm4 C zero + pxor %mm0, %mm0 C total + + subl $1, %ecx + ja L(top) + +L(last): + movd (%eax,%ecx,4), %mm1 C src high limb +HAM(` movd (%edx,%ecx,4), %mm2 + pxor %mm2, %mm1 +') + jmp L(loaded) + + +L(top): + C eax src + C ebx + C ecx counter, size-1 to 2 or 1, inclusive + C edx [hamdist] src2 + C + C mm0 total (low dword) + C mm1 (scratch) + C mm2 (scratch) + C mm3 + C mm4 0x0000000000000000 + C mm5 0x0F0F0F0F0F0F0F0F + C mm6 0x3333333333333333 + C mm7 0xAAAAAAAAAAAAAAAA + + movd (%eax), %mm1 + movd 4(%eax), %mm2 + punpckldq %mm2, %mm1 + addl $8, %eax + +HAM(` movd (%edx), %mm2 + movd 4(%edx), %mm3 + punpckldq %mm3, %mm2 + pxor %mm2, %mm1 + addl $8, %edx +') + +L(loaded): + movq %mm7, %mm2 + pand %mm1, %mm2 + psrlq $1, %mm2 + psubd %mm2, %mm1 C bit pairs + + movq %mm6, %mm2 + pand %mm1, %mm2 + psrlq $2, %mm1 + pand %mm6, %mm1 + paddd %mm2, %mm1 C nibbles + + movq %mm5, %mm2 + pand %mm1, %mm2 + psrlq $4, %mm1 + pand %mm5, %mm1 + paddd %mm2, %mm1 C bytes + + psadbw( %mm4, %mm1) + paddd %mm1, %mm0 C to total + + subl $2, %ecx + jg L(top) + + C ecx is 0 or -1 representing respectively 1 or 0 further limbs + jz L(last) + + + movd %mm0, %eax + emms + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/mmx/rshift.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/mmx/rshift.asm new file mode 100644 index 0000000..3ac0094 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/mmx/rshift.asm @@ -0,0 +1,39 @@ +dnl Intel Pentium-4 mpn_rshift -- right shift. + +dnl Copyright 2001, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P4 Willamette, Northwood: 1.75 cycles/limb +C P4 Prescott: 2.0 cycles/limb + + +MULFUNC_PROLOGUE(mpn_rshift) +include_mpn(`x86/pentium/mmx/rshift.asm') diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/add_n.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/add_n.asm new file mode 100644 index 0000000..8e2380e --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/add_n.asm @@ -0,0 +1,101 @@ +dnl Intel Pentium-4 mpn_add_n -- mpn addition. + +dnl Copyright 2001, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C dst!=src1,2 dst==src1 dst==src2 +C P6 model 0-8,10-12 - +C P6 model 9 (Banias) ? +C P6 model 13 (Dothan) ? +C P4 model 0-1 (Willamette) ? +C P4 model 2 (Northwood) 4 6 6 +C P4 model 3-4 (Prescott) 4.25 7.5 7.5 + +defframe(PARAM_CARRY,20) +defframe(PARAM_SIZE, 16) +defframe(PARAM_SRC2, 12) +defframe(PARAM_SRC1, 8) +defframe(PARAM_DST, 4) + +dnl re-use parameter space +define(SAVE_EBX,`PARAM_SRC1') + + TEXT + ALIGN(8) + +PROLOGUE(mpn_add_nc) +deflit(`FRAME',0) + movd PARAM_CARRY, %mm0 + jmp L(start_nc) +EPILOGUE() + + ALIGN(8) +PROLOGUE(mpn_add_n) +deflit(`FRAME',0) + pxor %mm0, %mm0 +L(start_nc): + mov PARAM_SRC1, %eax + mov %ebx, SAVE_EBX + mov PARAM_SRC2, %ebx + mov PARAM_DST, %edx + mov PARAM_SIZE, %ecx + + lea (%eax,%ecx,4), %eax C src1 end + lea (%ebx,%ecx,4), %ebx C src2 end + lea (%edx,%ecx,4), %edx C dst end + neg %ecx C -size + +L(top): + C eax src1 end + C ebx src2 end + C ecx counter, limbs, negative + C edx dst end + C mm0 carry bit + + movd (%eax,%ecx,4), %mm1 + movd (%ebx,%ecx,4), %mm2 + paddq %mm2, %mm1 + + paddq %mm1, %mm0 + movd %mm0, (%edx,%ecx,4) + + psrlq $32, %mm0 + + add $1, %ecx + jnz L(top) + + movd %mm0, %eax + mov SAVE_EBX, %ebx + emms + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/addlsh1_n.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/addlsh1_n.asm new file mode 100644 index 0000000..93b63b2 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/addlsh1_n.asm @@ -0,0 +1,108 @@ +dnl Intel Pentium-4 mpn_addlsh1_n -- mpn x+2*y. + +dnl Copyright 2001-2004, 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C dst!=src1,2 dst==src1 dst==src2 +C P6 model 0-8,10-12 - +C P6 model 9 (Banias) ? +C P6 model 13 (Dothan) ? +C P4 model 0-1 (Willamette) ? +C P4 model 2 (Northwood) 4.25 6 6 +C P4 model 3-4 (Prescott) 5 8.5 8.5 + +C The slightly strange combination of indexing and pointer incrementing +C that's used seems to work best. Not sure why, but %ecx,4 with src1 and/or +C src2 is a slowdown. +C +C The dependent chain is simply the paddq of x+2*y to the previous carry, +C then psrlq to get the new carry. That makes 4 c/l the target speed, which +C is almost achieved for separate src/dst but when src==dst the write +C combining anomalies slow it down. + +defframe(PARAM_SIZE, 16) +defframe(PARAM_SRC2, 12) +defframe(PARAM_SRC1, 8) +defframe(PARAM_DST, 4) + +dnl re-use parameter space +define(SAVE_EBX,`PARAM_SRC1') + + TEXT + ALIGN(8) + +PROLOGUE(mpn_addlsh1_n) +deflit(`FRAME',0) + + mov PARAM_SRC1, %eax + mov %ebx, SAVE_EBX + + mov PARAM_SRC2, %ebx + pxor %mm0, %mm0 C initial carry + + mov PARAM_DST, %edx + + mov PARAM_SIZE, %ecx + + lea (%edx,%ecx,4), %edx C dst end + neg %ecx C -size + +L(top): + C eax src1 end + C ebx src2 end + C ecx counter, limbs, negative + C edx dst end + C mm0 carry + + movd (%ebx), %mm2 + movd (%eax), %mm1 + psrlq $32, %mm0 + lea 4(%eax), %eax + lea 4(%ebx), %ebx + + psllq $1, %mm2 + paddq %mm2, %mm1 + + paddq %mm1, %mm0 + + movd %mm0, (%edx,%ecx,4) + add $1, %ecx + jnz L(top) + + + psrlq $32, %mm0 + mov SAVE_EBX, %ebx + movd %mm0, %eax + emms + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/addmul_1.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/addmul_1.asm new file mode 100644 index 0000000..7810207 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/addmul_1.asm @@ -0,0 +1,189 @@ +dnl mpn_addmul_1 for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F). + +dnl Copyright 2005, 2007, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C cycles/limb +C P6 model 0-8,10-12 - +C P6 model 9 (Banias) 5.24 +C P6 model 13 (Dothan) 5.24 +C P4 model 0-1 (Willamette) 5 +C P4 model 2 (Northwood) 5 +C P4 model 3-4 (Prescott) 5 + +C TODO: +C * Tweak eax/edx offsets in loop as to save some lea's +C * Perhaps software pipeline small-case code + +C INPUT PARAMETERS +C rp sp + 4 +C up sp + 8 +C n sp + 12 +C v0 sp + 16 + + TEXT + ALIGN(16) +PROLOGUE(mpn_addmul_1) + pxor %mm6, %mm6 +L(ent): mov 4(%esp), %edx + mov 8(%esp), %eax + mov 12(%esp), %ecx + movd 16(%esp), %mm7 + cmp $4, %ecx + jnc L(big) + +L(lp0): movd (%eax), %mm0 + lea 4(%eax), %eax + movd (%edx), %mm4 + lea 4(%edx), %edx + pmuludq %mm7, %mm0 + paddq %mm0, %mm4 + paddq %mm4, %mm6 + movd %mm6, -4(%edx) + psrlq $32, %mm6 + dec %ecx + jnz L(lp0) + movd %mm6, %eax + emms + ret + +L(big): and $3, %ecx + je L(0) + cmp $2, %ecx + jc L(1) + je L(2) + jmp L(3) C FIXME: one case should fall through + +L(0): movd (%eax), %mm3 + sub 12(%esp), %ecx C loop count + lea -16(%eax), %eax + lea -12(%edx), %edx + pmuludq %mm7, %mm3 + movd 20(%eax), %mm0 + movd 12(%edx), %mm5 + pmuludq %mm7, %mm0 + movd 24(%eax), %mm1 + paddq %mm3, %mm5 + movd 16(%edx), %mm4 + jmp L(00) + +L(1): movd (%eax), %mm2 + sub 12(%esp), %ecx + lea -12(%eax), %eax + lea -8(%edx), %edx + movd 8(%edx), %mm4 + pmuludq %mm7, %mm2 + movd 16(%eax), %mm3 + pmuludq %mm7, %mm3 + movd 20(%eax), %mm0 + paddq %mm2, %mm4 + movd 12(%edx), %mm5 + jmp L(01) + +L(2): movd (%eax), %mm1 + sub 12(%esp), %ecx + lea -8(%eax), %eax + lea -4(%edx), %edx + pmuludq %mm7, %mm1 + movd 12(%eax), %mm2 + movd 4(%edx), %mm5 + pmuludq %mm7, %mm2 + movd 16(%eax), %mm3 + paddq %mm1, %mm5 + movd 8(%edx), %mm4 + jmp L(10) + +L(3): movd (%eax), %mm0 + sub 12(%esp), %ecx + lea -4(%eax), %eax + pmuludq %mm7, %mm0 + movd 8(%eax), %mm1 + movd (%edx), %mm4 + pmuludq %mm7, %mm1 + movd 12(%eax), %mm2 + paddq %mm0, %mm4 + movd 4(%edx), %mm5 + + ALIGN(16) +L(top): pmuludq %mm7, %mm2 + paddq %mm4, %mm6 + movd 16(%eax), %mm3 + paddq %mm1, %mm5 + movd 8(%edx), %mm4 + movd %mm6, 0(%edx) + psrlq $32, %mm6 +L(10): pmuludq %mm7, %mm3 + paddq %mm5, %mm6 + movd 20(%eax), %mm0 + paddq %mm2, %mm4 + movd 12(%edx), %mm5 + movd %mm6, 4(%edx) + psrlq $32, %mm6 +L(01): pmuludq %mm7, %mm0 + paddq %mm4, %mm6 + movd 24(%eax), %mm1 + paddq %mm3, %mm5 + movd 16(%edx), %mm4 + movd %mm6, 8(%edx) + psrlq $32, %mm6 +L(00): pmuludq %mm7, %mm1 + paddq %mm5, %mm6 + movd 28(%eax), %mm2 + paddq %mm0, %mm4 + movd 20(%edx), %mm5 + movd %mm6, 12(%edx) + psrlq $32, %mm6 + lea 16(%eax), %eax + lea 16(%edx), %edx + add $4, %ecx + jnz L(top) + +L(end): pmuludq %mm7, %mm2 + paddq %mm4, %mm6 + paddq %mm1, %mm5 + movd 8(%edx), %mm4 + movd %mm6, 0(%edx) + psrlq $32, %mm6 + paddq %mm5, %mm6 + paddq %mm2, %mm4 + movd %mm6, 4(%edx) + psrlq $32, %mm6 + paddq %mm4, %mm6 + movd %mm6, 8(%edx) + psrlq $32, %mm6 + movd %mm6, %eax + emms + ret +EPILOGUE() +PROLOGUE(mpn_addmul_1c) + movd 20(%esp), %mm6 + jmp L(ent) +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/bdiv_dbm1c.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/bdiv_dbm1c.asm new file mode 100644 index 0000000..354300e --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/bdiv_dbm1c.asm @@ -0,0 +1,141 @@ +dnl Intel Atom mpn_bdiv_dbm1. + +dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C cycles/limb +C P5 - +C P6 model 0-8,10-12 - +C P6 model 9 (Banias) 9.75 +C P6 model 13 (Dothan) +C P4 model 0 (Willamette) +C P4 model 1 (?) +C P4 model 2 (Northwood) 8.25 +C P4 model 3 (Prescott) +C P4 model 4 (Nocona) +C Intel Atom 8 +C AMD K6 - +C AMD K7 - +C AMD K8 +C AMD K10 + +C TODO: This code was optimised for atom-32, consider moving it back to atom +C dir(atom currently grabs this code), and write a 4-way version(7c/l). + +defframe(PARAM_CARRY,20) +defframe(PARAM_MUL, 16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +dnl re-use parameter space +define(SAVE_RP,`PARAM_MUL') +define(SAVE_UP,`PARAM_SIZE') + +define(`rp', `%edi') +define(`up', `%esi') +define(`n', `%ecx') +define(`reg', `%edx') +define(`cy', `%eax') C contains the return value + +ASM_START() + TEXT + ALIGN(16) +deflit(`FRAME',0) + +PROLOGUE(mpn_bdiv_dbm1c) + mov PARAM_SIZE, n C size + mov up, SAVE_UP + mov PARAM_SRC, up + movd PARAM_MUL, %mm7 + mov rp, SAVE_RP + mov PARAM_DST, rp + + movd (up), %mm0 + pmuludq %mm7, %mm0 + shr n + mov PARAM_CARRY, cy + jz L(eq1) + + movd 4(up), %mm1 + jc L(odd) + + lea 4(up), up + pmuludq %mm7, %mm1 + movd %mm0, reg + psrlq $32, %mm0 + sub reg, cy + movd %mm0, reg + movq %mm1, %mm0 + dec n + mov cy, (rp) + lea 4(rp), rp + jz L(end) + +C ALIGN(16) +L(top): movd 4(up), %mm1 + sbb reg, cy +L(odd): movd %mm0, reg + psrlq $32, %mm0 + pmuludq %mm7, %mm1 + sub reg, cy + lea 8(up), up + movd %mm0, reg + movd (up), %mm0 + mov cy, (rp) + sbb reg, cy + movd %mm1, reg + psrlq $32, %mm1 + sub reg, cy + movd %mm1, reg + pmuludq %mm7, %mm0 + dec n + mov cy, 4(rp) + lea 8(rp), rp + jnz L(top) + +L(end): sbb reg, cy + +L(eq1): movd %mm0, reg + psrlq $32, %mm0 + mov SAVE_UP, up + sub reg, cy + movd %mm0, reg + emms + mov cy, (rp) + sbb reg, cy + + mov SAVE_RP, rp + ret +EPILOGUE() +ASM_END() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/bdiv_q_1.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/bdiv_q_1.asm new file mode 100644 index 0000000..d5008f4 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/bdiv_q_1.asm @@ -0,0 +1,234 @@ +dnl Intel Pentium-4 mpn_divexact_1 -- mpn by limb exact division. + +dnl Rearranged from mpn/x86/pentium4/sse2/dive_1.asm by Marco Bodrato. + +dnl Copyright 2001, 2002, 2007, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P4: 19.0 cycles/limb + +C Pairs of movd's are used to avoid unaligned loads. Despite the loads not +C being on the dependent chain and there being plenty of cycles available, +C using an unaligned movq on every second iteration measured about 23 c/l. +C + +defframe(PARAM_SHIFT, 24) +defframe(PARAM_INVERSE,20) +defframe(PARAM_DIVISOR,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + TEXT + +C mp_limb_t +C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor, +C mp_limb_t inverse, int shift) + ALIGN(32) +PROLOGUE(mpn_pi1_bdiv_q_1) +deflit(`FRAME',0) + + movl PARAM_SIZE, %edx + + movl PARAM_SRC, %eax + + movl PARAM_DIVISOR, %ecx + + movd %ecx, %mm6 + movl PARAM_SHIFT, %ecx + + movd %ecx, %mm7 C shift + + C + + movl PARAM_INVERSE, %ecx + movd %ecx, %mm5 C inv + + movl PARAM_DST, %ecx + pxor %mm1, %mm1 C initial carry limb + pxor %mm0, %mm0 C initial carry bit + + subl $1, %edx + jz L(done) + + pcmpeqd %mm4, %mm4 + psrlq $32, %mm4 C 0x00000000FFFFFFFF + +C The dependent chain here is as follows. +C +C latency +C psubq s = (src-cbit) - climb 2 +C pmuludq q = s*inverse 8 +C pmuludq prod = q*divisor 8 +C psrlq climb = high(prod) 2 +C -- +C 20 +C +C Yet the loop measures 19.0 c/l, so obviously there's something gained +C there over a straight reading of the chip documentation. + +L(top): + C eax src, incrementing + C ebx + C ecx dst, incrementing + C edx counter, size-1 iterations + C + C mm0 carry bit + C mm1 carry limb + C mm4 0x00000000FFFFFFFF + C mm5 inverse + C mm6 divisor + C mm7 shift + + movd (%eax), %mm2 + movd 4(%eax), %mm3 + addl $4, %eax + punpckldq %mm3, %mm2 + + psrlq %mm7, %mm2 + pand %mm4, %mm2 C src + psubq %mm0, %mm2 C src - cbit + + psubq %mm1, %mm2 C src - cbit - climb + movq %mm2, %mm0 + psrlq $63, %mm0 C new cbit + + pmuludq %mm5, %mm2 C s*inverse + movd %mm2, (%ecx) C q + addl $4, %ecx + + movq %mm6, %mm1 + pmuludq %mm2, %mm1 C q*divisor + psrlq $32, %mm1 C new climb + +L(entry): + subl $1, %edx + jnz L(top) + +L(done): + movd (%eax), %mm2 + psrlq %mm7, %mm2 C src + psubq %mm0, %mm2 C src - cbit + + psubq %mm1, %mm2 C src - cbit - climb + + pmuludq %mm5, %mm2 C s*inverse + movd %mm2, (%ecx) C q + + emms + ret + +EPILOGUE() + + ALIGN(16) +C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t divisor); +C +PROLOGUE(mpn_bdiv_q_1) +deflit(`FRAME',0) + + movl PARAM_SIZE, %edx + + movl PARAM_DIVISOR, %ecx + + C eax src + C ebx + C ecx divisor + C edx size-1 + + movl %ecx, %eax + bsfl %ecx, %ecx C trailing twos + + shrl %cl, %eax C d = divisor without twos + movd %eax, %mm6 + movd %ecx, %mm7 C shift + + shrl %eax C d/2 + + andl $127, %eax C d/2, 7 bits + +ifdef(`PIC',` + LEA( binvert_limb_table, %ecx) + movzbl (%eax,%ecx), %eax C inv 8 bits +',` + movzbl binvert_limb_table(%eax), %eax C inv 8 bits +') + + C + + movd %eax, %mm5 C inv + + movd %eax, %mm0 C inv + + pmuludq %mm5, %mm5 C inv*inv + + C + + pmuludq %mm6, %mm5 C inv*inv*d + paddd %mm0, %mm0 C 2*inv + + C + + psubd %mm5, %mm0 C inv = 2*inv - inv*inv*d + pxor %mm5, %mm5 + + paddd %mm0, %mm5 + pmuludq %mm0, %mm0 C inv*inv + + pcmpeqd %mm4, %mm4 + psrlq $32, %mm4 C 0x00000000FFFFFFFF + + C + + pmuludq %mm6, %mm0 C inv*inv*d + paddd %mm5, %mm5 C 2*inv + + movl PARAM_SRC, %eax + movl PARAM_DST, %ecx + pxor %mm1, %mm1 C initial carry limb + + C + + psubd %mm0, %mm5 C inv = 2*inv - inv*inv*d + + ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS + pushl %eax FRAME_pushl() + movq %mm6, %mm0 + pmuludq %mm5, %mm0 + movd %mm0, %eax + cmpl $1, %eax + popl %eax FRAME_popl()') + + pxor %mm0, %mm0 C initial carry bit + jmp L(entry) + +EPILOGUE() +ASM_END() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/cnd_add_n.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/cnd_add_n.asm new file mode 100644 index 0000000..b3f3474 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/cnd_add_n.asm @@ -0,0 +1,95 @@ +dnl Intel Pentium-4 mpn_cnd_add_n -- mpn addition. + +dnl Copyright 2001, 2002, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C P6 model 0-8,10-12 - +C P6 model 9 (Banias) ? +C P6 model 13 (Dothan) 4.67 +C P4 model 0-1 (Willamette) ? +C P4 model 2 (Northwood) 5 +C P4 model 3-4 (Prescott) 5.25 + +defframe(PARAM_SIZE, 20) +defframe(PARAM_SRC2, 16) +defframe(PARAM_SRC1, 12) +defframe(PARAM_DST, 8) +defframe(PARAM_CND, 4) + +dnl re-use parameter space +define(SAVE_EBX,`PARAM_SRC1') + +define(`cnd', `%mm3') + + TEXT + ALIGN(8) + + ALIGN(8) +PROLOGUE(mpn_cnd_add_n) +deflit(`FRAME',0) + pxor %mm0, %mm0 + + mov PARAM_CND, %eax + neg %eax + sbb %eax, %eax + movd %eax, cnd + + mov PARAM_SRC1, %eax + mov %ebx, SAVE_EBX + mov PARAM_SRC2, %ebx + mov PARAM_DST, %edx + mov PARAM_SIZE, %ecx + + lea (%eax,%ecx,4), %eax C src1 end + lea (%ebx,%ecx,4), %ebx C src2 end + lea (%edx,%ecx,4), %edx C dst end + neg %ecx C -size + +L(top): movd (%ebx,%ecx,4), %mm2 + movd (%eax,%ecx,4), %mm1 + pand cnd, %mm2 + paddq %mm2, %mm1 + + paddq %mm1, %mm0 + movd %mm0, (%edx,%ecx,4) + + psrlq $32, %mm0 + + add $1, %ecx + jnz L(top) + + movd %mm0, %eax + mov SAVE_EBX, %ebx + emms + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/cnd_sub_n.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/cnd_sub_n.asm new file mode 100644 index 0000000..339a23e --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/cnd_sub_n.asm @@ -0,0 +1,114 @@ +dnl Intel Pentium-4 mpn_cnd_sub_n -- mpn subtraction. + +dnl Copyright 2001, 2002, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C P6 model 0-8,10-12 - +C P6 model 9 (Banias) ? +C P6 model 13 (Dothan) 4.67 +C P4 model 0-1 (Willamette) ? +C P4 model 2 (Northwood) 5 +C P4 model 3-4 (Prescott) 5.25 + +defframe(PARAM_SIZE, 20) +defframe(PARAM_SRC2, 16) +defframe(PARAM_SRC1, 12) +defframe(PARAM_DST, 8) +defframe(PARAM_CND, 4) + +dnl re-use parameter space +define(SAVE_EBX,`PARAM_SRC1') + +define(`cnd', `%mm3') + + TEXT + ALIGN(8) + + ALIGN(8) +PROLOGUE(mpn_cnd_sub_n) +deflit(`FRAME',0) + pxor %mm0, %mm0 + + mov PARAM_CND, %eax + neg %eax + sbb %eax, %eax + movd %eax, cnd + + mov PARAM_SRC1, %eax + mov %ebx, SAVE_EBX + mov PARAM_SRC2, %ebx + mov PARAM_DST, %edx + mov PARAM_SIZE, %ecx + + lea (%eax,%ecx,4), %eax C src1 end + lea (%ebx,%ecx,4), %ebx C src2 end + lea (%edx,%ecx,4), %edx C dst end + neg %ecx C -size + +L(top): movd (%ebx,%ecx,4), %mm2 + movd (%eax,%ecx,4), %mm1 + pand cnd, %mm2 + psubq %mm2, %mm1 + + psubq %mm0, %mm1 + movd %mm1, (%edx,%ecx,4) + + psrlq $63, %mm1 + + add $1, %ecx + jz L(done_mm1) + + movd (%ebx,%ecx,4), %mm2 + movd (%eax,%ecx,4), %mm0 + pand cnd, %mm2 + psubq %mm2, %mm0 + + psubq %mm1, %mm0 + movd %mm0, (%edx,%ecx,4) + + psrlq $63, %mm0 + + add $1, %ecx + jnz L(top) + + movd %mm0, %eax + mov SAVE_EBX, %ebx + emms + ret + +L(done_mm1): + movd %mm1, %eax + mov SAVE_EBX, %ebx + emms + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/dive_1.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/dive_1.asm new file mode 100644 index 0000000..0ceef5b --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/dive_1.asm @@ -0,0 +1,216 @@ +dnl Intel Pentium-4 mpn_divexact_1 -- mpn by limb exact division. + +dnl Copyright 2001, 2002, 2007 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P4: 19.0 cycles/limb + + +C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t divisor); +C +C Pairs of movd's are used to avoid unaligned loads. Despite the loads not +C being on the dependent chain and there being plenty of cycles available, +C using an unaligned movq on every second iteration measured about 23 c/l. +C +C Using divl for size==1 seems a touch quicker than mul-by-inverse. The mul +C will be about 9+2*4+2*2+10*4+19+12 = 92 cycles latency, though some of +C that might be hidden by out-of-order execution, whereas divl is around 60. +C At size==2 an extra 19 for the mul versus 60 for the divl will see the mul +C faster. + +defframe(PARAM_DIVISOR,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + TEXT + + ALIGN(16) +PROLOGUE(mpn_divexact_1) +deflit(`FRAME',0) + + movl PARAM_SIZE, %edx + + movl PARAM_SRC, %eax + + movl PARAM_DIVISOR, %ecx + subl $1, %edx + jnz L(two_or_more) + + movl (%eax), %eax + xorl %edx, %edx + + divl %ecx + movl PARAM_DST, %ecx + + movl %eax, (%ecx) + ret + + +L(two_or_more): + C eax src + C ebx + C ecx divisor + C edx size-1 + + movl %ecx, %eax + bsfl %ecx, %ecx C trailing twos + + shrl %cl, %eax C d = divisor without twos + movd %eax, %mm6 + movd %ecx, %mm7 C shift + + shrl %eax C d/2 + + andl $127, %eax C d/2, 7 bits + +ifdef(`PIC',` + LEA( binvert_limb_table, %ecx) + movzbl (%eax,%ecx), %eax C inv 8 bits +',` + movzbl binvert_limb_table(%eax), %eax C inv 8 bits +') + + C + + movd %eax, %mm5 C inv + + movd %eax, %mm0 C inv + + pmuludq %mm5, %mm5 C inv*inv + + C + + pmuludq %mm6, %mm5 C inv*inv*d + paddd %mm0, %mm0 C 2*inv + + C + + psubd %mm5, %mm0 C inv = 2*inv - inv*inv*d + pxor %mm5, %mm5 + + paddd %mm0, %mm5 + pmuludq %mm0, %mm0 C inv*inv + + pcmpeqd %mm4, %mm4 + psrlq $32, %mm4 C 0x00000000FFFFFFFF + + C + + pmuludq %mm6, %mm0 C inv*inv*d + paddd %mm5, %mm5 C 2*inv + + movl PARAM_SRC, %eax + movl PARAM_DST, %ecx + pxor %mm1, %mm1 C initial carry limb + + C + + psubd %mm0, %mm5 C inv = 2*inv - inv*inv*d + + ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS + pushl %eax FRAME_pushl() + movq %mm6, %mm0 + pmuludq %mm5, %mm0 + movd %mm0, %eax + cmpl $1, %eax + popl %eax FRAME_popl()') + + pxor %mm0, %mm0 C initial carry bit + + +C The dependent chain here is as follows. +C +C latency +C psubq s = (src-cbit) - climb 2 +C pmuludq q = s*inverse 8 +C pmuludq prod = q*divisor 8 +C psrlq climb = high(prod) 2 +C -- +C 20 +C +C Yet the loop measures 19.0 c/l, so obviously there's something gained +C there over a straight reading of the chip documentation. + +L(top): + C eax src, incrementing + C ebx + C ecx dst, incrementing + C edx counter, size-1 iterations + C + C mm0 carry bit + C mm1 carry limb + C mm4 0x00000000FFFFFFFF + C mm5 inverse + C mm6 divisor + C mm7 shift + + movd (%eax), %mm2 + movd 4(%eax), %mm3 + addl $4, %eax + punpckldq %mm3, %mm2 + + psrlq %mm7, %mm2 + pand %mm4, %mm2 C src + psubq %mm0, %mm2 C src - cbit + + psubq %mm1, %mm2 C src - cbit - climb + movq %mm2, %mm0 + psrlq $63, %mm0 C new cbit + + pmuludq %mm5, %mm2 C s*inverse + movd %mm2, (%ecx) C q + addl $4, %ecx + + movq %mm6, %mm1 + pmuludq %mm2, %mm1 C q*divisor + psrlq $32, %mm1 C new climb + + subl $1, %edx + jnz L(top) + + +L(done): + movd (%eax), %mm2 + psrlq %mm7, %mm2 C src + psubq %mm0, %mm2 C src - cbit + + psubq %mm1, %mm2 C src - cbit - climb + + pmuludq %mm5, %mm2 C s*inverse + movd %mm2, (%ecx) C q + + emms + ret + +EPILOGUE() +ASM_END() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/divrem_1.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/divrem_1.asm new file mode 100644 index 0000000..0146fab --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/divrem_1.asm @@ -0,0 +1,645 @@ +dnl Intel Pentium-4 mpn_divrem_1 -- mpn by limb division. + +dnl Copyright 1999-2004 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P4: 32 cycles/limb integer part, 30 cycles/limb fraction part. + + +C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize, +C mp_srcptr src, mp_size_t size, +C mp_limb_t divisor); +C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize, +C mp_srcptr src, mp_size_t size, +C mp_limb_t divisor, mp_limb_t carry); +C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize, +C mp_srcptr src, mp_size_t size, +C mp_limb_t divisor, mp_limb_t inverse, +C unsigned shift); +C +C Algorithm: +C +C The method and nomenclature follow part 8 of "Division by Invariant +C Integers using Multiplication" by Granlund and Montgomery, reference in +C gmp.texi. +C +C "m" is written for what is m' in the paper, and "d" for d_norm, which +C won't cause any confusion since it's only the normalized divisor that's of +C any use in the code. "b" is written for 2^N, the size of a limb, N being +C 32 here. +C +C The step "sdword dr = n - 2^N*d + (2^N-1-q1) * d" is instead done as +C "n-d - q1*d". This rearrangement gives the same two-limb answer but lets +C us have just a psubq on the dependent chain. +C +C For reference, the way the k7 code uses "n-(q1+1)*d" would not suit here, +C detecting an overflow of q1+1 when q1=0xFFFFFFFF would cost too much. +C +C Notes: +C +C mpn_divrem_1 and mpn_preinv_divrem_1 avoid one division if the src high +C limb is less than the divisor. mpn_divrem_1c doesn't check for a zero +C carry, since in normal circumstances that will be a very rare event. +C +C The test for skipping a division is branch free (once size>=1 is tested). +C The store to the destination high limb is 0 when a divide is skipped, or +C if it's not skipped then a copy of the src high limb is stored. The +C latter is in case src==dst. +C +C There's a small bias towards expecting xsize==0, by having code for +C xsize==0 in a straight line and xsize!=0 under forward jumps. +C +C Enhancements: +C +C The loop measures 32 cycles, but the dependent chain would suggest it +C could be done with 30. Not sure where to start looking for the extras. +C +C Alternatives: +C +C If the divisor is normalized (high bit set) then a division step can +C always be skipped, since the high destination limb is always 0 or 1 in +C that case. It doesn't seem worth checking for this though, since it +C probably occurs infrequently. + + +dnl MUL_THRESHOLD is the value of xsize+size at which the multiply by +dnl inverse method is used, rather than plain "divl"s. Minimum value 1. +dnl +dnl The inverse takes about 80-90 cycles to calculate, but after that the +dnl multiply is 32 c/l versus division at about 58 c/l. +dnl +dnl At 4 limbs the div is a touch faster than the mul (and of course +dnl simpler), so start the mul from 5 limbs. + +deflit(MUL_THRESHOLD, 5) + + +defframe(PARAM_PREINV_SHIFT, 28) dnl mpn_preinv_divrem_1 +defframe(PARAM_PREINV_INVERSE, 24) dnl mpn_preinv_divrem_1 +defframe(PARAM_CARRY, 24) dnl mpn_divrem_1c +defframe(PARAM_DIVISOR,20) +defframe(PARAM_SIZE, 16) +defframe(PARAM_SRC, 12) +defframe(PARAM_XSIZE, 8) +defframe(PARAM_DST, 4) + +dnl re-use parameter space +define(SAVE_ESI,`PARAM_SIZE') +define(SAVE_EBP,`PARAM_SRC') +define(SAVE_EDI,`PARAM_DIVISOR') +define(SAVE_EBX,`PARAM_DST') + + TEXT + + ALIGN(16) +PROLOGUE(mpn_preinv_divrem_1) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + xorl %edx, %edx C carry if can't skip a div + + movl %esi, SAVE_ESI + movl PARAM_SRC, %esi + + movl %ebp, SAVE_EBP + movl PARAM_DIVISOR, %ebp + + movl %edi, SAVE_EDI + movl PARAM_DST, %edi + + movl -4(%esi,%ecx,4), %eax C src high limb + + movl %ebx, SAVE_EBX + movl PARAM_XSIZE, %ebx + + movd PARAM_PREINV_INVERSE, %mm4 + + movd PARAM_PREINV_SHIFT, %mm7 C l + cmpl %ebp, %eax C high cmp divisor + + cmovc( %eax, %edx) C high is carry if high<divisor + movd %edx, %mm0 C carry + + movd %edx, %mm1 C carry + movl $0, %edx + + movd %ebp, %mm5 C d + cmovnc( %eax, %edx) C 0 if skip div, src high if not + C (the latter in case src==dst) + leal -4(%edi,%ebx,4), %edi C &dst[xsize-1] + + movl %edx, (%edi,%ecx,4) C dst high limb + sbbl $0, %ecx C skip one division if high<divisor + movl $32, %eax + + subl PARAM_PREINV_SHIFT, %eax + psllq %mm7, %mm5 C d normalized + leal (%edi,%ecx,4), %edi C &dst[xsize+size-1] + leal -4(%esi,%ecx,4), %esi C &src[size-1] + + movd %eax, %mm6 C 32-l + jmp L(start_preinv) + +EPILOGUE() + + + ALIGN(16) +PROLOGUE(mpn_divrem_1c) +deflit(`FRAME',0) + + movl PARAM_CARRY, %edx + + movl PARAM_SIZE, %ecx + + movl %esi, SAVE_ESI + movl PARAM_SRC, %esi + + movl %ebp, SAVE_EBP + movl PARAM_DIVISOR, %ebp + + movl %edi, SAVE_EDI + movl PARAM_DST, %edi + + movl %ebx, SAVE_EBX + movl PARAM_XSIZE, %ebx + + leal -4(%edi,%ebx,4), %edi C &dst[xsize-1] + jmp L(start_1c) + +EPILOGUE() + + + ALIGN(16) +PROLOGUE(mpn_divrem_1) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + xorl %edx, %edx C initial carry (if can't skip a div) + + movl %esi, SAVE_ESI + movl PARAM_SRC, %esi + + movl %ebp, SAVE_EBP + movl PARAM_DIVISOR, %ebp + + movl %edi, SAVE_EDI + movl PARAM_DST, %edi + + movl %ebx, SAVE_EBX + movl PARAM_XSIZE, %ebx + leal -4(%edi,%ebx,4), %edi C &dst[xsize-1] + + orl %ecx, %ecx C size + jz L(no_skip_div) C if size==0 + movl -4(%esi,%ecx,4), %eax C src high limb + + cmpl %ebp, %eax C high cmp divisor + + cmovnc( %eax, %edx) C 0 if skip div, src high if not + movl %edx, (%edi,%ecx,4) C dst high limb + + movl $0, %edx + cmovc( %eax, %edx) C high is carry if high<divisor + + sbbl $0, %ecx C size-1 if high<divisor +L(no_skip_div): + + +L(start_1c): + C eax + C ebx xsize + C ecx size + C edx carry + C esi src + C edi &dst[xsize-1] + C ebp divisor + + leal (%ebx,%ecx), %eax C size+xsize + leal -4(%esi,%ecx,4), %esi C &src[size-1] + leal (%edi,%ecx,4), %edi C &dst[size+xsize-1] + + cmpl $MUL_THRESHOLD, %eax + jae L(mul_by_inverse) + + + orl %ecx, %ecx + jz L(divide_no_integer) C if size==0 + +L(divide_integer): + C eax scratch (quotient) + C ebx xsize + C ecx counter + C edx carry + C esi src, decrementing + C edi dst, decrementing + C ebp divisor + + movl (%esi), %eax + subl $4, %esi + + divl %ebp + + movl %eax, (%edi) + subl $4, %edi + + subl $1, %ecx + jnz L(divide_integer) + + +L(divide_no_integer): + orl %ebx, %ebx + jnz L(divide_fraction) C if xsize!=0 + +L(divide_done): + movl SAVE_ESI, %esi + movl SAVE_EDI, %edi + movl SAVE_EBX, %ebx + movl SAVE_EBP, %ebp + movl %edx, %eax + ret + + +L(divide_fraction): + C eax scratch (quotient) + C ebx counter + C ecx + C edx carry + C esi + C edi dst, decrementing + C ebp divisor + + movl $0, %eax + + divl %ebp + + movl %eax, (%edi) + subl $4, %edi + + subl $1, %ebx + jnz L(divide_fraction) + + jmp L(divide_done) + + + +C ----------------------------------------------------------------------------- + +L(mul_by_inverse): + C eax + C ebx xsize + C ecx size + C edx carry + C esi &src[size-1] + C edi &dst[size+xsize-1] + C ebp divisor + + bsrl %ebp, %eax C 31-l + movd %edx, %mm0 C carry + movd %edx, %mm1 C carry + movl %ecx, %edx C size + movl $31, %ecx + + C + + xorl %eax, %ecx C l = leading zeros on d + addl $1, %eax + + shll %cl, %ebp C d normalized + movd %ecx, %mm7 C l + movl %edx, %ecx C size + + movd %eax, %mm6 C 32-l + movl $-1, %edx + movl $-1, %eax + + C + + subl %ebp, %edx C (b-d)-1 so edx:eax = b*(b-d)-1 + + divl %ebp C floor (b*(b-d)-1 / d) + movd %ebp, %mm5 C d + + C + + movd %eax, %mm4 C m + + +L(start_preinv): + C eax inverse + C ebx xsize + C ecx size + C edx + C esi &src[size-1] + C edi &dst[size+xsize-1] + C ebp + C + C mm0 carry + C mm1 carry + C mm2 + C mm4 m + C mm5 d + C mm6 31-l + C mm7 l + + psllq %mm7, %mm0 C n2 = carry << l, for size==0 + + subl $1, %ecx + jb L(integer_none) + + movd (%esi), %mm0 C src high limb + punpckldq %mm1, %mm0 + psrlq %mm6, %mm0 C n2 = high (carry:srchigh << l) + jz L(integer_last) + + +C The dependent chain here consists of +C +C 2 paddd n1+n2 +C 8 pmuludq m*(n1+n2) +C 2 paddq n2:nadj + m*(n1+n2) +C 2 psrlq q1 +C 8 pmuludq d*q1 +C 2 psubq (n-d)-q1*d +C 2 psrlq high n-(q1+1)*d mask +C 2 pand d masked +C 2 paddd n2+d addback +C -- +C 30 +C +C But it seems to run at 32 cycles, so presumably there's something else +C going on. + + ALIGN(16) +L(integer_top): + C eax + C ebx + C ecx counter, size-1 to 0 + C edx + C esi src, decrementing + C edi dst, decrementing + C + C mm0 n2 + C mm4 m + C mm5 d + C mm6 32-l + C mm7 l + + ASSERT(b,`C n2<d + movd %mm0, %eax + movd %mm5, %edx + cmpl %edx, %eax') + + movd -4(%esi), %mm1 C next src limbs + movd (%esi), %mm2 + leal -4(%esi), %esi + + punpckldq %mm2, %mm1 + psrlq %mm6, %mm1 C n10 + + movq %mm1, %mm2 C n10 + movq %mm1, %mm3 C n10 + psrad $31, %mm1 C -n1 + pand %mm5, %mm1 C -n1 & d + paddd %mm2, %mm1 C nadj = n10+(-n1&d), ignore overflow + + psrld $31, %mm2 C n1 + paddd %mm0, %mm2 C n2+n1 + punpckldq %mm0, %mm1 C n2:nadj + + pmuludq %mm4, %mm2 C m*(n2+n1) + + C + + paddq %mm2, %mm1 C n2:nadj + m*(n2+n1) + pxor %mm2, %mm2 C break dependency, saves 4 cycles + pcmpeqd %mm2, %mm2 C FF...FF + psrlq $63, %mm2 C 1 + + psrlq $32, %mm1 C q1 = high(n2:nadj + m*(n2+n1)) + + paddd %mm1, %mm2 C q1+1 + pmuludq %mm5, %mm1 C q1*d + + punpckldq %mm0, %mm3 C n = n2:n10 + pxor %mm0, %mm0 + + psubq %mm5, %mm3 C n - d + + C + + psubq %mm1, %mm3 C n - (q1+1)*d + + por %mm3, %mm0 C copy remainder -> new n2 + psrlq $32, %mm3 C high n - (q1+1)*d, 0 or -1 + + ASSERT(be,`C 0 or -1 + movd %mm3, %eax + addl $1, %eax + cmpl $1, %eax') + + paddd %mm3, %mm2 C q + pand %mm5, %mm3 C mask & d + + paddd %mm3, %mm0 C addback if necessary + movd %mm2, (%edi) + leal -4(%edi), %edi + + subl $1, %ecx + ja L(integer_top) + + +L(integer_last): + C eax + C ebx xsize + C ecx + C edx + C esi &src[0] + C edi &dst[xsize] + C + C mm0 n2 + C mm4 m + C mm5 d + C mm6 + C mm7 l + + ASSERT(b,`C n2<d + movd %mm0, %eax + movd %mm5, %edx + cmpl %edx, %eax') + + movd (%esi), %mm1 C src[0] + psllq %mm7, %mm1 C n10 + + movq %mm1, %mm2 C n10 + movq %mm1, %mm3 C n10 + psrad $31, %mm1 C -n1 + pand %mm5, %mm1 C -n1 & d + paddd %mm2, %mm1 C nadj = n10+(-n1&d), ignore overflow + + psrld $31, %mm2 C n1 + paddd %mm0, %mm2 C n2+n1 + punpckldq %mm0, %mm1 C n2:nadj + + pmuludq %mm4, %mm2 C m*(n2+n1) + + C + + paddq %mm2, %mm1 C n2:nadj + m*(n2+n1) + pcmpeqd %mm2, %mm2 C FF...FF + psrlq $63, %mm2 C 1 + + psrlq $32, %mm1 C q1 = high(n2:nadj + m*(n2+n1)) + paddd %mm1, %mm2 C q1 + + pmuludq %mm5, %mm1 C q1*d + punpckldq %mm0, %mm3 C n + psubq %mm5, %mm3 C n - d + pxor %mm0, %mm0 + + C + + psubq %mm1, %mm3 C n - (q1+1)*d + + por %mm3, %mm0 C remainder -> n2 + psrlq $32, %mm3 C high n - (q1+1)*d, 0 or -1 + + ASSERT(be,`C 0 or -1 + movd %mm3, %eax + addl $1, %eax + cmpl $1, %eax') + + paddd %mm3, %mm2 C q + pand %mm5, %mm3 C mask & d + + paddd %mm3, %mm0 C addback if necessary + movd %mm2, (%edi) + leal -4(%edi), %edi + + +L(integer_none): + C eax + C ebx xsize + + orl %ebx, %ebx + jnz L(fraction_some) C if xsize!=0 + + +L(fraction_done): + movl SAVE_EBP, %ebp + psrld %mm7, %mm0 C remainder + + movl SAVE_EDI, %edi + movd %mm0, %eax + + movl SAVE_ESI, %esi + movl SAVE_EBX, %ebx + emms + ret + + + +C ----------------------------------------------------------------------------- +C + +L(fraction_some): + C eax + C ebx xsize + C ecx + C edx + C esi + C edi &dst[xsize-1] + C ebp + + +L(fraction_top): + C eax + C ebx counter, xsize iterations + C ecx + C edx + C esi src, decrementing + C edi dst, decrementing + C + C mm0 n2 + C mm4 m + C mm5 d + C mm6 32-l + C mm7 l + + ASSERT(b,`C n2<d + movd %mm0, %eax + movd %mm5, %edx + cmpl %edx, %eax') + + movq %mm0, %mm1 C n2 + pmuludq %mm4, %mm0 C m*n2 + + pcmpeqd %mm2, %mm2 + psrlq $63, %mm2 + + C + + psrlq $32, %mm0 C high(m*n2) + + paddd %mm1, %mm0 C q1 = high(n2:0 + m*n2) + + paddd %mm0, %mm2 C q1+1 + pmuludq %mm5, %mm0 C q1*d + + psllq $32, %mm1 C n = n2:0 + psubq %mm5, %mm1 C n - d + + C + + psubq %mm0, %mm1 C r = n - (q1+1)*d + pxor %mm0, %mm0 + + por %mm1, %mm0 C r -> n2 + psrlq $32, %mm1 C high n - (q1+1)*d, 0 or -1 + + ASSERT(be,`C 0 or -1 + movd %mm1, %eax + addl $1, %eax + cmpl $1, %eax') + + paddd %mm1, %mm2 C q + pand %mm5, %mm1 C mask & d + + paddd %mm1, %mm0 C addback if necessary + movd %mm2, (%edi) + leal -4(%edi), %edi + + subl $1, %ebx + jne L(fraction_top) + + + jmp L(fraction_done) + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/gmp-mparam.h new file mode 100644 index 0000000..a047a51 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/gmp-mparam.h @@ -0,0 +1,213 @@ +/* Intel Pentium-4 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 2600 MHz P4 Northwood */ +/* FFT tuning limit = 23,700,309 */ +/* Generated by tuneup.c, 2019-11-09, gcc 8.2 */ + +#define MOD_1_NORM_THRESHOLD 5 +#define MOD_1_UNNORM_THRESHOLD 14 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 7 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 13 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 7 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1N_PI1_METHOD 2 /* 4.36% faster than 1 */ +#define DIV_QR_1_NORM_THRESHOLD 16 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 21 + +#define DIV_1_VS_MUL_1_PERCENT 358 + +#define MUL_TOOM22_THRESHOLD 26 +#define MUL_TOOM33_THRESHOLD 101 +#define MUL_TOOM44_THRESHOLD 284 +#define MUL_TOOM6H_THRESHOLD 406 +#define MUL_TOOM8H_THRESHOLD 592 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 101 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 191 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 189 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 195 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 151 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 51 +#define SQR_TOOM3_THRESHOLD 163 +#define SQR_TOOM4_THRESHOLD 254 +#define SQR_TOOM6_THRESHOLD 614 +#define SQR_TOOM8_THRESHOLD 842 + +#define MULMID_TOOM42_THRESHOLD 58 + +#define MULMOD_BNM1_THRESHOLD 19 +#define SQRMOD_BNM1_THRESHOLD 23 + +#define MUL_FFT_MODF_THRESHOLD 824 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 824, 5}, { 29, 6}, { 15, 5}, { 33, 6}, \ + { 17, 5}, { 36, 6}, { 19, 5}, { 39, 6}, \ + { 29, 7}, { 15, 6}, { 32, 7}, { 17, 6}, \ + { 36, 7}, { 19, 6}, { 39, 7}, { 21, 6}, \ + { 43, 7}, { 23, 6}, { 48, 7}, { 29, 8}, \ + { 15, 7}, { 37, 8}, { 19, 7}, { 43, 8}, \ + { 23, 7}, { 49, 8}, { 27, 7}, { 55, 8}, \ + { 31, 7}, { 63, 8}, { 43, 9}, { 23, 8}, \ + { 55, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \ + { 79, 9}, { 47, 8}, { 99, 9}, { 55,10}, \ + { 31, 9}, { 79,10}, { 47, 9}, { 103,11}, \ + { 31,10}, { 63, 9}, { 143,10}, { 79, 9}, \ + { 167,10}, { 95, 9}, { 191,10}, { 111,11}, \ + { 63,10}, { 127, 9}, { 255,10}, { 159,11}, \ + { 95,10}, { 191,12}, { 63,11}, { 127,10}, \ + { 271,11}, { 159,10}, { 319, 9}, { 639,10}, \ + { 335,11}, { 191,10}, { 383, 9}, { 799,10}, \ + { 415,11}, { 223,12}, { 127,11}, { 255,10}, \ + { 527,11}, { 287,10}, { 607, 9}, { 1215,11}, \ + { 319,10}, { 671,12}, { 191,11}, { 383,10}, \ + { 799,11}, { 415,10}, { 863,13}, { 127,12}, \ + { 255,11}, { 543,10}, { 1119, 9}, { 2239,11}, \ + { 607,10}, { 1215,12}, { 319,11}, { 671,10}, \ + { 1343,11}, { 735,10}, { 1471, 9}, { 2943,12}, \ + { 383,11}, { 799,10}, { 1599,11}, { 863,12}, \ + { 447,11}, { 927,10}, { 1855,11}, { 959,13}, \ + { 255,12}, { 511,11}, { 1119,12}, { 575,11}, \ + { 1215,10}, { 2431,11}, { 1247,12}, { 639,11}, \ + { 1343,12}, { 703,11}, { 1471,10}, { 2943,13}, \ + { 383,12}, { 767,11}, { 1599,12}, { 831,11}, \ + { 1727,10}, { 3455,12}, { 895,14}, { 255,13}, \ + { 511,12}, { 1087,11}, { 2239,10}, { 4479,12}, \ + { 1215,11}, { 2431,13}, { 639,12}, { 1343,11}, \ + { 2687,12}, { 1471,11}, { 2943,13}, { 767,12}, \ + { 1727,11}, { 3455,13}, { 895,12}, { 1983,14}, \ + { 511,13}, { 1023,12}, { 2239,13}, { 1151,12}, \ + { 2495,11}, { 4991,13}, { 1407,12}, { 2943,14}, \ + { 767,13}, { 1535,12}, { 3135,13}, { 1663,12}, \ + { 3455,13}, { 1919,12}, { 3967,15}, { 511,14}, \ + { 1023,13}, { 2175,12}, { 4479,13}, { 2431,12}, \ + { 4991,14}, { 1279,13}, { 2687,12}, { 5503,13}, \ + { 8192,14}, { 16384,15}, { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 167 +#define MUL_FFT_THRESHOLD 7808 + +#define SQR_FFT_MODF_THRESHOLD 560 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 560, 5}, { 33, 6}, { 17, 5}, { 35, 6}, \ + { 33, 7}, { 17, 6}, { 36, 7}, { 19, 6}, \ + { 39, 7}, { 35, 8}, { 19, 7}, { 43, 8}, \ + { 23, 7}, { 47, 8}, { 27, 7}, { 55, 8}, \ + { 31, 7}, { 63, 8}, { 43, 9}, { 23, 8}, \ + { 55, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \ + { 79, 9}, { 47, 8}, { 95, 9}, { 55,10}, \ + { 31, 9}, { 79,10}, { 47, 9}, { 95,11}, \ + { 31,10}, { 63, 9}, { 135,10}, { 79, 9}, \ + { 159,10}, { 111,11}, { 63,10}, { 143, 9}, \ + { 287,10}, { 159,11}, { 95,10}, { 191,12}, \ + { 63,11}, { 127, 9}, { 511, 8}, { 1023, 9}, \ + { 527,11}, { 159,10}, { 319, 9}, { 639,10}, \ + { 351,11}, { 191,10}, { 431,11}, { 223,12}, \ + { 127,11}, { 255,10}, { 543,11}, { 287,10}, \ + { 607, 9}, { 1215,11}, { 319,10}, { 639,11}, \ + { 351,12}, { 191,11}, { 383,10}, { 767,11}, \ + { 415,10}, { 831,13}, { 127,12}, { 255,11}, \ + { 543,10}, { 1119,11}, { 607,12}, { 319,11}, \ + { 671,10}, { 1343,11}, { 735,12}, { 383,11}, \ + { 799,10}, { 1599,11}, { 863,12}, { 447,11}, \ + { 927,10}, { 1855,11}, { 991,13}, { 255,12}, \ + { 511,11}, { 1055,10}, { 2111,11}, { 1087,12}, \ + { 575,11}, { 1215,10}, { 2431,12}, { 639,11}, \ + { 1343,12}, { 703,11}, { 1407,13}, { 383,12}, \ + { 767,11}, { 1599,12}, { 831,11}, { 1727,10}, \ + { 3455,12}, { 895,11}, { 1855,12}, { 959,14}, \ + { 255,13}, { 511,12}, { 1087,11}, { 2239,12}, \ + { 1215,11}, { 2431,13}, { 639,12}, { 1471,11}, \ + { 2943,13}, { 767,12}, { 1727,11}, { 3455,13}, \ + { 895,12}, { 1983,14}, { 511,13}, { 1023,12}, \ + { 2239,13}, { 1151,12}, { 2495,11}, { 4991,13}, \ + { 1279,12}, { 2623,13}, { 1407,12}, { 2943,14}, \ + { 767,13}, { 1663,12}, { 3455,13}, { 1919,12}, \ + { 3839,15}, { 511,14}, { 1023,13}, { 2175,12}, \ + { 4479,13}, { 2431,12}, { 4991,14}, { 1279,13}, \ + { 2687,12}, { 5503,13}, { 8192,14}, { 16384,15}, \ + { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 149 +#define SQR_FFT_THRESHOLD 4800 + +#define MULLO_BASECASE_THRESHOLD 12 +#define MULLO_DC_THRESHOLD 44 +#define MULLO_MUL_N_THRESHOLD 14281 +#define SQRLO_BASECASE_THRESHOLD 13 +#define SQRLO_DC_THRESHOLD 42 +#define SQRLO_SQR_THRESHOLD 9449 + +#define DC_DIV_QR_THRESHOLD 38 +#define DC_DIVAPPR_Q_THRESHOLD 105 +#define DC_BDIV_QR_THRESHOLD 52 +#define DC_BDIV_Q_THRESHOLD 83 + +#define INV_MULMOD_BNM1_THRESHOLD 50 +#define INV_NEWTON_THRESHOLD 158 +#define INV_APPR_THRESHOLD 118 + +#define BINV_NEWTON_THRESHOLD 342 +#define REDC_1_TO_REDC_N_THRESHOLD 67 + +#define MU_DIV_QR_THRESHOLD 2130 +#define MU_DIVAPPR_Q_THRESHOLD 1895 +#define MUPI_DIV_QR_THRESHOLD 60 +#define MU_BDIV_QR_THRESHOLD 1652 +#define MU_BDIV_Q_THRESHOLD 2089 + +#define POWM_SEC_TABLE 1,22,96,446,723,1378 + +#define GET_STR_DC_THRESHOLD 13 +#define GET_STR_PRECOMPUTE_THRESHOLD 20 +#define SET_STR_DC_THRESHOLD 298 +#define SET_STR_PRECOMPUTE_THRESHOLD 960 + +#define FAC_DSC_THRESHOLD 212 +#define FAC_ODD_THRESHOLD 71 + +#define MATRIX22_STRASSEN_THRESHOLD 26 +#define HGCD2_DIV1_METHOD 3 /* 0.68% faster than 1 */ +#define HGCD_THRESHOLD 80 +#define HGCD_APPR_THRESHOLD 138 +#define HGCD_REDUCE_THRESHOLD 4455 +#define GCD_DC_THRESHOLD 365 +#define GCDEXT_DC_THRESHOLD 245 +#define JACOBI_BASE_METHOD 4 /* 23.41% faster than 1 */ + +/* Tuneup completed successfully, took 63807 seconds */ diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mod_1_1.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mod_1_1.asm new file mode 100644 index 0000000..ee88bab --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mod_1_1.asm @@ -0,0 +1,166 @@ +dnl x86-32 mpn_mod_1_1p for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F). + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2009, 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C TODO: +C * Optimize. The present code was written quite straightforwardly. +C * Optimize post-loop reduction code; it is from mod_1s_4p, thus overkill. +C * Write a cps function that uses sse2 insns. + +C cycles/limb +C P6 model 0-8,10-12 - +C P6 model 9 (Banias) ? +C P6 model 13 (Dothan) ? +C P4 model 0-1 (Willamette) ? +C P4 model 2 (Northwood) 16 +C P4 model 3-4 (Prescott) 18 + +C INPUT PARAMETERS +C ap sp + 4 +C n sp + 8 +C b sp + 12 +C cps sp + 16 + +define(`B1modb', `%mm1') +define(`B2modb', `%mm2') +define(`ap', `%edx') +define(`n', `%eax') + + TEXT + ALIGN(16) +PROLOGUE(mpn_mod_1_1p) + push %ebx + mov 8(%esp), ap + mov 12(%esp), n + mov 20(%esp), %ecx + movd 8(%ecx), B1modb + movd 12(%ecx), B2modb + + lea -4(ap,n,4), ap + +C FIXME: See comment in generic/mod_1_1.c. + movd (ap), %mm7 + movd -4(ap), %mm4 + pmuludq B1modb, %mm7 + paddq %mm4, %mm7 + add $-2, n + jz L(end) + + ALIGN(8) +L(top): movq %mm7, %mm6 + psrlq $32, %mm7 C rh + movd -8(ap), %mm0 + add $-4, ap + pmuludq B2modb, %mm7 + pmuludq B1modb, %mm6 + add $-1, n + paddq %mm0, %mm7 + paddq %mm6, %mm7 + jnz L(top) + +L(end): pcmpeqd %mm4, %mm4 + psrlq $32, %mm4 C 0x00000000FFFFFFFF + pand %mm7, %mm4 C rl + psrlq $32, %mm7 C rh + pmuludq B1modb, %mm7 C rh,cl + paddq %mm4, %mm7 C rh,rl + movd 4(%ecx), %mm4 C cnt + psllq %mm4, %mm7 C rh,rl normalized + movq %mm7, %mm2 C rl in low half + psrlq $32, %mm7 C rh + movd (%ecx), %mm1 C bi + pmuludq %mm7, %mm1 C qh,ql + paddq %mm2, %mm1 C qh-1,ql + movd %mm1, %ecx C ql + psrlq $32, %mm1 C qh-1 + movd 16(%esp), %mm3 C b + pmuludq %mm1, %mm3 C (qh-1) * b + psubq %mm3, %mm2 C r in low half (could use psubd) + movd %mm2, %eax C r + mov 16(%esp), %ebx + sub %ebx, %eax C r + cmp %eax, %ecx + lea (%eax,%ebx), %edx + cmovc( %edx, %eax) + movd %mm4, %ecx C cnt + cmp %ebx, %eax + jae L(fix) + emms + pop %ebx + shr %cl, %eax + ret + +L(fix): sub %ebx, %eax + emms + pop %ebx + shr %cl, %eax + ret +EPILOGUE() + +PROLOGUE(mpn_mod_1_1p_cps) +C CAUTION: This is the same code as in k7/mod_1_1.asm + push %ebp + mov 12(%esp), %ebp + push %esi + bsr %ebp, %ecx + push %ebx + xor $31, %ecx + mov 16(%esp), %esi + sal %cl, %ebp + mov %ebp, %edx + not %edx + mov $-1, %eax + div %ebp + mov %eax, (%esi) C store bi + mov %ecx, 4(%esi) C store cnt + xor %ebx, %ebx + sub %ebp, %ebx + mov $1, %edx + shld %cl, %eax, %edx + imul %edx, %ebx + mul %ebx + add %ebx, %edx + not %edx + imul %ebp, %edx + add %edx, %ebp + cmp %edx, %eax + cmovc( %ebp, %edx) + shr %cl, %ebx + mov %ebx, 8(%esi) C store B1modb + shr %cl, %edx + mov %edx, 12(%esi) C store B2modb + pop %ebx + pop %esi + pop %ebp + ret +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mod_1_4.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mod_1_4.asm new file mode 100644 index 0000000..eb2edb6 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mod_1_4.asm @@ -0,0 +1,269 @@ +dnl x86-32 mpn_mod_1s_4p for Pentium 4 and P6 models with SSE2 (i.e. 9,D,E,F). + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2009, 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C TODO: +C * Optimize. The present code was written quite straightforwardly. +C * Optimize post-loop reduction code. +C * Write a cps function that uses sse2 insns. + +C cycles/limb +C P6 model 0-8,10-12 - +C P6 model 9 (Banias) ? +C P6 model 13 (Dothan) 3.4 +C P4 model 0-1 (Willamette) ? +C P4 model 2 (Northwood) 4 +C P4 model 3-4 (Prescott) 4.5 + +C INPUT PARAMETERS +C ap sp + 4 +C n sp + 8 +C b sp + 12 +C cps sp + 16 + +define(`B1modb', `%mm1') +define(`B2modb', `%mm2') +define(`B3modb', `%mm3') +define(`B4modb', `%mm4') +define(`B5modb', `%mm5') +define(`ap', `%edx') +define(`n', `%eax') + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mod_1s_4p) + push %ebx + mov 8(%esp), ap + mov 12(%esp), n + mov 20(%esp), %ecx + + movd 8(%ecx), B1modb + movd 12(%ecx), B2modb + movd 16(%ecx), B3modb + movd 20(%ecx), B4modb + movd 24(%ecx), B5modb + + mov n, %ebx + lea -4(ap,n,4), ap + and $3, %ebx + je L(b0) + cmp $2, %ebx + jc L(b1) + je L(b2) + +L(b3): movd -4(ap), %mm7 + pmuludq B1modb, %mm7 + movd -8(ap), %mm6 + paddq %mm6, %mm7 + movd (ap), %mm6 + pmuludq B2modb, %mm6 + paddq %mm6, %mm7 + lea -24(ap), ap + add $-3, n + jz L(end) + jmp L(top) + +L(b0): movd -8(ap), %mm7 + pmuludq B1modb, %mm7 + movd -12(ap), %mm6 + paddq %mm6, %mm7 + movd -4(ap), %mm6 + pmuludq B2modb, %mm6 + paddq %mm6, %mm7 + movd (ap), %mm6 + pmuludq B3modb, %mm6 + paddq %mm6, %mm7 + lea -28(ap), ap + add $-4, n + jz L(end) + jmp L(top) + +L(b1): movd (ap), %mm7 + lea -16(ap), ap + dec n + jz L(x) + jmp L(top) + +L(b2): movd -4(ap), %mm7 C rl + punpckldq (ap), %mm7 C rh + lea -20(ap), ap + add $-2, n + jz L(end) + + ALIGN(8) +L(top): movd 4(ap), %mm0 + pmuludq B1modb, %mm0 + movd 0(ap), %mm6 + paddq %mm6, %mm0 + + movd 8(ap), %mm6 + pmuludq B2modb, %mm6 + paddq %mm6, %mm0 + + movd 12(ap), %mm6 + pmuludq B3modb, %mm6 + paddq %mm6, %mm0 + + movq %mm7, %mm6 + psrlq $32, %mm7 C rh + pmuludq B5modb, %mm7 + pmuludq B4modb, %mm6 + + paddq %mm0, %mm7 + paddq %mm6, %mm7 + + add $-16, ap + add $-4, n + jnz L(top) + +L(end): pcmpeqd %mm4, %mm4 + psrlq $32, %mm4 C 0x00000000FFFFFFFF + pand %mm7, %mm4 C rl + psrlq $32, %mm7 C rh + pmuludq B1modb, %mm7 C rh,cl + paddq %mm4, %mm7 C rh,rl +L(x): movd 4(%ecx), %mm4 C cnt + psllq %mm4, %mm7 C rh,rl normalized + movq %mm7, %mm2 C rl in low half + psrlq $32, %mm7 C rh + movd (%ecx), %mm1 C bi + pmuludq %mm7, %mm1 C qh,ql + paddq %mm2, %mm1 C qh-1,ql + movd %mm1, %ecx C ql + psrlq $32, %mm1 C qh-1 + movd 16(%esp), %mm3 C b + pmuludq %mm1, %mm3 C (qh-1) * b + psubq %mm3, %mm2 C r in low half (could use psubd) + movd %mm2, %eax C r + mov 16(%esp), %ebx + sub %ebx, %eax C r + cmp %eax, %ecx + lea (%eax,%ebx), %edx + cmovc( %edx, %eax) + movd %mm4, %ecx C cnt + cmp %ebx, %eax + jae L(fix) + emms + pop %ebx + shr %cl, %eax + ret + +L(fix): sub %ebx, %eax + emms + pop %ebx + shr %cl, %eax + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(mpn_mod_1s_4p_cps) +C CAUTION: This is the same code as in k7/mod_1_4.asm + push %ebp + push %edi + push %esi + push %ebx + mov 20(%esp), %ebp C FIXME: avoid bp for 0-idx + mov 24(%esp), %ebx + bsr %ebx, %ecx + xor $31, %ecx + sal %cl, %ebx C b << cnt + mov %ebx, %edx + not %edx + mov $-1, %eax + div %ebx + xor %edi, %edi + sub %ebx, %edi + mov $1, %esi + mov %eax, (%ebp) C store bi + mov %ecx, 4(%ebp) C store cnt + shld %cl, %eax, %esi + imul %edi, %esi + mov %eax, %edi + mul %esi + + add %esi, %edx + shr %cl, %esi + mov %esi, 8(%ebp) C store B1modb + + not %edx + imul %ebx, %edx + lea (%edx,%ebx), %esi + cmp %edx, %eax + cmovnc( %edx, %esi) + mov %edi, %eax + mul %esi + + add %esi, %edx + shr %cl, %esi + mov %esi, 12(%ebp) C store B2modb + + not %edx + imul %ebx, %edx + lea (%edx,%ebx), %esi + cmp %edx, %eax + cmovnc( %edx, %esi) + mov %edi, %eax + mul %esi + + add %esi, %edx + shr %cl, %esi + mov %esi, 16(%ebp) C store B3modb + + not %edx + imul %ebx, %edx + lea (%edx,%ebx), %esi + cmp %edx, %eax + cmovnc( %edx, %esi) + mov %edi, %eax + mul %esi + + add %esi, %edx + shr %cl, %esi + mov %esi, 20(%ebp) C store B4modb + + not %edx + imul %ebx, %edx + add %edx, %ebx + cmp %edx, %eax + cmovnc( %edx, %ebx) + + shr %cl, %ebx + mov %ebx, 24(%ebp) C store B5modb + + pop %ebx + pop %esi + pop %edi + pop %ebp + ret +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mod_34lsub1.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mod_34lsub1.asm new file mode 100644 index 0000000..31e25b7 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mod_34lsub1.asm @@ -0,0 +1,175 @@ +dnl Intel Pentium 4 mpn_mod_34lsub1 -- remainder modulo 2^24-1. + +dnl Copyright 2000-2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C Pentium4: 1.0 cycles/limb + + +C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size) +C +C Enhancements: +C +C There might a couple of cycles to save by using plain integer code for +C more small sizes. 2 limbs measures about 20 cycles, but 3 limbs jumps to +C about 46 (inclusive of some function call overheads). + +defframe(PARAM_SIZE, 8) +defframe(PARAM_SRC, 4) + +dnl re-use parameter space +define(SAVE_EBX, `PARAM_SRC') +define(SAVE_ESI, `PARAM_SIZE') + + TEXT + ALIGN(16) +PROLOGUE(mpn_mod_34lsub1) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + movl PARAM_SRC, %edx + movl (%edx), %eax + + subl $2, %ecx + ja L(three_or_more) + jne L(one) + + movl 4(%edx), %edx + movl %eax, %ecx + shrl $24, %eax C src[0] high + + andl $0x00FFFFFF, %ecx C src[0] low + addl %ecx, %eax + + movl %edx, %ecx + shll $8, %edx + + shrl $16, %ecx C src[1] low + addl %ecx, %eax + + andl $0x00FFFF00, %edx C src[1] high + addl %edx, %eax + +L(one): + ret + + +L(three_or_more): + pxor %mm0, %mm0 + pxor %mm1, %mm1 + pxor %mm2, %mm2 + + pcmpeqd %mm7, %mm7 + psrlq $32, %mm7 C 0x00000000FFFFFFFF, low 32 bits + + pcmpeqd %mm6, %mm6 + psrlq $40, %mm6 C 0x0000000000FFFFFF, low 24 bits + +L(top): + C eax + C ebx + C ecx counter, size-2 to 0, -1 or -2 + C edx src, incrementing + C + C mm0 sum 0mod3 + C mm1 sum 1mod3 + C mm2 sum 2mod3 + C mm3 + C mm4 + C mm5 + C mm6 0x0000000000FFFFFF + C mm7 0x00000000FFFFFFFF + + movd (%edx), %mm3 + paddq %mm3, %mm0 + + movd 4(%edx), %mm3 + paddq %mm3, %mm1 + + movd 8(%edx), %mm3 + paddq %mm3, %mm2 + + addl $12, %edx + subl $3, %ecx + ja L(top) + + + C ecx is -2, -1 or 0 representing 0, 1 or 2 more limbs, respectively + + addl $1, %ecx + js L(combine) C 0 more + + movd (%edx), %mm3 + paddq %mm3, %mm0 + + jz L(combine) C 1 more + + movd 4(%edx), %mm3 + paddq %mm3, %mm1 + +L(combine): + movq %mm7, %mm3 C low halves + pand %mm0, %mm3 + + movq %mm7, %mm4 + pand %mm1, %mm4 + + movq %mm7, %mm5 + pand %mm2, %mm5 + + psrlq $32, %mm0 C high halves + psrlq $32, %mm1 + psrlq $32, %mm2 + + paddq %mm0, %mm4 C fold high halves to give 33 bits each + paddq %mm1, %mm5 + paddq %mm2, %mm3 + + psllq $8, %mm4 C combine at respective offsets + psllq $16, %mm5 + paddq %mm4, %mm3 + paddq %mm5, %mm3 C 0x000cxxxxxxxxxxxx, 50 bits + + pand %mm3, %mm6 C fold at 24 bits + psrlq $24, %mm3 + + paddq %mm6, %mm3 + movd %mm3, %eax + + ASSERT(z, C nothing left in high dword + `psrlq $32, %mm3 + movd %mm3, %ecx + orl %ecx, %ecx') + + emms + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mode1o.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mode1o.asm new file mode 100644 index 0000000..aa9ef31 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mode1o.asm @@ -0,0 +1,175 @@ +dnl Intel Pentium-4 mpn_modexact_1_odd -- mpn by limb exact remainder. + +dnl Copyright 2001, 2002, 2007 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P4: 19.0 cycles/limb + + +C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size, +C mp_limb_t divisor); +C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, +C mp_limb_t divisor, mp_limb_t carry); +C + +defframe(PARAM_CARRY, 16) +defframe(PARAM_DIVISOR,12) +defframe(PARAM_SIZE, 8) +defframe(PARAM_SRC, 4) + + TEXT + + ALIGN(16) +PROLOGUE(mpn_modexact_1c_odd) +deflit(`FRAME',0) + + movd PARAM_CARRY, %mm1 + jmp L(start_1c) + +EPILOGUE() + + + ALIGN(16) +PROLOGUE(mpn_modexact_1_odd) +deflit(`FRAME',0) + + pxor %mm1, %mm1 C carry limb +L(start_1c): + movl PARAM_DIVISOR, %eax + + movd PARAM_DIVISOR, %mm7 + + shrl %eax + + andl $127, %eax C d/2, 7 bits + +ifdef(`PIC',` + LEA( binvert_limb_table, %edx) + movzbl (%eax,%edx), %eax C inv 8 bits +',` + movzbl binvert_limb_table(%eax), %eax C inv 8 bits +') + + C + + movd %eax, %mm6 C inv + + movd %eax, %mm0 C inv + + pmuludq %mm6, %mm6 C inv*inv + + C + + pmuludq %mm7, %mm6 C inv*inv*d + paddd %mm0, %mm0 C 2*inv + + C + + psubd %mm6, %mm0 C inv = 2*inv - inv*inv*d + pxor %mm6, %mm6 + + paddd %mm0, %mm6 + pmuludq %mm0, %mm0 C inv*inv + + C + + pmuludq %mm7, %mm0 C inv*inv*d + paddd %mm6, %mm6 C 2*inv + + + movl PARAM_SRC, %eax + movl PARAM_SIZE, %ecx + + C + + psubd %mm0, %mm6 C inv = 2*inv - inv*inv*d + + ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS + pushl %eax FRAME_pushl() + movd %mm6, %eax + imul PARAM_DIVISOR, %eax + cmpl $1, %eax + popl %eax FRAME_popl()') + + pxor %mm0, %mm0 C carry bit + + +C The dependent chain here is as follows. +C +C latency +C psubq s = (src-cbit) - climb 2 +C pmuludq q = s*inverse 8 +C pmuludq prod = q*divisor 8 +C psrlq climb = high(prod) 2 +C -- +C 20 +C +C Yet the loop measures 19.0 c/l, so obviously there's something gained +C there over a straight reading of the chip documentation. + +L(top): + C eax src, incrementing + C ebx + C ecx counter, limbs + C edx + C + C mm0 carry bit + C mm1 carry limb + C mm6 inverse + C mm7 divisor + + movd (%eax), %mm2 + addl $4, %eax + + psubq %mm0, %mm2 C src - cbit + + psubq %mm1, %mm2 C src - cbit - climb + movq %mm2, %mm0 + psrlq $63, %mm0 C new cbit + + pmuludq %mm6, %mm2 C s*inverse + + movq %mm7, %mm1 + pmuludq %mm2, %mm1 C q*divisor + psrlq $32, %mm1 C new climb + + subl $1, %ecx + jnz L(top) + + +L(done): + paddq %mm1, %mm0 + movd %mm0, %eax + emms + ret + +EPILOGUE() +ASM_END() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mul_1.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mul_1.asm new file mode 100644 index 0000000..6347b8b --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mul_1.asm @@ -0,0 +1,164 @@ +dnl mpn_mul_1 for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F). + +dnl Copyright 2005, 2007, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C cycles/limb +C P6 model 0-8,10-12 - +C P6 model 9 (Banias) 4.17 +C P6 model 13 (Dothan) 4.17 +C P4 model 0-1 (Willamette) 4 +C P4 model 2 (Northwood) 4 +C P4 model 3-4 (Prescott) 4.55 + +C TODO: +C * Tweak eax/edx offsets in loop as to save some lea's +C * Perhaps software pipeline small-case code + +C INPUT PARAMETERS +C rp sp + 4 +C up sp + 8 +C n sp + 12 +C v0 sp + 16 + + TEXT + ALIGN(16) +PROLOGUE(mpn_mul_1) + pxor %mm6, %mm6 +L(ent): mov 4(%esp), %edx + mov 8(%esp), %eax + mov 12(%esp), %ecx + movd 16(%esp), %mm7 + cmp $4, %ecx + jnc L(big) + +L(lp0): movd (%eax), %mm0 + lea 4(%eax), %eax + lea 4(%edx), %edx + pmuludq %mm7, %mm0 + paddq %mm0, %mm6 + movd %mm6, -4(%edx) + psrlq $32, %mm6 + dec %ecx + jnz L(lp0) + movd %mm6, %eax + emms + ret + +L(big): and $3, %ecx + je L(0) + cmp $2, %ecx + jc L(1) + je L(2) + jmp L(3) C FIXME: one case should fall through + +L(0): movd (%eax), %mm3 + sub 12(%esp), %ecx C loop count + lea -16(%eax), %eax + lea -12(%edx), %edx + pmuludq %mm7, %mm3 + movd 20(%eax), %mm0 + pmuludq %mm7, %mm0 + movd 24(%eax), %mm1 + jmp L(00) + +L(1): movd (%eax), %mm2 + sub 12(%esp), %ecx + lea -12(%eax), %eax + lea -8(%edx), %edx + pmuludq %mm7, %mm2 + movd 16(%eax), %mm3 + pmuludq %mm7, %mm3 + movd 20(%eax), %mm0 + jmp L(01) + +L(2): movd (%eax), %mm1 + sub 12(%esp), %ecx + lea -8(%eax), %eax + lea -4(%edx), %edx + pmuludq %mm7, %mm1 + movd 12(%eax), %mm2 + pmuludq %mm7, %mm2 + movd 16(%eax), %mm3 + jmp L(10) + +L(3): movd (%eax), %mm0 + sub 12(%esp), %ecx + lea -4(%eax), %eax + pmuludq %mm7, %mm0 + movd 8(%eax), %mm1 + pmuludq %mm7, %mm1 + movd 12(%eax), %mm2 + + ALIGN(16) +L(top): pmuludq %mm7, %mm2 + paddq %mm0, %mm6 + movd 16(%eax), %mm3 + movd %mm6, 0(%edx) + psrlq $32, %mm6 +L(10): pmuludq %mm7, %mm3 + paddq %mm1, %mm6 + movd 20(%eax), %mm0 + movd %mm6, 4(%edx) + psrlq $32, %mm6 +L(01): pmuludq %mm7, %mm0 + paddq %mm2, %mm6 + movd 24(%eax), %mm1 + movd %mm6, 8(%edx) + psrlq $32, %mm6 +L(00): pmuludq %mm7, %mm1 + paddq %mm3, %mm6 + movd 28(%eax), %mm2 + movd %mm6, 12(%edx) + psrlq $32, %mm6 + lea 16(%eax), %eax + lea 16(%edx), %edx + add $4, %ecx + ja L(top) + +L(end): pmuludq %mm7, %mm2 + paddq %mm0, %mm6 + movd %mm6, 0(%edx) + psrlq $32, %mm6 + paddq %mm1, %mm6 + movd %mm6, 4(%edx) + psrlq $32, %mm6 + paddq %mm2, %mm6 + movd %mm6, 8(%edx) + psrlq $32, %mm6 + movd %mm6, %eax + emms + ret +EPILOGUE() +PROLOGUE(mpn_mul_1c) + movd 20(%esp), %mm6 + jmp L(ent) +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mul_basecase.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mul_basecase.asm new file mode 100644 index 0000000..6e3775a --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mul_basecase.asm @@ -0,0 +1,662 @@ +dnl mpn_mul_basecase for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F). + +dnl Copyright 2001, 2002, 2005, 2007 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C TODO: +C * Improve ad-hoc outer loop code and register handling. Some feed-in +C scheduling could improve things by several cycles per outer iteration. +C * In code for un <= 3, try keeping accumulation operands in registers, +C without storing intermediates to rp. +C * We might want to keep 32 in a free mm register, since the register form is +C 3 bytes and the immediate form is 4 bytes. About 70 bytes to save. +C * Look into different loop alignment, we now expand the code about 50 bytes +C with possibly needless alignment. +C * Perhaps rewrap loops 00,01,02 (6 loops) to allow fall-through entry. +C * Use OSP, should solve feed-in latency problems. +C * Save a few tens of bytes by doing cross-jumping for Loel0, etc. +C * Save around 120 bytes by remapping "m 0", "m 1", "m 2" and "m 3" registers +C so that they can share feed-in code, and changing the branch targets from +C L<n> to Lm<nn>. + +C cycles/limb +C P6 model 9 (Banias) ? +C P6 model 13 (Dothan) 5.24 +C P6 model 14 (Yonah) ? +C P4 model 0-1 (Willamette): 5 +C P4 model 2 (Northwood): 4.60 at 32 limbs +C P4 model 3-4 (Prescott): 4.94 at 32 limbs + +C INPUT PARAMETERS +C rp sp + 4 +C up sp + 8 +C un sp + 12 +C vp sp + 16 +C vn sp + 20 + + TEXT + ALIGN(16) +PROLOGUE(mpn_mul_basecase) + push %esi + push %ebx + mov 12(%esp), %edx C rp + mov 16(%esp), %eax C up + mov 20(%esp), %ecx C un + mov 24(%esp), %esi C vp + mov 28(%esp), %ebx C vn + movd (%esi), %mm7 C +L(ent): cmp $3, %ecx + ja L(big) + movd (%eax), %mm6 + pmuludq %mm7, %mm6 + jz L(un3) + cmp $2, %ecx + jz L(un2) + +L(un1): movd %mm6, (%edx) C un=1 + psrlq $32, %mm6 C un=1 + movd %mm6, 4(%edx) C un=1 + jmp L(rtr) C un=1 + +L(un2): movd 4(%eax), %mm1 C un=2 + pmuludq %mm7, %mm1 C un=2 + movd %mm6, (%edx) C un=2 + psrlq $32, %mm6 C un=2 + paddq %mm1, %mm6 C un=2 + movd %mm6, 4(%edx) C un=2 + psrlq $32, %mm6 C un=2 + movd %mm6, 8(%edx) C un=2 + dec %ebx C un=2 + jz L(rtr) C un=2 + movd 4(%esi), %mm7 C un=2 + movd (%eax), %mm6 C un=2 + pmuludq %mm7, %mm6 C un=2 + movd 4(%eax), %mm1 C un=2 + movd 4(%edx), %mm4 C un=2 + pmuludq %mm7, %mm1 C un=2 + movd 8(%edx), %mm5 C un=2 + paddq %mm4, %mm6 C un=2 + paddq %mm1, %mm5 C un=2 + movd %mm6, 4(%edx) C un=2 + psrlq $32, %mm6 C un=2 + paddq %mm5, %mm6 C un=2 + movd %mm6, 8(%edx) C un=2 + psrlq $32, %mm6 C un=2 + movd %mm6, 12(%edx) C un=2 +L(rtr): emms + pop %ebx + pop %esi + ret + +L(un3): movd 4(%eax), %mm1 C un=3 + pmuludq %mm7, %mm1 C un=3 + movd 8(%eax), %mm2 C un=3 + pmuludq %mm7, %mm2 C un=3 + movd %mm6, (%edx) C un=3 + psrlq $32, %mm6 C un=3 + paddq %mm1, %mm6 C un=3 + movd %mm6, 4(%edx) C un=3 + psrlq $32, %mm6 C un=3 + paddq %mm2, %mm6 C un=3 + movd %mm6, 8(%edx) C un=3 + psrlq $32, %mm6 C un=3 + movd %mm6, 12(%edx) C un=3 + dec %ebx C un=3 + jz L(rtr) C un=3 + movd 4(%esi), %mm7 C un=3 + movd (%eax), %mm6 C un=3 + pmuludq %mm7, %mm6 C un=3 + movd 4(%eax), %mm1 C un=3 + movd 4(%edx), %mm4 C un=3 + pmuludq %mm7, %mm1 C un=3 + movd 8(%eax), %mm2 C un=3 + movd 8(%edx), %mm5 C un=3 + pmuludq %mm7, %mm2 C un=3 + paddq %mm4, %mm6 C un=3 + paddq %mm1, %mm5 C un=3 + movd 12(%edx), %mm4 C un=3 + movd %mm6, 4(%edx) C un=3 + psrlq $32, %mm6 C un=3 + paddq %mm5, %mm6 C un=3 + paddq %mm2, %mm4 C un=3 + movd %mm6, 8(%edx) C un=3 + psrlq $32, %mm6 C un=3 + paddq %mm4, %mm6 C un=3 + movd %mm6, 12(%edx) C un=3 + psrlq $32, %mm6 C un=3 + movd %mm6, 16(%edx) C un=3 + dec %ebx C un=3 + jz L(rtr) C un=3 + movd 8(%esi), %mm7 C un=3 + movd (%eax), %mm6 C un=3 + pmuludq %mm7, %mm6 C un=3 + movd 4(%eax), %mm1 C un=3 + movd 8(%edx), %mm4 C un=3 + pmuludq %mm7, %mm1 C un=3 + movd 8(%eax), %mm2 C un=3 + movd 12(%edx), %mm5 C un=3 + pmuludq %mm7, %mm2 C un=3 + paddq %mm4, %mm6 C un=3 + paddq %mm1, %mm5 C un=3 + movd 16(%edx), %mm4 C un=3 + movd %mm6, 8(%edx) C un=3 + psrlq $32, %mm6 C un=3 + paddq %mm5, %mm6 C un=3 + paddq %mm2, %mm4 C un=3 + movd %mm6, 12(%edx) C un=3 + psrlq $32, %mm6 C un=3 + paddq %mm4, %mm6 C un=3 + movd %mm6, 16(%edx) C un=3 + psrlq $32, %mm6 C un=3 + movd %mm6, 20(%edx) C un=3 + jmp L(rtr) + + +L(big): push %edi + pxor %mm6, %mm6 + lea 4(%esi), %esi + and $3, %ecx + jz L(0) + cmp $2, %ecx + jc L(1) + jz L(2) + jmp L(3) C FIXME: one case should fall through + + +L(0): movd (%eax), %mm3 C m 0 + sub 24(%esp), %ecx C inner loop count m 0 + mov %ecx, 24(%esp) C update loop count for later m 0 + pmuludq %mm7, %mm3 C m 0 + movd 4(%eax), %mm0 C m 0 + pmuludq %mm7, %mm0 C m 0 + movd 8(%eax), %mm1 C m 0 + jmp L(m00) C m 0 + ALIGN(16) C m 0 +L(lpm0): + pmuludq %mm7, %mm4 C m 0 + paddq %mm0, %mm6 C m 0 + movd (%eax), %mm3 C m 0 + movd %mm6, -12(%edx) C m 0 + psrlq $32, %mm6 C m 0 + pmuludq %mm7, %mm3 C m 0 + paddq %mm1, %mm6 C m 0 + movd 4(%eax), %mm0 C m 0 + movd %mm6, -8(%edx) C m 0 + psrlq $32, %mm6 C m 0 + pmuludq %mm7, %mm0 C m 0 + paddq %mm4, %mm6 C m 0 + movd 8(%eax), %mm1 C m 0 + movd %mm6, -4(%edx) C m 0 + psrlq $32, %mm6 C m 0 +L(m00): pmuludq %mm7, %mm1 C m 0 + paddq %mm3, %mm6 C m 0 + movd 12(%eax), %mm4 C m 0 + movd %mm6, (%edx) C m 0 + psrlq $32, %mm6 C m 0 + lea 16(%eax), %eax C m 0 + lea 16(%edx), %edx C m 0 + add $4, %ecx C m 0 + ja L(lpm0) C m 0 + pmuludq %mm7, %mm4 C m 0 + paddq %mm0, %mm6 C m 0 + movd %mm6, -12(%edx) C m 0 + psrlq $32, %mm6 C m 0 + paddq %mm1, %mm6 C m 0 + mov 16(%esp), %edi C rp 0 + jmp L(x0) + +L(olp0): + lea 4(%edi), %edi C am 0 + movd (%esi), %mm7 C am 0 + lea 4(%esi), %esi C am 0 + mov %edi, %edx C rp am 0 + mov 20(%esp), %eax C up am 0 + movd (%eax), %mm3 C am 0 + mov 24(%esp), %ecx C inner loop count am 0 + pxor %mm6, %mm6 C am 0 + pmuludq %mm7, %mm3 C am 0 + movd 4(%eax), %mm0 C am 0 + movd (%edx), %mm5 C am 0 + pmuludq %mm7, %mm0 C am 0 + movd 8(%eax), %mm1 C am 0 + paddq %mm3, %mm5 C am 0 + movd 4(%edx), %mm4 C am 0 + jmp L(am00) C am 0 + ALIGN(16) C mm 0 +L(lam0): + pmuludq %mm7, %mm2 C am 0 + paddq %mm4, %mm6 C am 0 + movd (%eax), %mm3 C am 0 + paddq %mm1, %mm5 C am 0 + movd -4(%edx), %mm4 C am 0 + movd %mm6, -12(%edx) C am 0 + psrlq $32, %mm6 C am 0 + pmuludq %mm7, %mm3 C am 0 + paddq %mm5, %mm6 C am 0 + movd 4(%eax), %mm0 C am 0 + paddq %mm2, %mm4 C am 0 + movd (%edx), %mm5 C am 0 + movd %mm6, -8(%edx) C am 0 + psrlq $32, %mm6 C am 0 + pmuludq %mm7, %mm0 C am 0 + paddq %mm4, %mm6 C am 0 + movd 8(%eax), %mm1 C am 0 + paddq %mm3, %mm5 C am 0 + movd 4(%edx), %mm4 C am 0 + movd %mm6, -4(%edx) C am 0 + psrlq $32, %mm6 C am 0 +L(am00): + pmuludq %mm7, %mm1 C am 0 + paddq %mm5, %mm6 C am 0 + movd 12(%eax), %mm2 C am 0 + paddq %mm0, %mm4 C am 0 + movd 8(%edx), %mm5 C am 0 + movd %mm6, (%edx) C am 0 + psrlq $32, %mm6 C am 0 + lea 16(%eax), %eax C am 0 + lea 16(%edx), %edx C am 0 + add $4, %ecx C am 0 + jnz L(lam0) C am 0 + pmuludq %mm7, %mm2 C am 0 + paddq %mm4, %mm6 C am 0 + paddq %mm1, %mm5 C am 0 + movd -4(%edx), %mm4 C am 0 + movd %mm6, -12(%edx) C am 0 + psrlq $32, %mm6 C am 0 + paddq %mm5, %mm6 C am 0 + paddq %mm2, %mm4 C am 0 +L(x0): movd %mm6, -8(%edx) C am 0 + psrlq $32, %mm6 C am 0 + paddq %mm4, %mm6 C am 0 + movd %mm6, -4(%edx) C am 0 + psrlq $32, %mm6 C am 0 + movd %mm6, (%edx) C am 0 + dec %ebx C am 0 + jnz L(olp0) C am 0 +L(oel0): + emms C 0 + pop %edi C 0 + pop %ebx C 0 + pop %esi C 0 + ret C 0 + + +L(1): movd (%eax), %mm4 C m 1 + sub 24(%esp), %ecx C m 1 + mov %ecx, 24(%esp) C update loop count for later m 1 + pmuludq %mm7, %mm4 C m 1 + movd 4(%eax), %mm3 C m 1 + pmuludq %mm7, %mm3 C m 1 + movd 8(%eax), %mm0 C m 1 + jmp L(m01) C m 1 + ALIGN(16) C m 1 +L(lpm1): + pmuludq %mm7, %mm4 C m 1 + paddq %mm0, %mm6 C m 1 + movd 4(%eax), %mm3 C m 1 + movd %mm6, -8(%edx) C m 1 + psrlq $32, %mm6 C m 1 + pmuludq %mm7, %mm3 C m 1 + paddq %mm1, %mm6 C m 1 + movd 8(%eax), %mm0 C m 1 + movd %mm6, -4(%edx) C m 1 + psrlq $32, %mm6 C m 1 +L(m01): pmuludq %mm7, %mm0 C m 1 + paddq %mm4, %mm6 C m 1 + movd 12(%eax), %mm1 C m 1 + movd %mm6, (%edx) C m 1 + psrlq $32, %mm6 C m 1 + pmuludq %mm7, %mm1 C m 1 + paddq %mm3, %mm6 C m 1 + movd 16(%eax), %mm4 C m 1 + movd %mm6, 4(%edx) C m 1 + psrlq $32, %mm6 C m 1 + lea 16(%eax), %eax C m 1 + lea 16(%edx), %edx C m 1 + add $4, %ecx C m 1 + ja L(lpm1) C m 1 + pmuludq %mm7, %mm4 C m 1 + paddq %mm0, %mm6 C m 1 + movd %mm6, -8(%edx) C m 1 + psrlq $32, %mm6 C m 1 + paddq %mm1, %mm6 C m 1 + mov 16(%esp), %edi C rp 1 + jmp L(x1) + +L(olp1): + lea 4(%edi), %edi C am 1 + movd (%esi), %mm7 C am 1 + lea 4(%esi), %esi C am 1 + mov %edi, %edx C rp am 1 + mov 20(%esp), %eax C up am 1 + movd (%eax), %mm2 C am 1 + mov 24(%esp), %ecx C inner loop count am 1 + pxor %mm6, %mm6 C am 1 + pmuludq %mm7, %mm2 C am 1 + movd 4(%eax), %mm3 C am 1 + movd (%edx), %mm4 C am 1 + pmuludq %mm7, %mm3 C am 1 + movd 8(%eax), %mm0 C am 1 + paddq %mm2, %mm4 C am 1 + movd 4(%edx), %mm5 C am 1 + jmp L(am01) C am 1 + ALIGN(16) C am 1 +L(lam1): + pmuludq %mm7, %mm2 C am 1 + paddq %mm4, %mm6 C am 1 + movd 4(%eax), %mm3 C am 1 + paddq %mm1, %mm5 C am 1 + movd (%edx), %mm4 C am 1 + movd %mm6, -8(%edx) C am 1 + psrlq $32, %mm6 C am 1 + pmuludq %mm7, %mm3 C am 1 + paddq %mm5, %mm6 C am 1 + movd 8(%eax), %mm0 C am 1 + paddq %mm2, %mm4 C am 1 + movd 4(%edx), %mm5 C am 1 + movd %mm6, -4(%edx) C am 1 + psrlq $32, %mm6 C am 1 +L(am01): + pmuludq %mm7, %mm0 C am 1 + paddq %mm4, %mm6 C am 1 + movd 12(%eax), %mm1 C am 1 + paddq %mm3, %mm5 C am 1 + movd 8(%edx), %mm4 C am 1 + movd %mm6, (%edx) C am 1 + psrlq $32, %mm6 C am 1 + pmuludq %mm7, %mm1 C am 1 + paddq %mm5, %mm6 C am 1 + movd 16(%eax), %mm2 C am 1 + paddq %mm0, %mm4 C am 1 + movd 12(%edx), %mm5 C am 1 + movd %mm6, 4(%edx) C am 1 + psrlq $32, %mm6 C am 1 + lea 16(%eax), %eax C am 1 + lea 16(%edx), %edx C am 1 + add $4, %ecx C am 1 + jnz L(lam1) C am 1 + pmuludq %mm7, %mm2 C am 1 + paddq %mm4, %mm6 C am 1 + paddq %mm1, %mm5 C am 1 + movd (%edx), %mm4 C am 1 + movd %mm6, -8(%edx) C am 1 + psrlq $32, %mm6 C am 1 + paddq %mm5, %mm6 C am 1 + paddq %mm2, %mm4 C am 1 +L(x1): movd %mm6, -4(%edx) C am 1 + psrlq $32, %mm6 C am 1 + paddq %mm4, %mm6 C am 1 + movd %mm6, (%edx) C am 1 + psrlq $32, %mm6 C am 1 + movd %mm6, 4(%edx) C am 1 + dec %ebx C am 1 + jnz L(olp1) C am 1 +L(oel1): + emms C 1 + pop %edi C 1 + pop %ebx C 1 + pop %esi C 1 + ret C 1 + + +L(2): movd (%eax), %mm1 C m 2 + sub 24(%esp), %ecx C m 2 + mov %ecx, 24(%esp) C update loop count for later m 2 + pmuludq %mm7, %mm1 C m 2 + movd 4(%eax), %mm4 C m 2 + pmuludq %mm7, %mm4 C m 2 + movd 8(%eax), %mm3 C m 2 + jmp L(m10) C m 2 + ALIGN(16) C m 2 +L(lpm2): + pmuludq %mm7, %mm4 C m 2 + paddq %mm0, %mm6 C m 2 + movd 8(%eax), %mm3 C m 2 + movd %mm6, -4(%edx) C m 2 + psrlq $32, %mm6 C m 2 +L(m10): pmuludq %mm7, %mm3 C m 2 + paddq %mm1, %mm6 C m 2 + movd 12(%eax), %mm0 C m 2 + movd %mm6, (%edx) C m 2 + psrlq $32, %mm6 C m 2 + pmuludq %mm7, %mm0 C m 2 + paddq %mm4, %mm6 C m 2 + movd 16(%eax), %mm1 C m 2 + movd %mm6, 4(%edx) C m 2 + psrlq $32, %mm6 C m 2 + pmuludq %mm7, %mm1 C m 2 + paddq %mm3, %mm6 C m 2 + movd 20(%eax), %mm4 C m 2 + movd %mm6, 8(%edx) C m 2 + psrlq $32, %mm6 C m 2 + lea 16(%eax), %eax C m 2 + lea 16(%edx), %edx C m 2 + add $4, %ecx C m 2 + ja L(lpm2) C m 2 + pmuludq %mm7, %mm4 C m 2 + paddq %mm0, %mm6 C m 2 + movd %mm6, -4(%edx) C m 2 + psrlq $32, %mm6 C m 2 + paddq %mm1, %mm6 C m 2 + mov 16(%esp), %edi C rp 2 + jmp L(x2) + +L(olp2): + lea 4(%edi), %edi C am 2 + movd (%esi), %mm7 C am 2 + lea 4(%esi), %esi C am 2 + mov %edi, %edx C rp am 2 + mov 20(%esp), %eax C up am 2 + movd (%eax), %mm1 C am 2 + mov 24(%esp), %ecx C inner loop count am 2 + pxor %mm6, %mm6 C am 2 + pmuludq %mm7, %mm1 C am 2 + movd 4(%eax), %mm2 C am 2 + movd (%edx), %mm5 C am 2 + pmuludq %mm7, %mm2 C am 2 + movd 8(%eax), %mm3 C am 2 + paddq %mm1, %mm5 C am 2 + movd 4(%edx), %mm4 C am 2 + jmp L(am10) C am 2 + ALIGN(16) C am 2 +L(lam2): + pmuludq %mm7, %mm2 C am 2 + paddq %mm4, %mm6 C am 2 + movd 8(%eax), %mm3 C am 2 + paddq %mm1, %mm5 C am 2 + movd 4(%edx), %mm4 C am 2 + movd %mm6, -4(%edx) C am 2 + psrlq $32, %mm6 C am 2 +L(am10): + pmuludq %mm7, %mm3 C am 2 + paddq %mm5, %mm6 C am 2 + movd 12(%eax), %mm0 C am 2 + paddq %mm2, %mm4 C am 2 + movd 8(%edx), %mm5 C am 2 + movd %mm6, (%edx) C am 2 + psrlq $32, %mm6 C am 2 + pmuludq %mm7, %mm0 C am 2 + paddq %mm4, %mm6 C am 2 + movd 16(%eax), %mm1 C am 2 + paddq %mm3, %mm5 C am 2 + movd 12(%edx), %mm4 C am 2 + movd %mm6, 4(%edx) C am 2 + psrlq $32, %mm6 C am 2 + pmuludq %mm7, %mm1 C am 2 + paddq %mm5, %mm6 C am 2 + movd 20(%eax), %mm2 C am 2 + paddq %mm0, %mm4 C am 2 + movd 16(%edx), %mm5 C am 2 + movd %mm6, 8(%edx) C am 2 + psrlq $32, %mm6 C am 2 + lea 16(%eax), %eax C am 2 + lea 16(%edx), %edx C am 2 + add $4, %ecx C am 2 + jnz L(lam2) C am 2 + pmuludq %mm7, %mm2 C am 2 + paddq %mm4, %mm6 C am 2 + paddq %mm1, %mm5 C am 2 + movd 4(%edx), %mm4 C am 2 + movd %mm6, -4(%edx) C am 2 + psrlq $32, %mm6 C am 2 + paddq %mm5, %mm6 C am 2 + paddq %mm2, %mm4 C am 2 +L(x2): movd %mm6, (%edx) C am 2 + psrlq $32, %mm6 C am 2 + paddq %mm4, %mm6 C am 2 + movd %mm6, 4(%edx) C am 2 + psrlq $32, %mm6 C am 2 + movd %mm6, 8(%edx) C am 2 + dec %ebx C am 2 + jnz L(olp2) C am 2 +L(oel2): + emms C 2 + pop %edi C 2 + pop %ebx C 2 + pop %esi C 2 + ret C 2 + + +L(3): movd (%eax), %mm0 C m 3 + sub 24(%esp), %ecx C m 3 + mov %ecx, 24(%esp) C update loop count for later m 3 + pmuludq %mm7, %mm0 C m 3 + movd 4(%eax), %mm1 C m 3 + pmuludq %mm7, %mm1 C m 3 + movd 8(%eax), %mm4 C m 3 + jmp L(lpm3) C m 3 + ALIGN(16) C m 3 +L(lpm3): + pmuludq %mm7, %mm4 C m 3 + paddq %mm0, %mm6 C m 3 + movd 12(%eax), %mm3 C m 3 + movd %mm6, (%edx) C m 3 + psrlq $32, %mm6 C m 3 + pmuludq %mm7, %mm3 C m 3 + paddq %mm1, %mm6 C m 3 + movd 16(%eax), %mm0 C m 3 + movd %mm6, 4(%edx) C m 3 + psrlq $32, %mm6 C m 3 + pmuludq %mm7, %mm0 C m 3 + paddq %mm4, %mm6 C m 3 + movd 20(%eax), %mm1 C m 3 + movd %mm6, 8(%edx) C m 3 + psrlq $32, %mm6 C m 3 + pmuludq %mm7, %mm1 C m 3 + paddq %mm3, %mm6 C m 3 + movd 24(%eax), %mm4 C m 3 + movd %mm6, 12(%edx) C m 3 + psrlq $32, %mm6 C m 3 + lea 16(%eax), %eax C m 3 + lea 16(%edx), %edx C m 3 + add $4, %ecx C m 3 + ja L(lpm3) C m 3 + pmuludq %mm7, %mm4 C m 3 + paddq %mm0, %mm6 C m 3 + movd %mm6, (%edx) C m 3 + psrlq $32, %mm6 C m 3 + paddq %mm1, %mm6 C m 3 + mov 16(%esp), %edi C rp 3 + jmp L(x3) + +L(olp3): + lea 4(%edi), %edi C am 3 + movd (%esi), %mm7 C am 3 + lea 4(%esi), %esi C am 3 + mov %edi, %edx C rp am 3 + mov 20(%esp), %eax C up am 3 + movd (%eax), %mm0 C am 3 + mov 24(%esp), %ecx C inner loop count am 3 + pxor %mm6, %mm6 C am 3 + pmuludq %mm7, %mm0 C am 3 + movd 4(%eax), %mm1 C am 3 + movd (%edx), %mm4 C am 3 + pmuludq %mm7, %mm1 C am 3 + movd 8(%eax), %mm2 C am 3 + paddq %mm0, %mm4 C am 3 + movd 4(%edx), %mm5 C am 3 + jmp L(lam3) C am 3 + ALIGN(16) C am 3 +L(lam3): + pmuludq %mm7, %mm2 C am 3 + paddq %mm4, %mm6 C am 3 + movd 12(%eax), %mm3 C am 3 + paddq %mm1, %mm5 C am 3 + movd 8(%edx), %mm4 C am 3 + movd %mm6, (%edx) C am 3 + psrlq $32, %mm6 C am 3 + pmuludq %mm7, %mm3 C am 3 + paddq %mm5, %mm6 C am 3 + movd 16(%eax), %mm0 C am 3 + paddq %mm2, %mm4 C am 3 + movd 12(%edx), %mm5 C am 3 + movd %mm6, 4(%edx) C am 3 + psrlq $32, %mm6 C am 3 + pmuludq %mm7, %mm0 C am 3 + paddq %mm4, %mm6 C am 3 + movd 20(%eax), %mm1 C am 3 + paddq %mm3, %mm5 C am 3 + movd 16(%edx), %mm4 C am 3 + movd %mm6, 8(%edx) C am 3 + psrlq $32, %mm6 C am 3 + pmuludq %mm7, %mm1 C am 3 + paddq %mm5, %mm6 C am 3 + movd 24(%eax), %mm2 C am 3 + paddq %mm0, %mm4 C am 3 + movd 20(%edx), %mm5 C am 3 + movd %mm6, 12(%edx) C am 3 + psrlq $32, %mm6 C am 3 + lea 16(%eax), %eax C am 3 + lea 16(%edx), %edx C am 3 + add $4, %ecx C am 3 + jnz L(lam3) C am 3 + pmuludq %mm7, %mm2 C am 3 + paddq %mm4, %mm6 C am 3 + paddq %mm1, %mm5 C am 3 + movd 8(%edx), %mm4 C am 3 + movd %mm6, (%edx) C am 3 + psrlq $32, %mm6 C am 3 + paddq %mm5, %mm6 C am 3 + paddq %mm2, %mm4 C am 3 +L(x3): movd %mm6, 4(%edx) C am 3 + psrlq $32, %mm6 C am 3 + paddq %mm4, %mm6 C am 3 + movd %mm6, 8(%edx) C am 3 + psrlq $32, %mm6 C am 3 + movd %mm6, 12(%edx) C am 3 + dec %ebx C am 3 + jnz L(olp3) C am 3 +L(oel3): + emms C 3 + pop %edi C 3 + pop %ebx C 3 + pop %esi C 3 + ret C 3 +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/popcount.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/popcount.asm new file mode 100644 index 0000000..c7f4426 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/popcount.asm @@ -0,0 +1,281 @@ +dnl X86-32 and X86-64 mpn_popcount using SSE2. + +dnl Copyright 2006, 2007, 2011, 2015, 2020 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + + +C 32-bit popcount hamdist +C cycles/limb cycles/limb +C P5 - +C P6 model 0-8,10-12 - +C P6 model 9 (Banias) ? +C P6 model 13 (Dothan) 4 +C P4 model 0 (Willamette) ? +C P4 model 1 (?) ? +C P4 model 2 (Northwood) 3.9 +C P4 model 3 (Prescott) ? +C P4 model 4 (Nocona) ? +C AMD K6 - +C AMD K7 - +C AMD K8 ? + +C 64-bit popcount hamdist +C cycles/limb cycles/limb +C P4 model 4 (Nocona): 8 +C AMD K8,K9 7.5 +C AMD K10 3.5 +C Intel core2 3.68 +C Intel corei 3.15 +C Intel atom 10.8 +C VIA nano 6.5 + +C TODO +C * Make an mpn_hamdist based on this. Alignment could either be handled by +C using movdqu for one operand and movdqa for the other, or by painfully +C shifting as we go. Unfortunately, there seem to be no usable shift +C instruction, except for one that takes an immediate count. +C * It would probably be possible to cut a few cycles/limb using software +C pipelining. +C * There are 35 decode slots unused by the SSE2 instructions. Loop control +C needs just 2 or 3 slots, leaving around 32 slots. This allows a parallel +C integer based popcount. Such a combined loop would handle 6 limbs in +C about 30 cycles on K8. +C * We could save a byte or two by using 32-bit operations on areg. +C * Check if using movdqa to a temp of and then register-based pand is faster. + +ifelse(GMP_LIMB_BITS,`32', +` define(`up', `%edx') + define(`n', `%ecx') + define(`areg',`%eax') + define(`breg',`%ebx') + define(`zero',`%xmm4') + define(`LIMB32',` $1') + define(`LIMB64',`dnl') +',` + define(`up', `%rdi') + define(`n', `%rsi') + define(`areg',`%rax') + define(`breg',`%rdx') + define(`zero',`%xmm8') + define(`LIMB32',`dnl') + define(`LIMB64',` $1') +') + +define(`mm01010101',`%xmm6') +define(`mm00110011',`%xmm7') +define(`mm00001111',`%xmm2') + +define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8)) +define(`LIMBS_PER_XMM', eval(16/GMP_LIMB_BYTES)) +define(`LIMBS_PER_2XMM', eval(32/GMP_LIMB_BYTES)) + +undefine(`psadbw') C override inherited m4 version + +C This file is shared between 32-bit and 64-bit builds. Only the former has +C LEAL. Default LEAL as an alias of LEA. +ifdef(`LEAL',,`define(`LEAL', `LEA($1,$2)')') + +ASM_START() + +C Make cnsts global to work around Apple relocation bug. +ifdef(`DARWIN',` + define(`cnsts', MPN(popccnsts)) + GLOBL cnsts') + + TEXT + ALIGN(32) +PROLOGUE(mpn_popcount) + +LIMB32(`mov 4(%esp), up ') +LIMB32(`mov 8(%esp), n ') +LIMB32(`push %ebx ') + + pxor %xmm3, %xmm3 C zero grand total count +LIMB64(`pxor zero, zero ') + + LEAL( cnsts, breg) + + movdqa -48(breg), mm01010101 + movdqa -32(breg), mm00110011 + movdqa -16(breg), mm00001111 + + mov up, areg + and $-16, up C round `up' down to 128-bit boundary + and $12, areg C 32:areg = 0, 4, 8, 12 + C 64:areg = 0, 8 + movdqa (up), %xmm0 + pand 64(breg,areg,4), %xmm0 + shr $m4_log2(GMP_LIMB_BYTES), %eax + add areg, n C compensate n for rounded down `up' + + pxor %xmm4, %xmm4 + sub $LIMBS_PER_XMM, n + jbe L(sum) + + sub $LIMBS_PER_XMM, n + ja L(ent) + jmp L(lsum) + + ALIGN(16) +L(top): movdqa (up), %xmm0 +L(ent): movdqa 16(up), %xmm4 + + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + psrld $1, %xmm0 + psrld $1, %xmm4 + pand mm01010101, %xmm0 + pand mm01010101, %xmm4 + psubd %xmm0, %xmm1 + psubd %xmm4, %xmm5 + + movdqa %xmm1, %xmm0 + movdqa %xmm5, %xmm4 + psrlq $2, %xmm1 + psrlq $2, %xmm5 + pand mm00110011, %xmm0 + pand mm00110011, %xmm4 + pand mm00110011, %xmm1 + pand mm00110011, %xmm5 + paddq %xmm0, %xmm1 + paddq %xmm4, %xmm5 + +LIMB32(`pxor zero, zero ') + + add $32, up + sub $LIMBS_PER_2XMM, n + + paddq %xmm5, %xmm1 + movdqa %xmm1, %xmm0 + psrlq $4, %xmm1 + pand mm00001111, %xmm0 + pand mm00001111, %xmm1 + paddq %xmm0, %xmm1 + + psadbw zero, %xmm1 + paddq %xmm1, %xmm3 C add to grand total + + jnc L(top) +L(end): + add $LIMBS_PER_2XMM, n + jz L(rt) + movdqa (up), %xmm0 + pxor %xmm4, %xmm4 + sub $LIMBS_PER_XMM, n + jbe L(sum) +L(lsum): + movdqa %xmm0, %xmm4 + movdqa 16(up), %xmm0 +L(sum): + shl $m4_log2(GMP_LIMB_BYTES), n + and $12, n + pand (breg,n,4), %xmm0 + + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + psrld $1, %xmm0 + psrld $1, %xmm4 + pand mm01010101, %xmm0 + pand mm01010101, %xmm4 + psubd %xmm0, %xmm1 + psubd %xmm4, %xmm5 + + movdqa %xmm1, %xmm0 + movdqa %xmm5, %xmm4 + psrlq $2, %xmm1 + psrlq $2, %xmm5 + pand mm00110011, %xmm0 + pand mm00110011, %xmm4 + pand mm00110011, %xmm1 + pand mm00110011, %xmm5 + paddq %xmm0, %xmm1 + paddq %xmm4, %xmm5 + +LIMB32(`pxor zero, zero ') + + paddq %xmm5, %xmm1 + movdqa %xmm1, %xmm0 + psrlq $4, %xmm1 + pand mm00001111, %xmm0 + pand mm00001111, %xmm1 + paddq %xmm0, %xmm1 + + psadbw zero, %xmm1 + paddq %xmm1, %xmm3 C add to grand total + + +C Add the two 64-bit halves of the grand total counter +L(rt): movdqa %xmm3, %xmm0 + psrldq $8, %xmm3 + paddq %xmm3, %xmm0 + movd %xmm0, areg C movq avoided due to gas bug + +LIMB32(`pop %ebx ') + ret + +EPILOGUE() +DEF_OBJECT(dummy,16) +C Three magic constants used for masking out bits + .byte 0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55 + .byte 0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55 + + .byte 0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33 + .byte 0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33 + + .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f + .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f +cnsts: +C Masks for high end of number + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + + .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 + .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 +C Masks for low end of number + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + + .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + + .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + + .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff +END_OBJECT(dummy) +ASM_END() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/rsh1add_n.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/rsh1add_n.asm new file mode 100644 index 0000000..f421d13 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/rsh1add_n.asm @@ -0,0 +1,126 @@ +dnl Intel Pentium-4 mpn_rsh1add_n -- mpn (x+y)/2 + +dnl Copyright 2001-2004 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb (approx) +C dst!=src1,2 dst==src1 dst==src2 +C P4: 4.5 6.5 6.5 + + +C mp_limb_t mpn_rsh1add_n (mp_ptr wp, mp_srcptr xp, mp_srcptr yp, +C mp_size_t size); +C +C The slightly strange combination of indexing and pointer incrementing +C that's used seems to work best. Not sure why, but for instance leal +C incrementing on %esi is a 1 or 2 cycle slowdown. +C +C The dependent chain is paddq combining the carry and next (shifted) part, +C plus psrlq to move the new carry down. That, and just 4 mmx instructions +C in total, makes 4 c/l the target speed, which is almost achieved for +C separate src/dst but when src==dst the write combining anomalies slow it +C down. + +defframe(PARAM_SIZE, 16) +defframe(PARAM_YP, 12) +defframe(PARAM_XP, 8) +defframe(PARAM_WP, 4) + +dnl re-use parameter space +define(SAVE_EBX,`PARAM_XP') +define(SAVE_ESI,`PARAM_YP') + + TEXT + ALIGN(8) + +PROLOGUE(mpn_rsh1add_n) +deflit(`FRAME',0) + + movl PARAM_XP, %edx + movl %ebx, SAVE_EBX + + movl PARAM_YP, %ebx + movl %esi, SAVE_ESI + + movl PARAM_WP, %esi + + movd (%edx), %mm0 C xp[0] + + movd (%ebx), %mm1 C yp[0] + movl PARAM_SIZE, %ecx + + movl (%edx), %eax C xp[0] + + addl (%ebx), %eax C xp[0]+yp[0] + + paddq %mm1, %mm0 C xp[0]+yp[0] + leal (%esi,%ecx,4), %esi C wp end + negl %ecx C -size + + psrlq $1, %mm0 C (xp[0]+yp[0])/2 + and $1, %eax C return value, rsh1 bit of xp[0]+yp[0] + addl $1, %ecx C -(size-1) + jz L(done) + + +L(top): + C eax return value + C ebx yp end + C ecx counter, limbs, -(size-1) to -1 inclusive + C edx xp end + C esi wp end + C mm0 carry (32 bits) + + movd 4(%edx), %mm1 C xp[i+1] + movd 4(%ebx), %mm2 C yp[i+1] + leal 4(%edx), %edx + leal 4(%ebx), %ebx + paddq %mm2, %mm1 C xp[i+1]+yp[i+1] + psllq $31, %mm1 C low bit at 31, further 32 above + + paddq %mm1, %mm0 C 31 and carry from prev add + movd %mm0, -4(%esi,%ecx,4) C low ready to store dst[i] + + psrlq $32, %mm0 C high becomes new carry + + addl $1, %ecx + jnz L(top) + + +L(done): + movd %mm0, -4(%esi) C dst[size-1] + movl SAVE_EBX, %ebx + + movl SAVE_ESI, %esi + emms + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/sqr_basecase.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/sqr_basecase.asm new file mode 100644 index 0000000..0d548e0 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/sqr_basecase.asm @@ -0,0 +1,705 @@ +dnl mpn_sqr_basecase for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F). + +dnl Copyright 2001, 2002, 2007 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C TODO: +C * Improve ad-hoc outer loop code and register handling. Some feed-in +C scheduling could improve things by several cycles per outer iteration. +C * In Lam3...Lam1 code for, keep accumulation operands in registers, without +C storing intermediates to rp. +C * We might want to keep 32 in a free mm register, since the register form is +C 3 bytes and the immediate form is 4 bytes. About 80 bytes to save. +C * Look into different loop alignment, we now expand the code about 50 bytes +C with possibly needless alignment. +C * Use OSP, should solve feed-in latency problems. +C * Address relative slowness for un<=3 for Pentium M. The old code is there +C considerably faster. (1:20/14, 2:34/32, 3:66/57) + +C INPUT PARAMETERS +C rp sp + 4 +C up sp + 8 +C un sp + 12 + + TEXT + ALIGN(16) +PROLOGUE(mpn_sqr_basecase) + mov 4(%esp), %edx C rp + mov 8(%esp), %eax C up + mov 12(%esp), %ecx C un + + cmp $2, %ecx + jc L(un1) + jz L(un2) + cmp $4, %ecx + jc L(un3) + jz L(un4) + jmp L(big) + +L(un1): mov (%eax), %eax + mov %edx, %ecx + mul %eax + mov %eax, (%ecx) + mov %edx, 4(%ecx) + ret +L(un2): movd (%eax), %mm0 C un=2 + movd (%eax), %mm2 C un=2 + movd 4(%eax), %mm1 C un=2 + pmuludq %mm0, %mm0 C 64b weight 0 un=2 + pmuludq %mm1, %mm2 C 64b weight 32 un=2 + pmuludq %mm1, %mm1 C 64b weight 64 un=2 + movd %mm0, (%edx) C un=2 + psrlq $32, %mm0 C 32b weight 32 un=2 + pcmpeqd %mm7, %mm7 C un=2 + psrlq $33, %mm7 C 0x000000007FFFFFFF un=2 + pand %mm2, %mm7 C 31b weight 32 un=2 + psrlq $31, %mm2 C 33b weight 65 un=2 + psllq $1, %mm7 C 31b weight 33 un=2 + paddq %mm7, %mm0 C un=2 + movd %mm0, 4(%edx) C un=2 + psrlq $32, %mm0 C un=2 + paddq %mm2, %mm1 C un=2 + paddq %mm0, %mm1 C un=2 + movd %mm1, 8(%edx) C un=2 + psrlq $32, %mm1 C un=2 + movd %mm1, 12(%edx) C un=2 + emms + ret +L(un3): movd (%eax), %mm7 C un=3 + movd 4(%eax), %mm6 C un=3 + pmuludq %mm7, %mm6 C un=3 + movd 8(%eax), %mm2 C un=3 + pmuludq %mm7, %mm2 C un=3 + movd %mm6, 4(%edx) C un=3 + psrlq $32, %mm6 C un=3 + paddq %mm2, %mm6 C un=3 + movd %mm6, 8(%edx) C un=3 + psrlq $32, %mm6 C un=3 + movd %mm6, 12(%edx) C un=3 + lea 4(%edx), %edx C un=3 + lea 4(%eax), %eax C un=3 + jmp L(am1) +L(un4): movd (%eax), %mm7 C un=4 + movd 4(%eax), %mm6 C un=4 + pmuludq %mm7, %mm6 C un=4 + movd 8(%eax), %mm0 C un=4 + pmuludq %mm7, %mm0 C un=4 + movd 12(%eax), %mm1 C un=4 + pmuludq %mm7, %mm1 C un=4 + movd %mm6, 4(%edx) C un=4 + psrlq $32, %mm6 C un=4 + paddq %mm0, %mm6 C un=4 + movd %mm6, 8(%edx) C un=4 + psrlq $32, %mm6 C un=4 + paddq %mm1, %mm6 C un=4 + movd %mm6, 12(%edx) C un=4 + psrlq $32, %mm6 C un=4 + movd %mm6, 16(%edx) C un=4 + lea 4(%edx), %edx C un=4 + lea 4(%eax), %eax C un=4 + jmp L(am2) + +L(big): push %esi + push %ebx + push %edi + pxor %mm6, %mm6 + movd (%eax), %mm7 C + lea 4(%eax), %esi C init up, up++ + lea 4(%eax), %eax C up2++ FIXME: should fix offsets + lea 4(%edx), %edi C init rp, rp++ + lea 4(%edx), %edx C rp2++ + lea -4(%ecx), %ebx C loop count + and $3, %ecx + jz L(3m) + cmp $2, %ecx + ja L(2m) + jb L(0m) + +L(1m): + movd (%eax), %mm4 C m 1 + lea (%ebx), %ecx C inner loop count m 1 + pmuludq %mm7, %mm4 C m 1 + movd 4(%eax), %mm3 C m 1 + pmuludq %mm7, %mm3 C m 1 + movd 8(%eax), %mm0 C m 1 + jmp L(m01) C m 1 + ALIGN(16) C m 1 +L(lpm1): + pmuludq %mm7, %mm4 C m 1 + paddq %mm0, %mm6 C m 1 + movd 4(%eax), %mm3 C m 1 + movd %mm6, -8(%edx) C m 1 + psrlq $32, %mm6 C m 1 + pmuludq %mm7, %mm3 C m 1 + paddq %mm1, %mm6 C m 1 + movd 8(%eax), %mm0 C m 1 + movd %mm6, -4(%edx) C m 1 + psrlq $32, %mm6 C m 1 +L(m01): pmuludq %mm7, %mm0 C m 1 + paddq %mm4, %mm6 C m 1 + movd 12(%eax), %mm1 C m 1 + movd %mm6, (%edx) C m 1 + psrlq $32, %mm6 C m 1 + pmuludq %mm7, %mm1 C m 1 + paddq %mm3, %mm6 C m 1 + movd 16(%eax), %mm4 C m 1 + movd %mm6, 4(%edx) C m 1 + psrlq $32, %mm6 C m 1 + lea 16(%eax), %eax C m 1 + lea 16(%edx), %edx C m 1 + sub $4, %ecx C m 1 + ja L(lpm1) C m 1 + pmuludq %mm7, %mm4 C m 1 + paddq %mm0, %mm6 C m 1 + movd %mm6, -8(%edx) C m 1 + psrlq $32, %mm6 C m 1 + paddq %mm1, %mm6 C m 1 + jmp L(0) + +L(2m): + movd (%eax), %mm1 C m 2 + lea (%ebx), %ecx C inner loop count m 2 + pmuludq %mm7, %mm1 C m 2 + movd 4(%eax), %mm4 C m 2 + pmuludq %mm7, %mm4 C m 2 + movd 8(%eax), %mm3 C m 2 + jmp L(m10) C m 2 + ALIGN(16) C m 2 +L(lpm2): + pmuludq %mm7, %mm4 C m 2 + paddq %mm0, %mm6 C m 2 + movd 8(%eax), %mm3 C m 2 + movd %mm6, -4(%edx) C m 2 + psrlq $32, %mm6 C m 2 +L(m10): pmuludq %mm7, %mm3 C m 2 + paddq %mm1, %mm6 C m 2 + movd 12(%eax), %mm0 C m 2 + movd %mm6, (%edx) C m 2 + psrlq $32, %mm6 C m 2 + pmuludq %mm7, %mm0 C m 2 + paddq %mm4, %mm6 C m 2 + movd 16(%eax), %mm1 C m 2 + movd %mm6, 4(%edx) C m 2 + psrlq $32, %mm6 C m 2 + pmuludq %mm7, %mm1 C m 2 + paddq %mm3, %mm6 C m 2 + movd 20(%eax), %mm4 C m 2 + movd %mm6, 8(%edx) C m 2 + psrlq $32, %mm6 C m 2 + lea 16(%eax), %eax C m 2 + lea 16(%edx), %edx C m 2 + sub $4, %ecx C m 2 + ja L(lpm2) C m 2 + pmuludq %mm7, %mm4 C m 2 + paddq %mm0, %mm6 C m 2 + movd %mm6, -4(%edx) C m 2 + psrlq $32, %mm6 C m 2 + paddq %mm1, %mm6 C m 2 + jmp L(1) + +L(3m): + movd (%eax), %mm0 C m 3 + lea (%ebx), %ecx C inner loop count m 3 + pmuludq %mm7, %mm0 C m 3 + movd 4(%eax), %mm1 C m 3 + pmuludq %mm7, %mm1 C m 3 + movd 8(%eax), %mm4 C m 3 + jmp L(lpm3) C m 3 + ALIGN(16) C m 3 +L(lpm3): + pmuludq %mm7, %mm4 C m 3 + paddq %mm0, %mm6 C m 3 + movd 12(%eax), %mm3 C m 3 + movd %mm6, (%edx) C m 3 + psrlq $32, %mm6 C m 3 + pmuludq %mm7, %mm3 C m 3 + paddq %mm1, %mm6 C m 3 + movd 16(%eax), %mm0 C m 3 + movd %mm6, 4(%edx) C m 3 + psrlq $32, %mm6 C m 3 + pmuludq %mm7, %mm0 C m 3 + paddq %mm4, %mm6 C m 3 + movd 20(%eax), %mm1 C m 3 + movd %mm6, 8(%edx) C m 3 + psrlq $32, %mm6 C m 3 + pmuludq %mm7, %mm1 C m 3 + paddq %mm3, %mm6 C m 3 + movd 24(%eax), %mm4 C m 3 + movd %mm6, 12(%edx) C m 3 + psrlq $32, %mm6 C m 3 + lea 16(%eax), %eax C m 3 + lea 16(%edx), %edx C m 3 + sub $4, %ecx C m 3 + ja L(lpm3) C m 3 + pmuludq %mm7, %mm4 C m 3 + paddq %mm0, %mm6 C m 3 + movd %mm6, (%edx) C m 3 + psrlq $32, %mm6 C m 3 + paddq %mm1, %mm6 C m 3 + jmp L(2) + +L(0m): + movd (%eax), %mm3 C m 0 + lea (%ebx), %ecx C inner loop count m 0 + pmuludq %mm7, %mm3 C m 0 + movd 4(%eax), %mm0 C m 0 + pmuludq %mm7, %mm0 C m 0 + movd 8(%eax), %mm1 C m 0 + jmp L(m00) C m 0 + ALIGN(16) C m 0 +L(lpm0): + pmuludq %mm7, %mm4 C m 0 + paddq %mm0, %mm6 C m 0 + movd (%eax), %mm3 C m 0 + movd %mm6, -12(%edx) C m 0 + psrlq $32, %mm6 C m 0 + pmuludq %mm7, %mm3 C m 0 + paddq %mm1, %mm6 C m 0 + movd 4(%eax), %mm0 C m 0 + movd %mm6, -8(%edx) C m 0 + psrlq $32, %mm6 C m 0 + pmuludq %mm7, %mm0 C m 0 + paddq %mm4, %mm6 C m 0 + movd 8(%eax), %mm1 C m 0 + movd %mm6, -4(%edx) C m 0 + psrlq $32, %mm6 C m 0 +L(m00): pmuludq %mm7, %mm1 C m 0 + paddq %mm3, %mm6 C m 0 + movd 12(%eax), %mm4 C m 0 + movd %mm6, (%edx) C m 0 + psrlq $32, %mm6 C m 0 + lea 16(%eax), %eax C m 0 + lea 16(%edx), %edx C m 0 + sub $4, %ecx C m 0 + ja L(lpm0) C m 0 + pmuludq %mm7, %mm4 C m 0 + paddq %mm0, %mm6 C m 0 + movd %mm6, -12(%edx) C m 0 + psrlq $32, %mm6 C m 0 + paddq %mm1, %mm6 C m 0 + jmp L(3) + +L(outer): + lea 8(%edi), %edi C rp += 2 + movd (%esi), %mm7 C am 3 + mov %edi, %edx C rp2 = rp am 3 + lea 4(%esi), %esi C up++ am 3 + lea (%esi), %eax C up2 = up am 3 + movd (%eax), %mm0 C am 3 + lea (%ebx), %ecx C inner loop count am 3 + pxor %mm6, %mm6 C am 3 + pmuludq %mm7, %mm0 C am 3 + movd 4(%eax), %mm1 C am 3 + movd (%edx), %mm4 C am 3 + pmuludq %mm7, %mm1 C am 3 + movd 8(%eax), %mm2 C am 3 + paddq %mm0, %mm4 C am 3 + movd 4(%edx), %mm5 C am 3 + jmp L(lam3) C am 3 + ALIGN(16) C am 3 +L(lam3): + pmuludq %mm7, %mm2 C am 3 + paddq %mm4, %mm6 C am 3 + movd 12(%eax), %mm3 C am 3 + paddq %mm1, %mm5 C am 3 + movd 8(%edx), %mm4 C am 3 + movd %mm6, (%edx) C am 3 + psrlq $32, %mm6 C am 3 + pmuludq %mm7, %mm3 C am 3 + paddq %mm5, %mm6 C am 3 + movd 16(%eax), %mm0 C am 3 + paddq %mm2, %mm4 C am 3 + movd 12(%edx), %mm5 C am 3 + movd %mm6, 4(%edx) C am 3 + psrlq $32, %mm6 C am 3 + pmuludq %mm7, %mm0 C am 3 + paddq %mm4, %mm6 C am 3 + movd 20(%eax), %mm1 C am 3 + paddq %mm3, %mm5 C am 3 + movd 16(%edx), %mm4 C am 3 + movd %mm6, 8(%edx) C am 3 + psrlq $32, %mm6 C am 3 + pmuludq %mm7, %mm1 C am 3 + paddq %mm5, %mm6 C am 3 + movd 24(%eax), %mm2 C am 3 + paddq %mm0, %mm4 C am 3 + movd 20(%edx), %mm5 C am 3 + movd %mm6, 12(%edx) C am 3 + psrlq $32, %mm6 C am 3 + lea 16(%eax), %eax C am 3 + lea 16(%edx), %edx C am 3 + sub $4, %ecx C am 3 + ja L(lam3) C am 3 + pmuludq %mm7, %mm2 C am 3 + paddq %mm4, %mm6 C am 3 + paddq %mm1, %mm5 C am 3 + movd 8(%edx), %mm4 C am 3 + movd %mm6, (%edx) C am 3 + psrlq $32, %mm6 C am 3 + paddq %mm5, %mm6 C am 3 + paddq %mm2, %mm4 C am 3 +L(2): movd %mm6, 4(%edx) C am 3 + psrlq $32, %mm6 C am 3 + paddq %mm4, %mm6 C am 3 + movd %mm6, 8(%edx) C am 3 + psrlq $32, %mm6 C am 3 + movd %mm6, 12(%edx) C am 3 + + lea 8(%edi), %edi C rp += 2 + movd (%esi), %mm7 C am 2 + mov %edi, %edx C rp2 = rp am 2 + lea 4(%esi), %esi C up++ am 2 + lea (%esi), %eax C up2 = up am 2 + movd (%eax), %mm1 C am 2 + lea (%ebx), %ecx C inner loop count am 2 + pxor %mm6, %mm6 C am 2 + pmuludq %mm7, %mm1 C am 2 + movd 4(%eax), %mm2 C am 2 + movd (%edx), %mm5 C am 2 + pmuludq %mm7, %mm2 C am 2 + movd 8(%eax), %mm3 C am 2 + paddq %mm1, %mm5 C am 2 + movd 4(%edx), %mm4 C am 2 + jmp L(am10) C am 2 + ALIGN(16) C am 2 +L(lam2): + pmuludq %mm7, %mm2 C am 2 + paddq %mm4, %mm6 C am 2 + movd 8(%eax), %mm3 C am 2 + paddq %mm1, %mm5 C am 2 + movd 4(%edx), %mm4 C am 2 + movd %mm6, -4(%edx) C am 2 + psrlq $32, %mm6 C am 2 +L(am10): + pmuludq %mm7, %mm3 C am 2 + paddq %mm5, %mm6 C am 2 + movd 12(%eax), %mm0 C am 2 + paddq %mm2, %mm4 C am 2 + movd 8(%edx), %mm5 C am 2 + movd %mm6, (%edx) C am 2 + psrlq $32, %mm6 C am 2 + pmuludq %mm7, %mm0 C am 2 + paddq %mm4, %mm6 C am 2 + movd 16(%eax), %mm1 C am 2 + paddq %mm3, %mm5 C am 2 + movd 12(%edx), %mm4 C am 2 + movd %mm6, 4(%edx) C am 2 + psrlq $32, %mm6 C am 2 + pmuludq %mm7, %mm1 C am 2 + paddq %mm5, %mm6 C am 2 + movd 20(%eax), %mm2 C am 2 + paddq %mm0, %mm4 C am 2 + movd 16(%edx), %mm5 C am 2 + movd %mm6, 8(%edx) C am 2 + psrlq $32, %mm6 C am 2 + lea 16(%eax), %eax C am 2 + lea 16(%edx), %edx C am 2 + sub $4, %ecx C am 2 + ja L(lam2) C am 2 + pmuludq %mm7, %mm2 C am 2 + paddq %mm4, %mm6 C am 2 + paddq %mm1, %mm5 C am 2 + movd 4(%edx), %mm4 C am 2 + movd %mm6, -4(%edx) C am 2 + psrlq $32, %mm6 C am 2 + paddq %mm5, %mm6 C am 2 + paddq %mm2, %mm4 C am 2 +L(1): movd %mm6, (%edx) C am 2 + psrlq $32, %mm6 C am 2 + paddq %mm4, %mm6 C am 2 + movd %mm6, 4(%edx) C am 2 + psrlq $32, %mm6 C am 2 + movd %mm6, 8(%edx) C am 2 + + lea 8(%edi), %edi C rp += 2 + movd (%esi), %mm7 C am 1 + mov %edi, %edx C rp2 = rp am 1 + lea 4(%esi), %esi C up++ am 1 + lea (%esi), %eax C up2 = up am 1 + movd (%eax), %mm2 C am 1 + lea (%ebx), %ecx C inner loop count am 1 + pxor %mm6, %mm6 C am 1 + pmuludq %mm7, %mm2 C am 1 + movd 4(%eax), %mm3 C am 1 + movd (%edx), %mm4 C am 1 + pmuludq %mm7, %mm3 C am 1 + movd 8(%eax), %mm0 C am 1 + paddq %mm2, %mm4 C am 1 + movd 4(%edx), %mm5 C am 1 + jmp L(am01) C am 1 + ALIGN(16) C am 1 +L(lam1): + pmuludq %mm7, %mm2 C am 1 + paddq %mm4, %mm6 C am 1 + movd 4(%eax), %mm3 C am 1 + paddq %mm1, %mm5 C am 1 + movd (%edx), %mm4 C am 1 + movd %mm6, -8(%edx) C am 1 + psrlq $32, %mm6 C am 1 + pmuludq %mm7, %mm3 C am 1 + paddq %mm5, %mm6 C am 1 + movd 8(%eax), %mm0 C am 1 + paddq %mm2, %mm4 C am 1 + movd 4(%edx), %mm5 C am 1 + movd %mm6, -4(%edx) C am 1 + psrlq $32, %mm6 C am 1 +L(am01): + pmuludq %mm7, %mm0 C am 1 + paddq %mm4, %mm6 C am 1 + movd 12(%eax), %mm1 C am 1 + paddq %mm3, %mm5 C am 1 + movd 8(%edx), %mm4 C am 1 + movd %mm6, (%edx) C am 1 + psrlq $32, %mm6 C am 1 + pmuludq %mm7, %mm1 C am 1 + paddq %mm5, %mm6 C am 1 + movd 16(%eax), %mm2 C am 1 + paddq %mm0, %mm4 C am 1 + movd 12(%edx), %mm5 C am 1 + movd %mm6, 4(%edx) C am 1 + psrlq $32, %mm6 C am 1 + lea 16(%eax), %eax C am 1 + lea 16(%edx), %edx C am 1 + sub $4, %ecx C am 1 + ja L(lam1) C am 1 + pmuludq %mm7, %mm2 C am 1 + paddq %mm4, %mm6 C am 1 + paddq %mm1, %mm5 C am 1 + movd (%edx), %mm4 C am 1 + movd %mm6, -8(%edx) C am 1 + psrlq $32, %mm6 C am 1 + paddq %mm5, %mm6 C am 1 + paddq %mm2, %mm4 C am 1 +L(0): movd %mm6, -4(%edx) C am 1 + psrlq $32, %mm6 C am 1 + paddq %mm4, %mm6 C am 1 + movd %mm6, (%edx) C am 1 + psrlq $32, %mm6 C am 1 + movd %mm6, 4(%edx) C am 1 + + lea 8(%edi), %edi C rp += 2 + movd (%esi), %mm7 C am 0 + mov %edi, %edx C rp2 = rp am 0 + lea 4(%esi), %esi C up++ am 0 + lea (%esi), %eax C up2 = up am 0 + movd (%eax), %mm3 C am 0 + lea (%ebx), %ecx C inner loop count am 0 + pxor %mm6, %mm6 C am 0 + pmuludq %mm7, %mm3 C am 0 + movd 4(%eax), %mm0 C am 0 + movd (%edx), %mm5 C am 0 + pmuludq %mm7, %mm0 C am 0 + movd 8(%eax), %mm1 C am 0 + paddq %mm3, %mm5 C am 0 + movd 4(%edx), %mm4 C am 0 + jmp L(am00) C am 0 + ALIGN(16) C am 0 +L(lam0): + pmuludq %mm7, %mm2 C am 0 + paddq %mm4, %mm6 C am 0 + movd (%eax), %mm3 C am 0 + paddq %mm1, %mm5 C am 0 + movd -4(%edx), %mm4 C am 0 + movd %mm6, -12(%edx) C am 0 + psrlq $32, %mm6 C am 0 + pmuludq %mm7, %mm3 C am 0 + paddq %mm5, %mm6 C am 0 + movd 4(%eax), %mm0 C am 0 + paddq %mm2, %mm4 C am 0 + movd (%edx), %mm5 C am 0 + movd %mm6, -8(%edx) C am 0 + psrlq $32, %mm6 C am 0 + pmuludq %mm7, %mm0 C am 0 + paddq %mm4, %mm6 C am 0 + movd 8(%eax), %mm1 C am 0 + paddq %mm3, %mm5 C am 0 + movd 4(%edx), %mm4 C am 0 + movd %mm6, -4(%edx) C am 0 + psrlq $32, %mm6 C am 0 +L(am00): + pmuludq %mm7, %mm1 C am 0 + paddq %mm5, %mm6 C am 0 + movd 12(%eax), %mm2 C am 0 + paddq %mm0, %mm4 C am 0 + movd 8(%edx), %mm5 C am 0 + movd %mm6, (%edx) C am 0 + psrlq $32, %mm6 C am 0 + lea 16(%eax), %eax C am 0 + lea 16(%edx), %edx C am 0 + sub $4, %ecx C am 0 + ja L(lam0) C am 0 + pmuludq %mm7, %mm2 C am 0 + paddq %mm4, %mm6 C am 0 + paddq %mm1, %mm5 C am 0 + movd -4(%edx), %mm4 C am 0 + movd %mm6, -12(%edx) C am 0 + psrlq $32, %mm6 C am 0 + paddq %mm5, %mm6 C am 0 + paddq %mm2, %mm4 C am 0 +L(3): movd %mm6, -8(%edx) C am 0 + psrlq $32, %mm6 C am 0 + paddq %mm4, %mm6 C am 0 + movd %mm6, -4(%edx) C am 0 + psrlq $32, %mm6 C am 0 + movd %mm6, (%edx) C am 0 + sub $4, %ebx C am 0 + ja L(outer) C am 0 + + mov %edi, %edx + mov %esi, %eax + pop %edi + pop %ebx + pop %esi + +L(am3): C up[un-1..un-3] x up[un-4] + lea 8(%edx), %edx C rp2 += 2 + movd (%eax), %mm7 + movd 4(%eax), %mm1 + movd 8(%eax), %mm2 + movd 12(%eax), %mm3 + movd (%edx), %mm4 + pmuludq %mm7, %mm1 + movd 4(%edx), %mm5 + pmuludq %mm7, %mm2 + movd 8(%edx), %mm6 + pmuludq %mm7, %mm3 + paddq %mm1, %mm4 + paddq %mm2, %mm5 + paddq %mm3, %mm6 + movd %mm4, (%edx) + psrlq $32, %mm4 + paddq %mm5, %mm4 + movd %mm4, 4(%edx) + psrlq $32, %mm4 + paddq %mm6, %mm4 + movd %mm4, 8(%edx) + psrlq $32, %mm4 + movd %mm4, 12(%edx) C FIXME feed through! + lea 4(%eax), %eax + +L(am2): C up[un-1..un-2] x up[un-3] + lea 8(%edx), %edx C rp2 += 2 + movd (%eax), %mm7 + movd 4(%eax), %mm1 + movd 8(%eax), %mm2 + movd (%edx), %mm4 + movd 4(%edx), %mm5 + pmuludq %mm7, %mm1 + pmuludq %mm7, %mm2 + paddq %mm1, %mm4 + paddq %mm2, %mm5 + movd %mm4, (%edx) + psrlq $32, %mm4 + paddq %mm5, %mm4 + movd %mm4, 4(%edx) + psrlq $32, %mm4 + movd %mm4, 8(%edx) C FIXME feed through! + lea 4(%eax), %eax + +L(am1): C up[un-1] x up[un-2] + lea 8(%edx), %edx C rp2 += 2 + movd (%eax), %mm7 + movd 4(%eax), %mm2 + movd (%edx), %mm4 + pmuludq %mm7, %mm2 + paddq %mm2, %mm4 + movd %mm4, (%edx) + psrlq $32, %mm4 + movd %mm4, 4(%edx) + +C *** diag stuff, use elementary code for now + + mov 4(%esp), %edx C rp + mov 8(%esp), %eax C up + mov 12(%esp), %ecx C un + + movd (%eax), %mm2 + pmuludq %mm2, %mm2 C src[0]^2 + + pcmpeqd %mm7, %mm7 + psrlq $32, %mm7 + + movd 4(%edx), %mm3 C dst[1] + + movd %mm2, (%edx) + psrlq $32, %mm2 + + psllq $1, %mm3 C 2*dst[1] + paddq %mm3, %mm2 + movd %mm2, 4(%edx) + psrlq $32, %mm2 + + sub $2, %ecx + +L(diag): + movd 4(%eax), %mm0 C src limb + add $4, %eax + pmuludq %mm0, %mm0 + movq %mm7, %mm1 + pand %mm0, %mm1 C diagonal low + psrlq $32, %mm0 C diagonal high + + movd 8(%edx), %mm3 + psllq $1, %mm3 C 2*dst[i] + paddq %mm3, %mm1 + paddq %mm1, %mm2 + movd %mm2, 8(%edx) + psrlq $32, %mm2 + + movd 12(%edx), %mm3 + psllq $1, %mm3 C 2*dst[i+1] + paddq %mm3, %mm0 + paddq %mm0, %mm2 + movd %mm2, 12(%edx) + add $8, %edx + psrlq $32, %mm2 + + sub $1, %ecx + jnz L(diag) + + movd 4(%eax), %mm0 C src[size-1] + pmuludq %mm0, %mm0 + pand %mm0, %mm7 C diagonal low + psrlq $32, %mm0 C diagonal high + + movd 8(%edx), %mm3 C dst[2*size-2] + psllq $1, %mm3 + paddq %mm3, %mm7 + paddq %mm7, %mm2 + movd %mm2, 8(%edx) + psrlq $32, %mm2 + + paddq %mm0, %mm2 + movd %mm2, 12(%edx) C dst[2*size-1] + + emms + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/sub_n.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/sub_n.asm new file mode 100644 index 0000000..5ba1c01 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/sub_n.asm @@ -0,0 +1,119 @@ +dnl Intel Pentium-4 mpn_sub_n -- mpn subtraction. + +dnl Copyright 2001, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C dst!=src1,2 dst==src1 dst==src2 +C P6 model 0-8,10-12 - +C P6 model 9 (Banias) ? +C P6 model 13 (Dothan) ? +C P4 model 0-1 (Willamette) ? +C P4 model 2 (Northwood) 4 6 6 +C P4 model 3-4 (Prescott) 4.25 7.5 7.5 + +defframe(PARAM_CARRY,20) +defframe(PARAM_SIZE, 16) +defframe(PARAM_SRC2, 12) +defframe(PARAM_SRC1, 8) +defframe(PARAM_DST, 4) + +dnl re-use parameter space +define(SAVE_EBX,`PARAM_SRC1') + + TEXT + ALIGN(8) + +PROLOGUE(mpn_sub_nc) +deflit(`FRAME',0) + movd PARAM_CARRY, %mm0 + jmp L(start_nc) +EPILOGUE() + + ALIGN(8) +PROLOGUE(mpn_sub_n) +deflit(`FRAME',0) + pxor %mm0, %mm0 +L(start_nc): + mov PARAM_SRC1, %eax + mov %ebx, SAVE_EBX + mov PARAM_SRC2, %ebx + mov PARAM_DST, %edx + mov PARAM_SIZE, %ecx + + lea (%eax,%ecx,4), %eax C src1 end + lea (%ebx,%ecx,4), %ebx C src2 end + lea (%edx,%ecx,4), %edx C dst end + neg %ecx C -size + +L(top): + C eax src1 end + C ebx src2 end + C ecx counter, limbs, negative + C edx dst end + C mm0 carry bit + + movd (%eax,%ecx,4), %mm1 + movd (%ebx,%ecx,4), %mm2 + psubq %mm2, %mm1 + + psubq %mm0, %mm1 + movd %mm1, (%edx,%ecx,4) + + psrlq $63, %mm1 + + add $1, %ecx + jz L(done_mm1) + + movd (%eax,%ecx,4), %mm0 + movd (%ebx,%ecx,4), %mm2 + psubq %mm2, %mm0 + + psubq %mm1, %mm0 + movd %mm0, (%edx,%ecx,4) + + psrlq $63, %mm0 + + add $1, %ecx + jnz L(top) + + movd %mm0, %eax + mov SAVE_EBX, %ebx + emms + ret + +L(done_mm1): + movd %mm1, %eax + mov SAVE_EBX, %ebx + emms + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/submul_1.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/submul_1.asm new file mode 100644 index 0000000..020675b --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/submul_1.asm @@ -0,0 +1,182 @@ +dnl Intel Pentium-4 mpn_submul_1 -- Multiply a limb vector with a limb and +dnl subtract the result from a second limb vector. + +dnl Copyright 2001, 2002, 2008, 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C P6 model 0-8,10-12 - +C P6 model 9 (Banias) 6.8 +C P6 model 13 (Dothan) 6.9 +C P4 model 0-1 (Willamette) ? +C P4 model 2 (Northwood) 5.87 +C P4 model 3-4 (Prescott) 6.5 + +C This code represents a step forwards compared to the code available before +C GMP 5.1, but it is not carefully tuned for either P6 or P4. In fact, it is +C not good for P6. For P4 it saved a bit over 1 c/l for both Northwood and +C Prescott compared to the old code. +C +C The arrangements made here to get a two instruction dependent chain are +C slightly subtle. In the loop the carry (or borrow rather) is a negative so +C that a paddq can be used to give a low limb ready to store, and a high limb +C ready to become the new carry after a psrlq. +C +C If the carry was a simple twos complement negative then the psrlq shift would +C need to bring in 0 bits or 1 bits according to whether the high was zero or +C non-zero, since a non-zero value would represent a negative needing sign +C extension. That wouldn't be particularly easy to arrange and certainly would +C add an instruction to the dependent chain, so instead an offset is applied so +C that the high limb will be 0xFFFFFFFF+c. With c in the range -0xFFFFFFFF to +C 0, the value 0xFFFFFFFF+c is in the range 0 to 0xFFFFFFFF and is therefore +C always positive and can always have 0 bits shifted in, which is what psrlq +C does. +C +C The extra 0xFFFFFFFF must be subtracted before c is used, but that can be +C done off the dependent chain. The total adjustment then is to add +C 0xFFFFFFFF00000000 to offset the new carry, and subtract 0x00000000FFFFFFFF +C to remove the offset from the current carry, for a net add of +C 0xFFFFFFFE00000001. In the code this is applied to the destination limb when +C fetched. +C +C It's also possible to view the 0xFFFFFFFF adjustment as a ones-complement +C negative, which is how it's undone for the return value, but that doesn't +C seem as clear. + +defframe(PARAM_CARRY, 20) +defframe(PARAM_MULTIPLIER,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + TEXT + ALIGN(16) + +PROLOGUE(mpn_submul_1c) +deflit(`FRAME',0) + movd PARAM_CARRY, %mm1 + jmp L(start_1c) +EPILOGUE() + +PROLOGUE(mpn_submul_1) +deflit(`FRAME',0) + pxor %mm1, %mm1 C initial borrow + +L(start_1c): + mov PARAM_SRC, %eax + pcmpeqd %mm0, %mm0 + + movd PARAM_MULTIPLIER, %mm7 + pcmpeqd %mm6, %mm6 + + mov PARAM_DST, %edx + psrlq $32, %mm0 C 0x00000000FFFFFFFF + + mov PARAM_SIZE, %ecx + psllq $32, %mm6 C 0xFFFFFFFF00000000 + + psubq %mm0, %mm6 C 0xFFFFFFFE00000001 + + psubq %mm1, %mm0 C 0xFFFFFFFF - borrow + + + movd (%eax), %mm3 C up + movd (%edx), %mm4 C rp + + add $-1, %ecx + paddq %mm6, %mm4 C add 0xFFFFFFFE00000001 + pmuludq %mm7, %mm3 + jnz L(gt1) + psubq %mm3, %mm4 C prod + paddq %mm4, %mm0 C borrow + movd %mm0, (%edx) C result + jmp L(rt) + +L(gt1): movd 4(%eax), %mm1 C up + movd 4(%edx), %mm2 C rp + + add $-1, %ecx + jz L(eev) + + ALIGN(16) +L(top): paddq %mm6, %mm2 C add 0xFFFFFFFE00000001 + pmuludq %mm7, %mm1 + psubq %mm3, %mm4 C prod + movd 8(%eax), %mm3 C up + paddq %mm4, %mm0 C borrow + movd 8(%edx), %mm4 C rp + movd %mm0, (%edx) C result + psrlq $32, %mm0 + + add $-1, %ecx + jz L(eod) + + paddq %mm6, %mm4 C add 0xFFFFFFFE00000001 + pmuludq %mm7, %mm3 + psubq %mm1, %mm2 C prod + movd 12(%eax), %mm1 C up + paddq %mm2, %mm0 C borrow + movd 12(%edx), %mm2 C rp + movd %mm0, 4(%edx) C result + psrlq $32, %mm0 + + lea 8(%eax), %eax + lea 8(%edx), %edx + add $-1, %ecx + jnz L(top) + + +L(eev): paddq %mm6, %mm2 C add 0xFFFFFFFE00000001 + pmuludq %mm7, %mm1 + psubq %mm3, %mm4 C prod + paddq %mm4, %mm0 C borrow + movd %mm0, (%edx) C result + psrlq $32, %mm0 + psubq %mm1, %mm2 C prod + paddq %mm2, %mm0 C borrow + movd %mm0, 4(%edx) C result +L(rt): psrlq $32, %mm0 + movd %mm0, %eax + not %eax + emms + ret + +L(eod): paddq %mm6, %mm4 C add 0xFFFFFFFE00000001 + pmuludq %mm7, %mm3 + psubq %mm1, %mm2 C prod + paddq %mm2, %mm0 C borrow + movd %mm0, 4(%edx) C result + psrlq $32, %mm0 + psubq %mm3, %mm4 C prod + paddq %mm4, %mm0 C borrow + movd %mm0, 8(%edx) C result + jmp L(rt) +EPILOGUE() |