diff options
| author | Thomas Voss <mail@thomasvoss.com> | 2024-06-21 23:36:36 +0200 | 
|---|---|---|
| committer | Thomas Voss <mail@thomasvoss.com> | 2024-06-21 23:42:26 +0200 | 
| commit | a89a14ef5da44684a16b204e7a70460cc8c4922a (patch) | |
| tree | b23b4c6b155977909ef508fdae2f48d33d802813 /vendor/gmp-6.3.0/mpn/x86/pentium4 | |
| parent | 1db63fcedab0b288820d66e100b1877b1a5a8851 (diff) | |
Basic constant folding implementation
Diffstat (limited to 'vendor/gmp-6.3.0/mpn/x86/pentium4')
27 files changed, 5649 insertions, 0 deletions
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/README b/vendor/gmp-6.3.0/mpn/x86/pentium4/README new file mode 100644 index 0000000..90f752e --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/README @@ -0,0 +1,124 @@ +Copyright 2001 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + +  * the GNU Lesser General Public License as published by the Free +    Software Foundation; either version 3 of the License, or (at your +    option) any later version. + +or + +  * the GNU General Public License as published by the Free Software +    Foundation; either version 2 of the License, or (at your option) any +    later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library.  If not, +see https://www.gnu.org/licenses/. + + + + +                   INTEL PENTIUM-4 MPN SUBROUTINES + + +This directory contains mpn functions optimized for Intel Pentium-4. + +The mmx subdirectory has routines using MMX instructions, the sse2 +subdirectory has routines using SSE2 instructions.  All P4s have these, the +separate directories are just so configure can omit that code if the +assembler doesn't support it. + + +STATUS + +                                cycles/limb + +	mpn_add_n/sub_n            4 normal, 6 in-place + +	mpn_mul_1                  4 normal, 6 in-place +	mpn_addmul_1               6 +	mpn_submul_1               7 + +	mpn_mul_basecase           6 cycles/crossproduct (approx) + +	mpn_sqr_basecase           3.5 cycles/crossproduct (approx) +                                   or 7.0 cycles/triangleproduct (approx) + +	mpn_l/rshift               1.75 + + + +The shifts ought to be able to go at 1.5 c/l, but not much effort has been +applied to them yet. + +In-place operations, and all addmul, submul, mul_basecase and sqr_basecase +calls, suffer from pipeline anomalies associated with write combining and +movd reads and writes to the same or nearby locations.  The movq +instructions do not trigger the same hardware problems.  Unfortunately, +using movq and splitting/combining seems to require too many extra +instructions to help.  Perhaps future chip steppings will be better. + + + +NOTES + +The Pentium-4 pipeline "Netburst", provides for quite a number of surprises. +Many traditional x86 instructions run very slowly, requiring use of +alterative instructions for acceptable performance. + +adcl and sbbl are quite slow at 8 cycles for reg->reg.  paddq of 32-bits +within a 64-bit mmx register seems better, though the combination +paddq/psrlq when propagating a carry is still a 4 cycle latency. + +incl and decl should be avoided, instead use add $1 and sub $1.  Apparently +the carry flag is not separately renamed, so incl and decl depend on all +previous flags-setting instructions. + +shll and shrl have a 4 cycle latency, or 8 times the latency of the fastest +integer instructions (addl, subl, orl, andl, and some more).  shldl and +shrdl seem to have 13 and 15 cycles latency, respectively.  Bizarre. + +movq mmx -> mmx does have 6 cycle latency, as noted in the documentation. +pxor/por or similar combination at 2 cycles latency can be used instead. +The movq however executes in the float unit, thereby saving MMX execution +resources.  With the right juggling, data moves shouldn't be on a dependent +chain. + +L1 is write-through, but the write-combining sounds like it does enough to +not require explicit destination prefetching. + +xmm registers so far haven't found a use, but not much effort has been +expended.  A configure test for whether the operating system knows +fxsave/fxrestor will be needed if they're used. + + + +REFERENCES + +Intel Pentium-4 processor manuals, + +	http://developer.intel.com/design/pentium4/manuals + +"Intel Pentium 4 Processor Optimization Reference Manual", Intel, 2001, +order number 248966.  Available on-line: + +	http://developer.intel.com/design/pentium4/manuals/248966.htm + + + +---------------- +Local variables: +mode: text +fill-column: 76 +End: diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/copyd.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/copyd.asm new file mode 100644 index 0000000..82af81c --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/copyd.asm @@ -0,0 +1,71 @@ +dnl  Pentium-4 mpn_copyd -- copy limb vector, decrementing. + +dnl  Copyright 1999-2001 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + + +dnl  The std/rep/movsl/cld is very slow for small blocks on pentium4.  Its +dnl  startup time seems to be about 165 cycles.  It then needs 2.6 c/l. +dnl  We therefore use an open-coded 2 c/l copying loop. + +dnl  Ultimately, we may want to use 64-bit movq or 128-bit movdqu in some +dnl  nifty unrolled arrangement.  Clearly, that could reach much higher +dnl  speeds, at least for large blocks. + +include(`../config.m4') + + +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST,  4) + +	TEXT +	ALIGN(8) + +PROLOGUE(mpn_copyd) +deflit(`FRAME',0) + +	movl	PARAM_SIZE, %ecx + +	movl	PARAM_SRC, %eax +	movl	PARAM_DST, %edx +	movl	%ebx, PARAM_SIZE +	addl	$-1, %ecx +	js	L(end) + +L(loop): +	movl	(%eax,%ecx,4), %ebx +	movl	%ebx, (%edx,%ecx,4) +	addl	$-1, %ecx + +	jns	L(loop) +L(end): +	movl	PARAM_SIZE, %ebx +	ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/copyi.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/copyi.asm new file mode 100644 index 0000000..b614887 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/copyi.asm @@ -0,0 +1,93 @@ +dnl  Pentium-4 mpn_copyi -- copy limb vector, incrementing. + +dnl  Copyright 1999-2001 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + + +dnl  The rep/movsl is very slow for small blocks on pentium4.  Its startup +dnl  time seems to be about 110 cycles.  It then copies at a rate of one +dnl  limb per cycle.  We therefore fall back to an open-coded 2 c/l copying +dnl  loop for smaller sizes. + +dnl  Ultimately, we may want to use 64-bit movd or 128-bit movdqu in some +dnl  nifty unrolled arrangement.  Clearly, that could reach much higher +dnl  speeds, at least for large blocks. + +include(`../config.m4') + + +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST,  4) + +	TEXT +	ALIGN(8) + +PROLOGUE(mpn_copyi) +deflit(`FRAME',0) + +	movl	PARAM_SIZE, %ecx +	cmpl	$150, %ecx +	jg	L(replmovs) + +	movl	PARAM_SRC, %eax +	movl	PARAM_DST, %edx +	movl	%ebx, PARAM_SIZE +	testl	%ecx, %ecx +	jz	L(end) + +L(loop): +	movl	(%eax), %ebx +	leal	4(%eax), %eax +	addl	$-1, %ecx +	movl	%ebx, (%edx) +	leal	4(%edx), %edx + +	jnz	L(loop) + +L(end): +	movl	PARAM_SIZE, %ebx +	ret + +L(replmovs): +	cld	C better safe than sorry, see mpn/x86/README + +	movl	%esi, %eax +	movl	PARAM_SRC, %esi +	movl	%edi, %edx +	movl	PARAM_DST, %edi + +	rep +	movsl + +	movl	%eax, %esi +	movl	%edx, %edi + +	ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/mmx/lshift.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/mmx/lshift.asm new file mode 100644 index 0000000..b5eca66 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/mmx/lshift.asm @@ -0,0 +1,39 @@ +dnl  Intel Pentium-4 mpn_lshift -- left shift. + +dnl  Copyright 2001, 2002 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P4 Willamette, Northwood: 1.75 cycles/limb +C P4 Prescott:		    2.0 cycles/limb + + +MULFUNC_PROLOGUE(mpn_lshift) +include_mpn(`x86/pentium/mmx/lshift.asm') diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/mmx/popham.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/mmx/popham.asm new file mode 100644 index 0000000..9563cb5 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/mmx/popham.asm @@ -0,0 +1,203 @@ +dnl  Intel Pentium 4 mpn_popcount, mpn_hamdist -- population count and +dnl  hamming distance. + +dnl  Copyright 2000-2002, 2007 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C			     popcount	     hamdist +C P3 model 9  (Banias)		?		? +C P3 model 13 (Dothan)		6		6 +C P4 model 0  (Willamette) +C P4 model 1  (?) +C P4 model 2  (Northwood)	8		9 +C P4 model 3  (Prescott)	8		9 +C P4 model 4  (Nocona) + +C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size); +C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size); +C +C Loading with unaligned movq's costs an extra 1 c/l and hence is avoided. +C Two movd's and a punpckldq seems to be the same speed as an aligned movq, +C and using them saves fiddling about with alignment testing on entry. +C +C For popcount there's 13 mmx instructions in the loop, so perhaps 6.5 c/l +C might be possible, but 8 c/l relying on out-of-order execution is already +C quite reasonable. + +ifdef(`OPERATION_popcount',, +`ifdef(`OPERATION_hamdist',, +`m4_error(`Need OPERATION_popcount or OPERATION_hamdist defined +')')') + +define(HAM, +m4_assert_numargs(1) +`ifdef(`OPERATION_hamdist',`$1')') + +define(POP, +m4_assert_numargs(1) +`ifdef(`OPERATION_popcount',`$1')') + +HAM(` +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC2,  8) +defframe(PARAM_SRC,   4) +define(M4_function,mpn_hamdist) +') +POP(` +defframe(PARAM_SIZE,  8) +defframe(PARAM_SRC,   4) +define(M4_function,mpn_popcount) +') + +MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist) + + +ifdef(`PIC',,` +	dnl  non-PIC +	RODATA +	ALIGN(8) +L(rodata_AAAAAAAAAAAAAAAA): +	.long	0xAAAAAAAA +	.long	0xAAAAAAAA +L(rodata_3333333333333333): +	.long	0x33333333 +	.long	0x33333333 +L(rodata_0F0F0F0F0F0F0F0F): +	.long	0x0F0F0F0F +	.long	0x0F0F0F0F +') + +	TEXT +	ALIGN(16) + +PROLOGUE(M4_function) +deflit(`FRAME',0) + +	movl	PARAM_SIZE, %ecx +	movl	PARAM_SRC, %eax + +ifdef(`PIC',` +	movl	$0xAAAAAAAA, %edx +	movd	%edx, %mm7 +	punpckldq %mm7, %mm7 + +	movl	$0x33333333, %edx +	movd	%edx, %mm6 +	punpckldq %mm6, %mm6 + +	movl	$0x0F0F0F0F, %edx +	movd	%edx, %mm5 +	punpckldq %mm5, %mm5 + +HAM(`	movl	PARAM_SRC2, %edx') + +',` +	dnl non-PIC +HAM(`	movl	PARAM_SRC2, %edx') +	movq	L(rodata_AAAAAAAAAAAAAAAA), %mm7 +	movq	L(rodata_3333333333333333), %mm6 +	movq	L(rodata_0F0F0F0F0F0F0F0F), %mm5 +') + +	pxor	%mm4, %mm4		C zero +	pxor	%mm0, %mm0		C total + +	subl	$1, %ecx +	ja	L(top) + +L(last): +	movd	(%eax,%ecx,4), %mm1		C src high limb +HAM(`	movd	(%edx,%ecx,4), %mm2 +	pxor	%mm2, %mm1 +') +	jmp	L(loaded) + + +L(top): +	C eax	src +	C ebx +	C ecx	counter, size-1 to 2 or 1, inclusive +	C edx	[hamdist] src2 +	C +	C mm0	total (low dword) +	C mm1	(scratch) +	C mm2	(scratch) +	C mm3 +	C mm4	0x0000000000000000 +	C mm5	0x0F0F0F0F0F0F0F0F +	C mm6	0x3333333333333333 +	C mm7	0xAAAAAAAAAAAAAAAA + +	movd	(%eax), %mm1 +	movd	4(%eax), %mm2 +	punpckldq %mm2, %mm1 +	addl	$8, %eax + +HAM(`	movd	(%edx), %mm2 +	movd	4(%edx), %mm3 +	punpckldq %mm3, %mm2 +	pxor	%mm2, %mm1 +	addl	$8, %edx +') + +L(loaded): +	movq	%mm7, %mm2 +	pand	%mm1, %mm2 +	psrlq	$1, %mm2 +	psubd	%mm2, %mm1	C bit pairs + +	movq	%mm6, %mm2 +	pand	%mm1, %mm2 +	psrlq	$2, %mm1 +	pand	%mm6, %mm1 +	paddd	%mm2, %mm1	C nibbles + +	movq	%mm5, %mm2 +	pand	%mm1, %mm2 +	psrlq	$4, %mm1 +	pand	%mm5, %mm1 +	paddd	%mm2, %mm1	C bytes + +	psadbw(	%mm4, %mm1) +	paddd	%mm1, %mm0	C to total + +	subl	$2, %ecx +	jg	L(top) + +	C ecx is 0 or -1 representing respectively 1 or 0 further limbs +	jz	L(last) + + +	movd	%mm0, %eax +	emms +	ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/mmx/rshift.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/mmx/rshift.asm new file mode 100644 index 0000000..3ac0094 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/mmx/rshift.asm @@ -0,0 +1,39 @@ +dnl  Intel Pentium-4 mpn_rshift -- right shift. + +dnl  Copyright 2001, 2002 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P4 Willamette, Northwood: 1.75 cycles/limb +C P4 Prescott:		    2.0 cycles/limb + + +MULFUNC_PROLOGUE(mpn_rshift) +include_mpn(`x86/pentium/mmx/rshift.asm') diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/add_n.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/add_n.asm new file mode 100644 index 0000000..8e2380e --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/add_n.asm @@ -0,0 +1,101 @@ +dnl  Intel Pentium-4 mpn_add_n -- mpn addition. + +dnl  Copyright 2001, 2002 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C					cycles/limb +C			     dst!=src1,2  dst==src1  dst==src2 +C P6 model 0-8,10-12		- +C P6 model 9   (Banias)		? +C P6 model 13  (Dothan)		? +C P4 model 0-1 (Willamette)	? +C P4 model 2   (Northwood)	4	     6		6 +C P4 model 3-4 (Prescott)	4.25	     7.5	7.5 + +defframe(PARAM_CARRY,20) +defframe(PARAM_SIZE, 16) +defframe(PARAM_SRC2, 12) +defframe(PARAM_SRC1, 8) +defframe(PARAM_DST,  4) + +dnl  re-use parameter space +define(SAVE_EBX,`PARAM_SRC1') + +	TEXT +	ALIGN(8) + +PROLOGUE(mpn_add_nc) +deflit(`FRAME',0) +	movd	PARAM_CARRY, %mm0 +	jmp	L(start_nc) +EPILOGUE() + +	ALIGN(8) +PROLOGUE(mpn_add_n) +deflit(`FRAME',0) +	pxor	%mm0, %mm0 +L(start_nc): +	mov	PARAM_SRC1, %eax +	mov	%ebx, SAVE_EBX +	mov	PARAM_SRC2, %ebx +	mov	PARAM_DST, %edx +	mov	PARAM_SIZE, %ecx + +	lea	(%eax,%ecx,4), %eax	C src1 end +	lea	(%ebx,%ecx,4), %ebx	C src2 end +	lea	(%edx,%ecx,4), %edx	C dst end +	neg	%ecx			C -size + +L(top): +	C eax	src1 end +	C ebx	src2 end +	C ecx	counter, limbs, negative +	C edx	dst end +	C mm0	carry bit + +	movd	(%eax,%ecx,4), %mm1 +	movd	(%ebx,%ecx,4), %mm2 +	paddq	%mm2, %mm1 + +	paddq	%mm1, %mm0 +	movd	%mm0, (%edx,%ecx,4) + +	psrlq	$32, %mm0 + +	add	$1, %ecx +	jnz	L(top) + +	movd	%mm0, %eax +	mov	SAVE_EBX, %ebx +	emms +	ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/addlsh1_n.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/addlsh1_n.asm new file mode 100644 index 0000000..93b63b2 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/addlsh1_n.asm @@ -0,0 +1,108 @@ +dnl  Intel Pentium-4 mpn_addlsh1_n -- mpn x+2*y. + +dnl  Copyright 2001-2004, 2006 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C					cycles/limb +C			     dst!=src1,2  dst==src1  dst==src2 +C P6 model 0-8,10-12		- +C P6 model 9   (Banias)		? +C P6 model 13  (Dothan)		? +C P4 model 0-1 (Willamette)	? +C P4 model 2   (Northwood)	4.25	     6		6 +C P4 model 3-4 (Prescott)	5	     8.5	8.5 + +C The slightly strange combination of indexing and pointer incrementing +C that's used seems to work best.  Not sure why, but %ecx,4 with src1 and/or +C src2 is a slowdown. +C +C The dependent chain is simply the paddq of x+2*y to the previous carry, +C then psrlq to get the new carry.  That makes 4 c/l the target speed, which +C is almost achieved for separate src/dst but when src==dst the write +C combining anomalies slow it down. + +defframe(PARAM_SIZE, 16) +defframe(PARAM_SRC2, 12) +defframe(PARAM_SRC1, 8) +defframe(PARAM_DST,  4) + +dnl  re-use parameter space +define(SAVE_EBX,`PARAM_SRC1') + +	TEXT +	ALIGN(8) + +PROLOGUE(mpn_addlsh1_n) +deflit(`FRAME',0) + +	mov	PARAM_SRC1, %eax +	mov	%ebx, SAVE_EBX + +	mov	PARAM_SRC2, %ebx +	pxor	%mm0, %mm0		C initial carry + +	mov	PARAM_DST, %edx + +	mov	PARAM_SIZE, %ecx + +	lea	(%edx,%ecx,4), %edx	C dst end +	neg	%ecx			C -size + +L(top): +	C eax	src1 end +	C ebx	src2 end +	C ecx	counter, limbs, negative +	C edx	dst end +	C mm0	carry + +	movd	(%ebx), %mm2 +	movd	(%eax), %mm1 +	psrlq	$32, %mm0 +	lea	4(%eax), %eax +	lea	4(%ebx), %ebx + +	psllq	$1, %mm2 +	paddq	%mm2, %mm1 + +	paddq	%mm1, %mm0 + +	movd	%mm0, (%edx,%ecx,4) +	add	$1, %ecx +	jnz	L(top) + + +	psrlq	$32, %mm0 +	mov	SAVE_EBX, %ebx +	movd	%mm0, %eax +	emms +	ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/addmul_1.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/addmul_1.asm new file mode 100644 index 0000000..7810207 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/addmul_1.asm @@ -0,0 +1,189 @@ +dnl  mpn_addmul_1 for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F). + +dnl  Copyright 2005, 2007, 2011 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C			    cycles/limb +C P6 model 0-8,10-12		- +C P6 model 9   (Banias)		5.24 +C P6 model 13  (Dothan)		5.24 +C P4 model 0-1 (Willamette)	5 +C P4 model 2   (Northwood)	5 +C P4 model 3-4 (Prescott)	5 + +C TODO: +C  * Tweak eax/edx offsets in loop as to save some lea's +C  * Perhaps software pipeline small-case code + +C INPUT PARAMETERS +C rp		sp + 4 +C up		sp + 8 +C n		sp + 12 +C v0		sp + 16 + +	TEXT +	ALIGN(16) +PROLOGUE(mpn_addmul_1) +	pxor	%mm6, %mm6 +L(ent):	mov	4(%esp), %edx +	mov	8(%esp), %eax +	mov	12(%esp), %ecx +	movd	16(%esp), %mm7 +	cmp	$4, %ecx +	jnc	L(big) + +L(lp0):	movd	(%eax), %mm0 +	lea	4(%eax), %eax +	movd	(%edx), %mm4 +	lea	4(%edx), %edx +	pmuludq	%mm7, %mm0 +	paddq	%mm0, %mm4 +	paddq	%mm4, %mm6 +	movd	%mm6, -4(%edx) +	psrlq	$32, %mm6 +	dec	%ecx +	jnz	L(lp0) +	movd	%mm6, %eax +	emms +	ret + +L(big):	and	$3, %ecx +	je	L(0) +	cmp	$2, %ecx +	jc	L(1) +	je	L(2) +	jmp	L(3)			C FIXME: one case should fall through + +L(0):	movd	(%eax), %mm3 +	sub	12(%esp), %ecx		C loop count +	lea	-16(%eax), %eax +	lea	-12(%edx), %edx +	pmuludq	%mm7, %mm3 +	movd	20(%eax), %mm0 +	movd	12(%edx), %mm5 +	pmuludq	%mm7, %mm0 +	movd	24(%eax), %mm1 +	paddq	%mm3, %mm5 +	movd	16(%edx), %mm4 +	jmp	L(00) + +L(1):	movd	(%eax), %mm2 +	sub	12(%esp), %ecx +	lea	-12(%eax), %eax +	lea	-8(%edx), %edx +	movd	8(%edx), %mm4 +	pmuludq	%mm7, %mm2 +	movd	16(%eax), %mm3 +	pmuludq	%mm7, %mm3 +	movd	20(%eax), %mm0 +	paddq	%mm2, %mm4 +	movd	12(%edx), %mm5 +	jmp	L(01) + +L(2):	movd	(%eax), %mm1 +	sub	12(%esp), %ecx +	lea	-8(%eax), %eax +	lea	-4(%edx), %edx +	pmuludq	%mm7, %mm1 +	movd	12(%eax), %mm2 +	movd	4(%edx), %mm5 +	pmuludq	%mm7, %mm2 +	movd	16(%eax), %mm3 +	paddq	%mm1, %mm5 +	movd	8(%edx), %mm4 +	jmp	L(10) + +L(3):	movd	(%eax), %mm0 +	sub	12(%esp), %ecx +	lea	-4(%eax), %eax +	pmuludq	%mm7, %mm0 +	movd	8(%eax), %mm1 +	movd	(%edx), %mm4 +	pmuludq	%mm7, %mm1 +	movd	12(%eax), %mm2 +	paddq	%mm0, %mm4 +	movd	4(%edx), %mm5 + +	ALIGN(16) +L(top):	pmuludq	%mm7, %mm2 +	paddq	%mm4, %mm6 +	movd	16(%eax), %mm3 +	paddq	%mm1, %mm5 +	movd	8(%edx), %mm4 +	movd	%mm6, 0(%edx) +	psrlq	$32, %mm6 +L(10):	pmuludq	%mm7, %mm3 +	paddq	%mm5, %mm6 +	movd	20(%eax), %mm0 +	paddq	%mm2, %mm4 +	movd	12(%edx), %mm5 +	movd	%mm6, 4(%edx) +	psrlq	$32, %mm6 +L(01):	pmuludq	%mm7, %mm0 +	paddq	%mm4, %mm6 +	movd	24(%eax), %mm1 +	paddq	%mm3, %mm5 +	movd	16(%edx), %mm4 +	movd	%mm6, 8(%edx) +	psrlq	$32, %mm6 +L(00):	pmuludq	%mm7, %mm1 +	paddq	%mm5, %mm6 +	movd	28(%eax), %mm2 +	paddq	%mm0, %mm4 +	movd	20(%edx), %mm5 +	movd	%mm6, 12(%edx) +	psrlq	$32, %mm6 +	lea	16(%eax), %eax +	lea	16(%edx), %edx +	add	$4, %ecx +	jnz	L(top) + +L(end):	pmuludq	%mm7, %mm2 +	paddq	%mm4, %mm6 +	paddq	%mm1, %mm5 +	movd	8(%edx), %mm4 +	movd	%mm6, 0(%edx) +	psrlq	$32, %mm6 +	paddq	%mm5, %mm6 +	paddq	%mm2, %mm4 +	movd	%mm6, 4(%edx) +	psrlq	$32, %mm6 +	paddq	%mm4, %mm6 +	movd	%mm6, 8(%edx) +	psrlq	$32, %mm6 +	movd	%mm6, %eax +	emms +	ret +EPILOGUE() +PROLOGUE(mpn_addmul_1c) +	movd	20(%esp), %mm6 +	jmp	L(ent) +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/bdiv_dbm1c.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/bdiv_dbm1c.asm new file mode 100644 index 0000000..354300e --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/bdiv_dbm1c.asm @@ -0,0 +1,141 @@ +dnl  Intel Atom  mpn_bdiv_dbm1. + +dnl  Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. + +dnl  Copyright 2011 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C			    cycles/limb +C			    cycles/limb +C P5				 - +C P6 model 0-8,10-12		 - +C P6 model 9  (Banias)		 9.75 +C P6 model 13 (Dothan) +C P4 model 0  (Willamette) +C P4 model 1  (?) +C P4 model 2  (Northwood)	 8.25 +C P4 model 3  (Prescott) +C P4 model 4  (Nocona) +C Intel Atom			 8 +C AMD K6			 - +C AMD K7			 - +C AMD K8 +C AMD K10 + +C TODO: This code was optimised for atom-32, consider moving it back to atom +C	dir(atom currently grabs this code), and write a 4-way version(7c/l). + +defframe(PARAM_CARRY,20) +defframe(PARAM_MUL,  16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC,  8) +defframe(PARAM_DST,  4) + +dnl  re-use parameter space +define(SAVE_RP,`PARAM_MUL') +define(SAVE_UP,`PARAM_SIZE') + +define(`rp', `%edi') +define(`up', `%esi') +define(`n',  `%ecx') +define(`reg', `%edx') +define(`cy', `%eax')	C contains the return value + +ASM_START() +	TEXT +	ALIGN(16) +deflit(`FRAME',0) + +PROLOGUE(mpn_bdiv_dbm1c) +	mov	PARAM_SIZE, n		C size +	mov	up, SAVE_UP +	mov	PARAM_SRC, up +	movd	PARAM_MUL, %mm7 +	mov	rp, SAVE_RP +	mov	PARAM_DST, rp + +	movd	(up), %mm0 +	pmuludq	%mm7, %mm0 +	shr	n +	mov	PARAM_CARRY, cy +	jz	L(eq1) + +	movd	4(up), %mm1 +	jc	L(odd) + +	lea	4(up), up +	pmuludq	%mm7, %mm1 +	movd	%mm0, reg +	psrlq	$32, %mm0 +	sub	reg, cy +	movd	%mm0, reg +	movq	%mm1, %mm0 +	dec	n +	mov	cy, (rp) +	lea	4(rp), rp +	jz	L(end) + +C	ALIGN(16) +L(top):	movd	4(up), %mm1 +	sbb	reg, cy +L(odd):	movd	%mm0, reg +	psrlq	$32, %mm0 +	pmuludq	%mm7, %mm1 +	sub	reg, cy +	lea	8(up), up +	movd	%mm0, reg +	movd	(up), %mm0 +	mov	cy, (rp) +	sbb	reg, cy +	movd	%mm1, reg +	psrlq	$32, %mm1 +	sub	reg, cy +	movd	%mm1, reg +	pmuludq	%mm7, %mm0 +	dec	n +	mov	cy, 4(rp) +	lea	8(rp), rp +	jnz	L(top) + +L(end):	sbb	reg, cy + +L(eq1):	movd	%mm0, reg +	psrlq	$32, %mm0 +	mov	SAVE_UP, up +	sub	reg, cy +	movd	%mm0, reg +	emms +	mov	cy, (rp) +	sbb	reg, cy + +	mov	SAVE_RP, rp +	ret +EPILOGUE() +ASM_END() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/bdiv_q_1.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/bdiv_q_1.asm new file mode 100644 index 0000000..d5008f4 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/bdiv_q_1.asm @@ -0,0 +1,234 @@ +dnl  Intel Pentium-4 mpn_divexact_1 -- mpn by limb exact division. + +dnl  Rearranged from mpn/x86/pentium4/sse2/dive_1.asm by Marco Bodrato. + +dnl  Copyright 2001, 2002, 2007, 2011 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P4: 19.0 cycles/limb + +C Pairs of movd's are used to avoid unaligned loads.  Despite the loads not +C being on the dependent chain and there being plenty of cycles available, +C using an unaligned movq on every second iteration measured about 23 c/l. +C + +defframe(PARAM_SHIFT,  24) +defframe(PARAM_INVERSE,20) +defframe(PARAM_DIVISOR,16) +defframe(PARAM_SIZE,   12) +defframe(PARAM_SRC,    8) +defframe(PARAM_DST,    4) + +	TEXT + +C mp_limb_t +C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor, +C		    mp_limb_t inverse, int shift) +	ALIGN(32) +PROLOGUE(mpn_pi1_bdiv_q_1) +deflit(`FRAME',0) + +	movl	PARAM_SIZE, %edx + +	movl	PARAM_SRC, %eax + +	movl	PARAM_DIVISOR, %ecx + +	movd	%ecx, %mm6 +	movl	PARAM_SHIFT, %ecx + +	movd	%ecx, %mm7		C shift + +	C + +	movl	PARAM_INVERSE, %ecx +	movd	%ecx, %mm5		C inv + +	movl	PARAM_DST, %ecx +	pxor	%mm1, %mm1		C initial carry limb +	pxor	%mm0, %mm0		C initial carry bit + +	subl	$1, %edx +	jz	L(done) + +	pcmpeqd	%mm4, %mm4 +	psrlq	$32, %mm4		C 0x00000000FFFFFFFF + +C The dependent chain here is as follows. +C +C					latency +C	psubq	 s = (src-cbit) - climb	   2 +C	pmuludq	 q = s*inverse		   8 +C	pmuludq	 prod = q*divisor	   8 +C	psrlq	 climb = high(prod)	   2 +C					  -- +C					  20 +C +C Yet the loop measures 19.0 c/l, so obviously there's something gained +C there over a straight reading of the chip documentation. + +L(top): +	C eax	src, incrementing +	C ebx +	C ecx	dst, incrementing +	C edx	counter, size-1 iterations +	C +	C mm0	carry bit +	C mm1	carry limb +	C mm4	0x00000000FFFFFFFF +	C mm5	inverse +	C mm6	divisor +	C mm7	shift + +	movd	(%eax), %mm2 +	movd	4(%eax), %mm3 +	addl	$4, %eax +	punpckldq %mm3, %mm2 + +	psrlq	%mm7, %mm2 +	pand	%mm4, %mm2		C src +	psubq	%mm0, %mm2		C src - cbit + +	psubq	%mm1, %mm2		C src - cbit - climb +	movq	%mm2, %mm0 +	psrlq	$63, %mm0		C new cbit + +	pmuludq	%mm5, %mm2		C s*inverse +	movd	%mm2, (%ecx)		C q +	addl	$4, %ecx + +	movq	%mm6, %mm1 +	pmuludq	%mm2, %mm1		C q*divisor +	psrlq	$32, %mm1		C new climb + +L(entry): +	subl	$1, %edx +	jnz	L(top) + +L(done): +	movd	(%eax), %mm2 +	psrlq	%mm7, %mm2		C src +	psubq	%mm0, %mm2		C src - cbit + +	psubq	%mm1, %mm2		C src - cbit - climb + +	pmuludq	%mm5, %mm2		C s*inverse +	movd	%mm2, (%ecx)		C q + +	emms +	ret + +EPILOGUE() + +	ALIGN(16) +C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C                           mp_limb_t divisor); +C +PROLOGUE(mpn_bdiv_q_1) +deflit(`FRAME',0) + +	movl	PARAM_SIZE, %edx + +	movl	PARAM_DIVISOR, %ecx + +	C eax	src +	C ebx +	C ecx	divisor +	C edx	size-1 + +	movl	%ecx, %eax +	bsfl	%ecx, %ecx		C trailing twos + +	shrl	%cl, %eax		C d = divisor without twos +	movd	%eax, %mm6 +	movd	%ecx, %mm7		C shift + +	shrl	%eax			C d/2 + +	andl	$127, %eax		C d/2, 7 bits + +ifdef(`PIC',` +	LEA(	binvert_limb_table, %ecx) +	movzbl	(%eax,%ecx), %eax		C inv 8 bits +',` +	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits +') + +	C + +	movd	%eax, %mm5		C inv + +	movd	%eax, %mm0		C inv + +	pmuludq	%mm5, %mm5		C inv*inv + +	C + +	pmuludq	%mm6, %mm5		C inv*inv*d +	paddd	%mm0, %mm0		C 2*inv + +	C + +	psubd	%mm5, %mm0		C inv = 2*inv - inv*inv*d +	pxor	%mm5, %mm5 + +	paddd	%mm0, %mm5 +	pmuludq	%mm0, %mm0		C inv*inv + +	pcmpeqd	%mm4, %mm4 +	psrlq	$32, %mm4		C 0x00000000FFFFFFFF + +	C + +	pmuludq	%mm6, %mm0		C inv*inv*d +	paddd	%mm5, %mm5		C 2*inv + +	movl	PARAM_SRC, %eax +	movl	PARAM_DST, %ecx +	pxor	%mm1, %mm1		C initial carry limb + +	C + +	psubd	%mm0, %mm5		C inv = 2*inv - inv*inv*d + +	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS +	pushl	%eax	FRAME_pushl() +	movq	%mm6, %mm0 +	pmuludq	%mm5, %mm0 +	movd	%mm0, %eax +	cmpl	$1, %eax +	popl	%eax	FRAME_popl()') + +	pxor	%mm0, %mm0		C initial carry bit +	jmp	L(entry) + +EPILOGUE() +ASM_END() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/cnd_add_n.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/cnd_add_n.asm new file mode 100644 index 0000000..b3f3474 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/cnd_add_n.asm @@ -0,0 +1,95 @@ +dnl  Intel Pentium-4 mpn_cnd_add_n -- mpn addition. + +dnl  Copyright 2001, 2002, 2013 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C			    cycles/limb +C P6 model 0-8,10-12		 - +C P6 model 9   (Banias)		 ? +C P6 model 13  (Dothan)		 4.67 +C P4 model 0-1 (Willamette)	 ? +C P4 model 2   (Northwood)	 5 +C P4 model 3-4 (Prescott)	 5.25 + +defframe(PARAM_SIZE, 20) +defframe(PARAM_SRC2, 16) +defframe(PARAM_SRC1, 12) +defframe(PARAM_DST,  8) +defframe(PARAM_CND,  4) + +dnl  re-use parameter space +define(SAVE_EBX,`PARAM_SRC1') + +define(`cnd', `%mm3') + +	TEXT +	ALIGN(8) + +	ALIGN(8) +PROLOGUE(mpn_cnd_add_n) +deflit(`FRAME',0) +	pxor	%mm0, %mm0 + +	mov	PARAM_CND, %eax +	neg	%eax +	sbb	%eax, %eax +	movd	%eax, cnd + +	mov	PARAM_SRC1, %eax +	mov	%ebx, SAVE_EBX +	mov	PARAM_SRC2, %ebx +	mov	PARAM_DST, %edx +	mov	PARAM_SIZE, %ecx + +	lea	(%eax,%ecx,4), %eax	C src1 end +	lea	(%ebx,%ecx,4), %ebx	C src2 end +	lea	(%edx,%ecx,4), %edx	C dst end +	neg	%ecx			C -size + +L(top):	movd	(%ebx,%ecx,4), %mm2 +	movd	(%eax,%ecx,4), %mm1 +	pand	cnd, %mm2 +	paddq	%mm2, %mm1 + +	paddq	%mm1, %mm0 +	movd	%mm0, (%edx,%ecx,4) + +	psrlq	$32, %mm0 + +	add	$1, %ecx +	jnz	L(top) + +	movd	%mm0, %eax +	mov	SAVE_EBX, %ebx +	emms +	ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/cnd_sub_n.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/cnd_sub_n.asm new file mode 100644 index 0000000..339a23e --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/cnd_sub_n.asm @@ -0,0 +1,114 @@ +dnl  Intel Pentium-4 mpn_cnd_sub_n -- mpn subtraction. + +dnl  Copyright 2001, 2002, 2013 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C			    cycles/limb +C P6 model 0-8,10-12		 - +C P6 model 9   (Banias)		 ? +C P6 model 13  (Dothan)		 4.67 +C P4 model 0-1 (Willamette)	 ? +C P4 model 2   (Northwood)	 5 +C P4 model 3-4 (Prescott)	 5.25 + +defframe(PARAM_SIZE, 20) +defframe(PARAM_SRC2, 16) +defframe(PARAM_SRC1, 12) +defframe(PARAM_DST,  8) +defframe(PARAM_CND,  4) + +dnl  re-use parameter space +define(SAVE_EBX,`PARAM_SRC1') + +define(`cnd', `%mm3') + +	TEXT +	ALIGN(8) + +	ALIGN(8) +PROLOGUE(mpn_cnd_sub_n) +deflit(`FRAME',0) +	pxor	%mm0, %mm0 + +	mov	PARAM_CND, %eax +	neg	%eax +	sbb	%eax, %eax +	movd	%eax, cnd + +	mov	PARAM_SRC1, %eax +	mov	%ebx, SAVE_EBX +	mov	PARAM_SRC2, %ebx +	mov	PARAM_DST, %edx +	mov	PARAM_SIZE, %ecx + +	lea	(%eax,%ecx,4), %eax	C src1 end +	lea	(%ebx,%ecx,4), %ebx	C src2 end +	lea	(%edx,%ecx,4), %edx	C dst end +	neg	%ecx			C -size + +L(top):	movd	(%ebx,%ecx,4), %mm2 +	movd	(%eax,%ecx,4), %mm1 +	pand	cnd, %mm2 +	psubq	%mm2, %mm1 + +	psubq	%mm0, %mm1 +	movd	%mm1, (%edx,%ecx,4) + +	psrlq	$63, %mm1 + +	add	$1, %ecx +	jz	L(done_mm1) + +	movd	(%ebx,%ecx,4), %mm2 +	movd	(%eax,%ecx,4), %mm0 +	pand	cnd, %mm2 +	psubq	%mm2, %mm0 + +	psubq	%mm1, %mm0 +	movd	%mm0, (%edx,%ecx,4) + +	psrlq	$63, %mm0 + +	add	$1, %ecx +	jnz	L(top) + +	movd	%mm0, %eax +	mov	SAVE_EBX, %ebx +	emms +	ret + +L(done_mm1): +	movd	%mm1, %eax +	mov	SAVE_EBX, %ebx +	emms +	ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/dive_1.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/dive_1.asm new file mode 100644 index 0000000..0ceef5b --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/dive_1.asm @@ -0,0 +1,216 @@ +dnl  Intel Pentium-4 mpn_divexact_1 -- mpn by limb exact division. + +dnl  Copyright 2001, 2002, 2007 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P4: 19.0 cycles/limb + + +C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C                      mp_limb_t divisor); +C +C Pairs of movd's are used to avoid unaligned loads.  Despite the loads not +C being on the dependent chain and there being plenty of cycles available, +C using an unaligned movq on every second iteration measured about 23 c/l. +C +C Using divl for size==1 seems a touch quicker than mul-by-inverse.  The mul +C will be about 9+2*4+2*2+10*4+19+12 = 92 cycles latency, though some of +C that might be hidden by out-of-order execution, whereas divl is around 60. +C At size==2 an extra 19 for the mul versus 60 for the divl will see the mul +C faster. + +defframe(PARAM_DIVISOR,16) +defframe(PARAM_SIZE,   12) +defframe(PARAM_SRC,    8) +defframe(PARAM_DST,    4) + +	TEXT + +	ALIGN(16) +PROLOGUE(mpn_divexact_1) +deflit(`FRAME',0) + +	movl	PARAM_SIZE, %edx + +	movl	PARAM_SRC, %eax + +	movl	PARAM_DIVISOR, %ecx +	subl	$1, %edx +	jnz	L(two_or_more) + +	movl	(%eax), %eax +	xorl	%edx, %edx + +	divl	%ecx +	movl	PARAM_DST, %ecx + +	movl	%eax, (%ecx) +	ret + + +L(two_or_more): +	C eax	src +	C ebx +	C ecx	divisor +	C edx	size-1 + +	movl	%ecx, %eax +	bsfl	%ecx, %ecx		C trailing twos + +	shrl	%cl, %eax		C d = divisor without twos +	movd	%eax, %mm6 +	movd	%ecx, %mm7		C shift + +	shrl	%eax			C d/2 + +	andl	$127, %eax		C d/2, 7 bits + +ifdef(`PIC',` +	LEA(	binvert_limb_table, %ecx) +	movzbl	(%eax,%ecx), %eax		C inv 8 bits +',` +	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits +') + +	C + +	movd	%eax, %mm5		C inv + +	movd	%eax, %mm0		C inv + +	pmuludq	%mm5, %mm5		C inv*inv + +	C + +	pmuludq	%mm6, %mm5		C inv*inv*d +	paddd	%mm0, %mm0		C 2*inv + +	C + +	psubd	%mm5, %mm0		C inv = 2*inv - inv*inv*d +	pxor	%mm5, %mm5 + +	paddd	%mm0, %mm5 +	pmuludq	%mm0, %mm0		C inv*inv + +	pcmpeqd	%mm4, %mm4 +	psrlq	$32, %mm4		C 0x00000000FFFFFFFF + +	C + +	pmuludq	%mm6, %mm0		C inv*inv*d +	paddd	%mm5, %mm5		C 2*inv + +	movl	PARAM_SRC, %eax +	movl	PARAM_DST, %ecx +	pxor	%mm1, %mm1		C initial carry limb + +	C + +	psubd	%mm0, %mm5		C inv = 2*inv - inv*inv*d + +	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS +	pushl	%eax	FRAME_pushl() +	movq	%mm6, %mm0 +	pmuludq	%mm5, %mm0 +	movd	%mm0, %eax +	cmpl	$1, %eax +	popl	%eax	FRAME_popl()') + +	pxor	%mm0, %mm0		C initial carry bit + + +C The dependent chain here is as follows. +C +C					latency +C	psubq	 s = (src-cbit) - climb	   2 +C	pmuludq	 q = s*inverse		   8 +C	pmuludq	 prod = q*divisor	   8 +C	psrlq	 climb = high(prod)	   2 +C					  -- +C					  20 +C +C Yet the loop measures 19.0 c/l, so obviously there's something gained +C there over a straight reading of the chip documentation. + +L(top): +	C eax	src, incrementing +	C ebx +	C ecx	dst, incrementing +	C edx	counter, size-1 iterations +	C +	C mm0	carry bit +	C mm1	carry limb +	C mm4	0x00000000FFFFFFFF +	C mm5	inverse +	C mm6	divisor +	C mm7	shift + +	movd	(%eax), %mm2 +	movd	4(%eax), %mm3 +	addl	$4, %eax +	punpckldq %mm3, %mm2 + +	psrlq	%mm7, %mm2 +	pand	%mm4, %mm2		C src +	psubq	%mm0, %mm2		C src - cbit + +	psubq	%mm1, %mm2		C src - cbit - climb +	movq	%mm2, %mm0 +	psrlq	$63, %mm0		C new cbit + +	pmuludq	%mm5, %mm2		C s*inverse +	movd	%mm2, (%ecx)		C q +	addl	$4, %ecx + +	movq	%mm6, %mm1 +	pmuludq	%mm2, %mm1		C q*divisor +	psrlq	$32, %mm1		C new climb + +	subl	$1, %edx +	jnz	L(top) + + +L(done): +	movd	(%eax), %mm2 +	psrlq	%mm7, %mm2		C src +	psubq	%mm0, %mm2		C src - cbit + +	psubq	%mm1, %mm2		C src - cbit - climb + +	pmuludq	%mm5, %mm2		C s*inverse +	movd	%mm2, (%ecx)		C q + +	emms +	ret + +EPILOGUE() +ASM_END() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/divrem_1.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/divrem_1.asm new file mode 100644 index 0000000..0146fab --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/divrem_1.asm @@ -0,0 +1,645 @@ +dnl  Intel Pentium-4 mpn_divrem_1 -- mpn by limb division. + +dnl  Copyright 1999-2004 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P4: 32 cycles/limb integer part, 30 cycles/limb fraction part. + + +C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize, +C                         mp_srcptr src, mp_size_t size, +C                         mp_limb_t divisor); +C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize, +C                          mp_srcptr src, mp_size_t size, +C                          mp_limb_t divisor, mp_limb_t carry); +C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize, +C                                mp_srcptr src, mp_size_t size, +C                                mp_limb_t divisor, mp_limb_t inverse, +C                                unsigned shift); +C +C Algorithm: +C +C The method and nomenclature follow part 8 of "Division by Invariant +C Integers using Multiplication" by Granlund and Montgomery, reference in +C gmp.texi. +C +C "m" is written for what is m' in the paper, and "d" for d_norm, which +C won't cause any confusion since it's only the normalized divisor that's of +C any use in the code.  "b" is written for 2^N, the size of a limb, N being +C 32 here. +C +C The step "sdword dr = n - 2^N*d + (2^N-1-q1) * d" is instead done as +C "n-d - q1*d".  This rearrangement gives the same two-limb answer but lets +C us have just a psubq on the dependent chain. +C +C For reference, the way the k7 code uses "n-(q1+1)*d" would not suit here, +C detecting an overflow of q1+1 when q1=0xFFFFFFFF would cost too much. +C +C Notes: +C +C mpn_divrem_1 and mpn_preinv_divrem_1 avoid one division if the src high +C limb is less than the divisor.  mpn_divrem_1c doesn't check for a zero +C carry, since in normal circumstances that will be a very rare event. +C +C The test for skipping a division is branch free (once size>=1 is tested). +C The store to the destination high limb is 0 when a divide is skipped, or +C if it's not skipped then a copy of the src high limb is stored.  The +C latter is in case src==dst. +C +C There's a small bias towards expecting xsize==0, by having code for +C xsize==0 in a straight line and xsize!=0 under forward jumps. +C +C Enhancements: +C +C The loop measures 32 cycles, but the dependent chain would suggest it +C could be done with 30.  Not sure where to start looking for the extras. +C +C Alternatives: +C +C If the divisor is normalized (high bit set) then a division step can +C always be skipped, since the high destination limb is always 0 or 1 in +C that case.  It doesn't seem worth checking for this though, since it +C probably occurs infrequently. + + +dnl  MUL_THRESHOLD is the value of xsize+size at which the multiply by +dnl  inverse method is used, rather than plain "divl"s.  Minimum value 1. +dnl +dnl  The inverse takes about 80-90 cycles to calculate, but after that the +dnl  multiply is 32 c/l versus division at about 58 c/l. +dnl +dnl  At 4 limbs the div is a touch faster than the mul (and of course +dnl  simpler), so start the mul from 5 limbs. + +deflit(MUL_THRESHOLD, 5) + + +defframe(PARAM_PREINV_SHIFT,   28)  dnl mpn_preinv_divrem_1 +defframe(PARAM_PREINV_INVERSE, 24)  dnl mpn_preinv_divrem_1 +defframe(PARAM_CARRY,  24)          dnl mpn_divrem_1c +defframe(PARAM_DIVISOR,20) +defframe(PARAM_SIZE,   16) +defframe(PARAM_SRC,    12) +defframe(PARAM_XSIZE,  8) +defframe(PARAM_DST,    4) + +dnl  re-use parameter space +define(SAVE_ESI,`PARAM_SIZE') +define(SAVE_EBP,`PARAM_SRC') +define(SAVE_EDI,`PARAM_DIVISOR') +define(SAVE_EBX,`PARAM_DST') + +	TEXT + +	ALIGN(16) +PROLOGUE(mpn_preinv_divrem_1) +deflit(`FRAME',0) + +	movl	PARAM_SIZE, %ecx +	xorl	%edx, %edx		C carry if can't skip a div + +	movl	%esi, SAVE_ESI +	movl	PARAM_SRC, %esi + +	movl	%ebp, SAVE_EBP +	movl	PARAM_DIVISOR, %ebp + +	movl	%edi, SAVE_EDI +	movl	PARAM_DST, %edi + +	movl	-4(%esi,%ecx,4), %eax	C src high limb + +	movl	%ebx, SAVE_EBX +	movl	PARAM_XSIZE, %ebx + +	movd	PARAM_PREINV_INVERSE, %mm4 + +	movd	PARAM_PREINV_SHIFT, %mm7  C l +	cmpl	%ebp, %eax		C high cmp divisor + +	cmovc(	%eax, %edx)		C high is carry if high<divisor +	movd	%edx, %mm0		C carry + +	movd	%edx, %mm1		C carry +	movl	$0, %edx + +	movd	%ebp, %mm5		C d +	cmovnc(	%eax, %edx)		C 0 if skip div, src high if not +					C (the latter in case src==dst) +	leal	-4(%edi,%ebx,4), %edi	C &dst[xsize-1] + +	movl	%edx, (%edi,%ecx,4)	C dst high limb +	sbbl	$0, %ecx		C skip one division if high<divisor +	movl	$32, %eax + +	subl	PARAM_PREINV_SHIFT, %eax +	psllq	%mm7, %mm5		C d normalized +	leal	(%edi,%ecx,4), %edi	C &dst[xsize+size-1] +	leal	-4(%esi,%ecx,4), %esi	C &src[size-1] + +	movd	%eax, %mm6		C 32-l +	jmp	L(start_preinv) + +EPILOGUE() + + +	ALIGN(16) +PROLOGUE(mpn_divrem_1c) +deflit(`FRAME',0) + +	movl	PARAM_CARRY, %edx + +	movl	PARAM_SIZE, %ecx + +	movl	%esi, SAVE_ESI +	movl	PARAM_SRC, %esi + +	movl	%ebp, SAVE_EBP +	movl	PARAM_DIVISOR, %ebp + +	movl	%edi, SAVE_EDI +	movl	PARAM_DST, %edi + +	movl	%ebx, SAVE_EBX +	movl	PARAM_XSIZE, %ebx + +	leal	-4(%edi,%ebx,4), %edi	C &dst[xsize-1] +	jmp	L(start_1c) + +EPILOGUE() + + +	ALIGN(16) +PROLOGUE(mpn_divrem_1) +deflit(`FRAME',0) + +	movl	PARAM_SIZE, %ecx +	xorl	%edx, %edx		C initial carry (if can't skip a div) + +	movl	%esi, SAVE_ESI +	movl	PARAM_SRC, %esi + +	movl	%ebp, SAVE_EBP +	movl	PARAM_DIVISOR, %ebp + +	movl	%edi, SAVE_EDI +	movl	PARAM_DST, %edi + +	movl	%ebx, SAVE_EBX +	movl	PARAM_XSIZE, %ebx +	leal	-4(%edi,%ebx,4), %edi	C &dst[xsize-1] + +	orl	%ecx, %ecx		C size +	jz	L(no_skip_div)		C if size==0 +	movl	-4(%esi,%ecx,4), %eax	C src high limb + +	cmpl	%ebp, %eax		C high cmp divisor + +	cmovnc(	%eax, %edx)		C 0 if skip div, src high if not +	movl	%edx, (%edi,%ecx,4)	C dst high limb + +	movl	$0, %edx +	cmovc(	%eax, %edx)		C high is carry if high<divisor + +	sbbl	$0, %ecx		C size-1 if high<divisor +L(no_skip_div): + + +L(start_1c): +	C eax +	C ebx	xsize +	C ecx	size +	C edx	carry +	C esi	src +	C edi	&dst[xsize-1] +	C ebp	divisor + +	leal	(%ebx,%ecx), %eax	C size+xsize +	leal	-4(%esi,%ecx,4), %esi	C &src[size-1] +	leal	(%edi,%ecx,4), %edi	C &dst[size+xsize-1] + +	cmpl	$MUL_THRESHOLD, %eax +	jae	L(mul_by_inverse) + + +	orl	%ecx, %ecx +	jz	L(divide_no_integer)	C if size==0 + +L(divide_integer): +	C eax	scratch (quotient) +	C ebx	xsize +	C ecx	counter +	C edx	carry +	C esi	src, decrementing +	C edi	dst, decrementing +	C ebp	divisor + +	movl	(%esi), %eax +	subl	$4, %esi + +	divl	%ebp + +	movl	%eax, (%edi) +	subl	$4, %edi + +	subl	$1, %ecx +	jnz	L(divide_integer) + + +L(divide_no_integer): +	orl	%ebx, %ebx +	jnz	L(divide_fraction)	C if xsize!=0 + +L(divide_done): +	movl	SAVE_ESI, %esi +	movl	SAVE_EDI, %edi +	movl	SAVE_EBX, %ebx +	movl	SAVE_EBP, %ebp +	movl	%edx, %eax +	ret + + +L(divide_fraction): +	C eax	scratch (quotient) +	C ebx	counter +	C ecx +	C edx	carry +	C esi +	C edi	dst, decrementing +	C ebp	divisor + +	movl	$0, %eax + +	divl	%ebp + +	movl	%eax, (%edi) +	subl	$4, %edi + +	subl	$1, %ebx +	jnz	L(divide_fraction) + +	jmp	L(divide_done) + + + +C ----------------------------------------------------------------------------- + +L(mul_by_inverse): +	C eax +	C ebx	xsize +	C ecx	size +	C edx	carry +	C esi	&src[size-1] +	C edi	&dst[size+xsize-1] +	C ebp	divisor + +	bsrl	%ebp, %eax		C 31-l +	movd	%edx, %mm0		C carry +	movd	%edx, %mm1		C carry +	movl	%ecx, %edx		C size +	movl	$31, %ecx + +	C + +	xorl	%eax, %ecx		C l = leading zeros on d +	addl	$1, %eax + +	shll	%cl, %ebp		C d normalized +	movd	%ecx, %mm7		C l +	movl	%edx, %ecx		C size + +	movd	%eax, %mm6		C 32-l +	movl	$-1, %edx +	movl	$-1, %eax + +	C + +	subl	%ebp, %edx		C (b-d)-1 so  edx:eax = b*(b-d)-1 + +	divl	%ebp			C floor (b*(b-d)-1 / d) +	movd	%ebp, %mm5		C d + +	C + +	movd	%eax, %mm4		C m + + +L(start_preinv): +	C eax	inverse +	C ebx	xsize +	C ecx	size +	C edx +	C esi	&src[size-1] +	C edi	&dst[size+xsize-1] +	C ebp +	C +	C mm0	carry +	C mm1	carry +	C mm2 +	C mm4	m +	C mm5	d +	C mm6	31-l +	C mm7	l + +	psllq	%mm7, %mm0		C n2 = carry << l, for size==0 + +	subl	$1, %ecx +	jb	L(integer_none) + +	movd	(%esi), %mm0		C src high limb +	punpckldq %mm1, %mm0 +	psrlq	%mm6, %mm0		C n2 = high (carry:srchigh << l) +	jz	L(integer_last) + + +C The dependent chain here consists of +C +C	2   paddd    n1+n2 +C	8   pmuludq  m*(n1+n2) +C	2   paddq    n2:nadj + m*(n1+n2) +C	2   psrlq    q1 +C	8   pmuludq  d*q1 +C	2   psubq    (n-d)-q1*d +C	2   psrlq    high n-(q1+1)*d mask +C	2   pand     d masked +C	2   paddd    n2+d addback +C	-- +C	30 +C +C But it seems to run at 32 cycles, so presumably there's something else +C going on. + +	ALIGN(16) +L(integer_top): +	C eax +	C ebx +	C ecx	counter, size-1 to 0 +	C edx +	C esi	src, decrementing +	C edi	dst, decrementing +	C +	C mm0	n2 +	C mm4	m +	C mm5	d +	C mm6	32-l +	C mm7	l + +	ASSERT(b,`C n2<d +	 movd	%mm0, %eax +	 movd	%mm5, %edx +	 cmpl	%edx, %eax') + +	movd	-4(%esi), %mm1		C next src limbs +	movd	(%esi), %mm2 +	leal	-4(%esi), %esi + +	punpckldq %mm2, %mm1 +	psrlq	%mm6, %mm1		C n10 + +	movq	%mm1, %mm2		C n10 +	movq	%mm1, %mm3		C n10 +	psrad	$31, %mm1		C -n1 +	pand	%mm5, %mm1		C -n1 & d +	paddd	%mm2, %mm1		C nadj = n10+(-n1&d), ignore overflow + +	psrld	$31, %mm2		C n1 +	paddd	%mm0, %mm2		C n2+n1 +	punpckldq %mm0, %mm1		C n2:nadj + +	pmuludq	%mm4, %mm2		C m*(n2+n1) + +	C + +	paddq	%mm2, %mm1		C n2:nadj + m*(n2+n1) +	pxor	%mm2, %mm2		C break dependency, saves 4 cycles +	pcmpeqd	%mm2, %mm2		C FF...FF +	psrlq	$63, %mm2		C 1 + +	psrlq	$32, %mm1		C q1 = high(n2:nadj + m*(n2+n1)) + +	paddd	%mm1, %mm2		C q1+1 +	pmuludq	%mm5, %mm1		C q1*d + +	punpckldq %mm0, %mm3		C n = n2:n10 +	pxor	%mm0, %mm0 + +	psubq	%mm5, %mm3		C n - d + +	C + +	psubq	%mm1, %mm3		C n - (q1+1)*d + +	por	%mm3, %mm0		C copy remainder -> new n2 +	psrlq	$32, %mm3		C high n - (q1+1)*d, 0 or -1 + +	ASSERT(be,`C 0 or -1 +	 movd	%mm3, %eax +	 addl	$1, %eax +	 cmpl	$1, %eax') + +	paddd	%mm3, %mm2		C q +	pand	%mm5, %mm3		C mask & d + +	paddd	%mm3, %mm0		C addback if necessary +	movd	%mm2, (%edi) +	leal	-4(%edi), %edi + +	subl	$1, %ecx +	ja	L(integer_top) + + +L(integer_last): +	C eax +	C ebx	xsize +	C ecx +	C edx +	C esi	&src[0] +	C edi	&dst[xsize] +	C +	C mm0	n2 +	C mm4	m +	C mm5	d +	C mm6 +	C mm7	l + +	ASSERT(b,`C n2<d +	 movd	%mm0, %eax +	 movd	%mm5, %edx +	 cmpl	%edx, %eax') + +	movd	(%esi), %mm1		C src[0] +	psllq	%mm7, %mm1		C n10 + +	movq	%mm1, %mm2		C n10 +	movq	%mm1, %mm3		C n10 +	psrad	$31, %mm1		C -n1 +	pand	%mm5, %mm1		C -n1 & d +	paddd	%mm2, %mm1		C nadj = n10+(-n1&d), ignore overflow + +	psrld	$31, %mm2		C n1 +	paddd	%mm0, %mm2		C n2+n1 +	punpckldq %mm0, %mm1		C n2:nadj + +	pmuludq	%mm4, %mm2		C m*(n2+n1) + +	C + +	paddq	%mm2, %mm1		C n2:nadj + m*(n2+n1) +	pcmpeqd	%mm2, %mm2		C FF...FF +	psrlq	$63, %mm2		C 1 + +	psrlq	$32, %mm1		C q1 = high(n2:nadj + m*(n2+n1)) +	paddd	%mm1, %mm2		C q1 + +	pmuludq	%mm5, %mm1		C q1*d +	punpckldq %mm0, %mm3		C n +	psubq	%mm5, %mm3		C n - d +	pxor	%mm0, %mm0 + +	C + +	psubq	%mm1, %mm3		C n - (q1+1)*d + +	por	%mm3, %mm0		C remainder -> n2 +	psrlq	$32, %mm3		C high n - (q1+1)*d, 0 or -1 + +	ASSERT(be,`C 0 or -1 +	 movd	%mm3, %eax +	 addl	$1, %eax +	 cmpl	$1, %eax') + +	paddd	%mm3, %mm2		C q +	pand	%mm5, %mm3		C mask & d + +	paddd	%mm3, %mm0		C addback if necessary +	movd	%mm2, (%edi) +	leal	-4(%edi), %edi + + +L(integer_none): +	C eax +	C ebx	xsize + +	orl	%ebx, %ebx +	jnz	L(fraction_some)	C if xsize!=0 + + +L(fraction_done): +	movl	SAVE_EBP, %ebp +	psrld	%mm7, %mm0		C remainder + +	movl	SAVE_EDI, %edi +	movd	%mm0, %eax + +	movl	SAVE_ESI, %esi +	movl	SAVE_EBX, %ebx +	emms +	ret + + + +C ----------------------------------------------------------------------------- +C + +L(fraction_some): +	C eax +	C ebx	xsize +	C ecx +	C edx +	C esi +	C edi	&dst[xsize-1] +	C ebp + + +L(fraction_top): +	C eax +	C ebx	counter, xsize iterations +	C ecx +	C edx +	C esi	src, decrementing +	C edi	dst, decrementing +	C +	C mm0	n2 +	C mm4	m +	C mm5	d +	C mm6	32-l +	C mm7	l + +	ASSERT(b,`C n2<d +	 movd	%mm0, %eax +	 movd	%mm5, %edx +	 cmpl	%edx, %eax') + +	movq	%mm0, %mm1		C n2 +	pmuludq	%mm4, %mm0		C m*n2 + +	pcmpeqd	%mm2, %mm2 +	psrlq	$63, %mm2 + +	C + +	psrlq	$32, %mm0		C high(m*n2) + +	paddd	%mm1, %mm0		C q1 = high(n2:0 + m*n2) + +	paddd	%mm0, %mm2		C q1+1 +	pmuludq	%mm5, %mm0		C q1*d + +	psllq	$32, %mm1		C n = n2:0 +	psubq	%mm5, %mm1		C n - d + +	C + +	psubq	%mm0, %mm1		C r = n - (q1+1)*d +	pxor	%mm0, %mm0 + +	por	%mm1, %mm0		C r -> n2 +	psrlq	$32, %mm1		C high n - (q1+1)*d, 0 or -1 + +	ASSERT(be,`C 0 or -1 +	 movd	%mm1, %eax +	 addl	$1, %eax +	 cmpl	$1, %eax') + +	paddd	%mm1, %mm2		C q +	pand	%mm5, %mm1		C mask & d + +	paddd	%mm1, %mm0		C addback if necessary +	movd	%mm2, (%edi) +	leal	-4(%edi), %edi + +	subl	$1, %ebx +	jne	L(fraction_top) + + +	jmp	L(fraction_done) + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/gmp-mparam.h new file mode 100644 index 0000000..a047a51 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/gmp-mparam.h @@ -0,0 +1,213 @@ +/* Intel Pentium-4 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + +  * the GNU Lesser General Public License as published by the Free +    Software Foundation; either version 3 of the License, or (at your +    option) any later version. + +or + +  * the GNU General Public License as published by the Free Software +    Foundation; either version 2 of the License, or (at your option) any +    later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library.  If not, +see https://www.gnu.org/licenses/.  */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 2600 MHz P4 Northwood */ +/* FFT tuning limit = 23,700,309 */ +/* Generated by tuneup.c, 2019-11-09, gcc 8.2 */ + +#define MOD_1_NORM_THRESHOLD                 5 +#define MOD_1_UNNORM_THRESHOLD              14 +#define MOD_1N_TO_MOD_1_1_THRESHOLD          7 +#define MOD_1U_TO_MOD_1_1_THRESHOLD          5 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD        13 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      7 +#define USE_PREINV_DIVREM_1                  1  /* native */ +#define DIV_QR_1N_PI1_METHOD                 2  /* 4.36% faster than 1 */ +#define DIV_QR_1_NORM_THRESHOLD             16 +#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */ +#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */ +#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD           21 + +#define DIV_1_VS_MUL_1_PERCENT             358 + +#define MUL_TOOM22_THRESHOLD                26 +#define MUL_TOOM33_THRESHOLD               101 +#define MUL_TOOM44_THRESHOLD               284 +#define MUL_TOOM6H_THRESHOLD               406 +#define MUL_TOOM8H_THRESHOLD               592 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD     101 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD     191 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD     189 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD     195 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD     151 + +#define SQR_BASECASE_THRESHOLD               0  /* always (native) */ +#define SQR_TOOM2_THRESHOLD                 51 +#define SQR_TOOM3_THRESHOLD                163 +#define SQR_TOOM4_THRESHOLD                254 +#define SQR_TOOM6_THRESHOLD                614 +#define SQR_TOOM8_THRESHOLD                842 + +#define MULMID_TOOM42_THRESHOLD             58 + +#define MULMOD_BNM1_THRESHOLD               19 +#define SQRMOD_BNM1_THRESHOLD               23 + +#define MUL_FFT_MODF_THRESHOLD             824  /* k = 5 */ +#define MUL_FFT_TABLE3                                      \ +  { {    824, 5}, {     29, 6}, {     15, 5}, {     33, 6}, \ +    {     17, 5}, {     36, 6}, {     19, 5}, {     39, 6}, \ +    {     29, 7}, {     15, 6}, {     32, 7}, {     17, 6}, \ +    {     36, 7}, {     19, 6}, {     39, 7}, {     21, 6}, \ +    {     43, 7}, {     23, 6}, {     48, 7}, {     29, 8}, \ +    {     15, 7}, {     37, 8}, {     19, 7}, {     43, 8}, \ +    {     23, 7}, {     49, 8}, {     27, 7}, {     55, 8}, \ +    {     31, 7}, {     63, 8}, {     43, 9}, {     23, 8}, \ +    {     55, 9}, {     31, 8}, {     67, 9}, {     39, 8}, \ +    {     79, 9}, {     47, 8}, {     99, 9}, {     55,10}, \ +    {     31, 9}, {     79,10}, {     47, 9}, {    103,11}, \ +    {     31,10}, {     63, 9}, {    143,10}, {     79, 9}, \ +    {    167,10}, {     95, 9}, {    191,10}, {    111,11}, \ +    {     63,10}, {    127, 9}, {    255,10}, {    159,11}, \ +    {     95,10}, {    191,12}, {     63,11}, {    127,10}, \ +    {    271,11}, {    159,10}, {    319, 9}, {    639,10}, \ +    {    335,11}, {    191,10}, {    383, 9}, {    799,10}, \ +    {    415,11}, {    223,12}, {    127,11}, {    255,10}, \ +    {    527,11}, {    287,10}, {    607, 9}, {   1215,11}, \ +    {    319,10}, {    671,12}, {    191,11}, {    383,10}, \ +    {    799,11}, {    415,10}, {    863,13}, {    127,12}, \ +    {    255,11}, {    543,10}, {   1119, 9}, {   2239,11}, \ +    {    607,10}, {   1215,12}, {    319,11}, {    671,10}, \ +    {   1343,11}, {    735,10}, {   1471, 9}, {   2943,12}, \ +    {    383,11}, {    799,10}, {   1599,11}, {    863,12}, \ +    {    447,11}, {    927,10}, {   1855,11}, {    959,13}, \ +    {    255,12}, {    511,11}, {   1119,12}, {    575,11}, \ +    {   1215,10}, {   2431,11}, {   1247,12}, {    639,11}, \ +    {   1343,12}, {    703,11}, {   1471,10}, {   2943,13}, \ +    {    383,12}, {    767,11}, {   1599,12}, {    831,11}, \ +    {   1727,10}, {   3455,12}, {    895,14}, {    255,13}, \ +    {    511,12}, {   1087,11}, {   2239,10}, {   4479,12}, \ +    {   1215,11}, {   2431,13}, {    639,12}, {   1343,11}, \ +    {   2687,12}, {   1471,11}, {   2943,13}, {    767,12}, \ +    {   1727,11}, {   3455,13}, {    895,12}, {   1983,14}, \ +    {    511,13}, {   1023,12}, {   2239,13}, {   1151,12}, \ +    {   2495,11}, {   4991,13}, {   1407,12}, {   2943,14}, \ +    {    767,13}, {   1535,12}, {   3135,13}, {   1663,12}, \ +    {   3455,13}, {   1919,12}, {   3967,15}, {    511,14}, \ +    {   1023,13}, {   2175,12}, {   4479,13}, {   2431,12}, \ +    {   4991,14}, {   1279,13}, {   2687,12}, {   5503,13}, \ +    {   8192,14}, {  16384,15}, {  32768,16} } +#define MUL_FFT_TABLE3_SIZE 167 +#define MUL_FFT_THRESHOLD                 7808 + +#define SQR_FFT_MODF_THRESHOLD             560  /* k = 5 */ +#define SQR_FFT_TABLE3                                      \ +  { {    560, 5}, {     33, 6}, {     17, 5}, {     35, 6}, \ +    {     33, 7}, {     17, 6}, {     36, 7}, {     19, 6}, \ +    {     39, 7}, {     35, 8}, {     19, 7}, {     43, 8}, \ +    {     23, 7}, {     47, 8}, {     27, 7}, {     55, 8}, \ +    {     31, 7}, {     63, 8}, {     43, 9}, {     23, 8}, \ +    {     55, 9}, {     31, 8}, {     67, 9}, {     39, 8}, \ +    {     79, 9}, {     47, 8}, {     95, 9}, {     55,10}, \ +    {     31, 9}, {     79,10}, {     47, 9}, {     95,11}, \ +    {     31,10}, {     63, 9}, {    135,10}, {     79, 9}, \ +    {    159,10}, {    111,11}, {     63,10}, {    143, 9}, \ +    {    287,10}, {    159,11}, {     95,10}, {    191,12}, \ +    {     63,11}, {    127, 9}, {    511, 8}, {   1023, 9}, \ +    {    527,11}, {    159,10}, {    319, 9}, {    639,10}, \ +    {    351,11}, {    191,10}, {    431,11}, {    223,12}, \ +    {    127,11}, {    255,10}, {    543,11}, {    287,10}, \ +    {    607, 9}, {   1215,11}, {    319,10}, {    639,11}, \ +    {    351,12}, {    191,11}, {    383,10}, {    767,11}, \ +    {    415,10}, {    831,13}, {    127,12}, {    255,11}, \ +    {    543,10}, {   1119,11}, {    607,12}, {    319,11}, \ +    {    671,10}, {   1343,11}, {    735,12}, {    383,11}, \ +    {    799,10}, {   1599,11}, {    863,12}, {    447,11}, \ +    {    927,10}, {   1855,11}, {    991,13}, {    255,12}, \ +    {    511,11}, {   1055,10}, {   2111,11}, {   1087,12}, \ +    {    575,11}, {   1215,10}, {   2431,12}, {    639,11}, \ +    {   1343,12}, {    703,11}, {   1407,13}, {    383,12}, \ +    {    767,11}, {   1599,12}, {    831,11}, {   1727,10}, \ +    {   3455,12}, {    895,11}, {   1855,12}, {    959,14}, \ +    {    255,13}, {    511,12}, {   1087,11}, {   2239,12}, \ +    {   1215,11}, {   2431,13}, {    639,12}, {   1471,11}, \ +    {   2943,13}, {    767,12}, {   1727,11}, {   3455,13}, \ +    {    895,12}, {   1983,14}, {    511,13}, {   1023,12}, \ +    {   2239,13}, {   1151,12}, {   2495,11}, {   4991,13}, \ +    {   1279,12}, {   2623,13}, {   1407,12}, {   2943,14}, \ +    {    767,13}, {   1663,12}, {   3455,13}, {   1919,12}, \ +    {   3839,15}, {    511,14}, {   1023,13}, {   2175,12}, \ +    {   4479,13}, {   2431,12}, {   4991,14}, {   1279,13}, \ +    {   2687,12}, {   5503,13}, {   8192,14}, {  16384,15}, \ +    {  32768,16} } +#define SQR_FFT_TABLE3_SIZE 149 +#define SQR_FFT_THRESHOLD                 4800 + +#define MULLO_BASECASE_THRESHOLD            12 +#define MULLO_DC_THRESHOLD                  44 +#define MULLO_MUL_N_THRESHOLD            14281 +#define SQRLO_BASECASE_THRESHOLD            13 +#define SQRLO_DC_THRESHOLD                  42 +#define SQRLO_SQR_THRESHOLD               9449 + +#define DC_DIV_QR_THRESHOLD                 38 +#define DC_DIVAPPR_Q_THRESHOLD             105 +#define DC_BDIV_QR_THRESHOLD                52 +#define DC_BDIV_Q_THRESHOLD                 83 + +#define INV_MULMOD_BNM1_THRESHOLD           50 +#define INV_NEWTON_THRESHOLD               158 +#define INV_APPR_THRESHOLD                 118 + +#define BINV_NEWTON_THRESHOLD              342 +#define REDC_1_TO_REDC_N_THRESHOLD          67 + +#define MU_DIV_QR_THRESHOLD               2130 +#define MU_DIVAPPR_Q_THRESHOLD            1895 +#define MUPI_DIV_QR_THRESHOLD               60 +#define MU_BDIV_QR_THRESHOLD              1652 +#define MU_BDIV_Q_THRESHOLD               2089 + +#define POWM_SEC_TABLE  1,22,96,446,723,1378 + +#define GET_STR_DC_THRESHOLD                13 +#define GET_STR_PRECOMPUTE_THRESHOLD        20 +#define SET_STR_DC_THRESHOLD               298 +#define SET_STR_PRECOMPUTE_THRESHOLD       960 + +#define FAC_DSC_THRESHOLD                  212 +#define FAC_ODD_THRESHOLD                   71 + +#define MATRIX22_STRASSEN_THRESHOLD         26 +#define HGCD2_DIV1_METHOD                    3  /* 0.68% faster than 1 */ +#define HGCD_THRESHOLD                      80 +#define HGCD_APPR_THRESHOLD                138 +#define HGCD_REDUCE_THRESHOLD             4455 +#define GCD_DC_THRESHOLD                   365 +#define GCDEXT_DC_THRESHOLD                245 +#define JACOBI_BASE_METHOD                   4  /* 23.41% faster than 1 */ + +/* Tuneup completed successfully, took 63807 seconds */ diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mod_1_1.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mod_1_1.asm new file mode 100644 index 0000000..ee88bab --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mod_1_1.asm @@ -0,0 +1,166 @@ +dnl  x86-32 mpn_mod_1_1p for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F). + +dnl  Contributed to the GNU project by Torbjorn Granlund. + +dnl  Copyright 2009, 2010 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C TODO: +C  * Optimize.  The present code was written quite straightforwardly. +C  * Optimize post-loop reduction code; it is from mod_1s_4p, thus overkill. +C  * Write a cps function that uses sse2 insns. + +C                           cycles/limb +C P6 model 0-8,10-12		- +C P6 model 9   (Banias)		? +C P6 model 13  (Dothan)		? +C P4 model 0-1 (Willamette)	? +C P4 model 2   (Northwood)     16 +C P4 model 3-4 (Prescott)      18 + +C INPUT PARAMETERS +C ap		sp + 4 +C n		sp + 8 +C b		sp + 12 +C cps		sp + 16 + +define(`B1modb', `%mm1') +define(`B2modb', `%mm2') +define(`ap',     `%edx') +define(`n',      `%eax') + +	TEXT +	ALIGN(16) +PROLOGUE(mpn_mod_1_1p) +	push	%ebx +	mov	8(%esp), ap +	mov	12(%esp), n +	mov	20(%esp), %ecx +	movd	8(%ecx), B1modb +	movd	12(%ecx), B2modb + +	lea	-4(ap,n,4), ap + +C FIXME: See comment in generic/mod_1_1.c. +	movd	(ap), %mm7 +	movd	-4(ap), %mm4 +	pmuludq B1modb, %mm7 +	paddq	%mm4, %mm7 +	add	$-2, n +	jz	L(end) + +	ALIGN(8) +L(top):	movq	%mm7, %mm6 +	psrlq	$32, %mm7		C rh +	movd	-8(ap), %mm0 +	add	$-4, ap +	pmuludq	B2modb, %mm7 +	pmuludq	B1modb, %mm6 +	add	$-1, n +	paddq	%mm0, %mm7 +	paddq	%mm6, %mm7 +	jnz	L(top) + +L(end):	pcmpeqd	%mm4, %mm4 +	psrlq	$32, %mm4		C 0x00000000FFFFFFFF +	pand	%mm7, %mm4		C rl +	psrlq	$32, %mm7		C rh +	pmuludq	B1modb, %mm7		C rh,cl +	paddq	%mm4, %mm7		C rh,rl +	movd	4(%ecx), %mm4		C cnt +	psllq	%mm4, %mm7		C rh,rl normalized +	movq	%mm7, %mm2		C rl in low half +	psrlq	$32, %mm7		C rh +	movd	(%ecx), %mm1		C bi +	pmuludq	%mm7, %mm1		C qh,ql +	paddq	%mm2, %mm1		C qh-1,ql +	movd	%mm1, %ecx		C ql +	psrlq	$32, %mm1		C qh-1 +	movd	16(%esp), %mm3		C b +	pmuludq	%mm1, %mm3		C (qh-1) * b +	psubq	%mm3, %mm2		C r in low half (could use psubd) +	movd	%mm2, %eax		C r +	mov	16(%esp), %ebx +	sub	%ebx, %eax		C r +	cmp	%eax, %ecx +	lea	(%eax,%ebx), %edx +	cmovc(	%edx, %eax) +	movd	%mm4, %ecx		C cnt +	cmp	%ebx, %eax +	jae	L(fix) +	emms +	pop	%ebx +	shr	%cl, %eax +	ret + +L(fix):	sub	%ebx, %eax +	emms +	pop	%ebx +	shr	%cl, %eax +	ret +EPILOGUE() + +PROLOGUE(mpn_mod_1_1p_cps) +C CAUTION: This is the same code as in k7/mod_1_1.asm +	push	%ebp +	mov	12(%esp), %ebp +	push	%esi +	bsr	%ebp, %ecx +	push	%ebx +	xor	$31, %ecx +	mov	16(%esp), %esi +	sal	%cl, %ebp +	mov	%ebp, %edx +	not	%edx +	mov	$-1, %eax +	div	%ebp +	mov	%eax, (%esi)		C store bi +	mov	%ecx, 4(%esi)		C store cnt +	xor	%ebx, %ebx +	sub	%ebp, %ebx +	mov	$1, %edx +	shld	%cl, %eax, %edx +	imul	%edx, %ebx +	mul	%ebx +	add	%ebx, %edx +	not	%edx +	imul	%ebp, %edx +	add	%edx, %ebp +	cmp	%edx, %eax +	cmovc(	%ebp, %edx) +	shr	%cl, %ebx +	mov	%ebx, 8(%esi)		C store B1modb +	shr	%cl, %edx +	mov	%edx, 12(%esi)		C store B2modb +	pop	%ebx +	pop	%esi +	pop	%ebp +	ret +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mod_1_4.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mod_1_4.asm new file mode 100644 index 0000000..eb2edb6 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mod_1_4.asm @@ -0,0 +1,269 @@ +dnl  x86-32 mpn_mod_1s_4p for Pentium 4 and P6 models with SSE2 (i.e. 9,D,E,F). + +dnl  Contributed to the GNU project by Torbjorn Granlund. + +dnl  Copyright 2009, 2010 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C TODO: +C  * Optimize.  The present code was written quite straightforwardly. +C  * Optimize post-loop reduction code. +C  * Write a cps function that uses sse2 insns. + +C			    cycles/limb +C P6 model 0-8,10-12		- +C P6 model 9   (Banias)		? +C P6 model 13  (Dothan)		3.4 +C P4 model 0-1 (Willamette)	? +C P4 model 2   (Northwood)	4 +C P4 model 3-4 (Prescott)	4.5 + +C INPUT PARAMETERS +C ap		sp + 4 +C n		sp + 8 +C b		sp + 12 +C cps		sp + 16 + +define(`B1modb', `%mm1') +define(`B2modb', `%mm2') +define(`B3modb', `%mm3') +define(`B4modb', `%mm4') +define(`B5modb', `%mm5') +define(`ap',     `%edx') +define(`n',      `%eax') + +ASM_START() +	TEXT +	ALIGN(16) +PROLOGUE(mpn_mod_1s_4p) +	push	%ebx +	mov	8(%esp), ap +	mov	12(%esp), n +	mov	20(%esp), %ecx + +	movd	8(%ecx), B1modb +	movd	12(%ecx), B2modb +	movd	16(%ecx), B3modb +	movd	20(%ecx), B4modb +	movd	24(%ecx), B5modb + +	mov	n, %ebx +	lea	-4(ap,n,4), ap +	and	$3, %ebx +	je	L(b0) +	cmp	$2, %ebx +	jc	L(b1) +	je	L(b2) + +L(b3):	movd	-4(ap), %mm7 +	pmuludq	B1modb, %mm7 +	movd	-8(ap), %mm6 +	paddq	%mm6, %mm7 +	movd	(ap), %mm6 +	pmuludq	B2modb, %mm6 +	paddq	%mm6, %mm7 +	lea	-24(ap), ap +	add	$-3, n +	jz	L(end) +	jmp	L(top) + +L(b0):	movd	-8(ap), %mm7 +	pmuludq	B1modb, %mm7 +	movd	-12(ap), %mm6 +	paddq	%mm6, %mm7 +	movd	-4(ap), %mm6 +	pmuludq	B2modb, %mm6 +	paddq	%mm6, %mm7 +	movd	(ap), %mm6 +	pmuludq	B3modb, %mm6 +	paddq	%mm6, %mm7 +	lea	-28(ap), ap +	add	$-4, n +	jz	L(end) +	jmp	L(top) + +L(b1):	movd	(ap), %mm7 +	lea	-16(ap), ap +	dec	n +	jz	L(x) +	jmp	L(top) + +L(b2):	movd	-4(ap), %mm7		C rl +	punpckldq (ap), %mm7		C rh +	lea	-20(ap), ap +	add	$-2, n +	jz	L(end) + +	ALIGN(8) +L(top):	movd	4(ap), %mm0 +	pmuludq	B1modb, %mm0 +	movd	0(ap), %mm6 +	paddq	%mm6, %mm0 + +	movd	8(ap), %mm6 +	pmuludq	B2modb, %mm6 +	paddq	%mm6, %mm0 + +	movd	12(ap), %mm6 +	pmuludq	B3modb, %mm6 +	paddq	%mm6, %mm0 + +	movq	%mm7, %mm6 +	psrlq	$32, %mm7		C rh +	pmuludq	B5modb, %mm7 +	pmuludq	B4modb, %mm6 + +	paddq	%mm0, %mm7 +	paddq	%mm6, %mm7 + +	add	$-16, ap +	add	$-4, n +	jnz	L(top) + +L(end):	pcmpeqd	%mm4, %mm4 +	psrlq	$32, %mm4		C 0x00000000FFFFFFFF +	pand	%mm7, %mm4		C rl +	psrlq	$32, %mm7		C rh +	pmuludq	B1modb, %mm7		C rh,cl +	paddq	%mm4, %mm7		C rh,rl +L(x):	movd	4(%ecx), %mm4		C cnt +	psllq	%mm4, %mm7		C rh,rl normalized +	movq	%mm7, %mm2		C rl in low half +	psrlq	$32, %mm7		C rh +	movd	(%ecx), %mm1		C bi +	pmuludq	%mm7, %mm1		C qh,ql +	paddq	%mm2, %mm1		C qh-1,ql +	movd	%mm1, %ecx		C ql +	psrlq	$32, %mm1		C qh-1 +	movd	16(%esp), %mm3		C b +	pmuludq	%mm1, %mm3		C (qh-1) * b +	psubq	%mm3, %mm2		C r in low half (could use psubd) +	movd	%mm2, %eax		C r +	mov	16(%esp), %ebx +	sub	%ebx, %eax		C r +	cmp	%eax, %ecx +	lea	(%eax,%ebx), %edx +	cmovc(	%edx, %eax) +	movd	%mm4, %ecx		C cnt +	cmp	%ebx, %eax +	jae	L(fix) +	emms +	pop	%ebx +	shr	%cl, %eax +	ret + +L(fix):	sub	%ebx, %eax +	emms +	pop	%ebx +	shr	%cl, %eax +	ret +EPILOGUE() + +	ALIGN(16) +PROLOGUE(mpn_mod_1s_4p_cps) +C CAUTION: This is the same code as in k7/mod_1_4.asm +	push	%ebp +	push	%edi +	push	%esi +	push	%ebx +	mov	20(%esp), %ebp		C FIXME: avoid bp for 0-idx +	mov	24(%esp), %ebx +	bsr	%ebx, %ecx +	xor	$31, %ecx +	sal	%cl, %ebx		C b << cnt +	mov	%ebx, %edx +	not	%edx +	mov	$-1, %eax +	div	%ebx +	xor	%edi, %edi +	sub	%ebx, %edi +	mov	$1, %esi +	mov	%eax, (%ebp)		C store bi +	mov	%ecx, 4(%ebp)		C store cnt +	shld	%cl, %eax, %esi +	imul	%edi, %esi +	mov	%eax, %edi +	mul	%esi + +	add	%esi, %edx +	shr	%cl, %esi +	mov	%esi, 8(%ebp)		C store B1modb + +	not	%edx +	imul	%ebx, %edx +	lea	(%edx,%ebx), %esi +	cmp	%edx, %eax +	cmovnc(	%edx, %esi) +	mov	%edi, %eax +	mul	%esi + +	add	%esi, %edx +	shr	%cl, %esi +	mov	%esi, 12(%ebp)		C store B2modb + +	not	%edx +	imul	%ebx, %edx +	lea	(%edx,%ebx), %esi +	cmp	%edx, %eax +	cmovnc(	%edx, %esi) +	mov	%edi, %eax +	mul	%esi + +	add	%esi, %edx +	shr	%cl, %esi +	mov	%esi, 16(%ebp)		C store B3modb + +	not	%edx +	imul	%ebx, %edx +	lea	(%edx,%ebx), %esi +	cmp	%edx, %eax +	cmovnc(	%edx, %esi) +	mov	%edi, %eax +	mul	%esi + +	add	%esi, %edx +	shr	%cl, %esi +	mov	%esi, 20(%ebp)		C store B4modb + +	not	%edx +	imul	%ebx, %edx +	add	%edx, %ebx +	cmp	%edx, %eax +	cmovnc(	%edx, %ebx) + +	shr	%cl, %ebx +	mov	%ebx, 24(%ebp)		C store B5modb + +	pop	%ebx +	pop	%esi +	pop	%edi +	pop	%ebp +	ret +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mod_34lsub1.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mod_34lsub1.asm new file mode 100644 index 0000000..31e25b7 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mod_34lsub1.asm @@ -0,0 +1,175 @@ +dnl  Intel Pentium 4 mpn_mod_34lsub1 -- remainder modulo 2^24-1. + +dnl  Copyright 2000-2003 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C Pentium4: 1.0 cycles/limb + + +C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size) +C +C Enhancements: +C +C There might a couple of cycles to save by using plain integer code for +C more small sizes.  2 limbs measures about 20 cycles, but 3 limbs jumps to +C about 46 (inclusive of some function call overheads). + +defframe(PARAM_SIZE, 8) +defframe(PARAM_SRC,  4) + +dnl  re-use parameter space +define(SAVE_EBX, `PARAM_SRC') +define(SAVE_ESI, `PARAM_SIZE') + +	TEXT +	ALIGN(16) +PROLOGUE(mpn_mod_34lsub1) +deflit(`FRAME',0) + +	movl	PARAM_SIZE, %ecx +	movl	PARAM_SRC, %edx +	movl	(%edx), %eax + +	subl	$2, %ecx +	ja	L(three_or_more) +	jne	L(one) + +	movl	4(%edx), %edx +	movl	%eax, %ecx +	shrl	$24, %eax		C src[0] high + +	andl	$0x00FFFFFF, %ecx	C src[0] low +	addl	%ecx, %eax + +	movl	%edx, %ecx +	shll	$8, %edx + +	shrl	$16, %ecx		C src[1] low +	addl	%ecx, %eax + +	andl	$0x00FFFF00, %edx	C src[1] high +	addl	%edx, %eax + +L(one): +	ret + + +L(three_or_more): +	pxor	%mm0, %mm0 +	pxor	%mm1, %mm1 +	pxor	%mm2, %mm2 + +	pcmpeqd	%mm7, %mm7 +	psrlq	$32, %mm7	C 0x00000000FFFFFFFF, low 32 bits + +	pcmpeqd	%mm6, %mm6 +	psrlq	$40, %mm6	C 0x0000000000FFFFFF, low 24 bits + +L(top): +	C eax +	C ebx +	C ecx	counter, size-2 to 0, -1 or -2 +	C edx	src, incrementing +	C +	C mm0	sum 0mod3 +	C mm1	sum 1mod3 +	C mm2	sum 2mod3 +	C mm3 +	C mm4 +	C mm5 +	C mm6	0x0000000000FFFFFF +	C mm7	0x00000000FFFFFFFF + +	movd	(%edx), %mm3 +	paddq	%mm3, %mm0 + +	movd	4(%edx), %mm3 +	paddq	%mm3, %mm1 + +	movd	8(%edx), %mm3 +	paddq	%mm3, %mm2 + +	addl	$12, %edx +	subl	$3, %ecx +	ja	L(top) + + +	C ecx is -2, -1 or 0 representing 0, 1 or 2 more limbs, respectively + +	addl	$1, %ecx +	js	L(combine)		C 0 more + +	movd	(%edx), %mm3 +	paddq	%mm3, %mm0 + +	jz	L(combine)		C 1 more + +	movd	4(%edx), %mm3 +	paddq	%mm3, %mm1 + +L(combine): +	movq	%mm7, %mm3		C low halves +	pand	%mm0, %mm3 + +	movq	%mm7, %mm4 +	pand	%mm1, %mm4 + +	movq	%mm7, %mm5 +	pand	%mm2, %mm5 + +	psrlq	$32, %mm0		C high halves +	psrlq	$32, %mm1 +	psrlq	$32, %mm2 + +	paddq	%mm0, %mm4		C fold high halves to give 33 bits each +	paddq	%mm1, %mm5 +	paddq	%mm2, %mm3 + +	psllq	$8, %mm4		C combine at respective offsets +	psllq	$16, %mm5 +	paddq	%mm4, %mm3 +	paddq	%mm5, %mm3		C 0x000cxxxxxxxxxxxx, 50 bits + +	pand	%mm3, %mm6		C fold at 24 bits +	psrlq	$24, %mm3 + +	paddq	%mm6, %mm3 +	movd	%mm3, %eax + +	ASSERT(z,	C nothing left in high dword +	`psrlq	$32, %mm3 +	movd	%mm3, %ecx +	orl	%ecx, %ecx') + +	emms +	ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mode1o.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mode1o.asm new file mode 100644 index 0000000..aa9ef31 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mode1o.asm @@ -0,0 +1,175 @@ +dnl  Intel Pentium-4 mpn_modexact_1_odd -- mpn by limb exact remainder. + +dnl  Copyright 2001, 2002, 2007 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P4: 19.0 cycles/limb + + +C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size, +C                               mp_limb_t divisor); +C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, +C                                mp_limb_t divisor, mp_limb_t carry); +C + +defframe(PARAM_CARRY,  16) +defframe(PARAM_DIVISOR,12) +defframe(PARAM_SIZE,   8) +defframe(PARAM_SRC,    4) + +	TEXT + +	ALIGN(16) +PROLOGUE(mpn_modexact_1c_odd) +deflit(`FRAME',0) + +	movd	PARAM_CARRY, %mm1 +	jmp	L(start_1c) + +EPILOGUE() + + +	ALIGN(16) +PROLOGUE(mpn_modexact_1_odd) +deflit(`FRAME',0) + +	pxor	%mm1, %mm1		C carry limb +L(start_1c): +	movl	PARAM_DIVISOR, %eax + +	movd	PARAM_DIVISOR, %mm7 + +	shrl	%eax + +	andl	$127, %eax		C d/2, 7 bits + +ifdef(`PIC',` +	LEA(	binvert_limb_table, %edx) +	movzbl	(%eax,%edx), %eax		C inv 8 bits +',` +	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits +') + +	C + +	movd	%eax, %mm6		C inv + +	movd	%eax, %mm0		C inv + +	pmuludq	%mm6, %mm6		C inv*inv + +	C + +	pmuludq	%mm7, %mm6		C inv*inv*d +	paddd	%mm0, %mm0		C 2*inv + +	C + +	psubd	%mm6, %mm0		C inv = 2*inv - inv*inv*d +	pxor	%mm6, %mm6 + +	paddd	%mm0, %mm6 +	pmuludq	%mm0, %mm0		C inv*inv + +	C + +	pmuludq	%mm7, %mm0		C inv*inv*d +	paddd	%mm6, %mm6		C 2*inv + + +	movl	PARAM_SRC, %eax +	movl	PARAM_SIZE, %ecx + +	C + +	psubd	%mm0, %mm6		C inv = 2*inv - inv*inv*d + +	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS +	pushl	%eax	FRAME_pushl() +	movd	%mm6, %eax +	imul	PARAM_DIVISOR, %eax +	cmpl	$1, %eax +	popl	%eax	FRAME_popl()') + +	pxor	%mm0, %mm0		C carry bit + + +C The dependent chain here is as follows. +C +C					latency +C	psubq	 s = (src-cbit) - climb	   2 +C	pmuludq	 q = s*inverse		   8 +C	pmuludq	 prod = q*divisor	   8 +C	psrlq	 climb = high(prod)	   2 +C					  -- +C					  20 +C +C Yet the loop measures 19.0 c/l, so obviously there's something gained +C there over a straight reading of the chip documentation. + +L(top): +	C eax	src, incrementing +	C ebx +	C ecx	counter, limbs +	C edx +	C +	C mm0	carry bit +	C mm1	carry limb +	C mm6	inverse +	C mm7	divisor + +	movd	(%eax), %mm2 +	addl	$4, %eax + +	psubq	%mm0, %mm2		C src - cbit + +	psubq	%mm1, %mm2		C src - cbit - climb +	movq	%mm2, %mm0 +	psrlq	$63, %mm0		C new cbit + +	pmuludq	%mm6, %mm2		C s*inverse + +	movq	%mm7, %mm1 +	pmuludq	%mm2, %mm1		C q*divisor +	psrlq	$32, %mm1		C new climb + +	subl	$1, %ecx +	jnz	L(top) + + +L(done): +	paddq	%mm1, %mm0 +	movd	%mm0, %eax +	emms +	ret + +EPILOGUE() +ASM_END() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mul_1.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mul_1.asm new file mode 100644 index 0000000..6347b8b --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mul_1.asm @@ -0,0 +1,164 @@ +dnl  mpn_mul_1 for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F). + +dnl  Copyright 2005, 2007, 2011 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C                           cycles/limb +C P6 model 0-8,10-12		- +C P6 model 9   (Banias)		4.17 +C P6 model 13  (Dothan)		4.17 +C P4 model 0-1 (Willamette)	4 +C P4 model 2   (Northwood)	4 +C P4 model 3-4 (Prescott)	4.55 + +C TODO: +C  * Tweak eax/edx offsets in loop as to save some lea's +C  * Perhaps software pipeline small-case code + +C INPUT PARAMETERS +C rp		sp + 4 +C up		sp + 8 +C n		sp + 12 +C v0		sp + 16 + +	TEXT +	ALIGN(16) +PROLOGUE(mpn_mul_1) +	pxor	%mm6, %mm6 +L(ent):	mov	4(%esp), %edx +	mov	8(%esp), %eax +	mov	12(%esp), %ecx +	movd	16(%esp), %mm7 +	cmp	$4, %ecx +	jnc	L(big) + +L(lp0):	movd	(%eax), %mm0 +	lea	4(%eax), %eax +	lea	4(%edx), %edx +	pmuludq	%mm7, %mm0 +	paddq	%mm0, %mm6 +	movd	%mm6, -4(%edx) +	psrlq	$32, %mm6 +	dec	%ecx +	jnz	L(lp0) +	movd	%mm6, %eax +	emms +	ret + +L(big):	and	$3, %ecx +	je	L(0) +	cmp	$2, %ecx +	jc	L(1) +	je	L(2) +	jmp	L(3)			C FIXME: one case should fall through + +L(0):	movd	(%eax), %mm3 +	sub	12(%esp), %ecx		C loop count +	lea	-16(%eax), %eax +	lea	-12(%edx), %edx +	pmuludq	%mm7, %mm3 +	movd	20(%eax), %mm0 +	pmuludq	%mm7, %mm0 +	movd	24(%eax), %mm1 +	jmp	L(00) + +L(1):	movd	(%eax), %mm2 +	sub	12(%esp), %ecx +	lea	-12(%eax), %eax +	lea	-8(%edx), %edx +	pmuludq	%mm7, %mm2 +	movd	16(%eax), %mm3 +	pmuludq	%mm7, %mm3 +	movd	20(%eax), %mm0 +	jmp	L(01) + +L(2):	movd	(%eax), %mm1 +	sub	12(%esp), %ecx +	lea	-8(%eax), %eax +	lea	-4(%edx), %edx +	pmuludq	%mm7, %mm1 +	movd	12(%eax), %mm2 +	pmuludq	%mm7, %mm2 +	movd	16(%eax), %mm3 +	jmp	L(10) + +L(3):	movd	(%eax), %mm0 +	sub	12(%esp), %ecx +	lea	-4(%eax), %eax +	pmuludq	%mm7, %mm0 +	movd	8(%eax), %mm1 +	pmuludq	%mm7, %mm1 +	movd	12(%eax), %mm2 + +	ALIGN(16) +L(top):	pmuludq	%mm7, %mm2 +	paddq	%mm0, %mm6 +	movd	16(%eax), %mm3 +	movd	%mm6, 0(%edx) +	psrlq	$32, %mm6 +L(10):	pmuludq	%mm7, %mm3 +	paddq	%mm1, %mm6 +	movd	20(%eax), %mm0 +	movd	%mm6, 4(%edx) +	psrlq	$32, %mm6 +L(01):	pmuludq	%mm7, %mm0 +	paddq	%mm2, %mm6 +	movd	24(%eax), %mm1 +	movd	%mm6, 8(%edx) +	psrlq	$32, %mm6 +L(00):	pmuludq	%mm7, %mm1 +	paddq	%mm3, %mm6 +	movd	28(%eax), %mm2 +	movd	%mm6, 12(%edx) +	psrlq	$32, %mm6 +	lea	16(%eax), %eax +	lea	16(%edx), %edx +	add	$4, %ecx +	ja	L(top) + +L(end):	pmuludq	%mm7, %mm2 +	paddq	%mm0, %mm6 +	movd	%mm6, 0(%edx) +	psrlq	$32, %mm6 +	paddq	%mm1, %mm6 +	movd	%mm6, 4(%edx) +	psrlq	$32, %mm6 +	paddq	%mm2, %mm6 +	movd	%mm6, 8(%edx) +	psrlq	$32, %mm6 +	movd	%mm6, %eax +	emms +	ret +EPILOGUE() +PROLOGUE(mpn_mul_1c) +	movd	20(%esp), %mm6 +	jmp	L(ent) +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mul_basecase.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mul_basecase.asm new file mode 100644 index 0000000..6e3775a --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mul_basecase.asm @@ -0,0 +1,662 @@ +dnl  mpn_mul_basecase for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F). + +dnl  Copyright 2001, 2002, 2005, 2007 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C TODO: +C  * Improve ad-hoc outer loop code and register handling.  Some feed-in +C    scheduling could improve things by several cycles per outer iteration. +C  * In code for un <= 3, try keeping accumulation operands in registers, +C    without storing intermediates to rp. +C  * We might want to keep 32 in a free mm register, since the register form is +C    3 bytes and the immediate form is 4 bytes.  About 70 bytes to save. +C  * Look into different loop alignment, we now expand the code about 50 bytes +C    with possibly needless alignment. +C  * Perhaps rewrap loops 00,01,02 (6 loops) to allow fall-through entry. +C  * Use OSP, should solve feed-in latency problems. +C  * Save a few tens of bytes by doing cross-jumping for Loel0, etc. +C  * Save around 120 bytes by remapping "m 0", "m 1", "m 2" and "m 3" registers +C    so that they can share feed-in code, and changing the branch targets from +C    L<n> to Lm<nn>. + +C                           cycles/limb +C P6 model 9   (Banias)         ? +C P6 model 13  (Dothan)         5.24 +C P6 model 14  (Yonah)          ? +C P4 model 0-1 (Willamette):    5 +C P4 model 2   (Northwood):     4.60 at 32 limbs +C P4 model 3-4 (Prescott):      4.94 at 32 limbs + +C INPUT PARAMETERS +C rp		sp + 4 +C up		sp + 8 +C un		sp + 12 +C vp		sp + 16 +C vn		sp + 20 + +	TEXT +	ALIGN(16) +PROLOGUE(mpn_mul_basecase) +	push	%esi +	push	%ebx +	mov	12(%esp), %edx		C rp +	mov	16(%esp), %eax		C up +	mov	20(%esp), %ecx		C un +	mov	24(%esp), %esi		C vp +	mov	28(%esp), %ebx		C vn +	movd	(%esi), %mm7		C +L(ent):	cmp	$3, %ecx +	ja	L(big) +	movd	(%eax), %mm6 +	pmuludq	%mm7, %mm6 +	jz	L(un3) +	cmp	$2, %ecx +	jz	L(un2) + +L(un1):	movd	%mm6, (%edx)		C				un=1 +	psrlq	$32, %mm6		C				un=1 +	movd	%mm6, 4(%edx)		C				un=1 +	jmp	L(rtr)			C				un=1 + +L(un2):	movd	4(%eax), %mm1		C				un=2 +	pmuludq	%mm7, %mm1		C				un=2 +	movd	%mm6, (%edx)		C				un=2 +	psrlq	$32, %mm6		C				un=2 +	paddq	%mm1, %mm6		C				un=2 +	movd	%mm6, 4(%edx)		C				un=2 +	psrlq	$32, %mm6		C				un=2 +	movd	%mm6, 8(%edx)		C				un=2 +      dec	%ebx			C				un=2 +      jz	L(rtr)			C				un=2 +	movd	4(%esi), %mm7		C				un=2 +	movd	(%eax), %mm6		C				un=2 +	pmuludq	%mm7, %mm6		C				un=2 +	movd	4(%eax), %mm1		C				un=2 +	movd	4(%edx), %mm4		C				un=2 +	pmuludq	%mm7, %mm1		C				un=2 +	movd	8(%edx), %mm5		C				un=2 +	paddq	%mm4, %mm6		C				un=2 +	paddq	%mm1, %mm5		C				un=2 +	movd	%mm6, 4(%edx)		C				un=2 +	psrlq	$32, %mm6		C				un=2 +	paddq	%mm5, %mm6		C				un=2 +	movd	%mm6, 8(%edx)		C				un=2 +	psrlq	$32, %mm6		C				un=2 +	movd	%mm6, 12(%edx)		C				un=2 +L(rtr):	emms +	pop	%ebx +	pop	%esi +	ret + +L(un3):	movd	4(%eax), %mm1		C				un=3 +	pmuludq	%mm7, %mm1		C				un=3 +	movd	8(%eax), %mm2		C				un=3 +	pmuludq	%mm7, %mm2		C				un=3 +	movd	%mm6, (%edx)		C				un=3 +	psrlq	$32, %mm6		C				un=3 +	paddq	%mm1, %mm6		C				un=3 +	movd	%mm6, 4(%edx)		C				un=3 +	psrlq	$32, %mm6		C				un=3 +	paddq	%mm2, %mm6		C				un=3 +	movd	%mm6, 8(%edx)		C				un=3 +	psrlq	$32, %mm6		C				un=3 +	movd	%mm6, 12(%edx)		C				un=3 +      dec	%ebx			C				un=3 +      jz	L(rtr)			C				un=3 +	movd	4(%esi), %mm7		C				un=3 +	movd	(%eax), %mm6		C				un=3 +	pmuludq	%mm7, %mm6		C				un=3 +	movd	4(%eax), %mm1		C				un=3 +	movd	4(%edx), %mm4		C				un=3 +	pmuludq	%mm7, %mm1		C				un=3 +	movd	8(%eax), %mm2		C				un=3 +	movd	8(%edx), %mm5		C				un=3 +	pmuludq	%mm7, %mm2		C				un=3 +	paddq	%mm4, %mm6		C				un=3 +	paddq	%mm1, %mm5		C				un=3 +	movd	12(%edx), %mm4		C				un=3 +	movd	%mm6, 4(%edx)		C				un=3 +	psrlq	$32, %mm6		C				un=3 +	paddq	%mm5, %mm6		C				un=3 +	paddq	%mm2, %mm4		C				un=3 +	movd	%mm6, 8(%edx)		C				un=3 +	psrlq	$32, %mm6		C				un=3 +	paddq	%mm4, %mm6		C				un=3 +	movd	%mm6, 12(%edx)		C				un=3 +	psrlq	$32, %mm6		C				un=3 +	movd	%mm6, 16(%edx)		C				un=3 +      dec	%ebx			C				un=3 +      jz	L(rtr)			C				un=3 +	movd	8(%esi), %mm7		C				un=3 +	movd	(%eax), %mm6		C				un=3 +	pmuludq	%mm7, %mm6		C				un=3 +	movd	4(%eax), %mm1		C				un=3 +	movd	8(%edx), %mm4		C				un=3 +	pmuludq	%mm7, %mm1		C				un=3 +	movd	8(%eax), %mm2		C				un=3 +	movd	12(%edx), %mm5		C				un=3 +	pmuludq	%mm7, %mm2		C				un=3 +	paddq	%mm4, %mm6		C				un=3 +	paddq	%mm1, %mm5		C				un=3 +	movd	16(%edx), %mm4		C				un=3 +	movd	%mm6, 8(%edx)		C				un=3 +	psrlq	$32, %mm6		C				un=3 +	paddq	%mm5, %mm6		C				un=3 +	paddq	%mm2, %mm4		C				un=3 +	movd	%mm6, 12(%edx)		C				un=3 +	psrlq	$32, %mm6		C				un=3 +	paddq	%mm4, %mm6		C				un=3 +	movd	%mm6, 16(%edx)		C				un=3 +	psrlq	$32, %mm6		C				un=3 +	movd	%mm6, 20(%edx)		C				un=3 +	jmp	L(rtr) + + +L(big):	push	%edi +	pxor	%mm6, %mm6 +	lea	4(%esi), %esi +	and	$3, %ecx +	jz	L(0) +	cmp	$2, %ecx +	jc	L(1) +	jz	L(2) +	jmp	L(3)			C FIXME: one case should fall through + + +L(0):	movd	(%eax), %mm3		C				m 0 +	sub	24(%esp), %ecx		C inner loop count		m 0 +	mov	%ecx, 24(%esp)		C update loop count for later	m 0 +	pmuludq	%mm7, %mm3		C				m 0 +	movd	4(%eax), %mm0		C				m 0 +	pmuludq	%mm7, %mm0		C				m 0 +	movd	8(%eax), %mm1		C				m 0 +	jmp	L(m00)			C				m 0 +	ALIGN(16)			C				m 0 +L(lpm0): +	pmuludq	%mm7, %mm4		C				m 0 +	paddq	%mm0, %mm6		C				m 0 +	movd	(%eax), %mm3		C				m 0 +	movd	%mm6, -12(%edx)		C				m 0 +	psrlq	$32, %mm6		C				m 0 +	pmuludq	%mm7, %mm3		C				m 0 +	paddq	%mm1, %mm6		C				m 0 +	movd	4(%eax), %mm0		C				m 0 +	movd	%mm6, -8(%edx)		C				m 0 +	psrlq	$32, %mm6		C				m 0 +	pmuludq	%mm7, %mm0		C				m 0 +	paddq	%mm4, %mm6		C				m 0 +	movd	8(%eax), %mm1		C				m 0 +	movd	%mm6, -4(%edx)		C				m 0 +	psrlq	$32, %mm6		C				m 0 +L(m00):	pmuludq	%mm7, %mm1		C				m 0 +	paddq	%mm3, %mm6		C				m 0 +	movd	12(%eax), %mm4		C				m 0 +	movd	%mm6, (%edx)		C				m 0 +	psrlq	$32, %mm6		C				m 0 +	lea	16(%eax), %eax		C				m 0 +	lea	16(%edx), %edx		C				m 0 +	add	$4, %ecx		C				m 0 +	ja	L(lpm0)			C				m 0 +	pmuludq	%mm7, %mm4		C				m 0 +	paddq	%mm0, %mm6		C				m 0 +	movd	%mm6, -12(%edx)		C				m 0 +	psrlq	$32, %mm6		C				m 0 +	paddq	%mm1, %mm6		C				m 0 +	mov	16(%esp), %edi		C rp				  0 +	jmp	L(x0) + +L(olp0): +	lea	4(%edi), %edi		C				am 0 +	movd	(%esi), %mm7		C				am 0 +	lea	4(%esi), %esi		C				am 0 +	mov	%edi, %edx		C rp				am 0 +	mov	20(%esp), %eax		C up				am 0 +	movd	(%eax), %mm3		C				am 0 +	mov	24(%esp), %ecx		C inner loop count		am 0 +	pxor	%mm6, %mm6		C				am 0 +	pmuludq	%mm7, %mm3		C				am 0 +	movd	4(%eax), %mm0		C				am 0 +	movd	(%edx), %mm5		C				am 0 +	pmuludq	%mm7, %mm0		C				am 0 +	movd	8(%eax), %mm1		C				am 0 +	paddq	%mm3, %mm5		C				am 0 +	movd	4(%edx), %mm4		C				am 0 +	jmp	L(am00)			C				am 0 +	ALIGN(16)			C				mm 0 +L(lam0): +	pmuludq	%mm7, %mm2		C				am 0 +	paddq	%mm4, %mm6		C				am 0 +	movd	(%eax), %mm3		C				am 0 +	paddq	%mm1, %mm5		C				am 0 +	movd	-4(%edx), %mm4		C				am 0 +	movd	%mm6, -12(%edx)		C				am 0 +	psrlq	$32, %mm6		C				am 0 +	pmuludq	%mm7, %mm3		C				am 0 +	paddq	%mm5, %mm6		C				am 0 +	movd	4(%eax), %mm0		C				am 0 +	paddq	%mm2, %mm4		C				am 0 +	movd	(%edx), %mm5		C				am 0 +	movd	%mm6, -8(%edx)		C				am 0 +	psrlq	$32, %mm6		C				am 0 +	pmuludq	%mm7, %mm0		C				am 0 +	paddq	%mm4, %mm6		C				am 0 +	movd	8(%eax), %mm1		C				am 0 +	paddq	%mm3, %mm5		C				am 0 +	movd	4(%edx), %mm4		C				am 0 +	movd	%mm6, -4(%edx)		C				am 0 +	psrlq	$32, %mm6		C				am 0 +L(am00): +	pmuludq	%mm7, %mm1		C				am 0 +	paddq	%mm5, %mm6		C				am 0 +	movd	12(%eax), %mm2		C				am 0 +	paddq	%mm0, %mm4		C				am 0 +	movd	8(%edx), %mm5		C				am 0 +	movd	%mm6, (%edx)		C				am 0 +	psrlq	$32, %mm6		C				am 0 +	lea	16(%eax), %eax		C				am 0 +	lea	16(%edx), %edx		C				am 0 +	add	$4, %ecx		C				am 0 +	jnz	L(lam0)			C				am 0 +	pmuludq	%mm7, %mm2		C				am 0 +	paddq	%mm4, %mm6		C				am 0 +	paddq	%mm1, %mm5		C				am 0 +	movd	-4(%edx), %mm4		C				am 0 +	movd	%mm6, -12(%edx)		C				am 0 +	psrlq	$32, %mm6		C				am 0 +	paddq	%mm5, %mm6		C				am 0 +	paddq	%mm2, %mm4		C				am 0 +L(x0):	movd	%mm6, -8(%edx)		C				am 0 +	psrlq	$32, %mm6		C				am 0 +	paddq	%mm4, %mm6		C				am 0 +	movd	%mm6, -4(%edx)		C				am 0 +	psrlq	$32, %mm6		C				am 0 +	movd	%mm6, (%edx)		C				am 0 +	dec	%ebx			C				am 0 +	jnz	L(olp0)			C				am 0 +L(oel0): +	emms				C				   0 +	pop	%edi			C				   0 +	pop	%ebx			C				   0 +	pop	%esi			C				   0 +	ret				C				   0 + + +L(1):	movd	(%eax), %mm4		C				m 1 +	sub	24(%esp), %ecx		C				m 1 +	mov	%ecx, 24(%esp)		C update loop count for later	m 1 +	pmuludq	%mm7, %mm4		C				m 1 +	movd	4(%eax), %mm3		C				m 1 +	pmuludq	%mm7, %mm3		C				m 1 +	movd	8(%eax), %mm0		C				m 1 +	jmp	L(m01)			C				m 1 +	ALIGN(16)			C				m 1 +L(lpm1): +	pmuludq	%mm7, %mm4		C				m 1 +	paddq	%mm0, %mm6		C				m 1 +	movd	4(%eax), %mm3		C				m 1 +	movd	%mm6, -8(%edx)		C				m 1 +	psrlq	$32, %mm6		C				m 1 +	pmuludq	%mm7, %mm3		C				m 1 +	paddq	%mm1, %mm6		C				m 1 +	movd	8(%eax), %mm0		C				m 1 +	movd	%mm6, -4(%edx)		C				m 1 +	psrlq	$32, %mm6		C				m 1 +L(m01):	pmuludq	%mm7, %mm0		C				m 1 +	paddq	%mm4, %mm6		C				m 1 +	movd	12(%eax), %mm1		C				m 1 +	movd	%mm6, (%edx)		C				m 1 +	psrlq	$32, %mm6		C				m 1 +	pmuludq	%mm7, %mm1		C				m 1 +	paddq	%mm3, %mm6		C				m 1 +	movd	16(%eax), %mm4		C				m 1 +	movd	%mm6, 4(%edx)		C				m 1 +	psrlq	$32, %mm6		C				m 1 +	lea	16(%eax), %eax		C				m 1 +	lea	16(%edx), %edx		C				m 1 +	add	$4, %ecx		C				m 1 +	ja	L(lpm1)			C				m 1 +	pmuludq	%mm7, %mm4		C				m 1 +	paddq	%mm0, %mm6		C				m 1 +	movd	%mm6, -8(%edx)		C				m 1 +	psrlq	$32, %mm6		C				m 1 +	paddq	%mm1, %mm6		C				m 1 +	mov	16(%esp), %edi		C rp				  1 +	jmp	L(x1) + +L(olp1): +	lea	4(%edi), %edi		C				am 1 +	movd	(%esi), %mm7		C				am 1 +	lea	4(%esi), %esi		C				am 1 +	mov	%edi, %edx		C rp				am 1 +	mov	20(%esp), %eax		C up				am 1 +	movd	(%eax), %mm2		C				am 1 +	mov	24(%esp), %ecx		C inner loop count		am 1 +	pxor	%mm6, %mm6		C				am 1 +	pmuludq	%mm7, %mm2		C				am 1 +	movd	4(%eax), %mm3		C				am 1 +	movd	(%edx), %mm4		C				am 1 +	pmuludq	%mm7, %mm3		C				am 1 +	movd	8(%eax), %mm0		C				am 1 +	paddq	%mm2, %mm4		C				am 1 +	movd	4(%edx), %mm5		C				am 1 +	jmp	L(am01)			C				am 1 +	ALIGN(16)			C				am 1 +L(lam1): +	pmuludq	%mm7, %mm2		C				am 1 +	paddq	%mm4, %mm6		C				am 1 +	movd	4(%eax), %mm3		C				am 1 +	paddq	%mm1, %mm5		C				am 1 +	movd	(%edx), %mm4		C				am 1 +	movd	%mm6, -8(%edx)		C				am 1 +	psrlq	$32, %mm6		C				am 1 +	pmuludq	%mm7, %mm3		C				am 1 +	paddq	%mm5, %mm6		C				am 1 +	movd	8(%eax), %mm0		C				am 1 +	paddq	%mm2, %mm4		C				am 1 +	movd	4(%edx), %mm5		C				am 1 +	movd	%mm6, -4(%edx)		C				am 1 +	psrlq	$32, %mm6		C				am 1 +L(am01): +	pmuludq	%mm7, %mm0		C				am 1 +	paddq	%mm4, %mm6		C				am 1 +	movd	12(%eax), %mm1		C				am 1 +	paddq	%mm3, %mm5		C				am 1 +	movd	8(%edx), %mm4		C				am 1 +	movd	%mm6, (%edx)		C				am 1 +	psrlq	$32, %mm6		C				am 1 +	pmuludq	%mm7, %mm1		C				am 1 +	paddq	%mm5, %mm6		C				am 1 +	movd	16(%eax), %mm2		C				am 1 +	paddq	%mm0, %mm4		C				am 1 +	movd	12(%edx), %mm5		C				am 1 +	movd	%mm6, 4(%edx)		C				am 1 +	psrlq	$32, %mm6		C				am 1 +	lea	16(%eax), %eax		C				am 1 +	lea	16(%edx), %edx		C				am 1 +	add	$4, %ecx		C				am 1 +	jnz	L(lam1)			C				am 1 +	pmuludq	%mm7, %mm2		C				am 1 +	paddq	%mm4, %mm6		C				am 1 +	paddq	%mm1, %mm5		C				am 1 +	movd	(%edx), %mm4		C				am 1 +	movd	%mm6, -8(%edx)		C				am 1 +	psrlq	$32, %mm6		C				am 1 +	paddq	%mm5, %mm6		C				am 1 +	paddq	%mm2, %mm4		C				am 1 +L(x1):	movd	%mm6, -4(%edx)		C				am 1 +	psrlq	$32, %mm6		C				am 1 +	paddq	%mm4, %mm6		C				am 1 +	movd	%mm6, (%edx)		C				am 1 +	psrlq	$32, %mm6		C				am 1 +	movd	%mm6, 4(%edx)		C				am 1 +	dec	%ebx			C				am 1 +	jnz	L(olp1)			C				am 1 +L(oel1): +	emms				C				   1 +	pop	%edi			C				   1 +	pop	%ebx			C				   1 +	pop	%esi			C				   1 +	ret				C				   1 + + +L(2):	movd	(%eax), %mm1		C				m 2 +	sub	24(%esp), %ecx		C				m 2 +	mov	%ecx, 24(%esp)		C update loop count for later	m 2 +	pmuludq	%mm7, %mm1		C				m 2 +	movd	4(%eax), %mm4		C				m 2 +	pmuludq	%mm7, %mm4		C				m 2 +	movd	8(%eax), %mm3		C				m 2 +	jmp	L(m10)			C				m 2 +	ALIGN(16)			C				m 2 +L(lpm2): +	pmuludq	%mm7, %mm4		C				m 2 +	paddq	%mm0, %mm6		C				m 2 +	movd	8(%eax), %mm3		C				m 2 +	movd	%mm6, -4(%edx)		C				m 2 +	psrlq	$32, %mm6		C				m 2 +L(m10):	pmuludq	%mm7, %mm3		C				m 2 +	paddq	%mm1, %mm6		C				m 2 +	movd	12(%eax), %mm0		C				m 2 +	movd	%mm6, (%edx)		C				m 2 +	psrlq	$32, %mm6		C				m 2 +	pmuludq	%mm7, %mm0		C				m 2 +	paddq	%mm4, %mm6		C				m 2 +	movd	16(%eax), %mm1		C				m 2 +	movd	%mm6, 4(%edx)		C				m 2 +	psrlq	$32, %mm6		C				m 2 +	pmuludq	%mm7, %mm1		C				m 2 +	paddq	%mm3, %mm6		C				m 2 +	movd	20(%eax), %mm4		C				m 2 +	movd	%mm6, 8(%edx)		C				m 2 +	psrlq	$32, %mm6		C				m 2 +	lea	16(%eax), %eax		C				m 2 +	lea	16(%edx), %edx		C				m 2 +	add	$4, %ecx		C				m 2 +	ja	L(lpm2)			C				m 2 +	pmuludq	%mm7, %mm4		C				m 2 +	paddq	%mm0, %mm6		C				m 2 +	movd	%mm6, -4(%edx)		C				m 2 +	psrlq	$32, %mm6		C				m 2 +	paddq	%mm1, %mm6		C				m 2 +	mov	16(%esp), %edi		C rp				  2 +	jmp	L(x2) + +L(olp2): +	lea	4(%edi), %edi		C				am 2 +	movd	(%esi), %mm7		C				am 2 +	lea	4(%esi), %esi		C				am 2 +	mov	%edi, %edx		C rp				am 2 +	mov	20(%esp), %eax		C up				am 2 +	movd	(%eax), %mm1		C				am 2 +	mov	24(%esp), %ecx		C inner loop count		am 2 +	pxor	%mm6, %mm6		C				am 2 +	pmuludq	%mm7, %mm1		C				am 2 +	movd	4(%eax), %mm2		C				am 2 +	movd	(%edx), %mm5		C				am 2 +	pmuludq	%mm7, %mm2		C				am 2 +	movd	8(%eax), %mm3		C				am 2 +	paddq	%mm1, %mm5		C				am 2 +	movd	4(%edx), %mm4		C				am 2 +	jmp	L(am10)			C				am 2 +	ALIGN(16)			C				am 2 +L(lam2): +	pmuludq	%mm7, %mm2		C				am 2 +	paddq	%mm4, %mm6		C				am 2 +	movd	8(%eax), %mm3		C				am 2 +	paddq	%mm1, %mm5		C				am 2 +	movd	4(%edx), %mm4		C				am 2 +	movd	%mm6, -4(%edx)		C				am 2 +	psrlq	$32, %mm6		C				am 2 +L(am10): +	pmuludq	%mm7, %mm3		C				am 2 +	paddq	%mm5, %mm6		C				am 2 +	movd	12(%eax), %mm0		C				am 2 +	paddq	%mm2, %mm4		C				am 2 +	movd	8(%edx), %mm5		C				am 2 +	movd	%mm6, (%edx)		C				am 2 +	psrlq	$32, %mm6		C				am 2 +	pmuludq	%mm7, %mm0		C				am 2 +	paddq	%mm4, %mm6		C				am 2 +	movd	16(%eax), %mm1		C				am 2 +	paddq	%mm3, %mm5		C				am 2 +	movd	12(%edx), %mm4		C				am 2 +	movd	%mm6, 4(%edx)		C				am 2 +	psrlq	$32, %mm6		C				am 2 +	pmuludq	%mm7, %mm1		C				am 2 +	paddq	%mm5, %mm6		C				am 2 +	movd	20(%eax), %mm2		C				am 2 +	paddq	%mm0, %mm4		C				am 2 +	movd	16(%edx), %mm5		C				am 2 +	movd	%mm6, 8(%edx)		C				am 2 +	psrlq	$32, %mm6		C				am 2 +	lea	16(%eax), %eax		C				am 2 +	lea	16(%edx), %edx		C				am 2 +	add	$4, %ecx		C				am 2 +	jnz	L(lam2)			C				am 2 +	pmuludq	%mm7, %mm2		C				am 2 +	paddq	%mm4, %mm6		C				am 2 +	paddq	%mm1, %mm5		C				am 2 +	movd	4(%edx), %mm4		C				am 2 +	movd	%mm6, -4(%edx)		C				am 2 +	psrlq	$32, %mm6		C				am 2 +	paddq	%mm5, %mm6		C				am 2 +	paddq	%mm2, %mm4		C				am 2 +L(x2):	movd	%mm6, (%edx)		C				am 2 +	psrlq	$32, %mm6		C				am 2 +	paddq	%mm4, %mm6		C				am 2 +	movd	%mm6, 4(%edx)		C				am 2 +	psrlq	$32, %mm6		C				am 2 +	movd	%mm6, 8(%edx)		C				am 2 +	dec	%ebx			C				am 2 +	jnz	L(olp2)			C				am 2 +L(oel2): +	emms				C				   2 +	pop	%edi			C				   2 +	pop	%ebx			C				   2 +	pop	%esi			C				   2 +	ret				C				   2 + + +L(3):	movd	(%eax), %mm0		C				m 3 +	sub	24(%esp), %ecx		C				m 3 +	mov	%ecx, 24(%esp)		C update loop count for later	m 3 +	pmuludq	%mm7, %mm0		C				m 3 +	movd	4(%eax), %mm1		C				m 3 +	pmuludq	%mm7, %mm1		C				m 3 +	movd	8(%eax), %mm4		C				m 3 +	jmp	L(lpm3)			C				m 3 +	ALIGN(16)			C				m 3 +L(lpm3): +	pmuludq	%mm7, %mm4		C				m 3 +	paddq	%mm0, %mm6		C				m 3 +	movd	12(%eax), %mm3		C				m 3 +	movd	%mm6, (%edx)		C				m 3 +	psrlq	$32, %mm6		C				m 3 +	pmuludq	%mm7, %mm3		C				m 3 +	paddq	%mm1, %mm6		C				m 3 +	movd	16(%eax), %mm0		C				m 3 +	movd	%mm6, 4(%edx)		C				m 3 +	psrlq	$32, %mm6		C				m 3 +	pmuludq	%mm7, %mm0		C				m 3 +	paddq	%mm4, %mm6		C				m 3 +	movd	20(%eax), %mm1		C				m 3 +	movd	%mm6, 8(%edx)		C				m 3 +	psrlq	$32, %mm6		C				m 3 +	pmuludq	%mm7, %mm1		C				m 3 +	paddq	%mm3, %mm6		C				m 3 +	movd	24(%eax), %mm4		C				m 3 +	movd	%mm6, 12(%edx)		C				m 3 +	psrlq	$32, %mm6		C				m 3 +	lea	16(%eax), %eax		C				m 3 +	lea	16(%edx), %edx		C				m 3 +	add	$4, %ecx		C				m 3 +	ja	L(lpm3)			C				m 3 +	pmuludq	%mm7, %mm4		C				m 3 +	paddq	%mm0, %mm6		C				m 3 +	movd	%mm6, (%edx)		C				m 3 +	psrlq	$32, %mm6		C				m 3 +	paddq	%mm1, %mm6		C				m 3 +	mov	16(%esp), %edi		C rp				  3 +	jmp	L(x3) + +L(olp3): +	lea	4(%edi), %edi		C				am 3 +	movd	(%esi), %mm7		C				am 3 +	lea	4(%esi), %esi		C				am 3 +	mov	%edi, %edx		C rp				am 3 +	mov	20(%esp), %eax		C up				am 3 +	movd	(%eax), %mm0		C				am 3 +	mov	24(%esp), %ecx		C inner loop count		am 3 +	pxor	%mm6, %mm6		C				am 3 +	pmuludq	%mm7, %mm0		C				am 3 +	movd	4(%eax), %mm1		C				am 3 +	movd	(%edx), %mm4		C				am 3 +	pmuludq	%mm7, %mm1		C				am 3 +	movd	8(%eax), %mm2		C				am 3 +	paddq	%mm0, %mm4		C				am 3 +	movd	4(%edx), %mm5		C				am 3 +	jmp	L(lam3)			C				am 3 +	ALIGN(16)			C				am 3 +L(lam3): +	pmuludq	%mm7, %mm2		C				am 3 +	paddq	%mm4, %mm6		C				am 3 +	movd	12(%eax), %mm3		C				am 3 +	paddq	%mm1, %mm5		C				am 3 +	movd	8(%edx), %mm4		C				am 3 +	movd	%mm6, (%edx)		C				am 3 +	psrlq	$32, %mm6		C				am 3 +	pmuludq	%mm7, %mm3		C				am 3 +	paddq	%mm5, %mm6		C				am 3 +	movd	16(%eax), %mm0		C				am 3 +	paddq	%mm2, %mm4		C				am 3 +	movd	12(%edx), %mm5		C				am 3 +	movd	%mm6, 4(%edx)		C				am 3 +	psrlq	$32, %mm6		C				am 3 +	pmuludq	%mm7, %mm0		C				am 3 +	paddq	%mm4, %mm6		C				am 3 +	movd	20(%eax), %mm1		C				am 3 +	paddq	%mm3, %mm5		C				am 3 +	movd	16(%edx), %mm4		C				am 3 +	movd	%mm6, 8(%edx)		C				am 3 +	psrlq	$32, %mm6		C				am 3 +	pmuludq	%mm7, %mm1		C				am 3 +	paddq	%mm5, %mm6		C				am 3 +	movd	24(%eax), %mm2		C				am 3 +	paddq	%mm0, %mm4		C				am 3 +	movd	20(%edx), %mm5		C				am 3 +	movd	%mm6, 12(%edx)		C				am 3 +	psrlq	$32, %mm6		C				am 3 +	lea	16(%eax), %eax		C				am 3 +	lea	16(%edx), %edx		C				am 3 +	add	$4, %ecx		C				am 3 +	jnz	L(lam3)			C				am 3 +	pmuludq	%mm7, %mm2		C				am 3 +	paddq	%mm4, %mm6		C				am 3 +	paddq	%mm1, %mm5		C				am 3 +	movd	8(%edx), %mm4		C				am 3 +	movd	%mm6, (%edx)		C				am 3 +	psrlq	$32, %mm6		C				am 3 +	paddq	%mm5, %mm6		C				am 3 +	paddq	%mm2, %mm4		C				am 3 +L(x3):	movd	%mm6, 4(%edx)		C				am 3 +	psrlq	$32, %mm6		C				am 3 +	paddq	%mm4, %mm6		C				am 3 +	movd	%mm6, 8(%edx)		C				am 3 +	psrlq	$32, %mm6		C				am 3 +	movd	%mm6, 12(%edx)		C				am 3 +	dec	%ebx			C				am 3 +	jnz	L(olp3)			C				am 3 +L(oel3): +	emms				C				   3 +	pop	%edi			C				   3 +	pop	%ebx			C				   3 +	pop	%esi			C				   3 +	ret				C				   3 +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/popcount.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/popcount.asm new file mode 100644 index 0000000..c7f4426 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/popcount.asm @@ -0,0 +1,281 @@ +dnl  X86-32 and X86-64 mpn_popcount using SSE2. + +dnl  Copyright 2006, 2007, 2011, 2015, 2020 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + + +include(`../config.m4') + + +C 32-bit		     popcount	     hamdist +C			    cycles/limb	    cycles/limb +C P5				- +C P6 model 0-8,10-12		- +C P6 model 9  (Banias)		? +C P6 model 13 (Dothan)		4 +C P4 model 0  (Willamette)	? +C P4 model 1  (?)		? +C P4 model 2  (Northwood)	3.9 +C P4 model 3  (Prescott)	? +C P4 model 4  (Nocona)		? +C AMD K6			- +C AMD K7			- +C AMD K8			? + +C 64-bit		     popcount	     hamdist +C			    cycles/limb	    cycles/limb +C P4 model 4 (Nocona):		8 +C AMD K8,K9			7.5 +C AMD K10			3.5 +C Intel core2			3.68 +C Intel corei			3.15 +C Intel atom		       10.8 +C VIA nano			6.5 + +C TODO +C  * Make an mpn_hamdist based on this.  Alignment could either be handled by +C    using movdqu for one operand and movdqa for the other, or by painfully +C    shifting as we go.  Unfortunately, there seem to be no usable shift +C    instruction, except for one that takes an immediate count. +C  * It would probably be possible to cut a few cycles/limb using software +C    pipelining. +C  * There are 35 decode slots unused by the SSE2 instructions.  Loop control +C    needs just 2 or 3 slots, leaving around 32 slots.  This allows a parallel +C    integer based popcount.  Such a combined loop would handle 6 limbs in +C    about 30 cycles on K8. +C  * We could save a byte or two by using 32-bit operations on areg. +C  * Check if using movdqa to a temp of and then register-based pand is faster. + +ifelse(GMP_LIMB_BITS,`32', +`	define(`up',  `%edx') +	define(`n',   `%ecx') +	define(`areg',`%eax') +	define(`breg',`%ebx') +	define(`zero',`%xmm4') +	define(`LIMB32',`	$1') +	define(`LIMB64',`dnl') +',` +	define(`up',  `%rdi') +	define(`n',   `%rsi') +	define(`areg',`%rax') +	define(`breg',`%rdx') +	define(`zero',`%xmm8') +	define(`LIMB32',`dnl') +	define(`LIMB64',`	$1') +') + +define(`mm01010101',`%xmm6') +define(`mm00110011',`%xmm7') +define(`mm00001111',`%xmm2') + +define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8)) +define(`LIMBS_PER_XMM',  eval(16/GMP_LIMB_BYTES)) +define(`LIMBS_PER_2XMM', eval(32/GMP_LIMB_BYTES)) + +undefine(`psadbw')			C override inherited m4 version + +C This file is shared between 32-bit and 64-bit builds.  Only the former has +C LEAL.  Default LEAL as an alias of LEA. +ifdef(`LEAL',,`define(`LEAL', `LEA($1,$2)')') + +ASM_START() + +C Make cnsts global to work around Apple relocation bug. +ifdef(`DARWIN',` +	define(`cnsts', MPN(popccnsts)) +	GLOBL	cnsts') + +	TEXT +	ALIGN(32) +PROLOGUE(mpn_popcount) + +LIMB32(`mov	4(%esp), up	') +LIMB32(`mov	8(%esp), n	') +LIMB32(`push	%ebx		') + +	pxor	%xmm3, %xmm3		C zero grand total count +LIMB64(`pxor	zero, zero	') + +	LEAL(	cnsts, breg) + +	movdqa	-48(breg), mm01010101 +	movdqa	-32(breg), mm00110011 +	movdqa	-16(breg), mm00001111 + +	mov	up, areg +	and	$-16, up		C round `up' down to 128-bit boundary +	and	$12, areg		C 32:areg = 0, 4, 8, 12 +					C 64:areg = 0, 8 +	movdqa	(up), %xmm0 +	pand	64(breg,areg,4), %xmm0 +	shr	$m4_log2(GMP_LIMB_BYTES), %eax +	add	areg, n			C compensate n for rounded down `up' + +	pxor	%xmm4, %xmm4 +	sub	$LIMBS_PER_XMM, n +	jbe	L(sum) + +	sub	$LIMBS_PER_XMM, n +	ja	L(ent) +	jmp	L(lsum) + +	ALIGN(16) +L(top):	movdqa	(up), %xmm0 +L(ent):	movdqa	16(up), %xmm4 + +	movdqa	%xmm0, %xmm1 +	movdqa	%xmm4, %xmm5 +	psrld	$1, %xmm0 +	psrld	$1, %xmm4 +	pand	mm01010101, %xmm0 +	pand	mm01010101, %xmm4 +	psubd	%xmm0, %xmm1 +	psubd	%xmm4, %xmm5 + +	movdqa	%xmm1, %xmm0 +	movdqa	%xmm5, %xmm4 +	psrlq	$2, %xmm1 +	psrlq	$2, %xmm5 +	pand	mm00110011, %xmm0 +	pand	mm00110011, %xmm4 +	pand	mm00110011, %xmm1 +	pand	mm00110011, %xmm5 +	paddq	%xmm0, %xmm1 +	paddq	%xmm4, %xmm5 + +LIMB32(`pxor	zero, zero	') + +	add	$32, up +	sub	$LIMBS_PER_2XMM, n + +	paddq	%xmm5, %xmm1 +	movdqa	%xmm1, %xmm0 +	psrlq	$4, %xmm1 +	pand	mm00001111, %xmm0 +	pand	mm00001111, %xmm1 +	paddq	%xmm0, %xmm1 + +	psadbw	zero, %xmm1 +	paddq	%xmm1, %xmm3		C add to grand total + +	jnc	L(top) +L(end): +	add	$LIMBS_PER_2XMM, n +	jz	L(rt) +	movdqa	(up), %xmm0 +	pxor	%xmm4, %xmm4 +	sub	$LIMBS_PER_XMM, n +	jbe	L(sum) +L(lsum): +	movdqa	%xmm0, %xmm4 +	movdqa	16(up), %xmm0 +L(sum): +	shl	$m4_log2(GMP_LIMB_BYTES), n +	and	$12, n +	pand	(breg,n,4), %xmm0 + +	movdqa	%xmm0, %xmm1 +	movdqa	%xmm4, %xmm5 +	psrld	$1, %xmm0 +	psrld	$1, %xmm4 +	pand	mm01010101, %xmm0 +	pand	mm01010101, %xmm4 +	psubd	%xmm0, %xmm1 +	psubd	%xmm4, %xmm5 + +	movdqa	%xmm1, %xmm0 +	movdqa	%xmm5, %xmm4 +	psrlq	$2, %xmm1 +	psrlq	$2, %xmm5 +	pand	mm00110011, %xmm0 +	pand	mm00110011, %xmm4 +	pand	mm00110011, %xmm1 +	pand	mm00110011, %xmm5 +	paddq	%xmm0, %xmm1 +	paddq	%xmm4, %xmm5 + +LIMB32(`pxor	zero, zero	') + +	paddq	%xmm5, %xmm1 +	movdqa	%xmm1, %xmm0 +	psrlq	$4, %xmm1 +	pand	mm00001111, %xmm0 +	pand	mm00001111, %xmm1 +	paddq	%xmm0, %xmm1 + +	psadbw	zero, %xmm1 +	paddq	%xmm1, %xmm3		C add to grand total + + +C Add the two 64-bit halves of the grand total counter +L(rt):	movdqa	%xmm3, %xmm0 +	psrldq	$8, %xmm3 +	paddq	%xmm3, %xmm0 +	movd	%xmm0, areg		C movq avoided due to gas bug + +LIMB32(`pop	%ebx		') +	ret + +EPILOGUE() +DEF_OBJECT(dummy,16) +C Three magic constants used for masking out bits +	.byte	0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55 +	.byte	0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55 + +	.byte	0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33 +	.byte	0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33 + +	.byte	0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f +	.byte	0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f +cnsts: +C Masks for high end of number +	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff +	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + +	.byte	0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 +	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + +	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff +	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + +	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff +	.byte	0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 +C Masks for low end of number +	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff +	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + +	.byte	0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff +	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + +	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + +	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +	.byte	0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff +END_OBJECT(dummy) +ASM_END() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/rsh1add_n.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/rsh1add_n.asm new file mode 100644 index 0000000..f421d13 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/rsh1add_n.asm @@ -0,0 +1,126 @@ +dnl  Intel Pentium-4 mpn_rsh1add_n -- mpn (x+y)/2 + +dnl  Copyright 2001-2004 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C        cycles/limb (approx) +C      dst!=src1,2  dst==src1  dst==src2 +C P4:      4.5         6.5        6.5 + + +C mp_limb_t mpn_rsh1add_n (mp_ptr wp, mp_srcptr xp, mp_srcptr yp, +C                          mp_size_t size); +C +C The slightly strange combination of indexing and pointer incrementing +C that's used seems to work best.  Not sure why, but for instance leal +C incrementing on %esi is a 1 or 2 cycle slowdown. +C +C The dependent chain is paddq combining the carry and next (shifted) part, +C plus psrlq to move the new carry down.  That, and just 4 mmx instructions +C in total, makes 4 c/l the target speed, which is almost achieved for +C separate src/dst but when src==dst the write combining anomalies slow it +C down. + +defframe(PARAM_SIZE, 16) +defframe(PARAM_YP,   12) +defframe(PARAM_XP,   8) +defframe(PARAM_WP,   4) + +dnl  re-use parameter space +define(SAVE_EBX,`PARAM_XP') +define(SAVE_ESI,`PARAM_YP') + +	TEXT +	ALIGN(8) + +PROLOGUE(mpn_rsh1add_n) +deflit(`FRAME',0) + +	movl	PARAM_XP, %edx +	movl	%ebx, SAVE_EBX + +	movl	PARAM_YP, %ebx +	movl	%esi, SAVE_ESI + +	movl	PARAM_WP, %esi + +	movd	(%edx), %mm0		C xp[0] + +	movd	(%ebx), %mm1		C yp[0] +	movl	PARAM_SIZE, %ecx + +	movl	(%edx), %eax		C xp[0] + +	addl	(%ebx), %eax		C xp[0]+yp[0] + +	paddq	%mm1, %mm0		C xp[0]+yp[0] +	leal	(%esi,%ecx,4), %esi	C wp end +	negl	%ecx			C -size + +	psrlq	$1, %mm0		C (xp[0]+yp[0])/2 +	and	$1, %eax		C return value, rsh1 bit of xp[0]+yp[0] +	addl	$1, %ecx		C -(size-1) +	jz	L(done) + + +L(top): +	C eax	return value +	C ebx	yp end +	C ecx	counter, limbs, -(size-1) to -1 inclusive +	C edx	xp end +	C esi	wp end +	C mm0	carry (32 bits) + +	movd	4(%edx), %mm1	C xp[i+1] +	movd	4(%ebx), %mm2	C yp[i+1] +	leal	4(%edx), %edx +	leal	4(%ebx), %ebx +	paddq	%mm2, %mm1		C xp[i+1]+yp[i+1] +	psllq	$31, %mm1		C low bit at 31, further 32 above + +	paddq	%mm1, %mm0		C 31 and carry from prev add +	movd	%mm0, -4(%esi,%ecx,4)	C low ready to store dst[i] + +	psrlq	$32, %mm0		C high becomes new carry + +	addl	$1, %ecx +	jnz	L(top) + + +L(done): +	movd	%mm0, -4(%esi)		C dst[size-1] +	movl	SAVE_EBX, %ebx + +	movl	SAVE_ESI, %esi +	emms +	ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/sqr_basecase.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/sqr_basecase.asm new file mode 100644 index 0000000..0d548e0 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/sqr_basecase.asm @@ -0,0 +1,705 @@ +dnl  mpn_sqr_basecase for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F). + +dnl  Copyright 2001, 2002, 2007 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C TODO: +C  * Improve ad-hoc outer loop code and register handling.  Some feed-in +C    scheduling could improve things by several cycles per outer iteration. +C  * In Lam3...Lam1 code for, keep accumulation operands in registers, without +C    storing intermediates to rp. +C  * We might want to keep 32 in a free mm register, since the register form is +C    3 bytes and the immediate form is 4 bytes.  About 80 bytes to save. +C  * Look into different loop alignment, we now expand the code about 50 bytes +C    with possibly needless alignment. +C  * Use OSP, should solve feed-in latency problems. +C  * Address relative slowness for un<=3 for Pentium M.  The old code is there +C    considerably faster.  (1:20/14, 2:34/32, 3:66/57) + +C INPUT PARAMETERS +C rp		sp + 4 +C up		sp + 8 +C un		sp + 12 + +	TEXT +	ALIGN(16) +PROLOGUE(mpn_sqr_basecase) +	mov	4(%esp), %edx		C rp +	mov	8(%esp), %eax		C up +	mov	12(%esp), %ecx		C un + +	cmp	$2, %ecx +	jc	L(un1) +	jz	L(un2) +	cmp	$4, %ecx +	jc	L(un3) +	jz	L(un4) +	jmp	L(big) + +L(un1):	mov	(%eax), %eax +	mov	%edx, %ecx +	mul	%eax +	mov	%eax, (%ecx) +	mov	%edx, 4(%ecx) +	ret +L(un2):	movd	(%eax), %mm0		C				un=2 +	movd	(%eax), %mm2		C				un=2 +	movd	4(%eax), %mm1		C				un=2 +	pmuludq	%mm0, %mm0		C 64b weight 0			un=2 +	pmuludq	%mm1, %mm2		C 64b weight 32			un=2 +	pmuludq	%mm1, %mm1		C 64b weight 64			un=2 +	movd	%mm0, (%edx)		C				un=2 +	psrlq	$32, %mm0		C 32b weight 32			un=2 +	pcmpeqd	%mm7, %mm7		C				un=2 +	psrlq	$33, %mm7		C 0x000000007FFFFFFF		un=2 +	pand	%mm2, %mm7		C 31b weight 32			un=2 +	psrlq	$31, %mm2		C 33b weight 65			un=2 +	psllq	$1, %mm7		C 31b weight 33			un=2 +	paddq	%mm7, %mm0		C				un=2 +	movd	%mm0, 4(%edx)		C				un=2 +	psrlq	$32, %mm0		C				un=2 +	paddq	%mm2, %mm1		C				un=2 +	paddq	%mm0, %mm1		C				un=2 +	movd	%mm1, 8(%edx)		C				un=2 +	psrlq	$32, %mm1		C				un=2 +	movd	%mm1, 12(%edx)		C				un=2 +	emms +	ret +L(un3):	movd	(%eax), %mm7		C				un=3 +	movd	4(%eax), %mm6		C				un=3 +	pmuludq	%mm7, %mm6		C				un=3 +	movd	8(%eax), %mm2		C				un=3 +	pmuludq	%mm7, %mm2		C				un=3 +	movd	%mm6, 4(%edx)		C				un=3 +	psrlq	$32, %mm6		C				un=3 +	paddq	%mm2, %mm6		C				un=3 +	movd	%mm6, 8(%edx)		C				un=3 +	psrlq	$32, %mm6		C				un=3 +	movd	%mm6, 12(%edx)		C				un=3 +	lea	4(%edx), %edx		C				un=3 +	lea	4(%eax), %eax		C				un=3 +	jmp	L(am1) +L(un4):	movd	(%eax), %mm7		C				un=4 +	movd	4(%eax), %mm6		C				un=4 +	pmuludq	%mm7, %mm6		C				un=4 +	movd	8(%eax), %mm0		C				un=4 +	pmuludq	%mm7, %mm0		C				un=4 +	movd	12(%eax), %mm1		C				un=4 +	pmuludq	%mm7, %mm1		C				un=4 +	movd	%mm6, 4(%edx)		C				un=4 +	psrlq	$32, %mm6		C				un=4 +	paddq	%mm0, %mm6		C				un=4 +	movd	%mm6, 8(%edx)		C				un=4 +	psrlq	$32, %mm6		C				un=4 +	paddq	%mm1, %mm6		C				un=4 +	movd	%mm6, 12(%edx)		C				un=4 +	psrlq	$32, %mm6		C				un=4 +	movd	%mm6, 16(%edx)		C				un=4 +	lea	4(%edx), %edx		C				un=4 +	lea	4(%eax), %eax		C				un=4 +	jmp	L(am2) + +L(big):	push	%esi +	push	%ebx +	push	%edi +	pxor	%mm6, %mm6 +	movd	(%eax), %mm7		C +	lea	4(%eax), %esi		C init up, up++ +	lea	4(%eax), %eax		C up2++  FIXME: should fix offsets +	lea	4(%edx), %edi		C init rp, rp++ +	lea	4(%edx), %edx		C rp2++ +	lea	-4(%ecx), %ebx		C loop count +	and	$3, %ecx +	jz	L(3m) +	cmp	$2, %ecx +	ja	L(2m) +	jb	L(0m) + +L(1m): +	movd	(%eax), %mm4		C				m 1 +	lea	(%ebx), %ecx		C inner loop count		m 1 +	pmuludq	%mm7, %mm4		C				m 1 +	movd	4(%eax), %mm3		C				m 1 +	pmuludq	%mm7, %mm3		C				m 1 +	movd	8(%eax), %mm0		C				m 1 +	jmp	L(m01)			C				m 1 +	ALIGN(16)			C				m 1 +L(lpm1): +	pmuludq	%mm7, %mm4		C				m 1 +	paddq	%mm0, %mm6		C				m 1 +	movd	4(%eax), %mm3		C				m 1 +	movd	%mm6, -8(%edx)		C				m 1 +	psrlq	$32, %mm6		C				m 1 +	pmuludq	%mm7, %mm3		C				m 1 +	paddq	%mm1, %mm6		C				m 1 +	movd	8(%eax), %mm0		C				m 1 +	movd	%mm6, -4(%edx)		C				m 1 +	psrlq	$32, %mm6		C				m 1 +L(m01):	pmuludq	%mm7, %mm0		C				m 1 +	paddq	%mm4, %mm6		C				m 1 +	movd	12(%eax), %mm1		C				m 1 +	movd	%mm6, (%edx)		C				m 1 +	psrlq	$32, %mm6		C				m 1 +	pmuludq	%mm7, %mm1		C				m 1 +	paddq	%mm3, %mm6		C				m 1 +	movd	16(%eax), %mm4		C				m 1 +	movd	%mm6, 4(%edx)		C				m 1 +	psrlq	$32, %mm6		C				m 1 +	lea	16(%eax), %eax		C				m 1 +	lea	16(%edx), %edx		C				m 1 +	sub	$4, %ecx		C				m 1 +	ja	L(lpm1)			C				m 1 +	pmuludq	%mm7, %mm4		C				m 1 +	paddq	%mm0, %mm6		C				m 1 +	movd	%mm6, -8(%edx)		C				m 1 +	psrlq	$32, %mm6		C				m 1 +	paddq	%mm1, %mm6		C				m 1 +	jmp	L(0) + +L(2m): +	movd	(%eax), %mm1		C				m 2 +	lea	(%ebx), %ecx		C inner loop count		m 2 +	pmuludq	%mm7, %mm1		C				m 2 +	movd	4(%eax), %mm4		C				m 2 +	pmuludq	%mm7, %mm4		C				m 2 +	movd	8(%eax), %mm3		C				m 2 +	jmp	L(m10)			C				m 2 +	ALIGN(16)			C				m 2 +L(lpm2): +	pmuludq	%mm7, %mm4		C				m 2 +	paddq	%mm0, %mm6		C				m 2 +	movd	8(%eax), %mm3		C				m 2 +	movd	%mm6, -4(%edx)		C				m 2 +	psrlq	$32, %mm6		C				m 2 +L(m10):	pmuludq	%mm7, %mm3		C				m 2 +	paddq	%mm1, %mm6		C				m 2 +	movd	12(%eax), %mm0		C				m 2 +	movd	%mm6, (%edx)		C				m 2 +	psrlq	$32, %mm6		C				m 2 +	pmuludq	%mm7, %mm0		C				m 2 +	paddq	%mm4, %mm6		C				m 2 +	movd	16(%eax), %mm1		C				m 2 +	movd	%mm6, 4(%edx)		C				m 2 +	psrlq	$32, %mm6		C				m 2 +	pmuludq	%mm7, %mm1		C				m 2 +	paddq	%mm3, %mm6		C				m 2 +	movd	20(%eax), %mm4		C				m 2 +	movd	%mm6, 8(%edx)		C				m 2 +	psrlq	$32, %mm6		C				m 2 +	lea	16(%eax), %eax		C				m 2 +	lea	16(%edx), %edx		C				m 2 +	sub	$4, %ecx		C				m 2 +	ja	L(lpm2)			C				m 2 +	pmuludq	%mm7, %mm4		C				m 2 +	paddq	%mm0, %mm6		C				m 2 +	movd	%mm6, -4(%edx)		C				m 2 +	psrlq	$32, %mm6		C				m 2 +	paddq	%mm1, %mm6		C				m 2 +	jmp	L(1) + +L(3m): +	movd	(%eax), %mm0		C				m 3 +	lea	(%ebx), %ecx		C inner loop count		m 3 +	pmuludq	%mm7, %mm0		C				m 3 +	movd	4(%eax), %mm1		C				m 3 +	pmuludq	%mm7, %mm1		C				m 3 +	movd	8(%eax), %mm4		C				m 3 +	jmp	L(lpm3)			C				m 3 +	ALIGN(16)			C				m 3 +L(lpm3): +	pmuludq	%mm7, %mm4		C				m 3 +	paddq	%mm0, %mm6		C				m 3 +	movd	12(%eax), %mm3		C				m 3 +	movd	%mm6, (%edx)		C				m 3 +	psrlq	$32, %mm6		C				m 3 +	pmuludq	%mm7, %mm3		C				m 3 +	paddq	%mm1, %mm6		C				m 3 +	movd	16(%eax), %mm0		C				m 3 +	movd	%mm6, 4(%edx)		C				m 3 +	psrlq	$32, %mm6		C				m 3 +	pmuludq	%mm7, %mm0		C				m 3 +	paddq	%mm4, %mm6		C				m 3 +	movd	20(%eax), %mm1		C				m 3 +	movd	%mm6, 8(%edx)		C				m 3 +	psrlq	$32, %mm6		C				m 3 +	pmuludq	%mm7, %mm1		C				m 3 +	paddq	%mm3, %mm6		C				m 3 +	movd	24(%eax), %mm4		C				m 3 +	movd	%mm6, 12(%edx)		C				m 3 +	psrlq	$32, %mm6		C				m 3 +	lea	16(%eax), %eax		C				m 3 +	lea	16(%edx), %edx		C				m 3 +	sub	$4, %ecx		C				m 3 +	ja	L(lpm3)			C				m 3 +	pmuludq	%mm7, %mm4		C				m 3 +	paddq	%mm0, %mm6		C				m 3 +	movd	%mm6, (%edx)		C				m 3 +	psrlq	$32, %mm6		C				m 3 +	paddq	%mm1, %mm6		C				m 3 +	jmp	L(2) + +L(0m): +	movd	(%eax), %mm3		C				m 0 +	lea	(%ebx), %ecx		C inner loop count		m 0 +	pmuludq	%mm7, %mm3		C				m 0 +	movd	4(%eax), %mm0		C				m 0 +	pmuludq	%mm7, %mm0		C				m 0 +	movd	8(%eax), %mm1		C				m 0 +	jmp	L(m00)			C				m 0 +	ALIGN(16)			C				m 0 +L(lpm0): +	pmuludq	%mm7, %mm4		C				m 0 +	paddq	%mm0, %mm6		C				m 0 +	movd	(%eax), %mm3		C				m 0 +	movd	%mm6, -12(%edx)		C				m 0 +	psrlq	$32, %mm6		C				m 0 +	pmuludq	%mm7, %mm3		C				m 0 +	paddq	%mm1, %mm6		C				m 0 +	movd	4(%eax), %mm0		C				m 0 +	movd	%mm6, -8(%edx)		C				m 0 +	psrlq	$32, %mm6		C				m 0 +	pmuludq	%mm7, %mm0		C				m 0 +	paddq	%mm4, %mm6		C				m 0 +	movd	8(%eax), %mm1		C				m 0 +	movd	%mm6, -4(%edx)		C				m 0 +	psrlq	$32, %mm6		C				m 0 +L(m00):	pmuludq	%mm7, %mm1		C				m 0 +	paddq	%mm3, %mm6		C				m 0 +	movd	12(%eax), %mm4		C				m 0 +	movd	%mm6, (%edx)		C				m 0 +	psrlq	$32, %mm6		C				m 0 +	lea	16(%eax), %eax		C				m 0 +	lea	16(%edx), %edx		C				m 0 +	sub	$4, %ecx		C				m 0 +	ja	L(lpm0)			C				m 0 +	pmuludq	%mm7, %mm4		C				m 0 +	paddq	%mm0, %mm6		C				m 0 +	movd	%mm6, -12(%edx)		C				m 0 +	psrlq	$32, %mm6		C				m 0 +	paddq	%mm1, %mm6		C				m 0 +	jmp	L(3) + +L(outer): +	lea	8(%edi), %edi		C rp += 2 +	movd	(%esi), %mm7		C				am 3 +	mov	%edi, %edx		C rp2 = rp			am 3 +	lea	4(%esi), %esi		C up++				am 3 +	lea	(%esi), %eax		C up2 = up			am 3 +	movd	(%eax), %mm0		C				am 3 +	lea	(%ebx), %ecx		C inner loop count		am 3 +	pxor	%mm6, %mm6		C				am 3 +	pmuludq	%mm7, %mm0		C				am 3 +	movd	4(%eax), %mm1		C				am 3 +	movd	(%edx), %mm4		C				am 3 +	pmuludq	%mm7, %mm1		C				am 3 +	movd	8(%eax), %mm2		C				am 3 +	paddq	%mm0, %mm4		C				am 3 +	movd	4(%edx), %mm5		C				am 3 +	jmp	L(lam3)			C				am 3 +	ALIGN(16)			C				am 3 +L(lam3): +	pmuludq	%mm7, %mm2		C				am 3 +	paddq	%mm4, %mm6		C				am 3 +	movd	12(%eax), %mm3		C				am 3 +	paddq	%mm1, %mm5		C				am 3 +	movd	8(%edx), %mm4		C				am 3 +	movd	%mm6, (%edx)		C				am 3 +	psrlq	$32, %mm6		C				am 3 +	pmuludq	%mm7, %mm3		C				am 3 +	paddq	%mm5, %mm6		C				am 3 +	movd	16(%eax), %mm0		C				am 3 +	paddq	%mm2, %mm4		C				am 3 +	movd	12(%edx), %mm5		C				am 3 +	movd	%mm6, 4(%edx)		C				am 3 +	psrlq	$32, %mm6		C				am 3 +	pmuludq	%mm7, %mm0		C				am 3 +	paddq	%mm4, %mm6		C				am 3 +	movd	20(%eax), %mm1		C				am 3 +	paddq	%mm3, %mm5		C				am 3 +	movd	16(%edx), %mm4		C				am 3 +	movd	%mm6, 8(%edx)		C				am 3 +	psrlq	$32, %mm6		C				am 3 +	pmuludq	%mm7, %mm1		C				am 3 +	paddq	%mm5, %mm6		C				am 3 +	movd	24(%eax), %mm2		C				am 3 +	paddq	%mm0, %mm4		C				am 3 +	movd	20(%edx), %mm5		C				am 3 +	movd	%mm6, 12(%edx)		C				am 3 +	psrlq	$32, %mm6		C				am 3 +	lea	16(%eax), %eax		C				am 3 +	lea	16(%edx), %edx		C				am 3 +	sub	$4, %ecx		C				am 3 +	ja	L(lam3)			C				am 3 +	pmuludq	%mm7, %mm2		C				am 3 +	paddq	%mm4, %mm6		C				am 3 +	paddq	%mm1, %mm5		C				am 3 +	movd	8(%edx), %mm4		C				am 3 +	movd	%mm6, (%edx)		C				am 3 +	psrlq	$32, %mm6		C				am 3 +	paddq	%mm5, %mm6		C				am 3 +	paddq	%mm2, %mm4		C				am 3 +L(2):	movd	%mm6, 4(%edx)		C				am 3 +	psrlq	$32, %mm6		C				am 3 +	paddq	%mm4, %mm6		C				am 3 +	movd	%mm6, 8(%edx)		C				am 3 +	psrlq	$32, %mm6		C				am 3 +	movd	%mm6, 12(%edx)		C				am 3 + +	lea	8(%edi), %edi		C rp += 2 +	movd	(%esi), %mm7		C				am 2 +	mov	%edi, %edx		C rp2 = rp			am 2 +	lea	4(%esi), %esi		C up++				am 2 +	lea	(%esi), %eax		C up2 = up			am 2 +	movd	(%eax), %mm1		C				am 2 +	lea	(%ebx), %ecx		C inner loop count		am 2 +	pxor	%mm6, %mm6		C				am 2 +	pmuludq	%mm7, %mm1		C				am 2 +	movd	4(%eax), %mm2		C				am 2 +	movd	(%edx), %mm5		C				am 2 +	pmuludq	%mm7, %mm2		C				am 2 +	movd	8(%eax), %mm3		C				am 2 +	paddq	%mm1, %mm5		C				am 2 +	movd	4(%edx), %mm4		C				am 2 +	jmp	L(am10)			C				am 2 +	ALIGN(16)			C				am 2 +L(lam2): +	pmuludq	%mm7, %mm2		C				am 2 +	paddq	%mm4, %mm6		C				am 2 +	movd	8(%eax), %mm3		C				am 2 +	paddq	%mm1, %mm5		C				am 2 +	movd	4(%edx), %mm4		C				am 2 +	movd	%mm6, -4(%edx)		C				am 2 +	psrlq	$32, %mm6		C				am 2 +L(am10): +	pmuludq	%mm7, %mm3		C				am 2 +	paddq	%mm5, %mm6		C				am 2 +	movd	12(%eax), %mm0		C				am 2 +	paddq	%mm2, %mm4		C				am 2 +	movd	8(%edx), %mm5		C				am 2 +	movd	%mm6, (%edx)		C				am 2 +	psrlq	$32, %mm6		C				am 2 +	pmuludq	%mm7, %mm0		C				am 2 +	paddq	%mm4, %mm6		C				am 2 +	movd	16(%eax), %mm1		C				am 2 +	paddq	%mm3, %mm5		C				am 2 +	movd	12(%edx), %mm4		C				am 2 +	movd	%mm6, 4(%edx)		C				am 2 +	psrlq	$32, %mm6		C				am 2 +	pmuludq	%mm7, %mm1		C				am 2 +	paddq	%mm5, %mm6		C				am 2 +	movd	20(%eax), %mm2		C				am 2 +	paddq	%mm0, %mm4		C				am 2 +	movd	16(%edx), %mm5		C				am 2 +	movd	%mm6, 8(%edx)		C				am 2 +	psrlq	$32, %mm6		C				am 2 +	lea	16(%eax), %eax		C				am 2 +	lea	16(%edx), %edx		C				am 2 +	sub	$4, %ecx		C				am 2 +	ja	L(lam2)			C				am 2 +	pmuludq	%mm7, %mm2		C				am 2 +	paddq	%mm4, %mm6		C				am 2 +	paddq	%mm1, %mm5		C				am 2 +	movd	4(%edx), %mm4		C				am 2 +	movd	%mm6, -4(%edx)		C				am 2 +	psrlq	$32, %mm6		C				am 2 +	paddq	%mm5, %mm6		C				am 2 +	paddq	%mm2, %mm4		C				am 2 +L(1):	movd	%mm6, (%edx)		C				am 2 +	psrlq	$32, %mm6		C				am 2 +	paddq	%mm4, %mm6		C				am 2 +	movd	%mm6, 4(%edx)		C				am 2 +	psrlq	$32, %mm6		C				am 2 +	movd	%mm6, 8(%edx)		C				am 2 + +	lea	8(%edi), %edi		C rp += 2 +	movd	(%esi), %mm7		C				am 1 +	mov	%edi, %edx		C rp2 = rp			am 1 +	lea	4(%esi), %esi		C up++				am 1 +	lea	(%esi), %eax		C up2 = up			am 1 +	movd	(%eax), %mm2		C				am 1 +	lea	(%ebx), %ecx		C inner loop count		am 1 +	pxor	%mm6, %mm6		C				am 1 +	pmuludq	%mm7, %mm2		C				am 1 +	movd	4(%eax), %mm3		C				am 1 +	movd	(%edx), %mm4		C				am 1 +	pmuludq	%mm7, %mm3		C				am 1 +	movd	8(%eax), %mm0		C				am 1 +	paddq	%mm2, %mm4		C				am 1 +	movd	4(%edx), %mm5		C				am 1 +	jmp	L(am01)			C				am 1 +	ALIGN(16)			C				am 1 +L(lam1): +	pmuludq	%mm7, %mm2		C				am 1 +	paddq	%mm4, %mm6		C				am 1 +	movd	4(%eax), %mm3		C				am 1 +	paddq	%mm1, %mm5		C				am 1 +	movd	(%edx), %mm4		C				am 1 +	movd	%mm6, -8(%edx)		C				am 1 +	psrlq	$32, %mm6		C				am 1 +	pmuludq	%mm7, %mm3		C				am 1 +	paddq	%mm5, %mm6		C				am 1 +	movd	8(%eax), %mm0		C				am 1 +	paddq	%mm2, %mm4		C				am 1 +	movd	4(%edx), %mm5		C				am 1 +	movd	%mm6, -4(%edx)		C				am 1 +	psrlq	$32, %mm6		C				am 1 +L(am01): +	pmuludq	%mm7, %mm0		C				am 1 +	paddq	%mm4, %mm6		C				am 1 +	movd	12(%eax), %mm1		C				am 1 +	paddq	%mm3, %mm5		C				am 1 +	movd	8(%edx), %mm4		C				am 1 +	movd	%mm6, (%edx)		C				am 1 +	psrlq	$32, %mm6		C				am 1 +	pmuludq	%mm7, %mm1		C				am 1 +	paddq	%mm5, %mm6		C				am 1 +	movd	16(%eax), %mm2		C				am 1 +	paddq	%mm0, %mm4		C				am 1 +	movd	12(%edx), %mm5		C				am 1 +	movd	%mm6, 4(%edx)		C				am 1 +	psrlq	$32, %mm6		C				am 1 +	lea	16(%eax), %eax		C				am 1 +	lea	16(%edx), %edx		C				am 1 +	sub	$4, %ecx		C				am 1 +	ja	L(lam1)			C				am 1 +	pmuludq	%mm7, %mm2		C				am 1 +	paddq	%mm4, %mm6		C				am 1 +	paddq	%mm1, %mm5		C				am 1 +	movd	(%edx), %mm4		C				am 1 +	movd	%mm6, -8(%edx)		C				am 1 +	psrlq	$32, %mm6		C				am 1 +	paddq	%mm5, %mm6		C				am 1 +	paddq	%mm2, %mm4		C				am 1 +L(0):	movd	%mm6, -4(%edx)		C				am 1 +	psrlq	$32, %mm6		C				am 1 +	paddq	%mm4, %mm6		C				am 1 +	movd	%mm6, (%edx)		C				am 1 +	psrlq	$32, %mm6		C				am 1 +	movd	%mm6, 4(%edx)		C				am 1 + +	lea	8(%edi), %edi		C rp += 2 +	movd	(%esi), %mm7		C				am 0 +	mov	%edi, %edx		C rp2 = rp			am 0 +	lea	4(%esi), %esi		C up++				am 0 +	lea	(%esi), %eax		C up2 = up			am 0 +	movd	(%eax), %mm3		C				am 0 +	lea	(%ebx), %ecx		C inner loop count		am 0 +	pxor	%mm6, %mm6		C				am 0 +	pmuludq	%mm7, %mm3		C				am 0 +	movd	4(%eax), %mm0		C				am 0 +	movd	(%edx), %mm5		C				am 0 +	pmuludq	%mm7, %mm0		C				am 0 +	movd	8(%eax), %mm1		C				am 0 +	paddq	%mm3, %mm5		C				am 0 +	movd	4(%edx), %mm4		C				am 0 +	jmp	L(am00)			C				am 0 +	ALIGN(16)			C				am 0 +L(lam0): +	pmuludq	%mm7, %mm2		C				am 0 +	paddq	%mm4, %mm6		C				am 0 +	movd	(%eax), %mm3		C				am 0 +	paddq	%mm1, %mm5		C				am 0 +	movd	-4(%edx), %mm4		C				am 0 +	movd	%mm6, -12(%edx)		C				am 0 +	psrlq	$32, %mm6		C				am 0 +	pmuludq	%mm7, %mm3		C				am 0 +	paddq	%mm5, %mm6		C				am 0 +	movd	4(%eax), %mm0		C				am 0 +	paddq	%mm2, %mm4		C				am 0 +	movd	(%edx), %mm5		C				am 0 +	movd	%mm6, -8(%edx)		C				am 0 +	psrlq	$32, %mm6		C				am 0 +	pmuludq	%mm7, %mm0		C				am 0 +	paddq	%mm4, %mm6		C				am 0 +	movd	8(%eax), %mm1		C				am 0 +	paddq	%mm3, %mm5		C				am 0 +	movd	4(%edx), %mm4		C				am 0 +	movd	%mm6, -4(%edx)		C				am 0 +	psrlq	$32, %mm6		C				am 0 +L(am00): +	pmuludq	%mm7, %mm1		C				am 0 +	paddq	%mm5, %mm6		C				am 0 +	movd	12(%eax), %mm2		C				am 0 +	paddq	%mm0, %mm4		C				am 0 +	movd	8(%edx), %mm5		C				am 0 +	movd	%mm6, (%edx)		C				am 0 +	psrlq	$32, %mm6		C				am 0 +	lea	16(%eax), %eax		C				am 0 +	lea	16(%edx), %edx		C				am 0 +	sub	$4, %ecx		C				am 0 +	ja	L(lam0)			C				am 0 +	pmuludq	%mm7, %mm2		C				am 0 +	paddq	%mm4, %mm6		C				am 0 +	paddq	%mm1, %mm5		C				am 0 +	movd	-4(%edx), %mm4		C				am 0 +	movd	%mm6, -12(%edx)		C				am 0 +	psrlq	$32, %mm6		C				am 0 +	paddq	%mm5, %mm6		C				am 0 +	paddq	%mm2, %mm4		C				am 0 +L(3):	movd	%mm6, -8(%edx)		C				am 0 +	psrlq	$32, %mm6		C				am 0 +	paddq	%mm4, %mm6		C				am 0 +	movd	%mm6, -4(%edx)		C				am 0 +	psrlq	$32, %mm6		C				am 0 +	movd	%mm6, (%edx)		C				am 0 +	sub	$4, %ebx		C				am 0 +	ja	L(outer)			C				am 0 + +	mov	%edi, %edx +	mov	%esi, %eax +	pop	%edi +	pop	%ebx +	pop	%esi + +L(am3):	C up[un-1..un-3] x up[un-4] +	lea	8(%edx), %edx		C rp2 += 2 +	movd	(%eax), %mm7 +	movd	4(%eax), %mm1 +	movd	8(%eax), %mm2 +	movd	12(%eax), %mm3 +	movd	(%edx), %mm4 +	pmuludq	%mm7, %mm1 +	movd	4(%edx), %mm5 +	pmuludq	%mm7, %mm2 +	movd	8(%edx), %mm6 +	pmuludq	%mm7, %mm3 +	paddq	%mm1, %mm4 +	paddq	%mm2, %mm5 +	paddq	%mm3, %mm6 +	movd	%mm4, (%edx) +	psrlq	$32, %mm4 +	paddq	%mm5, %mm4 +	movd	%mm4, 4(%edx) +	psrlq	$32, %mm4 +	paddq	%mm6, %mm4 +	movd	%mm4, 8(%edx) +	psrlq	$32, %mm4 +	movd	%mm4, 12(%edx)		C FIXME feed through! +	lea	4(%eax), %eax + +L(am2):	C up[un-1..un-2] x up[un-3] +	lea	8(%edx), %edx		C rp2 += 2 +	movd	(%eax), %mm7 +	movd	4(%eax), %mm1 +	movd	8(%eax), %mm2 +	movd	(%edx), %mm4 +	movd	4(%edx), %mm5 +	pmuludq	%mm7, %mm1 +	pmuludq	%mm7, %mm2 +	paddq	%mm1, %mm4 +	paddq	%mm2, %mm5 +	movd	%mm4, (%edx) +	psrlq	$32, %mm4 +	paddq	%mm5, %mm4 +	movd	%mm4, 4(%edx) +	psrlq	$32, %mm4 +	movd	%mm4, 8(%edx)		C FIXME feed through! +	lea	4(%eax), %eax + +L(am1):	C up[un-1] x up[un-2] +	lea	8(%edx), %edx		C rp2 += 2 +	movd	(%eax), %mm7 +	movd	4(%eax), %mm2 +	movd	(%edx), %mm4 +	pmuludq	%mm7, %mm2 +	paddq	%mm2, %mm4 +	movd	%mm4, (%edx) +	psrlq	$32, %mm4 +	movd	%mm4, 4(%edx) + +C *** diag stuff, use elementary code for now + +	mov	4(%esp), %edx		C rp +	mov	8(%esp), %eax		C up +	mov	12(%esp), %ecx		C un + +	movd	(%eax), %mm2 +	pmuludq	%mm2, %mm2		C src[0]^2 + +	pcmpeqd	%mm7, %mm7 +	psrlq	$32, %mm7 + +	movd	4(%edx), %mm3		C dst[1] + +	movd	%mm2, (%edx) +	psrlq	$32, %mm2 + +	psllq	$1, %mm3		C 2*dst[1] +	paddq	%mm3, %mm2 +	movd	%mm2, 4(%edx) +	psrlq	$32, %mm2 + +	sub	$2, %ecx + +L(diag): +	movd	4(%eax), %mm0		C src limb +	add	$4, %eax +	pmuludq	%mm0, %mm0 +	movq	%mm7, %mm1 +	pand	%mm0, %mm1		C diagonal low +	psrlq	$32, %mm0		C diagonal high + +	movd	8(%edx), %mm3 +	psllq	$1, %mm3		C 2*dst[i] +	paddq	%mm3, %mm1 +	paddq	%mm1, %mm2 +	movd	%mm2, 8(%edx) +	psrlq	$32, %mm2 + +	movd	12(%edx), %mm3 +	psllq	$1, %mm3		C 2*dst[i+1] +	paddq	%mm3, %mm0 +	paddq	%mm0, %mm2 +	movd	%mm2, 12(%edx) +	add	$8, %edx +	psrlq	$32, %mm2 + +	sub	$1, %ecx +	jnz	L(diag) + +	movd	4(%eax), %mm0		C src[size-1] +	pmuludq	%mm0, %mm0 +	pand	%mm0, %mm7		C diagonal low +	psrlq	$32, %mm0		C diagonal high + +	movd	8(%edx), %mm3		C dst[2*size-2] +	psllq	$1, %mm3 +	paddq	%mm3, %mm7 +	paddq	%mm7, %mm2 +	movd	%mm2, 8(%edx) +	psrlq	$32, %mm2 + +	paddq	%mm0, %mm2 +	movd	%mm2, 12(%edx)		C dst[2*size-1] + +	emms +	ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/sub_n.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/sub_n.asm new file mode 100644 index 0000000..5ba1c01 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/sub_n.asm @@ -0,0 +1,119 @@ +dnl  Intel Pentium-4 mpn_sub_n -- mpn subtraction. + +dnl  Copyright 2001, 2002 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C					cycles/limb +C			     dst!=src1,2  dst==src1  dst==src2 +C P6 model 0-8,10-12		- +C P6 model 9   (Banias)		? +C P6 model 13  (Dothan)		? +C P4 model 0-1 (Willamette)	? +C P4 model 2   (Northwood)	4	     6		6 +C P4 model 3-4 (Prescott)	4.25	     7.5	7.5 + +defframe(PARAM_CARRY,20) +defframe(PARAM_SIZE, 16) +defframe(PARAM_SRC2, 12) +defframe(PARAM_SRC1, 8) +defframe(PARAM_DST,  4) + +dnl  re-use parameter space +define(SAVE_EBX,`PARAM_SRC1') + +	TEXT +	ALIGN(8) + +PROLOGUE(mpn_sub_nc) +deflit(`FRAME',0) +	movd	PARAM_CARRY, %mm0 +	jmp	L(start_nc) +EPILOGUE() + +	ALIGN(8) +PROLOGUE(mpn_sub_n) +deflit(`FRAME',0) +	pxor	%mm0, %mm0 +L(start_nc): +	mov	PARAM_SRC1, %eax +	mov	%ebx, SAVE_EBX +	mov	PARAM_SRC2, %ebx +	mov	PARAM_DST, %edx +	mov	PARAM_SIZE, %ecx + +	lea	(%eax,%ecx,4), %eax	C src1 end +	lea	(%ebx,%ecx,4), %ebx	C src2 end +	lea	(%edx,%ecx,4), %edx	C dst end +	neg	%ecx			C -size + +L(top): +	C eax	src1 end +	C ebx	src2 end +	C ecx	counter, limbs, negative +	C edx	dst end +	C mm0	carry bit + +	movd	(%eax,%ecx,4), %mm1 +	movd	(%ebx,%ecx,4), %mm2 +	psubq	%mm2, %mm1 + +	psubq	%mm0, %mm1 +	movd	%mm1, (%edx,%ecx,4) + +	psrlq	$63, %mm1 + +	add	$1, %ecx +	jz	L(done_mm1) + +	movd	(%eax,%ecx,4), %mm0 +	movd	(%ebx,%ecx,4), %mm2 +	psubq	%mm2, %mm0 + +	psubq	%mm1, %mm0 +	movd	%mm0, (%edx,%ecx,4) + +	psrlq	$63, %mm0 + +	add	$1, %ecx +	jnz	L(top) + +	movd	%mm0, %eax +	mov	SAVE_EBX, %ebx +	emms +	ret + +L(done_mm1): +	movd	%mm1, %eax +	mov	SAVE_EBX, %ebx +	emms +	ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/submul_1.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/submul_1.asm new file mode 100644 index 0000000..020675b --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/submul_1.asm @@ -0,0 +1,182 @@ +dnl  Intel Pentium-4 mpn_submul_1 -- Multiply a limb vector with a limb and +dnl  subtract the result from a second limb vector. + +dnl  Copyright 2001, 2002, 2008, 2010 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C			    cycles/limb +C P6 model 0-8,10-12		- +C P6 model 9   (Banias)		6.8 +C P6 model 13  (Dothan)		6.9 +C P4 model 0-1 (Willamette)	? +C P4 model 2   (Northwood)	5.87 +C P4 model 3-4 (Prescott)	6.5 + +C This code represents a step forwards compared to the code available before +C GMP 5.1, but it is not carefully tuned for either P6 or P4.  In fact, it is +C not good for P6.  For P4 it saved a bit over 1 c/l for both Northwood and +C Prescott compared to the old code. +C +C The arrangements made here to get a two instruction dependent chain are +C slightly subtle.  In the loop the carry (or borrow rather) is a negative so +C that a paddq can be used to give a low limb ready to store, and a high limb +C ready to become the new carry after a psrlq. +C +C If the carry was a simple twos complement negative then the psrlq shift would +C need to bring in 0 bits or 1 bits according to whether the high was zero or +C non-zero, since a non-zero value would represent a negative needing sign +C extension.  That wouldn't be particularly easy to arrange and certainly would +C add an instruction to the dependent chain, so instead an offset is applied so +C that the high limb will be 0xFFFFFFFF+c.  With c in the range -0xFFFFFFFF to +C 0, the value 0xFFFFFFFF+c is in the range 0 to 0xFFFFFFFF and is therefore +C always positive and can always have 0 bits shifted in, which is what psrlq +C does. +C +C The extra 0xFFFFFFFF must be subtracted before c is used, but that can be +C done off the dependent chain.  The total adjustment then is to add +C 0xFFFFFFFF00000000 to offset the new carry, and subtract 0x00000000FFFFFFFF +C to remove the offset from the current carry, for a net add of +C 0xFFFFFFFE00000001.  In the code this is applied to the destination limb when +C fetched. +C +C It's also possible to view the 0xFFFFFFFF adjustment as a ones-complement +C negative, which is how it's undone for the return value, but that doesn't +C seem as clear. + +defframe(PARAM_CARRY,     20) +defframe(PARAM_MULTIPLIER,16) +defframe(PARAM_SIZE,      12) +defframe(PARAM_SRC,       8) +defframe(PARAM_DST,       4) + +	TEXT +	ALIGN(16) + +PROLOGUE(mpn_submul_1c) +deflit(`FRAME',0) +	movd	PARAM_CARRY, %mm1 +	jmp	L(start_1c) +EPILOGUE() + +PROLOGUE(mpn_submul_1) +deflit(`FRAME',0) +	pxor	%mm1, %mm1		C initial borrow + +L(start_1c): +	mov	PARAM_SRC, %eax +	pcmpeqd	%mm0, %mm0 + +	movd	PARAM_MULTIPLIER, %mm7 +	pcmpeqd	%mm6, %mm6 + +	mov	PARAM_DST, %edx +	psrlq	$32, %mm0		C 0x00000000FFFFFFFF + +	mov	PARAM_SIZE, %ecx +	psllq	$32, %mm6		C 0xFFFFFFFF00000000 + +	psubq	%mm0, %mm6		C 0xFFFFFFFE00000001 + +	psubq	%mm1, %mm0		C 0xFFFFFFFF - borrow + + +	movd	(%eax), %mm3		C up +	movd	(%edx), %mm4		C rp + +	add	$-1, %ecx +	paddq	%mm6, %mm4		C add 0xFFFFFFFE00000001 +	pmuludq	%mm7, %mm3 +	jnz	L(gt1) +	psubq	%mm3, %mm4		C prod +	paddq	%mm4, %mm0		C borrow +	movd	%mm0, (%edx)		C result +	jmp	L(rt) + +L(gt1):	movd	4(%eax), %mm1		C up +	movd	4(%edx), %mm2		C rp + +	add	$-1, %ecx +	jz	L(eev) + +	ALIGN(16) +L(top):	paddq	%mm6, %mm2		C add 0xFFFFFFFE00000001 +	pmuludq	%mm7, %mm1 +	psubq	%mm3, %mm4		C prod +	movd	8(%eax), %mm3		C up +	paddq	%mm4, %mm0		C borrow +	movd	8(%edx), %mm4		C rp +	movd	%mm0, (%edx)		C result +	psrlq	$32, %mm0 + +	add	$-1, %ecx +	jz	L(eod) + +	paddq	%mm6, %mm4		C add 0xFFFFFFFE00000001 +	pmuludq	%mm7, %mm3 +	psubq	%mm1, %mm2		C prod +	movd	12(%eax), %mm1		C up +	paddq	%mm2, %mm0		C borrow +	movd	12(%edx), %mm2		C rp +	movd	%mm0, 4(%edx)		C result +	psrlq	$32, %mm0 + +	lea	8(%eax), %eax +	lea	8(%edx), %edx +	add	$-1, %ecx +	jnz	L(top) + + +L(eev):	paddq	%mm6, %mm2		C add 0xFFFFFFFE00000001 +	pmuludq	%mm7, %mm1 +	psubq	%mm3, %mm4		C prod +	paddq	%mm4, %mm0		C borrow +	movd	%mm0, (%edx)		C result +	psrlq	$32, %mm0 +	psubq	%mm1, %mm2		C prod +	paddq	%mm2, %mm0		C borrow +	movd	%mm0, 4(%edx)		C result +L(rt):	psrlq	$32, %mm0 +	movd	%mm0, %eax +	not	%eax +	emms +	ret + +L(eod):	paddq	%mm6, %mm4		C add 0xFFFFFFFE00000001 +	pmuludq	%mm7, %mm3 +	psubq	%mm1, %mm2		C prod +	paddq	%mm2, %mm0		C borrow +	movd	%mm0, 4(%edx)		C result +	psrlq	$32, %mm0 +	psubq	%mm3, %mm4		C prod +	paddq	%mm4, %mm0		C borrow +	movd	%mm0, 8(%edx)		C result +	jmp	L(rt) +EPILOGUE()  |