diff options
Diffstat (limited to 'vendor/gmp-6.3.0/mpn/x86_64/k8')
| -rw-r--r-- | vendor/gmp-6.3.0/mpn/x86_64/k8/addaddmul_1msb0.asm | 153 | ||||
| -rw-r--r-- | vendor/gmp-6.3.0/mpn/x86_64/k8/addmul_2.asm | 195 | ||||
| -rw-r--r-- | vendor/gmp-6.3.0/mpn/x86_64/k8/aorrlsh_n.asm | 217 | ||||
| -rw-r--r-- | vendor/gmp-6.3.0/mpn/x86_64/k8/bdiv_q_1.asm | 179 | ||||
| -rw-r--r-- | vendor/gmp-6.3.0/mpn/x86_64/k8/div_qr_1n_pi1.asm | 249 | ||||
| -rw-r--r-- | vendor/gmp-6.3.0/mpn/x86_64/k8/gmp-mparam.h | 237 | ||||
| -rw-r--r-- | vendor/gmp-6.3.0/mpn/x86_64/k8/mul_basecase.asm | 469 | ||||
| -rw-r--r-- | vendor/gmp-6.3.0/mpn/x86_64/k8/mullo_basecase.asm | 436 | ||||
| -rw-r--r-- | vendor/gmp-6.3.0/mpn/x86_64/k8/mulmid_basecase.asm | 559 | ||||
| -rw-r--r-- | vendor/gmp-6.3.0/mpn/x86_64/k8/redc_1.asm | 591 | ||||
| -rw-r--r-- | vendor/gmp-6.3.0/mpn/x86_64/k8/sqr_basecase.asm | 807 | 
11 files changed, 4092 insertions, 0 deletions
diff --git a/vendor/gmp-6.3.0/mpn/x86_64/k8/addaddmul_1msb0.asm b/vendor/gmp-6.3.0/mpn/x86_64/k8/addaddmul_1msb0.asm new file mode 100644 index 0000000..3e1898b --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/k8/addaddmul_1msb0.asm @@ -0,0 +1,153 @@ +dnl  AMD64 mpn_addaddmul_1msb0, R = Au + Bv, u,v < 2^63. + +dnl  Copyright 2008, 2021 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C	     cycles/limb +C AMD K8,K9	 2.167 +C AMD K10	 2.167 +C Intel P4	12.0 +C Intel core2	 4.0 +C Intel corei	 ? +C Intel atom	 ? +C VIA nano	 ? + +C TODO +C  * Perhaps handle various n mod 3 sizes better.  The code now is too large. + +C INPUT PARAMETERS +define(`rp',	`%rdi') +define(`ap',	`%rsi') +define(`bp_param', `%rdx') +define(`n',	`%rcx') +define(`u0',	`%r8') +define(`v0',	`%r9') + + +define(`bp', `%rbp') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() +	TEXT +	ALIGN(16) +PROLOGUE(mpn_addaddmul_1msb0) +        FUNC_ENTRY(4) +IFDOS(`	mov	56(%rsp), %r8	') +IFDOS(`	mov	64(%rsp), %r9	') +	push	%rbp + +	lea	(ap,n,8), ap +	lea	(bp_param,n,8), bp +	lea	(rp,n,8), rp +	neg	n + +	mov	(ap,n,8), %rax +	mul	%r8 +	mov	%rax, %r11 +	mov	(bp,n,8), %rax +	mov	%rdx, %r10 +	add	$3, n +	jns	L(end) + +	push	%r13 + +	ALIGN(16) +L(top):	mul	%r9 +	add	%rax, %r11 +	mov	-16(ap,n,8), %rax +	adc	%rdx, %r10 +	mov	%r11, -24(rp,n,8) +	mul	%r8 +	add	%rax, %r10 +	mov	-16(bp,n,8), %rax +	mov	$0, R32(%r13) +	adc	%rdx, %r13 +	mul	%r9 +	add	%rax, %r10 +	mov	-8(ap,n,8), %rax +	adc	%rdx, %r13 +	mov	%r10, -16(rp,n,8) +	mul	%r8 +	add	%rax, %r13 +	mov	-8(bp,n,8), %rax +	mov	$0, R32(%r11) +	adc	%rdx, %r11 +	mul	%r9 +	add	%rax, %r13 +	adc	%rdx, %r11 +	mov	(ap,n,8), %rax +	mul	%r8 +	add	%rax, %r11 +	mov	%r13, -8(rp,n,8) +	mov	(bp,n,8), %rax +	mov	$0, R32(%r10) +	adc	%rdx, %r10 +	add	$3, n +	js	L(top) + +	pop	%r13 + +L(end):	mul	%r9 +	add	%rax, %r11 +	adc	%rdx, %r10 +	cmp	$1, R32(n) +	ja	L(two) +	mov	-16(ap,n,8), %rax +	mov	%r11, -24(rp,n,8) +	mov	%r10, %r11 +	jz	L(one) + +L(nul):	mul	%r8 +	add	%rax, %r10 +	mov	-16(bp), %rax +	mov	$0, R32(%r11) +	adc	%rdx, %r11 +	mul	%r9 +	add	%rax, %r10 +	mov	-8(ap), %rax +	adc	%rdx, %r11 +	mov	%r10, -16(rp) +L(one):	mul	%r8 +	add	%rax, %r11 +	mov	-8(bp), %rax +	mov	$0, R32(%r10) +	adc	%rdx, %r10 +	mul	%r9 +	add	%rax, %r11 +	adc	%rdx, %r10 + +L(two):	mov	%r11, -8(rp) +	mov	%r10, %rax +L(ret):	pop	%rbp +	FUNC_EXIT() +	ret +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86_64/k8/addmul_2.asm b/vendor/gmp-6.3.0/mpn/x86_64/k8/addmul_2.asm new file mode 100644 index 0000000..78bcba1 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/k8/addmul_2.asm @@ -0,0 +1,195 @@ +dnl  AMD64 mpn_addmul_2 -- Multiply an n-limb vector with a 2-limb vector and +dnl  add the result to a third limb vector. + +dnl  Copyright 2008, 2011, 2012, 2016 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C	     cycles/limb     cycles/limb cfg	cycles/limb am1+am1 +C AMD K8,K9	 2.375 +C AMD K10	 2.375 +C AMD bull	 5.2		<-		4.6-4.75		bad +C AMD pile	 4.96		<-		4.6-4.75		bad +C AMD steam	 ? +C AMD excavator	 ? +C AMD bobcat	 5.75				5.0			bad +C AMD jaguar	 5.9				5.2-5.4			bad +C Intel P4	15-16 +C Intel core2	 4.5				4.25-4.5		bad +C Intel NHM	 4.33				4.55			bad +C Intel SBR	 3.4		 2.93		3.24			bad +C Intel IBR	 3.35		 2.6		2.95			bad +C Intel HWL	 3.3		 2.15		2.3			bad +C Intel BWL	 2.33		 2.33		1.65			bad +C Intel SKL	 2.37		 2.21		1.64			bad +C Intel atom	20		18.7 +C Intel SLM	 8		 8.5 +C VIA nano	 4.4 + +C This code is the result of running a code generation and optimization tool +C suite written by David Harvey and Torbjorn Granlund. + +C TODO +C  * Tune feed-in and wind-down code. + +C INPUT PARAMETERS +define(`rp',     `%rdi') +define(`up',     `%rsi') +define(`n_param',`%rdx') +define(`vp',     `%rcx') + +define(`v0', `%r8') +define(`v1', `%r9') +define(`w0', `%rbx') +define(`w1', `%rcx') +define(`w2', `%rbp') +define(`w3', `%r10') +define(`n',  `%r11') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() +	TEXT +	ALIGN(16) +PROLOGUE(mpn_addmul_2) +	FUNC_ENTRY(4) +	mov	n_param, n +	push	%rbx +	push	%rbp + +	mov	0(vp), v0 +	mov	8(vp), v1 + +	mov	R32(n_param), R32(%rbx) +	mov	(up), %rax +	lea	-8(up,n_param,8), up +	lea	-8(rp,n_param,8), rp +	mul	v0 +	neg	n +	and	$3, R32(%rbx) +	jz	L(b0) +	cmp	$2, R32(%rbx) +	jc	L(b1) +	jz	L(b2) + +L(b3):	mov	%rax, w1 +	mov	%rdx, w2 +	xor	R32(w3), R32(w3) +	mov	8(up,n,8), %rax +	dec	n +	jmp	L(lo3) + +L(b2):	mov	%rax, w2 +	mov	8(up,n,8), %rax +	mov	%rdx, w3 +	xor	R32(w0), R32(w0) +	add	$-2, n +	jmp	L(lo2) + +L(b1):	mov	%rax, w3 +	mov	8(up,n,8), %rax +	mov	%rdx, w0 +	xor	R32(w1), R32(w1) +	inc	n +	jmp	L(lo1) + +L(b0):	mov	$0, R32(w3) +	mov	%rax, w0 +	mov	8(up,n,8), %rax +	mov	%rdx, w1 +	xor	R32(w2), R32(w2) +	jmp	L(lo0) + +	ALIGN(32) +L(top):	mov	$0, R32(w1) +	mul	v0 +	add	%rax, w3 +	mov	(up,n,8), %rax +	adc	%rdx, w0 +	adc	$0, R32(w1) +L(lo1):	mul	v1 +	add	w3, (rp,n,8) +	mov	$0, R32(w3) +	adc	%rax, w0 +	mov	$0, R32(w2) +	mov	8(up,n,8), %rax +	adc	%rdx, w1 +	mul	v0 +	add	%rax, w0 +	mov	8(up,n,8), %rax +	adc	%rdx, w1 +	adc	$0, R32(w2) +L(lo0):	mul	v1 +	add	w0, 8(rp,n,8) +	adc	%rax, w1 +	adc	%rdx, w2 +	mov	16(up,n,8), %rax +	mul	v0 +	add	%rax, w1 +	adc	%rdx, w2 +	adc	$0, R32(w3) +	mov	16(up,n,8), %rax +L(lo3):	mul	v1 +	add	w1, 16(rp,n,8) +	adc	%rax, w2 +	adc	%rdx, w3 +	xor	R32(w0), R32(w0) +	mov	24(up,n,8), %rax +	mul	v0 +	add	%rax, w2 +	mov	24(up,n,8), %rax +	adc	%rdx, w3 +	adc	$0, R32(w0) +L(lo2):	mul	v1 +	add	w2, 24(rp,n,8) +	adc	%rax, w3 +	adc	%rdx, w0 +	mov	32(up,n,8), %rax +	add	$4, n +	js	L(top) + +L(end):	xor	R32(w1), R32(w1) +	mul	v0 +	add	%rax, w3 +	mov	(up), %rax +	adc	%rdx, w0 +	adc	R32(w1), R32(w1) +	mul	v1 +	add	w3, (rp) +	adc	%rax, w0 +	adc	%rdx, w1 +	mov	w0, 8(rp) +	mov	w1, %rax + +	pop	%rbp +	pop	%rbx +	FUNC_EXIT() +	ret +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86_64/k8/aorrlsh_n.asm b/vendor/gmp-6.3.0/mpn/x86_64/k8/aorrlsh_n.asm new file mode 100644 index 0000000..ff3a184 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/k8/aorrlsh_n.asm @@ -0,0 +1,217 @@ +dnl  AMD64 mpn_addlsh_n and mpn_rsblsh_n.  R = V2^k +- U. + +dnl  Copyright 2006, 2010-2012 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C	     cycles/limb +C AMD K8,K9	 2.87	< 3.85 for lshift + add_n +C AMD K10	 2.75	< 3.85 for lshift + add_n +C Intel P4	22	> 7.33 for lshift + add_n +C Intel core2	 4.1	> 3.27 for lshift + add_n +C Intel NHM	 4.4	> 3.75 for lshift + add_n +C Intel SBR	 3.17	< 3.46 for lshift + add_n +C Intel atom	 ?	? 8.75 for lshift + add_n +C VIA nano	 4.7	< 6.25 for lshift + add_n + +C TODO +C  * Can we propagate carry into rdx instead of using a special carry register? +C    That could save enough insns to get to 10 cycles/iteration. + +define(`rp',       `%rdi') +define(`up',       `%rsi') +define(`vp_param', `%rdx') +define(`n_param',  `%rcx') +define(`cnt',      `%r8') + +define(`vp',    `%r12') +define(`n',     `%rbp') + +ifdef(`OPERATION_addlsh_n',` +  define(ADDSUB,       `add') +  define(ADCSBB,       `adc') +  define(func, mpn_addlsh_n) +') +ifdef(`OPERATION_rsblsh_n',` +  define(ADDSUB,       `sub') +  define(ADCSBB,       `sbb') +  define(func, mpn_rsblsh_n) +') + +MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() +	TEXT +	ALIGN(16) +PROLOGUE(func) +	FUNC_ENTRY(4) +IFDOS(`	mov	56(%rsp), %r8d	') +	push	%r12 +	push	%rbp +	push	%rbx + +	mov	(vp_param), %rax	C load first V limb early + +	mov	$0, R32(n) +	sub	n_param, n + +	lea	-16(up,n_param,8), up +	lea	-16(rp,n_param,8), rp +	lea	16(vp_param,n_param,8), vp + +	mov	n_param, %r9 + +	mov	%r8, %rcx +	mov	$1, R32(%r8) +	shl	R8(%rcx), %r8 + +	mul	%r8			C initial multiply + +	and	$3, R32(%r9) +	jz	L(b0) +	cmp	$2, R32(%r9) +	jc	L(b1) +	jz	L(b2) + +L(b3):	mov	%rax, %r11 +	ADDSUB	16(up,n,8), %r11 +	mov	-8(vp,n,8), %rax +	sbb	R32(%rcx), R32(%rcx) +	mov	%rdx, %rbx +	mul	%r8 +	or	%rax, %rbx +	mov	(vp,n,8), %rax +	mov	%rdx, %r9 +	mul	%r8 +	or	%rax, %r9 +	add	$3, n +	jnz	L(lo3) +	jmp	L(cj3) + +L(b2):	mov	%rax, %rbx +	mov	-8(vp,n,8), %rax +	mov	%rdx, %r9 +	mul	%r8 +	or	%rax, %r9 +	add	$2, n +	jz	L(cj2) +	mov	%rdx, %r10 +	mov	-16(vp,n,8), %rax +	mul	%r8 +	or	%rax, %r10 +	xor	R32(%rcx), R32(%rcx)	C clear carry register +	jmp	L(lo2) + +L(b1):	mov	%rax, %r9 +	mov	%rdx, %r10 +	add	$1, n +	jnz	L(gt1) +	ADDSUB	8(up,n,8), %r9 +	jmp	L(cj1) +L(gt1):	mov	-16(vp,n,8), %rax +	mul	%r8 +	or	%rax, %r10 +	mov	%rdx, %r11 +	mov	-8(vp,n,8), %rax +	mul	%r8 +	or	%rax, %r11 +	ADDSUB	8(up,n,8), %r9 +	ADCSBB	16(up,n,8), %r10 +	ADCSBB	24(up,n,8), %r11 +	mov	(vp,n,8), %rax +	sbb	R32(%rcx), R32(%rcx) +	jmp	L(lo1) + +L(b0):	mov	%rax, %r10 +	mov	%rdx, %r11 +	mov	-8(vp,n,8), %rax +	mul	%r8 +	or	%rax, %r11 +	ADDSUB	16(up,n,8), %r10 +	ADCSBB	24(up,n,8), %r11 +	mov	(vp,n,8), %rax +	sbb	R32(%rcx), R32(%rcx) +	mov	%rdx, %rbx +	mul	%r8 +	or	%rax, %rbx +	mov	8(vp,n,8), %rax +	add	$4, n +	jz	L(end) + +	ALIGN(8) +L(top):	mov	%rdx, %r9 +	mul	%r8 +	or	%rax, %r9 +	mov	%r10, -16(rp,n,8) +L(lo3):	mov	%rdx, %r10 +	mov	-16(vp,n,8), %rax +	mul	%r8 +	or	%rax, %r10 +	mov	%r11, -8(rp,n,8) +L(lo2):	mov	%rdx, %r11 +	mov	-8(vp,n,8), %rax +	mul	%r8 +	or	%rax, %r11 +	add	R32(%rcx), R32(%rcx) +	ADCSBB	(up,n,8), %rbx +	ADCSBB	8(up,n,8), %r9 +	ADCSBB	16(up,n,8), %r10 +	ADCSBB	24(up,n,8), %r11 +	mov	(vp,n,8), %rax +	sbb	R32(%rcx), R32(%rcx) +	mov	%rbx, (rp,n,8) +L(lo1):	mov	%rdx, %rbx +	mul	%r8 +	or	%rax, %rbx +	mov	%r9, 8(rp,n,8) +L(lo0):	mov	8(vp,n,8), %rax +	add	$4, n +	jnz	L(top) + +L(end):	mov	%rdx, %r9 +	mul	%r8 +	or	%rax, %r9 +	mov	%r10, -16(rp,n,8) +L(cj3):	mov	%r11, -8(rp,n,8) +L(cj2):	add	R32(%rcx), R32(%rcx) +	ADCSBB	(up,n,8), %rbx +	ADCSBB	8(up,n,8), %r9 +	mov	%rbx, (rp,n,8) +L(cj1):	mov	%r9, 8(rp,n,8) +	mov	%rdx, %rax +	ADCSBB	$0, %rax +	pop	%rbx +	pop	%rbp +	pop	%r12 +	FUNC_EXIT() +	ret +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86_64/k8/bdiv_q_1.asm b/vendor/gmp-6.3.0/mpn/x86_64/k8/bdiv_q_1.asm new file mode 100644 index 0000000..1172b0d --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/k8/bdiv_q_1.asm @@ -0,0 +1,179 @@ +dnl  AMD64 mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- Hensel division by 1-limb divisor, +dnl  returning quotient only. + +dnl  Copyright 2001, 2002, 2004-2006, 2009, 2011, 2012, 2017 Free Software +dnl  Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C	    cycles/limb +C	     norm/unorm +C AMD K8,K9	10	+ +C AMD K10	10	+ +C AMD bull	13.7	- +C AMD pile	13.7	+ +C AMD steam +C AMD excavator +C AMD bobcat	15	- +C AMD jaguar	16	- +C Intel P4	33	= +C Intel core2	13.25	= +C Intel NHM	14	= +C Intel SBR	8.5	- +C Intel IBR	8.5	- +C Intel HWL	8	= +C Intel BWL	8	= +C Intel SKL	8	= +C Intel atom	42	-- +C Intel SLM	20.4	-- +C VIA nano + +C INPUT PARAMETERS +define(`rp',		`%rdi') +define(`up',		`%rsi') +define(`n',		`%rdx') +define(`d',		`%rcx') +define(`di',		`%r8')		C	just mpn_pi1_bdiv_q_1 +define(`ncnt',		`%r9')		C	just mpn_pi1_bdiv_q_1 + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() +	TEXT +	ALIGN(16) +PROLOGUE(mpn_bdiv_q_1) +	FUNC_ENTRY(4) +	push	%rbx + +	mov	%rcx, %rax +	xor	R32(%rcx), R32(%rcx)	C ncnt count +	mov	%rdx, %r10 + +	bt	$0, R32(%rax) +	jnc	L(evn)			C skip bsf unless divisor is even + +L(odd):	mov	%rax, %rbx +	shr	R32(%rax) +	and	$127, R32(%rax)		C d/2, 7 bits + +	LEA(	binvert_limb_table, %rdx) + +	movzbl	(%rdx,%rax), R32(%rax)	C inv 8 bits + +	mov	%rbx, %r11		C d without twos + +	lea	(%rax,%rax), R32(%rdx)	C 2*inv +	imul	R32(%rax), R32(%rax)	C inv*inv +	imul	R32(%rbx), R32(%rax)	C inv*inv*d +	sub	R32(%rax), R32(%rdx)	C inv = 2*inv - inv*inv*d, 16 bits + +	lea	(%rdx,%rdx), R32(%rax)	C 2*inv +	imul	R32(%rdx), R32(%rdx)	C inv*inv +	imul	R32(%rbx), R32(%rdx)	C inv*inv*d +	sub	R32(%rdx), R32(%rax)	C inv = 2*inv - inv*inv*d, 32 bits + +	lea	(%rax,%rax), %r8	C 2*inv +	imul	%rax, %rax		C inv*inv +	imul	%rbx, %rax		C inv*inv*d +	sub	%rax, %r8		C inv = 2*inv - inv*inv*d, 64 bits + +	jmp	L(pi1) + +L(evn):	bsf	%rax, %rcx +	shr	R8(%rcx), %rax +	jmp	L(odd) +EPILOGUE() + +PROLOGUE(mpn_pi1_bdiv_q_1) +	FUNC_ENTRY(4) +IFDOS(`	mov	56(%rsp), %r8	') +IFDOS(`	mov	64(%rsp), %r9	') +	push	%rbx + +	mov	%rcx, %r11		C d +	mov	%rdx, %r10		C n +	mov	%r9, %rcx		C ncnt + +L(pi1):	mov	(up), %rax		C up[0] + +	dec	%r10 +	jz	L(one) + +	mov	8(up), %rdx		C up[1] +	lea	(up,%r10,8), up		C up end +	lea	(rp,%r10,8), rp		C rp end +	neg	%r10			C -n + +	shrd	R8(%rcx), %rdx, %rax + +	xor	R32(%rbx), R32(%rbx) +	jmp	L(ent) + +	ALIGN(8) +L(top): +	C rax	q +	C rbx	carry bit, 0 or 1 +	C rcx	ncnt +	C rdx +	C r10	counter, limbs, negative +	C r11	d + +	mul	%r11			C carry limb in rdx +	mov	(up,%r10,8), %rax +	mov	8(up,%r10,8), %r9 +	shrd	R8(%rcx), %r9, %rax +	nop +	sub	%rbx, %rax		C apply carry bit +	setc	R8(%rbx) +	sub	%rdx, %rax		C apply carry limb +	adc	$0, R32(%rbx) +L(ent):	imul	%r8, %rax +	mov	%rax, (rp,%r10,8) +	inc	%r10 +	jnz	L(top) + +	mul	%r11			C carry limb in rdx +	mov	(up), %rax		C up high limb +	shr	R8(%rcx), %rax +	sub	%rbx, %rax		C apply carry bit +	sub	%rdx, %rax		C apply carry limb +	imul	%r8, %rax +	mov	%rax, (rp) +	pop	%rbx +	FUNC_EXIT() +	ret + +L(one):	shr	R8(%rcx), %rax +	imul	%r8, %rax +	mov	%rax, (rp) +	pop	%rbx +	FUNC_EXIT() +	ret +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86_64/k8/div_qr_1n_pi1.asm b/vendor/gmp-6.3.0/mpn/x86_64/k8/div_qr_1n_pi1.asm new file mode 100644 index 0000000..86de08c --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/k8/div_qr_1n_pi1.asm @@ -0,0 +1,249 @@ +dnl  x86-64 mpn_div_qr_1n_pi1 +dnl  -- Divide an mpn number by a normalized single-limb number, +dnl     using a single-limb inverse. + +dnl  Contributed to the GNU project by Niels Möller + +dnl  Copyright 2013 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C		c/l +C AMD K8,K9	11 +C AMD K10	11 +C AMD bull	16 +C AMD pile	14.25 +C AMD steam	 ? +C AMD bobcat	16 +C AMD jaguar	 ? +C Intel P4	47.5	poor +C Intel core	28.5	very poor +C Intel NHM	29	very poor +C Intel SBR	16	poor +C Intel IBR	13.5 +C Intel HWL	12 +C Intel BWL	 ? +C Intel atom	53	very poor +C VIA nano	19 + + +C INPUT Parameters +define(`QP', `%rdi') +define(`UP', `%rsi') +define(`UN_INPUT', `%rdx') +define(`U1', `%rcx')	C Also in %rax +define(`D', `%r8') +define(`DINV', `%r9') + +C Invariants +define(`B2', `%rbp') +define(`B2md', `%rbx') + +C Variables +define(`UN', `%r8')	C Overlaps D input +define(`T', `%r10') +define(`U0', `%r11') +define(`U2', `%r12') +define(`Q0', `%r13') +define(`Q1', `%r14') +define(`Q2', `%r15') + +ABI_SUPPORT(STD64) + +	ASM_START() +	TEXT +	ALIGN(16) +PROLOGUE(mpn_div_qr_1n_pi1) +	FUNC_ENTRY(4) +IFDOS(`	mov	56(%rsp), %r8	') +IFDOS(`	mov	64(%rsp), %r9	') +	dec	UN_INPUT +	jnz	L(first) + +	C Just a single 2/1 division. +	C T, U0 are allocated in scratch registers +	lea	1(U1), T +	mov	U1, %rax +	mul	DINV +	mov	(UP), U0 +	add	U0, %rax +	adc	T, %rdx +	mov	%rdx, T +	imul	D, %rdx +	sub	%rdx, U0 +	cmp	U0, %rax +	lea	(U0, D), %rax +	cmovnc	U0, %rax +	sbb	$0, T +	cmp	D, %rax +	jc	L(single_div_done) +	sub	D, %rax +	add	$1, T +L(single_div_done): +	mov	T, (QP) +	FUNC_EXIT() +	ret +L(first): +	C FIXME: Could delay some of these until we enter the loop. +	push	%r15 +	push	%r14 +	push	%r13 +	push	%r12 +	push	%rbx +	push	%rbp + +	mov	D, B2 +	imul	DINV, B2 +	neg	B2 +	mov	B2, B2md +	sub	D, B2md + +	C D not needed until final reduction +	push	D +	mov	UN_INPUT, UN	C Clobbers D + +	mov	DINV, %rax +	mul	U1 +	mov	%rax, Q0 +	add	U1, %rdx +	mov	%rdx, T + +	mov	B2, %rax +	mul	U1 +	mov	-8(UP, UN, 8), U0 +	mov	(UP, UN, 8), U1 +	mov	T, (QP, UN, 8) +	add	%rax, U0 +	adc	%rdx, U1 +	sbb	U2, U2 +	dec	UN +	mov	U1, %rax +	jz	L(final) +	mov	$0, R32(Q1) + +	ALIGN(16) + +	C Loop is 28 instructions, 30 K8/K10 decoder slots, should run +	C in 10 cycles. At entry, %rax holds an extra copy of U1, Q1 +	C is zero, and carry holds an extra copy of U2. +L(loop): +	C {Q2, Q1, Q0} <-- DINV * U1 + B (Q0 + U2 DINV) + B^2 U2 +	C Remains to add in B (U1 + c) +	cmovc	DINV, Q1 +	mov	U2, Q2 +	neg	Q2 +	mul	DINV +	add	%rdx, Q1 +	adc	$0, Q2 +	add	Q0, Q1 +	mov	%rax, Q0 +	mov	B2, %rax +	lea	(B2md, U0), T +	adc	$0, Q2 + +	C {U2, U1, U0} <-- (U0 + U2 B2 -c U) B + U1 B2 + u +	mul	U1 +	and	B2, U2 +	add	U2, U0 +	cmovnc	U0, T + +	C {QP+UN, ...} <-- {QP+UN, ...} + {Q2, Q1} + U1 + c +	adc	U1, Q1 +	mov	-8(UP, UN, 8), U0 +	adc	Q2, 8(QP, UN, 8) +	jc	L(q_incr) +L(q_incr_done): +	add	%rax, U0 +	mov	T, %rax +	adc	%rdx, %rax +	mov	Q1, (QP, UN, 8) +	mov	$0, R32(Q1) +	sbb	U2, U2 +	dec	UN +	mov	%rax, U1 +	jnz	L(loop) + +L(final): +	pop	D + +	mov	U2, Q1 +	and	D, U2 +	sub	U2, %rax +	neg	Q1 + +	mov	%rax, U1 +	sub	D, %rax +	cmovc	U1, %rax +	sbb	$-1, Q1 + +	lea	1(%rax), T +	mul	DINV +	add	U0, %rax +	adc	T, %rdx +	mov	%rdx, T +	imul	D, %rdx +	sub	%rdx, U0 +	cmp	U0, %rax +	lea	(U0, D), %rax +	cmovnc	U0, %rax +	sbb	$0, T +	cmp	D, %rax +	jc	L(div_done) +	sub	D, %rax +	add	$1, T +L(div_done): +	add	T, Q0 +	mov	Q0, (QP) +	adc	Q1, 8(QP) +	jnc	L(done) +L(final_q_incr): +	addq	$1, 16(QP) +	lea	8(QP), QP +	jc	L(final_q_incr) + +L(done): +	pop	%rbp +	pop	%rbx +	pop	%r12 +	pop	%r13 +	pop	%r14 +	pop	%r15 +	FUNC_EXIT() +	ret + +L(q_incr): +	C U1 is not live, so use it for indexing +	lea	16(QP, UN, 8), U1 +L(q_incr_loop): +	addq	$1, (U1) +	jnc	L(q_incr_done) +	lea	8(U1), U1 +	jmp	L(q_incr_loop) +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86_64/k8/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/x86_64/k8/gmp-mparam.h new file mode 100644 index 0000000..d87cc3b --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/k8/gmp-mparam.h @@ -0,0 +1,237 @@ +/* AMD K8 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + +  * the GNU Lesser General Public License as published by the Free +    Software Foundation; either version 3 of the License, or (at your +    option) any later version. + +or + +  * the GNU General Public License as published by the Free Software +    Foundation; either version 2 of the License, or (at your option) any +    later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library.  If not, +see https://www.gnu.org/licenses/.  */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +#if 0 +#undef mpn_sublsh_n +#define mpn_sublsh_n(rp,up,vp,n,c)					\ +  (((rp) == (up)) ? mpn_submul_1 (rp, vp, n, CNST_LIMB(1) << (c))	\ +   : MPN(mpn_sublsh_n)(rp,up,vp,n,c)) +#endif + +/* 2500 MHz K8 Brisbane */ +/* FFT tuning limit = 115,768,433 */ +/* Generated by tuneup.c, 2019-10-18, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD                 0  /* always */ +#define MOD_1_UNNORM_THRESHOLD               0  /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD          5 +#define MOD_1U_TO_MOD_1_1_THRESHOLD          2 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD        14 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD        35 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      9 +#define USE_PREINV_DIVREM_1                  1  /* native */ +#define DIV_QR_1_NORM_THRESHOLD              1 +#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */ +#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */ +#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD           16 + +#define DIV_1_VS_MUL_1_PERCENT             309 + +#define MUL_TOOM22_THRESHOLD                28 +#define MUL_TOOM33_THRESHOLD                81 +#define MUL_TOOM44_THRESHOLD               232 +#define MUL_TOOM6H_THRESHOLD               324 +#define MUL_TOOM8H_THRESHOLD               478 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD      97 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD     153 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD     154 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD     160 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD     226 + +#define SQR_BASECASE_THRESHOLD               0  /* always (native) */ +#define SQR_TOOM2_THRESHOLD                 34 +#define SQR_TOOM3_THRESHOLD                114 +#define SQR_TOOM4_THRESHOLD                336 +#define SQR_TOOM6_THRESHOLD                430 +#define SQR_TOOM8_THRESHOLD                  0  /* always */ + +#define MULMID_TOOM42_THRESHOLD             36 + +#define MULMOD_BNM1_THRESHOLD               17 +#define SQRMOD_BNM1_THRESHOLD               19 + +#define MUL_FFT_MODF_THRESHOLD             654  /* k = 5 */ +#define MUL_FFT_TABLE3                                      \ +  { {    654, 5}, {     21, 6}, {     11, 5}, {     23, 6}, \ +    {     12, 5}, {     25, 6}, {     13, 5}, {     27, 6}, \ +    {     27, 7}, {     14, 6}, {     29, 7}, {     15, 6}, \ +    {     31, 7}, {     29, 8}, {     15, 7}, {     32, 8}, \ +    {     17, 7}, {     37, 8}, {     19, 7}, {     39, 8}, \ +    {     21, 7}, {     44, 8}, {     23, 7}, {     47, 8}, \ +    {     25, 7}, {     51, 8}, {     31, 7}, {     63, 8}, \ +    {     37, 9}, {     19, 8}, {     43, 9}, {     23, 8}, \ +    {     53, 9}, {     27, 8}, {     57, 9}, {     31, 8}, \ +    {     67, 9}, {     35, 8}, {     71, 9}, {     39, 8}, \ +    {     81, 9}, {     43,10}, {     23, 9}, {     55, 8}, \ +    {    111,10}, {     31, 9}, {     71,10}, {     39, 9}, \ +    {     87,10}, {     47, 9}, {     99,10}, {     55, 9}, \ +    {    111,11}, {     31,10}, {     63, 9}, {    131,10}, \ +    {     71, 9}, {    147,10}, {     87,11}, {     47,10}, \ +    {    111,11}, {     63,10}, {    143,11}, {     79,10}, \ +    {    167,11}, {     95,10}, {    199,11}, {    111,12}, \ +    {     63,11}, {    143,10}, {    287,11}, {    159,12}, \ +    {     95,11}, {    191,10}, {    383,11}, {    207,10}, \ +    {    415,13}, {     63,12}, {    127,11}, {    255,10}, \ +    {    511,11}, {    271,10}, {    543,11}, {    287,12}, \ +    {    159,11}, {    319,10}, {    639,11}, {    335,10}, \ +    {    671,11}, {    351,12}, {    191,11}, {    415,12}, \ +    {    223,11}, {    447,13}, {    127,12}, {    255,11}, \ +    {    543,12}, {    287,11}, {    575,10}, {   1151,11}, \ +    {    607,12}, {    319,11}, {    671,12}, {    351,11}, \ +    {    703,13}, {    191,12}, {    383,11}, {    767,12}, \ +    {    415,11}, {    831,12}, {    447,11}, {    895,12}, \ +    {    479,14}, {    127,13}, {    255,12}, {    543,11}, \ +    {   1087,12}, {    575,11}, {   1151,12}, {    607,13}, \ +    {    319,12}, {    735,13}, {    383,12}, {    831,13}, \ +    {    447,12}, {    959,14}, {    255,13}, {    511,12}, \ +    {   1087,13}, {    575,12}, {   1215,13}, {    639,12}, \ +    {   1279,13}, {    703,12}, {   1407,14}, {    383,13}, \ +    {    767,12}, {   1535,13}, {    831,12}, {   1663,13}, \ +    {    959,15}, {    255,14}, {    511,13}, {   1215,14}, \ +    {    639,13}, {   1471,14}, {    767,13}, {   1663,14}, \ +    {    895,13}, {   1855,15}, {    511,14}, {   1023,13}, \ +    {   2047,14}, {   1151,13}, {   2367,14}, {   1407,15}, \ +    {    767,14}, {   1791,16}, {    511,15}, {   1023,14}, \ +    {   2303,15}, {   1279,14}, {   2687,15}, {   1535,14}, \ +    {   3199,15}, {   1791,16}, {   1023,15}, {   2047,14}, \ +    {   4223,15}, {   2303,14}, {   4735,15}, {   2559,16}, \ +    {   1535,15}, {   3071,14}, {   6271,15}, {   3327,17}, \ +    { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ +    {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 183 +#define MUL_FFT_THRESHOLD                11520 + +#define SQR_FFT_MODF_THRESHOLD             540  /* k = 5 */ +#define SQR_FFT_TABLE3                                      \ +  { {    540, 5}, {     21, 6}, {     11, 5}, {     23, 6}, \ +    {     12, 5}, {     25, 6}, {     13, 5}, {     27, 6}, \ +    {     16, 5}, {     33, 6}, {     29, 7}, {     15, 6}, \ +    {     31, 7}, {     16, 6}, {     33, 7}, {     33, 8}, \ +    {     17, 7}, {     37, 8}, {     19, 7}, {     39, 8}, \ +    {     21, 7}, {     43, 8}, {     23, 7}, {     47, 8}, \ +    {     25, 7}, {     51, 8}, {     29, 9}, {     15, 8}, \ +    {     37, 9}, {     19, 8}, {     43, 9}, {     23, 8}, \ +    {     51, 9}, {     27, 8}, {     55, 9}, {     31, 8}, \ +    {     65, 9}, {     35, 8}, {     71, 9}, {     43,10}, \ +    {     23, 9}, {     55,10}, {     31, 9}, {     71,10}, \ +    {     39, 9}, {     83,10}, {     47, 9}, {     99,10}, \ +    {     55, 9}, {    111,11}, {     31,10}, {     63, 9}, \ +    {    127,10}, {     87,11}, {     47,10}, {    111,12}, \ +    {     31,11}, {     63,10}, {    143,11}, {     79,10}, \ +    {    167,11}, {     95,10}, {    191,11}, {    111,12}, \ +    {     63,11}, {    127, 9}, {    511,11}, {    143,10}, \ +    {    287, 9}, {    575,11}, {    159,12}, {     95,11}, \ +    {    191,10}, {    383, 9}, {    767,11}, {    207,10}, \ +    {    415,13}, {     63,12}, {    127,10}, {    511, 9}, \ +    {   1023,11}, {    271,10}, {    543, 9}, {   1087,11}, \ +    {    287,10}, {    575,12}, {    159,11}, {    319,10}, \ +    {    639,11}, {    335,10}, {    671,11}, {    351,10}, \ +    {    703,12}, {    191,11}, {    383,10}, {    767,11}, \ +    {    415,10}, {    831,12}, {    223,11}, {    447,13}, \ +    {    127,11}, {    511,10}, {   1023,11}, {    543,10}, \ +    {   1087,12}, {    287,11}, {    575,10}, {   1151,11}, \ +    {    607,12}, {    319,11}, {    639,10}, {   1279,11}, \ +    {    671,12}, {    351,11}, {    703,13}, {    191,12}, \ +    {    383,11}, {    767,12}, {    415,11}, {    831,12}, \ +    {    447,11}, {    895,14}, {    127,12}, {    511,11}, \ +    {   1023,12}, {    543,11}, {   1087,12}, {    575,11}, \ +    {   1151,12}, {    607,11}, {   1215,13}, {    319,12}, \ +    {    639,11}, {   1279,12}, {    671,11}, {   1343,12}, \ +    {    703,11}, {   1407,12}, {    735,13}, {    383,12}, \ +    {    767,11}, {   1535,12}, {    831,13}, {    447,12}, \ +    {    959,13}, {    511,12}, {   1087,13}, {    575,12}, \ +    {   1215,13}, {    639,12}, {   1343,13}, {    703,12}, \ +    {   1407,14}, {    383,13}, {    767,12}, {   1535,13}, \ +    {    831,12}, {   1663,13}, {    895,12}, {   1791,13}, \ +    {    959,14}, {    511,13}, {   1215,14}, {    639,13}, \ +    {   1471,14}, {    767,13}, {   1663,14}, {    895,13}, \ +    {   1791,15}, {    511,14}, {   1023,13}, {   2111,14}, \ +    {   1151,13}, {   2303,14}, {   1407,15}, {    767,14}, \ +    {   1791,16}, {    511,15}, {   1023,14}, {   2303,15}, \ +    {   1279,14}, {   2687,15}, {   1535,14}, {   3199,15}, \ +    {   1791,16}, {   1023,15}, {   2047,14}, {   4223,15}, \ +    {   2303,14}, {   4863,15}, {   2559,16}, {   1535,15}, \ +    {   3071,14}, {   6271,15}, {   3327,17}, { 131072,18}, \ +    { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ +    {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 202 +#define SQR_FFT_THRESHOLD                 7296 + +#define MULLO_BASECASE_THRESHOLD             0  /* always */ +#define MULLO_DC_THRESHOLD                  61 +#define MULLO_MUL_N_THRESHOLD            22239 +#define SQRLO_BASECASE_THRESHOLD             8 +#define SQRLO_DC_THRESHOLD                   0  /* never mpn_sqrlo_basecase */ +#define SQRLO_SQR_THRESHOLD              14281 + +#define DC_DIV_QR_THRESHOLD                 47 +#define DC_DIVAPPR_Q_THRESHOLD             266 +#define DC_BDIV_QR_THRESHOLD                38 +#define DC_BDIV_Q_THRESHOLD                104 + +#define INV_MULMOD_BNM1_THRESHOLD           54 +#define INV_NEWTON_THRESHOLD               252 +#define INV_APPR_THRESHOLD                 250 + +#define BINV_NEWTON_THRESHOLD              258 +#define REDC_1_TO_REDC_2_THRESHOLD          35 +#define REDC_2_TO_REDC_N_THRESHOLD          79 + +#define MU_DIV_QR_THRESHOLD               2089 +#define MU_DIVAPPR_Q_THRESHOLD            1895 +#define MUPI_DIV_QR_THRESHOLD               99 +#define MU_BDIV_QR_THRESHOLD              1787 +#define MU_BDIV_Q_THRESHOLD               1895 + +#define POWM_SEC_TABLE  1,16,194,960,2825 + +#define GET_STR_DC_THRESHOLD                16 +#define GET_STR_PRECOMPUTE_THRESHOLD        26 +#define SET_STR_DC_THRESHOLD               248 +#define SET_STR_PRECOMPUTE_THRESHOLD      1747 + +#define FAC_DSC_THRESHOLD                 1240 +#define FAC_ODD_THRESHOLD                   27 + +#define MATRIX22_STRASSEN_THRESHOLD         21 +#define HGCD2_DIV1_METHOD                    3  /* 4.10% faster than 5 */ +#define HGCD_THRESHOLD                     141 +#define HGCD_APPR_THRESHOLD                181 +#define HGCD_REDUCE_THRESHOLD             4633 +#define GCD_DC_THRESHOLD                   622 +#define GCDEXT_DC_THRESHOLD                496 +#define JACOBI_BASE_METHOD                   1  /* 0.97% faster than 3 */ + +/* Tuneup completed successfully, took 131832 seconds */ diff --git a/vendor/gmp-6.3.0/mpn/x86_64/k8/mul_basecase.asm b/vendor/gmp-6.3.0/mpn/x86_64/k8/mul_basecase.asm new file mode 100644 index 0000000..ca2efb9 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/k8/mul_basecase.asm @@ -0,0 +1,469 @@ +dnl  AMD64 mpn_mul_basecase. + +dnl  Contributed to the GNU project by Torbjorn Granlund and David Harvey. + +dnl  Copyright 2008, 2012 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C	     cycles/limb +C AMD K8,K9	 2.375 +C AMD K10	 2.375 +C Intel P4	15-16 +C Intel core2	 4.45 +C Intel corei	 4.35 +C Intel atom	 ? +C VIA nano	 4.5 + +C The inner loops of this code are the result of running a code generation and +C optimization tool suite written by David Harvey and Torbjorn Granlund. + +C TODO +C  * Use fewer registers.  (how??? I can't see it -- david) +C  * Avoid some "mov $0,r" and instead use "xor r,r". +C  * Can the top of each L(addmul_outer_n) prologue be folded into the +C    mul_1/mul_2 prologues, saving a LEA (%rip)? It would slow down the +C    case where vn = 1 or 2; is it worth it? + +C INPUT PARAMETERS +define(`rp',      `%rdi') +define(`up',      `%rsi') +define(`un_param',`%rdx') +define(`vp',      `%rcx') +define(`vn',      `%r8') + +define(`v0', `%r12') +define(`v1', `%r9') + +define(`w0', `%rbx') +define(`w1', `%r15') +define(`w2', `%rbp') +define(`w3', `%r10') + +define(`n',  `%r11') +define(`outer_addr', `%r14') +define(`un',  `%r13') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() +	TEXT +	ALIGN(16) +PROLOGUE(mpn_mul_basecase) +	FUNC_ENTRY(4) +IFDOS(`	mov	56(%rsp), %r8d	') +	push	%rbx +	push	%rbp +	push	%r12 +	push	%r13 +	push	%r14 +	push	%r15 + +	xor	R32(un), R32(un) +	mov	(up), %rax +	mov	(vp), v0 + +	sub	un_param, un		C rdx used by mul +	mov	un, n +	mov	R32(un_param), R32(w0) + +	lea	(rp,un_param,8), rp +	lea	(up,un_param,8), up + +	mul	v0 + +	test	$1, R8(vn) +	jz	L(mul_2) + +C =========================================================== +C     mul_1 for vp[0] if vn is odd + +L(mul_1): +	and	$3, R32(w0) +	jz	L(mul_1_prologue_0) +	cmp	$2, R32(w0) +	jc	L(mul_1_prologue_1) +	jz	L(mul_1_prologue_2) + +L(mul_1_prologue_3): +	add	$-1, n +	lea	L(addmul_outer_3)(%rip), outer_addr +	mov	%rax, w3 +	mov	%rdx, w0 +	jmp	L(mul_1_entry_3) + +L(mul_1_prologue_0): +	mov	%rax, w2 +	mov	%rdx, w3		C note: already w0 == 0 +	lea	L(addmul_outer_0)(%rip), outer_addr +	jmp	L(mul_1_entry_0) + +L(mul_1_prologue_1): +	cmp	$-1, un +	jne	2f +	mov	%rax, -8(rp) +	mov	%rdx, (rp) +	jmp	L(ret) +2:	add	$1, n +	lea	L(addmul_outer_1)(%rip), outer_addr +	mov	%rax, w1 +	mov	%rdx, w2 +	xor	R32(w3), R32(w3) +	mov	(up,n,8), %rax +	jmp	L(mul_1_entry_1) + +L(mul_1_prologue_2): +	add	$-2, n +	lea	L(addmul_outer_2)(%rip), outer_addr +	mov	%rax, w0 +	mov	%rdx, w1 +	mov	24(up,n,8), %rax +	xor	R32(w2), R32(w2) +	xor	R32(w3), R32(w3) +	jmp	L(mul_1_entry_2) + + +	C this loop is 10 c/loop = 2.5 c/l on K8, for all up/rp alignments + +	ALIGN(16) +L(mul_1_top): +	mov	w0, -16(rp,n,8) +	add	%rax, w1 +	mov	(up,n,8), %rax +	adc	%rdx, w2 +L(mul_1_entry_1): +	xor	R32(w0), R32(w0) +	mul	v0 +	mov	w1, -8(rp,n,8) +	add	%rax, w2 +	adc	%rdx, w3 +L(mul_1_entry_0): +	mov	8(up,n,8), %rax +	mul	v0 +	mov	w2, (rp,n,8) +	add	%rax, w3 +	adc	%rdx, w0 +L(mul_1_entry_3): +	mov	16(up,n,8), %rax +	mul	v0 +	mov	w3, 8(rp,n,8) +	xor	R32(w2), R32(w2)	C zero +	mov	w2, w3			C zero +	add	%rax, w0 +	mov	24(up,n,8), %rax +	mov	w2, w1			C zero +	adc	%rdx, w1 +L(mul_1_entry_2): +	mul	v0 +	add	$4, n +	js	L(mul_1_top) + +	mov	w0, -16(rp) +	add	%rax, w1 +	mov	w1, -8(rp) +	adc	%rdx, w2 +	mov	w2, (rp) + +	add	$-1, vn			C vn -= 1 +	jz	L(ret) + +	mov	8(vp), v0 +	mov	16(vp), v1 + +	lea	8(vp), vp		C vp += 1 +	lea	8(rp), rp		C rp += 1 + +	jmp	*outer_addr + +C =========================================================== +C     mul_2 for vp[0], vp[1] if vn is even + +	ALIGN(16) +L(mul_2): +	mov	8(vp), v1 + +	and	$3, R32(w0) +	jz	L(mul_2_prologue_0) +	cmp	$2, R32(w0) +	jz	L(mul_2_prologue_2) +	jc	L(mul_2_prologue_1) + +L(mul_2_prologue_3): +	lea	L(addmul_outer_3)(%rip), outer_addr +	add	$2, n +	mov	%rax, -16(rp,n,8) +	mov	%rdx, w2 +	xor	R32(w3), R32(w3) +	xor	R32(w0), R32(w0) +	mov	-16(up,n,8), %rax +	jmp	L(mul_2_entry_3) + +	ALIGN(16) +L(mul_2_prologue_0): +	add	$3, n +	mov	%rax, w0 +	mov	%rdx, w1 +	xor	R32(w2), R32(w2) +	mov	-24(up,n,8), %rax +	lea	L(addmul_outer_0)(%rip), outer_addr +	jmp	L(mul_2_entry_0) + +	ALIGN(16) +L(mul_2_prologue_1): +	mov	%rax, w3 +	mov	%rdx, w0 +	xor	R32(w1), R32(w1) +	lea	L(addmul_outer_1)(%rip), outer_addr +	jmp	L(mul_2_entry_1) + +	ALIGN(16) +L(mul_2_prologue_2): +	add	$1, n +	lea	L(addmul_outer_2)(%rip), outer_addr +	mov	$0, R32(w0) +	mov	$0, R32(w1) +	mov	%rax, w2 +	mov	-8(up,n,8), %rax +	mov	%rdx, w3 +	jmp	L(mul_2_entry_2) + +	C this loop is 18 c/loop = 2.25 c/l on K8, for all up/rp alignments + +	ALIGN(16) +L(mul_2_top): +	mov	-32(up,n,8), %rax +	mul	v1 +	add	%rax, w0 +	adc	%rdx, w1 +	mov	-24(up,n,8), %rax +	xor	R32(w2), R32(w2) +	mul	v0 +	add	%rax, w0 +	mov	-24(up,n,8), %rax +	adc	%rdx, w1 +	adc	$0, R32(w2) +L(mul_2_entry_0): +	mul	v1 +	add	%rax, w1 +	mov	w0, -24(rp,n,8) +	adc	%rdx, w2 +	mov	-16(up,n,8), %rax +	mul	v0 +	mov	$0, R32(w3) +	add	%rax, w1 +	adc	%rdx, w2 +	mov	-16(up,n,8), %rax +	adc	$0, R32(w3) +	mov	$0, R32(w0) +	mov	w1, -16(rp,n,8) +L(mul_2_entry_3): +	mul	v1 +	add	%rax, w2 +	mov	-8(up,n,8), %rax +	adc	%rdx, w3 +	mov	$0, R32(w1) +	mul	v0 +	add	%rax, w2 +	mov	-8(up,n,8), %rax +	adc	%rdx, w3 +	adc	R32(w1), R32(w0)	C adc $0, w0 +L(mul_2_entry_2): +	mul	v1 +	add	%rax, w3 +	mov	w2, -8(rp,n,8) +	adc	%rdx, w0 +	mov	(up,n,8), %rax +	mul	v0 +	add	%rax, w3 +	adc	%rdx, w0 +	adc	$0, R32(w1) +L(mul_2_entry_1): +	add	$4, n +	mov	w3, -32(rp,n,8) +	js	L(mul_2_top) + +	mov	-32(up,n,8), %rax	C FIXME: n is constant +	mul	v1 +	add	%rax, w0 +	mov	w0, (rp) +	adc	%rdx, w1 +	mov	w1, 8(rp) + +	add	$-2, vn			C vn -= 2 +	jz	L(ret) + +	mov	16(vp), v0 +	mov	24(vp), v1 + +	lea	16(vp), vp		C vp += 2 +	lea	16(rp), rp		C rp += 2 + +	jmp	*outer_addr + + +C =========================================================== +C     addmul_2 for remaining vp's + +	C in the following prologues, we reuse un to store the +	C adjusted value of n that is reloaded on each iteration + +L(addmul_outer_0): +	add	$3, un +	lea	0(%rip), outer_addr + +	mov	un, n +	mov	-24(up,un,8), %rax +	mul	v0 +	mov	%rax, w0 +	mov	-24(up,un,8), %rax +	mov	%rdx, w1 +	xor	R32(w2), R32(w2) +	jmp	L(addmul_entry_0) + +L(addmul_outer_1): +	mov	un, n +	mov	(up,un,8), %rax +	mul	v0 +	mov	%rax, w3 +	mov	(up,un,8), %rax +	mov	%rdx, w0 +	xor	R32(w1), R32(w1) +	jmp	L(addmul_entry_1) + +L(addmul_outer_2): +	add	$1, un +	lea	0(%rip), outer_addr + +	mov	un, n +	mov	-8(up,un,8), %rax +	mul	v0 +	xor	R32(w0), R32(w0) +	mov	%rax, w2 +	xor	R32(w1), R32(w1) +	mov	%rdx, w3 +	mov	-8(up,un,8), %rax +	jmp	L(addmul_entry_2) + +L(addmul_outer_3): +	add	$2, un +	lea	0(%rip), outer_addr + +	mov	un, n +	mov	-16(up,un,8), %rax +	xor	R32(w3), R32(w3) +	mul	v0 +	mov	%rax, w1 +	mov	-16(up,un,8), %rax +	mov	%rdx, w2 +	jmp	L(addmul_entry_3) + +	C this loop is 19 c/loop = 2.375 c/l on K8, for all up/rp alignments + +	ALIGN(16) +L(addmul_top): +	add	w3, -32(rp,n,8) +	adc	%rax, w0 +	mov	-24(up,n,8), %rax +	adc	%rdx, w1 +	xor	R32(w2), R32(w2) +	mul	v0 +	add	%rax, w0 +	mov	-24(up,n,8), %rax +	adc	%rdx, w1 +	adc	R32(w2), R32(w2)	C adc $0, w2 +L(addmul_entry_0): +	mul	v1 +	xor	R32(w3), R32(w3) +	add	w0, -24(rp,n,8) +	adc	%rax, w1 +	mov	-16(up,n,8), %rax +	adc	%rdx, w2 +	mul	v0 +	add	%rax, w1 +	mov	-16(up,n,8), %rax +	adc	%rdx, w2 +	adc	$0, R32(w3) +L(addmul_entry_3): +	mul	v1 +	add	w1, -16(rp,n,8) +	adc	%rax, w2 +	mov	-8(up,n,8), %rax +	adc	%rdx, w3 +	mul	v0 +	xor	R32(w0), R32(w0) +	add	%rax, w2 +	adc	%rdx, w3 +	mov	$0, R32(w1) +	mov	-8(up,n,8), %rax +	adc	R32(w1), R32(w0)	C adc $0, w0 +L(addmul_entry_2): +	mul	v1 +	add	w2, -8(rp,n,8) +	adc	%rax, w3 +	adc	%rdx, w0 +	mov	(up,n,8), %rax +	mul	v0 +	add	%rax, w3 +	mov	(up,n,8), %rax +	adc	%rdx, w0 +	adc	$0, R32(w1) +L(addmul_entry_1): +	mul	v1 +	add	$4, n +	js	L(addmul_top) + +	add	w3, -8(rp) +	adc	%rax, w0 +	mov	w0, (rp) +	adc	%rdx, w1 +	mov	w1, 8(rp) + +	add	$-2, vn			C vn -= 2 +	jz	L(ret) + +	lea	16(rp), rp		C rp += 2 +	lea	16(vp), vp		C vp += 2 + +	mov	(vp), v0 +	mov	8(vp), v1 + +	jmp	*outer_addr + +	ALIGN(16) +L(ret):	pop	%r15 +	pop	%r14 +	pop	%r13 +	pop	%r12 +	pop	%rbp +	pop	%rbx +	FUNC_EXIT() +	ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86_64/k8/mullo_basecase.asm b/vendor/gmp-6.3.0/mpn/x86_64/k8/mullo_basecase.asm new file mode 100644 index 0000000..fa00f42 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/k8/mullo_basecase.asm @@ -0,0 +1,436 @@ +dnl  AMD64 mpn_mullo_basecase. + +dnl  Contributed to the GNU project by Torbjorn Granlund. + +dnl  Copyright 2008, 2009, 2011, 2012 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +C NOTES +C   * There is a major stupidity in that we call mpn_mul_1 initially, for a +C     large trip count.  Instead, we should start with mul_2 for any operand +C     size congruence class. +C   * Stop iterating addmul_2 earlier, falling into straight-line triangle code +C     for the last 2-3 iterations. +C   * Perhaps implement n=4 special code. +C   * The reload of the outer loop jump address hurts branch prediction. +C   * The addmul_2 loop ends with an MUL whose high part is not used upon loop +C     exit. + +C INPUT PARAMETERS +define(`rp',	   `%rdi') +define(`up',	   `%rsi') +define(`vp_param', `%rdx') +define(`n',	   `%rcx') + +define(`vp',	`%r11') +define(`outer_addr', `%r8') +define(`j',	`%r9') +define(`v0',	`%r13') +define(`v1',	`%r14') +define(`w0',	`%rbx') +define(`w1',	`%r15') +define(`w2',	`%rbp') +define(`w3',	`%r10') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() +	TEXT +	ALIGN(16) +PROLOGUE(mpn_mullo_basecase) +	FUNC_ENTRY(4) +	cmp	$4, n +	jge	L(gen) +	mov	(up), %rax		C u0 +	mov	(vp_param), %r8		C v0 + +	lea	L(tab)(%rip), %r9 +ifdef(`PIC', +`	movslq	(%r9,%rcx,4), %r10 +	add	%r10, %r9 +	jmp	*%r9 +',` +	jmp	*(%r9,n,8) +') +	JUMPTABSECT +	ALIGN(8) +L(tab):	JMPENT(	L(tab), L(tab))			C not allowed +	JMPENT(	L(1), L(tab))			C 1 +	JMPENT(	L(2), L(tab))			C 2 +	JMPENT(	L(3), L(tab))			C 3 +dnl	JMPENT(	L(0m4), L(tab))			C 4 +dnl	JMPENT(	L(1m4), L(tab))			C 5 +dnl	JMPENT(	L(2m4), L(tab))			C 6 +dnl	JMPENT(	L(3m4), L(tab))			C 7 +dnl	JMPENT(	L(0m4), L(tab))			C 8 +dnl	JMPENT(	L(1m4), L(tab))			C 9 +dnl	JMPENT(	L(2m4), L(tab))			C 10 +dnl	JMPENT(	L(3m4), L(tab))			C 11 +	TEXT + +L(1):	imul	%r8, %rax +	mov	%rax, (rp) +	FUNC_EXIT() +	ret + +L(2):	mov	8(vp_param), %r11 +	imul	%rax, %r11		C u0 x v1 +	mul	%r8			C u0 x v0 +	mov	%rax, (rp) +	imul	8(up), %r8		C u1 x v0 +	lea	(%r11, %rdx), %rax +	add	%r8, %rax +	mov	%rax, 8(rp) +	FUNC_EXIT() +	ret + +L(3):	mov	8(vp_param), %r9	C v1 +	mov	16(vp_param), %r11 +	mul	%r8			C u0 x v0 -> <r1,r0> +	mov	%rax, (rp)		C r0 +	mov	(up), %rax		C u0 +	mov	%rdx, %rcx		C r1 +	mul	%r9			C u0 x v1 -> <r2,r1> +	imul	8(up), %r9		C u1 x v1 -> r2 +	mov	16(up), %r10 +	imul	%r8, %r10		C u2 x v0 -> r2 +	add	%rax, %rcx +	adc	%rdx, %r9 +	add	%r10, %r9 +	mov	8(up), %rax		C u1 +	mul	%r8			C u1 x v0 -> <r2,r1> +	add	%rax, %rcx +	adc	%rdx, %r9 +	mov	%r11, %rax +	imul	(up), %rax		C u0 x v2 -> r2 +	add	%rax, %r9 +	mov	%rcx, 8(rp) +	mov	%r9, 16(rp) +	FUNC_EXIT() +	ret + +L(0m4): +L(1m4): +L(2m4): +L(3m4): +L(gen):	push	%rbx +	push	%rbp +	push	%r13 +	push	%r14 +	push	%r15 + +	mov	(up), %rax +	mov	(vp_param), v0 +	mov	vp_param, vp + +	lea	(rp,n,8), rp +	lea	(up,n,8), up +	neg	n + +	mul	v0 + +	test	$1, R8(n) +	jz	L(mul_2) + +L(mul_1): +	lea	-8(rp), rp +	lea	-8(up), up +	test	$2, R8(n) +	jnz	L(mul_1_prologue_3) + +L(mul_1_prologue_2):		C n = 7, 11, 15, ... +	lea	-1(n), j +	lea	L(addmul_outer_1)(%rip), outer_addr +	mov	%rax, w0 +	mov	%rdx, w1 +	xor	R32(w2), R32(w2) +	xor	R32(w3), R32(w3) +	mov	16(up,n,8), %rax +	jmp	L(mul_1_entry_2) + +L(mul_1_prologue_3):		C n = 5, 9, 13, ... +	lea	1(n), j +	lea	L(addmul_outer_3)(%rip), outer_addr +	mov	%rax, w2 +	mov	%rdx, w3 +	xor	R32(w0), R32(w0) +	jmp	L(mul_1_entry_0) + +	ALIGN(16) +L(mul_1_top): +	mov	w0, -16(rp,j,8) +	add	%rax, w1 +	mov	(up,j,8), %rax +	adc	%rdx, w2 +	xor	R32(w0), R32(w0) +	mul	v0 +	mov	w1, -8(rp,j,8) +	add	%rax, w2 +	adc	%rdx, w3 +L(mul_1_entry_0): +	mov	8(up,j,8), %rax +	mul	v0 +	mov	w2, (rp,j,8) +	add	%rax, w3 +	adc	%rdx, w0 +	mov	16(up,j,8), %rax +	mul	v0 +	mov	w3, 8(rp,j,8) +	xor	R32(w2), R32(w2)	C zero +	mov	w2, w3			C zero +	add	%rax, w0 +	mov	24(up,j,8), %rax +	mov	w2, w1			C zero +	adc	%rdx, w1 +L(mul_1_entry_2): +	mul	v0 +	add	$4, j +	js	L(mul_1_top) + +	mov	w0, -16(rp) +	add	%rax, w1 +	mov	w1, -8(rp) +	adc	%rdx, w2 + +	imul	(up), v0 +	add	v0, w2 +	mov	w2, (rp) + +	add	$1, n +	jz	L(ret) + +	mov	8(vp), v0 +	mov	16(vp), v1 + +	lea	16(up), up +	lea	8(vp), vp +	lea	24(rp), rp + +	jmp	*outer_addr + + +L(mul_2): +	mov	8(vp), v1 +	test	$2, R8(n) +	jz	L(mul_2_prologue_3) + +	ALIGN(16) +L(mul_2_prologue_1): +	lea	0(n), j +	mov	%rax, w3 +	mov	%rdx, w0 +	xor	R32(w1), R32(w1) +	mov	(up,n,8), %rax +	lea	L(addmul_outer_3)(%rip), outer_addr +	jmp	L(mul_2_entry_1) + +	ALIGN(16) +L(mul_2_prologue_3): +	lea	2(n), j +	mov	$0, R32(w3) +	mov	%rax, w1 +	mov	(up,n,8), %rax +	mov	%rdx, w2 +	lea	L(addmul_outer_1)(%rip), outer_addr +	jmp	L(mul_2_entry_3) + +	ALIGN(16) +L(mul_2_top): +	mov	-32(up,j,8), %rax +	mul	v1 +	add	%rax, w0 +	adc	%rdx, w1 +	mov	-24(up,j,8), %rax +	xor	R32(w2), R32(w2) +	mul	v0 +	add	%rax, w0 +	mov	-24(up,j,8), %rax +	adc	%rdx, w1 +	adc	$0, R32(w2) +	mul	v1 +	add	%rax, w1 +	mov	w0, -24(rp,j,8) +	adc	%rdx, w2 +	mov	-16(up,j,8), %rax +	mul	v0 +	mov	$0, R32(w3) +	add	%rax, w1 +	adc	%rdx, w2 +	mov	-16(up,j,8), %rax +	adc	$0, R32(w3) +L(mul_2_entry_3): +	mov	$0, R32(w0) +	mov	w1, -16(rp,j,8) +	mul	v1 +	add	%rax, w2 +	mov	-8(up,j,8), %rax +	adc	%rdx, w3 +	mov	$0, R32(w1) +	mul	v0 +	add	%rax, w2 +	mov	-8(up,j,8), %rax +	adc	%rdx, w3 +	adc	R32(w1), R32(w0) +	mul	v1 +	add	%rax, w3 +	mov	w2, -8(rp,j,8) +	adc	%rdx, w0 +	mov	(up,j,8), %rax +	mul	v0 +	add	%rax, w3 +	adc	%rdx, w0 +	adc	$0, R32(w1) +L(mul_2_entry_1): +	add	$4, j +	mov	w3, -32(rp,j,8) +	js	L(mul_2_top) + +	imul	-16(up), v1 +	add	v1, w0 +	imul	-8(up), v0 +	add	v0, w0 +	mov	w0, -8(rp) + +	add	$2, n +	jz	L(ret) + +	mov	16(vp), v0 +	mov	24(vp), v1 + +	lea	16(vp), vp +	lea	16(rp), rp + +	jmp	*outer_addr + + +L(addmul_outer_1): +	lea	-2(n), j +	mov	-16(up,n,8), %rax +	mul	v0 +	mov	%rax, w3 +	mov	-16(up,n,8), %rax +	mov	%rdx, w0 +	xor	R32(w1), R32(w1) +	lea	L(addmul_outer_3)(%rip), outer_addr +	jmp	L(addmul_entry_1) + +L(addmul_outer_3): +	lea	0(n), j +	mov	-16(up,n,8), %rax +	xor	R32(w3), R32(w3) +	mul	v0 +	mov	%rax, w1 +	mov	-16(up,n,8), %rax +	mov	%rdx, w2 +	lea	L(addmul_outer_1)(%rip), outer_addr +	jmp	L(addmul_entry_3) + +	ALIGN(16) +L(addmul_top): +	add	w3, -32(rp,j,8) +	adc	%rax, w0 +	mov	-24(up,j,8), %rax +	adc	%rdx, w1 +	xor	R32(w2), R32(w2) +	mul	v0 +	add	%rax, w0 +	mov	-24(up,j,8), %rax +	adc	%rdx, w1 +	adc	R32(w2), R32(w2) +	mul	v1 +	xor	R32(w3), R32(w3) +	add	w0, -24(rp,j,8) +	adc	%rax, w1 +	mov	-16(up,j,8), %rax +	adc	%rdx, w2 +	mul	v0 +	add	%rax, w1 +	mov	-16(up,j,8), %rax +	adc	%rdx, w2 +	adc	$0, R32(w3) +L(addmul_entry_3): +	mul	v1 +	add	w1, -16(rp,j,8) +	adc	%rax, w2 +	mov	-8(up,j,8), %rax +	adc	%rdx, w3 +	mul	v0 +	xor	R32(w0), R32(w0) +	add	%rax, w2 +	adc	%rdx, w3 +	mov	$0, R32(w1) +	mov	-8(up,j,8), %rax +	adc	R32(w1), R32(w0) +	mul	v1 +	add	w2, -8(rp,j,8) +	adc	%rax, w3 +	adc	%rdx, w0 +	mov	(up,j,8), %rax +	mul	v0 +	add	%rax, w3 +	mov	(up,j,8), %rax +	adc	%rdx, w0 +	adc	$0, R32(w1) +L(addmul_entry_1): +	mul	v1 +	add	$4, j +	js	L(addmul_top) + +	add	w3, -32(rp) +	adc	%rax, w0 + +	imul	-24(up), v0 +	add	v0, w0 +	add	w0, -24(rp) + +	add	$2, n +	jns	L(ret) + +	lea	16(vp), vp + +	mov	(vp), v0 +	mov	8(vp), v1 + +	lea	-16(up), up + +	jmp	*outer_addr + +L(ret):	pop	%r15 +	pop	%r14 +	pop	%r13 +	pop	%rbp +	pop	%rbx +	FUNC_EXIT() +	ret +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86_64/k8/mulmid_basecase.asm b/vendor/gmp-6.3.0/mpn/x86_64/k8/mulmid_basecase.asm new file mode 100644 index 0000000..86f1414 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/k8/mulmid_basecase.asm @@ -0,0 +1,559 @@ +dnl  AMD64 mpn_mulmid_basecase + +dnl  Contributed by David Harvey. + +dnl  Copyright 2011, 2012 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C	     cycles/limb +C K8,K9:	 2.375  (2.5 when un - vn is "small") +C K10:		 ? +C P4:		 ? +C P6-15:	 ? + +C INPUT PARAMETERS +define(`rp',      `%rdi') +define(`up',      `%rsi') +define(`un_param',`%rdx') +define(`vp_param',`%rcx') +define(`vn',      `%r8') + +define(`v0', `%r12') +define(`v1', `%r9') + +define(`w0', `%rbx') +define(`w1', `%rcx') +define(`w2', `%rbp') +define(`w3', `%r10') + +define(`n',  `%r11') +define(`outer_addr', `%r14') +define(`un',  `%r13') +define(`vp',  `%r15') + +define(`vp_inner', `%r10') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() +	TEXT +	ALIGN(16) +PROLOGUE(mpn_mulmid_basecase) +	FUNC_ENTRY(4) +IFDOS(`	mov	56(%rsp), %r8d	') +	push	%rbx +	push	%rbp +	push	%r12 +	push	%r13 +	push	%r14 +	push	%r15 + +	mov	vp_param, vp + +	C use un for row length (= un_param - vn + 1) +	lea	1(un_param), un +	sub	vn, un + +	lea	(rp,un,8), rp + +	cmp	$4, un		C TODO: needs tuning +	jc	L(diagonal) + +	lea	(up,un_param,8), up + +	test	$1, vn +	jz	L(mul_2) + +C =========================================================== +C     mul_1 for vp[0] if vn is odd + +L(mul_1): +	mov	R32(un), R32(w0) + +	neg	un +	mov	(up,un,8), %rax +	mov	(vp), v0 +	mul	v0 + +	and	$-4, un		C round down to multiple of 4 +	mov	un, n + +	and	$3, R32(w0) +	jz	L(mul_1_prologue_0) +	cmp	$2, R32(w0) +	jc	L(mul_1_prologue_1) +	jz	L(mul_1_prologue_2) + +L(mul_1_prologue_3): +	mov	%rax, w3 +	mov	%rdx, w0 +	lea	L(addmul_prologue_3)(%rip), outer_addr +	jmp	L(mul_1_entry_3) + +	ALIGN(16) +L(mul_1_prologue_0): +	mov	%rax, w2 +	mov	%rdx, w3		C note already w0 == 0 +	lea	L(addmul_prologue_0)(%rip), outer_addr +	jmp	L(mul_1_entry_0) + +	ALIGN(16) +L(mul_1_prologue_1): +	add	$4, n +	mov	%rax, w1 +	mov	%rdx, w2 +	mov	$0, R32(w3) +	mov	(up,n,8), %rax +	lea	L(addmul_prologue_1)(%rip), outer_addr +	jmp	L(mul_1_entry_1) + +	ALIGN(16) +L(mul_1_prologue_2): +	mov	%rax, w0 +	mov	%rdx, w1 +	mov	24(up,n,8), %rax +	mov	$0, R32(w2) +	mov	$0, R32(w3) +	lea	L(addmul_prologue_2)(%rip), outer_addr +	jmp	L(mul_1_entry_2) + + +	C this loop is 10 c/loop = 2.5 c/l on K8 + +	ALIGN(16) +L(mul_1_top): +	mov	w0, -16(rp,n,8) +	add	%rax, w1 +	mov	(up,n,8), %rax +	adc	%rdx, w2 +L(mul_1_entry_1): +	mov	$0, R32(w0) +	mul	v0 +	mov	w1, -8(rp,n,8) +	add	%rax, w2 +	adc	%rdx, w3 +L(mul_1_entry_0): +	mov	8(up,n,8), %rax +	mul	v0 +	mov	w2, (rp,n,8) +	add	%rax, w3 +	adc	%rdx, w0 +L(mul_1_entry_3): +	mov	16(up,n,8), %rax +	mul	v0 +	mov	w3, 8(rp,n,8) +	mov	$0, R32(w2)		C zero +	mov	w2, w3			C zero +	add	%rax, w0 +	mov	24(up,n,8), %rax +	mov	w2, w1			C zero +	adc	%rdx, w1 +L(mul_1_entry_2): +	mul	v0 +	add	$4, n +	js	L(mul_1_top) + +	mov	w0, -16(rp) +	add	%rax, w1 +	mov	w1, -8(rp) +	mov	w2, 8(rp)		C zero last limb of output +	adc	%rdx, w2 +	mov	w2, (rp) + +	dec	vn +	jz	L(ret) + +	lea	-8(up), up +	lea	8(vp), vp + +	mov	un, n +	mov	(vp), v0 +	mov	8(vp), v1 + +	jmp	*outer_addr + +C =========================================================== +C     mul_2 for vp[0], vp[1] if vn is even + +	ALIGN(16) +L(mul_2): +	mov	R32(un), R32(w0) + +	neg	un +	mov	-8(up,un,8), %rax +	mov	(vp), v0 +	mov	8(vp), v1 +	mul	v1 + +	and	$-4, un		C round down to multiple of 4 +	mov	un, n + +	and	$3, R32(w0) +	jz	L(mul_2_prologue_0) +	cmp	$2, R32(w0) +	jc	L(mul_2_prologue_1) +	jz	L(mul_2_prologue_2) + +L(mul_2_prologue_3): +	mov	%rax, w1 +	mov	%rdx, w2 +	lea	L(addmul_prologue_3)(%rip), outer_addr +	jmp	L(mul_2_entry_3) + +	ALIGN(16) +L(mul_2_prologue_0): +	mov	%rax, w0 +	mov	%rdx, w1 +	lea	L(addmul_prologue_0)(%rip), outer_addr +	jmp	L(mul_2_entry_0) + +	ALIGN(16) +L(mul_2_prologue_1): +	mov	%rax, w3 +	mov	%rdx, w0 +	mov	$0, R32(w1) +	lea	L(addmul_prologue_1)(%rip), outer_addr +	jmp	L(mul_2_entry_1) + +	ALIGN(16) +L(mul_2_prologue_2): +	mov	%rax, w2 +	mov	%rdx, w3 +	mov	$0, R32(w0) +	mov	16(up,n,8), %rax +	lea	L(addmul_prologue_2)(%rip), outer_addr +	jmp	L(mul_2_entry_2) + + +	C this loop is 18 c/loop = 2.25 c/l on K8 + +	ALIGN(16) +L(mul_2_top): +	mov     -8(up,n,8), %rax +	mul     v1 +	add     %rax, w0 +	adc     %rdx, w1 +L(mul_2_entry_0): +	mov     $0, R32(w2) +	mov     (up,n,8), %rax +	mul     v0 +	add     %rax, w0 +	mov     (up,n,8), %rax +	adc     %rdx, w1 +	adc     $0, R32(w2) +	mul     v1 +	add     %rax, w1 +	mov     w0, (rp,n,8) +	adc     %rdx, w2 +L(mul_2_entry_3): +	mov     8(up,n,8), %rax +	mul     v0 +	mov     $0, R32(w3) +	add     %rax, w1 +	adc     %rdx, w2 +	mov     $0, R32(w0) +	adc     $0, R32(w3) +	mov     8(up,n,8), %rax +	mov     w1, 8(rp,n,8) +	mul     v1 +	add     %rax, w2 +	mov     16(up,n,8), %rax +	adc     %rdx, w3 +L(mul_2_entry_2): +	mov     $0, R32(w1) +	mul     v0 +	add     %rax, w2 +	mov     16(up,n,8), %rax +	adc     %rdx, w3 +	adc     $0, R32(w0) +	mul     v1 +	add     %rax, w3 +	mov     w2, 16(rp,n,8) +	adc     %rdx, w0 +L(mul_2_entry_1): +	mov     24(up,n,8), %rax +	mul     v0 +	add     %rax, w3 +	adc     %rdx, w0 +	adc     $0, R32(w1) +	add     $4, n +	mov     w3, -8(rp,n,8) +	jnz     L(mul_2_top) + +	mov	w0, (rp) +	mov	w1, 8(rp) + +	sub	$2, vn +	jz	L(ret) + +	lea	16(vp), vp +	lea	-16(up), up + +	mov	un, n +	mov	(vp), v0 +	mov	8(vp), v1 + +	jmp	*outer_addr + +C =========================================================== +C     addmul_2 for remaining vp's + +	ALIGN(16) +L(addmul_prologue_0): +	mov	-8(up,n,8), %rax +	mul	v1 +	mov	%rax, w1 +	mov	%rdx, w2 +	mov	$0, R32(w3) +	jmp	L(addmul_entry_0) + +	ALIGN(16) +L(addmul_prologue_1): +	mov	16(up,n,8), %rax +	mul	v1 +	mov	%rax, w0 +	mov	%rdx, w1 +	mov	$0, R32(w2) +	mov	24(up,n,8), %rax +	jmp	L(addmul_entry_1) + +	ALIGN(16) +L(addmul_prologue_2): +	mov	8(up,n,8), %rax +	mul	v1 +	mov	%rax, w3 +	mov	%rdx, w0 +	mov	$0, R32(w1) +	jmp	L(addmul_entry_2) + +	ALIGN(16) +L(addmul_prologue_3): +	mov	(up,n,8), %rax +	mul	v1 +	mov	%rax, w2 +	mov	%rdx, w3 +	mov	$0, R32(w0) +	mov	$0, R32(w1) +	jmp	L(addmul_entry_3) + +	C this loop is 19 c/loop = 2.375 c/l on K8 + +	ALIGN(16) +L(addmul_top): +	mov	$0, R32(w3) +	add	%rax, w0 +	mov	-8(up,n,8), %rax +	adc	%rdx, w1 +	adc	$0, R32(w2) +	mul	v1 +	add	w0, -8(rp,n,8) +	adc	%rax, w1 +	adc	%rdx, w2 +L(addmul_entry_0): +	mov	(up,n,8), %rax +	mul	v0 +	add	%rax, w1 +	mov	(up,n,8), %rax +	adc	%rdx, w2 +	adc	$0, R32(w3) +	mul	v1 +	add	w1, (rp,n,8) +	mov	$0, R32(w1) +	adc	%rax, w2 +	mov	$0, R32(w0) +	adc	%rdx, w3 +L(addmul_entry_3): +	mov	8(up,n,8), %rax +	mul	v0 +	add	%rax, w2 +	mov	8(up,n,8), %rax +	adc	%rdx, w3 +	adc	$0, R32(w0) +	mul	v1 +	add	w2, 8(rp,n,8) +	adc	%rax, w3 +	adc	%rdx, w0 +L(addmul_entry_2): +	mov	16(up,n,8), %rax +	mul	v0 +	add	%rax, w3 +	mov	16(up,n,8), %rax +	adc	%rdx, w0 +	adc	$0, R32(w1) +	mul	v1 +	add	w3, 16(rp,n,8) +	nop			C don't ask... +	adc	%rax, w0 +	mov	$0, R32(w2) +	mov	24(up,n,8), %rax +	adc	%rdx, w1 +L(addmul_entry_1): +	mul	v0 +	add	$4, n +	jnz	L(addmul_top) + +	add	%rax, w0 +	adc	%rdx, w1 +	adc	$0, R32(w2) + +	add	w0, -8(rp) +	adc	w1, (rp) +	adc	w2, 8(rp) + +	sub	$2, vn +	jz	L(ret) + +	lea	16(vp), vp +	lea	-16(up), up + +	mov	un, n +	mov	(vp), v0 +	mov	8(vp), v1 + +	jmp	*outer_addr + +C =========================================================== +C     accumulate along diagonals if un - vn is small + +	ALIGN(16) +L(diagonal): +	xor	R32(w0), R32(w0) +	xor	R32(w1), R32(w1) +	xor	R32(w2), R32(w2) + +	neg	un + +	mov	R32(vn), %eax +	and	$3, %eax +	jz	L(diag_prologue_0) +	cmp	$2, %eax +	jc	L(diag_prologue_1) +	jz	L(diag_prologue_2) + +L(diag_prologue_3): +	lea	-8(vp), vp +	mov	vp, vp_inner +	add	$1, vn +	mov	vn, n +	lea	L(diag_entry_3)(%rip), outer_addr +	jmp	L(diag_entry_3) + +L(diag_prologue_0): +	mov	vp, vp_inner +	mov	vn, n +	lea	0(%rip), outer_addr +	mov     -8(up,n,8), %rax +	jmp	L(diag_entry_0) + +L(diag_prologue_1): +	lea	8(vp), vp +	mov	vp, vp_inner +	add	$3, vn +	mov	vn, n +	lea	0(%rip), outer_addr +	mov     -8(vp_inner), %rax +	jmp	L(diag_entry_1) + +L(diag_prologue_2): +	lea	-16(vp), vp +	mov	vp, vp_inner +	add	$2, vn +	mov	vn, n +	lea	0(%rip), outer_addr +	mov	16(vp_inner), %rax +	jmp	L(diag_entry_2) + + +	C this loop is 10 c/loop = 2.5 c/l on K8 + +	ALIGN(16) +L(diag_top): +	add     %rax, w0 +	adc     %rdx, w1 +	mov     -8(up,n,8), %rax +	adc     $0, w2 +L(diag_entry_0): +	mulq    (vp_inner) +	add     %rax, w0 +	adc     %rdx, w1 +	adc     $0, w2 +L(diag_entry_3): +	mov     -16(up,n,8), %rax +	mulq    8(vp_inner) +	add     %rax, w0 +	mov     16(vp_inner), %rax +	adc     %rdx, w1 +	adc     $0, w2 +L(diag_entry_2): +	mulq    -24(up,n,8) +	add     %rax, w0 +	mov     24(vp_inner), %rax +	adc     %rdx, w1 +	lea     32(vp_inner), vp_inner +	adc     $0, w2 +L(diag_entry_1): +	mulq    -32(up,n,8) +	sub     $4, n +	jnz	L(diag_top) + +	add	%rax, w0 +	adc	%rdx, w1 +	adc	$0, w2 + +	mov	w0, (rp,un,8) + +	inc	un +	jz	L(diag_end) + +	mov	vn, n +	mov	vp, vp_inner + +	lea	8(up), up +	mov	w1, w0 +	mov	w2, w1 +	xor	R32(w2), R32(w2) + +	jmp	*outer_addr + +L(diag_end): +	mov	w1, (rp) +	mov	w2, 8(rp) + +L(ret):	pop	%r15 +	pop	%r14 +	pop	%r13 +	pop	%r12 +	pop	%rbp +	pop	%rbx +	FUNC_EXIT() +	ret +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86_64/k8/redc_1.asm b/vendor/gmp-6.3.0/mpn/x86_64/k8/redc_1.asm new file mode 100644 index 0000000..9327b21 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/k8/redc_1.asm @@ -0,0 +1,591 @@ +dnl  X86-64 mpn_redc_1 optimised for AMD K8-K10. + +dnl  Contributed to the GNU project by Torbjörn Granlund. + +dnl  Copyright 2004, 2008, 2013 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C	     cycles/limb +C AMD K8,K9	 ? +C AMD K10	 ? +C AMD bull	 ? +C AMD pile	 ? +C AMD steam	 ? +C AMD bobcat	 ? +C AMD jaguar	 ? +C Intel P4	 ? +C Intel core	 ? +C Intel NHM	 ? +C Intel SBR	 ? +C Intel IBR	 ? +C Intel HWL	 ? +C Intel BWL	 ? +C Intel atom	 ? +C VIA nano	 ? + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjörn Granlund. + +C TODO +C  * Micro-optimise, none performed thus far. +C  * This looks different from other current redc_1.asm variants.  Consider +C    adapting this to the mainstream style. +C  * Is this code really faster than more approaches which compute q0 later? +C    Is the use of a jump jump table faster?  Or is the edge of this due to the +C    inlined add_n code? +C  * Put initial m[0] x q0 computation in header. +C  * Put basecases at the file's end, single them out before the pushes. + +define(`rp',          `%rdi')   C rcx +define(`up',          `%rsi')   C rdx +define(`mp_param',    `%rdx')   C r8 +define(`n',           `%rcx')   C r9 +define(`u0inv',       `%r8')    C stack + +define(`i',           `%r11') +define(`nneg',        `%r12') +define(`mp',          `%r13') +define(`q0',          `%rbp') +define(`vp',          `%rdx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() +	TEXT +	ALIGN(32) +PROLOGUE(mpn_redc_1) +	FUNC_ENTRY(4) +IFDOS(`	mov	56(%rsp), %r8	') +	push	%rbp +	mov	(up), q0		C up[0] +	push	%rbx +	imul	u0inv, q0		C first q0, for all execution paths +	push	%r12 +	push	%r13 +	push	%r14 +	push	%r15 + +	mov	n, nneg +	neg	nneg +	lea	(mp_param,n,8), mp	C mp += n +	lea	-16(up,n,8), up		C up += n + +	mov	R32(n), R32(%rax) +	and	$3, R32(%rax) +	lea	4(%rax), %r9 +	cmp	$4, R32(n) +	cmovg	%r9, %rax +	lea	L(tab)(%rip), %r9 +ifdef(`PIC',` +	movslq	(%r9,%rax,4), %rax +	add	%r9, %rax +	jmp	*%rax +',` +	jmp	*(%r9,%rax,8) +') + +	JUMPTABSECT +	ALIGN(8) +L(tab):	JMPENT(	L(0), L(tab)) +	JMPENT(	L(1), L(tab)) +	JMPENT(	L(2), L(tab)) +	JMPENT(	L(3), L(tab)) +	JMPENT(	L(0m4), L(tab)) +	JMPENT(	L(1m4), L(tab)) +	JMPENT(	L(2m4), L(tab)) +	JMPENT(	L(3m4), L(tab)) +	TEXT + +	ALIGN(16) +L(1):	mov	(mp_param), %rax +	mul	q0 +	add	8(up), %rax +	adc	16(up), %rdx +	mov	%rdx, (rp) +	mov	$0, R32(%rax) +	adc	R32(%rax), R32(%rax) +	jmp	L(ret) + + +	ALIGN(16) +L(2):	mov	(mp_param), %rax +	mul	q0 +	xor	R32(%r14), R32(%r14) +	mov	%rax, %r10 +	mov	-8(mp), %rax +	mov	%rdx, %r9 +	mul	q0 +	add	(up), %r10 +	adc	%rax, %r9 +	adc	%rdx, %r14 +	add	8(up), %r9 +	adc	$0, %r14 +	mov	%r9, q0 +	imul	u0inv, q0 +	mov	-16(mp), %rax +	mul	q0 +	xor	R32(%rbx), R32(%rbx) +	mov	%rax, %r10 +	mov	-8(mp), %rax +	mov	%rdx, %r11 +	mul	q0 +	add	%r9, %r10 +	adc	%rax, %r11 +	adc	%rdx, %rbx +	add	16(up), %r11 +	adc	$0, %rbx +	xor	R32(%rax), R32(%rax) +	add	%r11, %r14 +	adc	24(up), %rbx +	mov	%r14, (rp) +	mov	%rbx, 8(rp) +	adc	R32(%rax), R32(%rax) +	jmp	L(ret) + + +L(3):	mov	(mp_param), %rax +	mul	q0 +	mov	%rax, %rbx +	mov	%rdx, %r10 +	mov	-16(mp), %rax +	mul	q0 +	xor	R32(%r9), R32(%r9) +	xor	R32(%r14), R32(%r14) +	add	-8(up), %rbx +	adc	%rax, %r10 +	mov	-8(mp), %rax +	adc	%rdx, %r9 +	mul	q0 +	add	(up), %r10 +	mov	%r10, (up) +	adc	%rax, %r9 +	adc	%rdx, %r14 +	mov	%r10, q0 +	imul	u0inv, q0 +	add	%r9, 8(up) +	adc	$0, %r14 +	mov	%r14, -8(up) + +	mov	-24(mp), %rax +	mul	q0 +	mov	%rax, %rbx +	mov	%rdx, %r10 +	mov	-16(mp), %rax +	mul	q0 +	xor	R32(%r9), R32(%r9) +	xor	R32(%r14), R32(%r14) +	add	(up), %rbx +	adc	%rax, %r10 +	mov	-8(mp), %rax +	adc	%rdx, %r9 +	mul	q0 +	add	8(up), %r10 +	mov	%r10, 8(up) +	adc	%rax, %r9 +	adc	%rdx, %r14 +	mov	%r10, q0 +	imul	u0inv, q0 +	add	%r9, 16(up) +	adc	$0, %r14 +	mov	%r14, (up) + +	mov	-24(mp), %rax +	mul	q0 +	mov	%rax, %rbx +	mov	%rdx, %r10 +	mov	-16(mp), %rax +	mul	q0 +	xor	R32(%r9), R32(%r9) +	xor	R32(%r14), R32(%r14) +	add	8(up), %rbx +	adc	%rax, %r10 +	mov	-8(mp), %rax +	adc	%rdx, %r9 +	mul	q0 +	add	16(up), %r10 +	adc	%rax, %r9 +	adc	%rdx, %r14 +	add	24(up), %r9 +	adc	$0, %r14 + +	xor	R32(%rax), R32(%rax) +	add	-8(up), %r10 +	adc	(up), %r9 +	adc	32(up), %r14 +	mov	%r10, (rp) +	mov	%r9, 8(rp) +	mov	%r14, 16(rp) +	adc	R32(%rax), R32(%rax) +	jmp	L(ret) + + +	ALIGN(16) +L(2m4): +L(lo2):	mov	(mp,nneg,8), %rax +	mul	q0 +	xor	R32(%r14), R32(%r14) +	xor	R32(%rbx), R32(%rbx) +	mov	%rax, %r10 +	mov	8(mp,nneg,8), %rax +	mov	24(up,nneg,8), %r15 +	mov	%rdx, %r9 +	mul	q0 +	add	16(up,nneg,8), %r10 +	adc	%rax, %r9 +	mov	16(mp,nneg,8), %rax +	adc	%rdx, %r14 +	mul	q0 +	mov	$0, R32(%r10)		C xor? +	lea	2(nneg), i +	add	%r9, %r15 +	imul	u0inv, %r15 +	jmp	 L(e2) + +	ALIGN(16) +L(li2):	add	%r10, (up,i,8) +	adc	%rax, %r9 +	mov	(mp,i,8), %rax +	adc	%rdx, %r14 +	xor	R32(%r10), R32(%r10) +	mul	q0 +L(e2):	add	%r9, 8(up,i,8) +	adc	%rax, %r14 +	adc	%rdx, %rbx +	mov	8(mp,i,8), %rax +	mul	q0 +	add	%r14, 16(up,i,8) +	adc	%rax, %rbx +	adc	%rdx, %r10 +	mov	16(mp,i,8), %rax +	mul	q0 +	add	%rbx, 24(up,i,8) +	mov	$0, R32(%r14)		C zero +	mov	%r14, %rbx		C zero +	adc	%rax, %r10 +	mov	24(mp,i,8), %rax +	mov	%r14, %r9		C zero +	adc	%rdx, %r9 +	mul	q0 +	add	$4, i +	js	 L(li2) + +L(le2):	add	%r10, (up) +	adc	%rax, %r9 +	adc	%r14, %rdx +	add	%r9, 8(up) +	adc	$0, %rdx +	mov	%rdx, 16(up,nneg,8)	C up[0] +	add	$8, up +	mov	%r15, q0 +	dec	n +	jnz	L(lo2) + +	mov	nneg, n +	sar	$2, n +	lea	32(up,nneg,8), up +	lea	(up,nneg,8), vp + +	mov	-16(up), %r8 +	mov	-8(up), %r9 +	add	-16(vp), %r8 +	adc	-8(vp), %r9 +	mov	%r8, (rp) +	mov	%r9, 8(rp) +	lea	16(rp), rp +	jmp	L(addx) + + +	ALIGN(16) +L(1m4): +L(lo1):	mov	(mp,nneg,8), %rax +	xor	%r9, %r9 +	xor	R32(%rbx), R32(%rbx) +	mul	q0 +	mov	%rax, %r9 +	mov	8(mp,nneg,8), %rax +	mov	24(up,nneg,8), %r15 +	mov	%rdx, %r14 +	mov	$0, R32(%r10)		C xor? +	mul	q0 +	add	16(up,nneg,8), %r9 +	adc	%rax, %r14 +	adc	%rdx, %rbx +	mov	16(mp,nneg,8), %rax +	mul	q0 +	lea	1(nneg), i +	add	%r14, %r15 +	imul	u0inv, %r15 +	jmp	 L(e1) + +	ALIGN(16) +L(li1):	add	%r10, (up,i,8) +	adc	%rax, %r9 +	mov	(mp,i,8), %rax +	adc	%rdx, %r14 +	xor	R32(%r10), R32(%r10) +	mul	q0 +	add	%r9, 8(up,i,8) +	adc	%rax, %r14 +	adc	%rdx, %rbx +	mov	8(mp,i,8), %rax +	mul	q0 +L(e1):	add	%r14, 16(up,i,8) +	adc	%rax, %rbx +	adc	%rdx, %r10 +	mov	16(mp,i,8), %rax +	mul	q0 +	add	%rbx, 24(up,i,8) +	mov	$0, R32(%r14)		C zero +	mov	%r14, %rbx		C zero +	adc	%rax, %r10 +	mov	24(mp,i,8), %rax +	mov	%r14, %r9		C zero +	adc	%rdx, %r9 +	mul	q0 +	add	$4, i +	js	 L(li1) + +L(le1):	add	%r10, (up) +	adc	%rax, %r9 +	adc	%r14, %rdx +	add	%r9, 8(up) +	adc	$0, %rdx +	mov	%rdx, 16(up,nneg,8)	C up[0] +	add	$8, up +	mov	%r15, q0 +	dec	n +	jnz	L(lo1) + +	mov	nneg, n +	sar	$2, n +	lea	24(up,nneg,8), up +	lea	(up,nneg,8), vp + +	mov	-8(up), %r8 +	add	-8(vp), %r8 +	mov	%r8, (rp) +	lea	8(rp), rp +	jmp	L(addx) + + +	ALIGN(16) +L(0): +L(0m4): +L(lo0):	mov	(mp,nneg,8), %rax +	mov	nneg, i +	mul	q0 +	xor	R32(%r10), R32(%r10) +	mov	%rax, %r14 +	mov	%rdx, %rbx +	mov	8(mp,nneg,8), %rax +	mov	24(up,nneg,8), %r15 +	mul	q0 +	add	16(up,nneg,8), %r14 +	adc	%rax, %rbx +	adc	%rdx, %r10 +	add	%rbx, %r15 +	imul	u0inv, %r15 +	jmp	L(e0) + +	ALIGN(16) +L(li0):	add	%r10, (up,i,8) +	adc	%rax, %r9 +	mov	(mp,i,8), %rax +	adc	%rdx, %r14 +	xor	R32(%r10), R32(%r10) +	mul	q0 +	add	%r9, 8(up,i,8) +	adc	%rax, %r14 +	adc	%rdx, %rbx +	mov	8(mp,i,8), %rax +	mul	q0 +	add	%r14, 16(up,i,8) +	adc	%rax, %rbx +	adc	%rdx, %r10 +L(e0):	mov	16(mp,i,8), %rax +	mul	q0 +	add	%rbx, 24(up,i,8) +	mov	$0, R32(%r14)		C zero +	mov	%r14, %rbx		C zero +	adc	%rax, %r10 +	mov	24(mp,i,8), %rax +	mov	%r14, %r9		C zero +	adc	%rdx, %r9 +	mul	q0 +	add	$4, i +	js	 L(li0) + +L(le0):	add	%r10, (up) +	adc	%rax, %r9 +	adc	%r14, %rdx +	add	%r9, 8(up) +	adc	$0, %rdx +	mov	%rdx, 16(up,nneg,8)	C up[0] +	add	$8, up +	mov	%r15, q0 +	dec	n +	jnz	L(lo0) + +	mov	nneg, n +	sar	$2, n +	clc +	lea	16(up,nneg,8), up +	lea	(up,nneg,8), vp +	jmp	L(addy) + + +	ALIGN(16) +L(3m4): +L(lo3):	mov	(mp,nneg,8), %rax +	mul	q0 +	mov	%rax, %rbx +	mov	%rdx, %r10 +	mov	8(mp,nneg,8), %rax +	mov	24(up,nneg,8), %r15 +	mul	q0 +	add	16(up,nneg,8), %rbx	C result is zero, might carry +	mov	$0, R32(%rbx)		C zero +	mov	%rbx, %r14		C zero +	adc	%rax, %r10 +	mov	16(mp,nneg,8), %rax +	mov	%r14, %r9		C zero +	adc	%rdx, %r9 +	add	%r10, %r15 +	mul	q0 +	lea	3(nneg), i +	imul	u0inv, %r15 +C	jmp	L(li3) + +	ALIGN(16) +L(li3):	add	%r10, (up,i,8) +	adc	%rax, %r9 +	mov	(mp,i,8), %rax +	adc	%rdx, %r14 +	xor	R32(%r10), R32(%r10) +	mul	q0 +	add	%r9, 8(up,i,8) +	adc	%rax, %r14 +	adc	%rdx, %rbx +	mov	8(mp,i,8), %rax +	mul	q0 +	add	%r14, 16(up,i,8) +	adc	%rax, %rbx +	adc	%rdx, %r10 +	mov	16(mp,i,8), %rax +	mul	q0 +	add	%rbx, 24(up,i,8) +	mov	$0, R32(%r14)		C zero +	mov	%r14, %rbx		C zero +	adc	%rax, %r10 +	mov	24(mp,i,8), %rax +	mov	%r14, %r9		C zero +	adc	%rdx, %r9 +	mul	q0 +	add	$4, i +	js	 L(li3) + +L(le3):	add	%r10, (up) +	adc	%rax, %r9 +	adc	%r14, %rdx +	add	%r9, 8(up) +	adc	$0, %rdx +	mov	%rdx, 16(up,nneg,8)	C up[0] +	mov	%r15, q0 +	lea	8(up), up +	dec	n +	jnz	L(lo3) + + +C ==== Addition code ==== +	mov	nneg, n +	sar	$2, n +	lea	40(up,nneg,8), up +	lea	(up,nneg,8), vp + +	mov	-24(up), %r8 +	mov	-16(up), %r9 +	mov	-8(up), %r10 +	add	-24(vp), %r8 +	adc	-16(vp), %r9 +	adc	-8(vp), %r10 +	mov	%r8, (rp) +	mov	%r9, 8(rp) +	mov	%r10, 16(rp) +	lea	24(rp), rp + +L(addx):inc	n +	jz	L(ad3) + +L(addy):mov	(up), %r8 +	mov	8(up), %r9 +	inc	n +	jmp	L(mid) + +C	ALIGN(16) +L(al3):	adc	(vp), %r8 +	adc	8(vp), %r9 +	adc	16(vp), %r10 +	adc	24(vp), %r11 +	mov	%r8, (rp) +	lea	32(up), up +	mov	%r9, 8(rp) +	mov	%r10, 16(rp) +	inc	n +	mov	%r11, 24(rp) +	lea	32(vp), vp +	mov	(up), %r8 +	mov	8(up), %r9 +	lea	32(rp), rp +L(mid):	mov	16(up), %r10 +	mov	24(up), %r11 +	jnz	L(al3) + +L(ae3):	adc	(vp), %r8 +	adc	8(vp), %r9 +	adc	16(vp), %r10 +	adc	24(vp), %r11 +	mov	%r8, (rp) +	mov	%r9, 8(rp) +	mov	%r10, 16(rp) +	mov	%r11, 24(rp) + +L(ad3):	mov	R32(n), R32(%rax)	C zero +	adc	R32(%rax), R32(%rax) + +L(ret):	pop	%r15 +	pop	%r14 +	pop	%r13 +	pop	%r12 +	pop	%rbx +	pop	%rbp +	FUNC_EXIT() +	ret +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86_64/k8/sqr_basecase.asm b/vendor/gmp-6.3.0/mpn/x86_64/k8/sqr_basecase.asm new file mode 100644 index 0000000..60cf945 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/k8/sqr_basecase.asm @@ -0,0 +1,807 @@ +dnl  AMD64 mpn_sqr_basecase. + +dnl  Contributed to the GNU project by Torbjorn Granlund. + +dnl  Copyright 2008, 2009, 2011, 2012 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C The inner loops of this code are the result of running a code generation and +C optimization tool suite written by David Harvey and Torbjorn Granlund. + +C NOTES +C   * There is a major stupidity in that we call mpn_mul_1 initially, for a +C     large trip count.  Instead, we should follow the generic/sqr_basecase.c +C     code which uses addmul_2s from the start, conditionally leaving a 1x1 +C     multiply to the end.  (In assembly code, one would stop invoking +C     addmul_2s loops when perhaps 3x2s respectively a 2x2s remains.) +C   * Another stupidity is in the sqr_diag_addlsh1 code.  It does not need to +C     save/restore carry, instead it can propagate into the high product word. +C   * Align more labels, should shave off a few cycles. +C   * We can safely use 32-bit size operations, since operands with (2^32) +C     limbs will lead to non-termination in practice. +C   * The jump table could probably be optimized, at least for non-pic. +C   * The special code for n <= 4 was quickly written.  It is probably too +C     large and unnecessarily slow. +C   * Consider combining small cases code so that the n=k-1 code jumps into the +C     middle of the n=k code. +C   * Avoid saving registers for small cases code. +C   * Needed variables: +C    n   r11  input size +C    i   r8   work left, initially n +C    j   r9   inner loop count +C        r15  unused +C    v0  r13 +C    v1  r14 +C    rp  rdi +C    up  rsi +C    w0  rbx +C    w1  rcx +C    w2  rbp +C    w3  r10 +C    tp  r12 +C    lo  rax +C    hi  rdx +C        rsp + +C INPUT PARAMETERS +define(`rp',	  `%rdi') +define(`up',	  `%rsi') +define(`n_param', `%rdx') + +define(`n',	`%r11') +define(`tp',	`%r12') +define(`i',	`%r8') +define(`j',	`%r9') +define(`v0',	`%r13') +define(`v1',	`%r14') +define(`w0',	`%rbx') +define(`w1',	`%rcx') +define(`w2',	`%rbp') +define(`w3',	`%r10') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() +	TEXT +	ALIGN(16) +PROLOGUE(mpn_sqr_basecase) +	FUNC_ENTRY(3) +	mov	R32(n_param), R32(%rcx) +	mov	R32(n_param), R32(n)		C free original n register (rdx) + +	add	$-40, %rsp + +	and	$3, R32(%rcx) +	cmp	$4, R32(n_param) +	lea	4(%rcx), %r8 + +	mov	%rbx, 32(%rsp) +	mov	%rbp, 24(%rsp) +	mov	%r12, 16(%rsp) +	mov	%r13, 8(%rsp) +	mov	%r14, (%rsp) + +	cmovg	%r8, %rcx + +	lea	L(tab)(%rip), %rax +ifdef(`PIC', +`	movslq	(%rax,%rcx,4), %r10 +	add	%r10, %rax +	jmp	*%rax +',` +	jmp	*(%rax,%rcx,8) +') +	JUMPTABSECT +	ALIGN(8) +L(tab):	JMPENT(	L(4), L(tab)) +	JMPENT(	L(1), L(tab)) +	JMPENT(	L(2), L(tab)) +	JMPENT(	L(3), L(tab)) +	JMPENT(	L(0m4), L(tab)) +	JMPENT(	L(1m4), L(tab)) +	JMPENT(	L(2m4), L(tab)) +	JMPENT(	L(3m4), L(tab)) +	TEXT + +L(1):	mov	(up), %rax +	mul	%rax +	add	$40, %rsp +	mov	%rax, (rp) +	mov	%rdx, 8(rp) +	FUNC_EXIT() +	ret + +L(2):	mov	(up), %rax +	mov	%rax, %r8 +	mul	%rax +	mov	8(up), %r11 +	mov	%rax, (rp) +	mov	%r11, %rax +	mov	%rdx, %r9 +	mul	%rax +	add	$40, %rsp +	mov	%rax, %r10 +	mov	%r11, %rax +	mov	%rdx, %r11 +	mul	%r8 +	xor	%r8, %r8 +	add	%rax, %r9 +	adc	%rdx, %r10 +	adc	%r8, %r11 +	add	%rax, %r9 +	mov	%r9, 8(rp) +	adc	%rdx, %r10 +	mov	%r10, 16(rp) +	adc	%r8, %r11 +	mov	%r11, 24(rp) +	FUNC_EXIT() +	ret + +L(3):	mov	(up), %rax +	mov	%rax, %r10 +	mul	%rax +	mov	8(up), %r11 +	mov	%rax, (rp) +	mov	%r11, %rax +	mov	%rdx, 8(rp) +	mul	%rax +	mov	16(up), %rcx +	mov	%rax, 16(rp) +	mov	%rcx, %rax +	mov	%rdx, 24(rp) +	mul	%rax +	mov	%rax, 32(rp) +	mov	%rdx, 40(rp) + +	mov	%r11, %rax +	mul	%r10 +	mov	%rax, %r8 +	mov	%rcx, %rax +	mov	%rdx, %r9 +	mul	%r10 +	xor	%r10, %r10 +	add	%rax, %r9 +	mov	%r11, %rax +	mov	%r10, %r11 +	adc	%rdx, %r10 + +	mul	%rcx +	add	$40, %rsp +	add	%rax, %r10 +	adc	%r11, %rdx +	add	%r8, %r8 +	adc	%r9, %r9 +	adc	%r10, %r10 +	adc	%rdx, %rdx +	adc	%r11, %r11 +	add	%r8, 8(rp) +	adc	%r9, 16(rp) +	adc	%r10, 24(rp) +	adc	%rdx, 32(rp) +	adc	%r11, 40(rp) +	FUNC_EXIT() +	ret + +L(4):	mov	(up), %rax +	mov	%rax, %r11 +	mul	%rax +	mov	8(up), %rbx +	mov	%rax, (rp) +	mov	%rbx, %rax +	mov	%rdx, 8(rp) +	mul	%rax +	mov	%rax, 16(rp) +	mov	%rdx, 24(rp) +	mov	16(up), %rax +	mul	%rax +	mov	%rax, 32(rp) +	mov	%rdx, 40(rp) +	mov	24(up), %rax +	mul	%rax +	mov	%rax, 48(rp) +	mov	%rbx, %rax +	mov	%rdx, 56(rp) + +	mul	%r11 +	add	$32, %rsp +	mov	%rax, %r8 +	mov	%rdx, %r9 +	mov	16(up), %rax +	mul	%r11 +	xor	%r10, %r10 +	add	%rax, %r9 +	adc	%rdx, %r10 +	mov	24(up), %rax +	mul	%r11 +	xor	%r11, %r11 +	add	%rax, %r10 +	adc	%rdx, %r11 +	mov	16(up), %rax +	mul	%rbx +	xor	%rcx, %rcx +	add	%rax, %r10 +	adc	%rdx, %r11 +	adc	$0, %rcx +	mov	24(up), %rax +	mul	%rbx +	pop	%rbx +	add	%rax, %r11 +	adc	%rdx, %rcx +	mov	16(up), %rdx +	mov	24(up), %rax +	mul	%rdx +	add	%rax, %rcx +	adc	$0, %rdx + +	add	%r8, %r8 +	adc	%r9, %r9 +	adc	%r10, %r10 +	adc	%r11, %r11 +	adc	%rcx, %rcx +	mov	$0, R32(%rax) +	adc	%rdx, %rdx + +	adc	%rax, %rax +	add	%r8, 8(rp) +	adc	%r9, 16(rp) +	adc	%r10, 24(rp) +	adc	%r11, 32(rp) +	adc	%rcx, 40(rp) +	adc	%rdx, 48(rp) +	adc	%rax, 56(rp) +	FUNC_EXIT() +	ret + + +L(0m4): +	lea	-16(rp,n,8), tp		C point tp in middle of result operand +	mov	(up), v0 +	mov	8(up), %rax +	lea	(up,n,8), up		C point up at end of input operand + +	lea	-4(n), i +C Function mpn_mul_1_m3(tp, up - i, i, up[-i - 1]) +	xor	R32(j), R32(j) +	sub	n, j + +	mul	v0 +	xor	R32(w2), R32(w2) +	mov	%rax, w0 +	mov	16(up,j,8), %rax +	mov	%rdx, w3 +	jmp	L(L3) + +	ALIGN(16) +L(mul_1_m3_top): +	add	%rax, w2 +	mov	w3, (tp,j,8) +	mov	(up,j,8), %rax +	adc	%rdx, w1 +	xor	R32(w0), R32(w0) +	mul	v0 +	xor	R32(w3), R32(w3) +	mov	w2, 8(tp,j,8) +	add	%rax, w1 +	adc	%rdx, w0 +	mov	8(up,j,8), %rax +	mov	w1, 16(tp,j,8) +	xor	R32(w2), R32(w2) +	mul	v0 +	add	%rax, w0 +	mov	16(up,j,8), %rax +	adc	%rdx, w3 +L(L3):	xor	R32(w1), R32(w1) +	mul	v0 +	add	%rax, w3 +	mov	24(up,j,8), %rax +	adc	%rdx, w2 +	mov	w0, 24(tp,j,8) +	mul	v0 +	add	$4, j +	js	L(mul_1_m3_top) + +	add	%rax, w2 +	mov	w3, (tp) +	adc	%rdx, w1 +	mov	w2, 8(tp) +	mov	w1, 16(tp) + +	lea	eval(2*8)(tp), tp	C tp += 2 +	lea	-8(up), up +	jmp	L(dowhile) + + +L(1m4): +	lea	8(rp,n,8), tp		C point tp in middle of result operand +	mov	(up), v0		C u0 +	mov	8(up), %rax		C u1 +	lea	8(up,n,8), up		C point up at end of input operand + +	lea	-3(n), i +C Function mpn_mul_2s_m0(tp, up - i, i, up - i - 1) +	lea	-3(n), j +	neg	j + +	mov	%rax, v1		C u1 +	mul	v0			C u0 * u1 +	mov	%rdx, w1 +	xor	R32(w2), R32(w2) +	mov	%rax, 8(rp) +	jmp	L(m0) + +	ALIGN(16) +L(mul_2_m0_top): +	mul	v1 +	add	%rax, w0 +	adc	%rdx, w1 +	mov	-24(up,j,8), %rax +	mov	$0, R32(w2) +	mul	v0 +	add	%rax, w0 +	mov	-24(up,j,8), %rax +	adc	%rdx, w1 +	adc	$0, R32(w2) +	mul	v1			C v1 * u0 +	add	%rax, w1 +	mov	w0, -24(tp,j,8) +	adc	%rdx, w2 +L(m0):	mov	-16(up,j,8), %rax	C u2, u6 ... +	mul	v0			C u0 * u2 +	mov	$0, R32(w3) +	add	%rax, w1 +	adc	%rdx, w2 +	mov	-16(up,j,8), %rax +	adc	$0, R32(w3) +	mov	$0, R32(w0) +	mov	w1, -16(tp,j,8) +	mul	v1 +	add	%rax, w2 +	mov	-8(up,j,8), %rax +	adc	%rdx, w3 +	mov	$0, R32(w1) +	mul	v0 +	add	%rax, w2 +	mov	-8(up,j,8), %rax +	adc	%rdx, w3 +	adc	$0, R32(w0) +	mul	v1 +	add	%rax, w3 +	mov	w2, -8(tp,j,8) +	adc	%rdx, w0 +L(m2x):	mov	(up,j,8), %rax +	mul	v0 +	add	%rax, w3 +	adc	%rdx, w0 +	adc	$0, R32(w1) +	add	$4, j +	mov	-32(up,j,8), %rax +	mov	w3, -32(tp,j,8) +	js	L(mul_2_m0_top) + +	mul	v1 +	add	%rax, w0 +	adc	%rdx, w1 +	mov	w0, -8(tp) +	mov	w1, (tp) + +	lea	-16(up), up +	lea	eval(3*8-24)(tp), tp	C tp += 3 +	jmp	L(dowhile_end) + + +L(2m4): +	lea	-16(rp,n,8), tp		C point tp in middle of result operand +	mov	(up), v0 +	mov	8(up), %rax +	lea	(up,n,8), up		C point up at end of input operand + +	lea	-4(n), i +C Function mpn_mul_1_m1(tp, up - (i - 1), i - 1, up[-i]) +	lea	-2(n), j +	neg	j + +	mul	v0 +	mov	%rax, w2 +	mov	(up,j,8), %rax +	mov	%rdx, w1 +	jmp	L(L1) + +	ALIGN(16) +L(mul_1_m1_top): +	add	%rax, w2 +	mov	w3, (tp,j,8) +	mov	(up,j,8), %rax +	adc	%rdx, w1 +L(L1):	xor	R32(w0), R32(w0) +	mul	v0 +	xor	R32(w3), R32(w3) +	mov	w2, 8(tp,j,8) +	add	%rax, w1 +	adc	%rdx, w0 +	mov	8(up,j,8), %rax +	mov	w1, 16(tp,j,8) +	xor	R32(w2), R32(w2) +	mul	v0 +	add	%rax, w0 +	mov	16(up,j,8), %rax +	adc	%rdx, w3 +	xor	R32(w1), R32(w1) +	mul	v0 +	add	%rax, w3 +	mov	24(up,j,8), %rax +	adc	%rdx, w2 +	mov	w0, 24(tp,j,8) +	mul	v0 +	add	$4, j +	js	L(mul_1_m1_top) + +	add	%rax, w2 +	mov	w3, (tp) +	adc	%rdx, w1 +	mov	w2, 8(tp) +	mov	w1, 16(tp) + +	lea	eval(2*8)(tp), tp	C tp += 2 +	lea	-8(up), up +	jmp	L(dowhile_mid) + + +L(3m4): +	lea	8(rp,n,8), tp		C point tp in middle of result operand +	mov	(up), v0		C u0 +	mov	8(up), %rax		C u1 +	lea	8(up,n,8), up		C point up at end of input operand + +	lea	-5(n), i +C Function mpn_mul_2s_m2(tp, up - i + 1, i - 1, up - i) +	lea	-1(n), j +	neg	j + +	mov	%rax, v1		C u1 +	mul	v0			C u0 * u1 +	mov	%rdx, w3 +	xor	R32(w0), R32(w0) +	xor	R32(w1), R32(w1) +	mov	%rax, 8(rp) +	jmp	L(m2) + +	ALIGN(16) +L(mul_2_m2_top): +	mul	v1 +	add	%rax, w0 +	adc	%rdx, w1 +	mov	-24(up,j,8), %rax +	mov	$0, R32(w2) +	mul	v0 +	add	%rax, w0 +	mov	-24(up,j,8), %rax +	adc	%rdx, w1 +	adc	$0, R32(w2) +	mul	v1			C v1 * u0 +	add	%rax, w1 +	mov	w0, -24(tp,j,8) +	adc	%rdx, w2 +	mov	-16(up,j,8), %rax +	mul	v0 +	mov	$0, R32(w3) +	add	%rax, w1 +	adc	%rdx, w2 +	mov	-16(up,j,8), %rax +	adc	$0, R32(w3) +	mov	$0, R32(w0) +	mov	w1, -16(tp,j,8) +	mul	v1 +	add	%rax, w2 +	mov	-8(up,j,8), %rax +	adc	%rdx, w3 +	mov	$0, R32(w1) +	mul	v0 +	add	%rax, w2 +	mov	-8(up,j,8), %rax +	adc	%rdx, w3 +	adc	$0, R32(w0) +	mul	v1 +	add	%rax, w3 +	mov	w2, -8(tp,j,8) +	adc	%rdx, w0 +L(m2):	mov	(up,j,8), %rax +	mul	v0 +	add	%rax, w3 +	adc	%rdx, w0 +	adc	$0, R32(w1) +	add	$4, j +	mov	-32(up,j,8), %rax +	mov	w3, -32(tp,j,8) +	js	L(mul_2_m2_top) + +	mul	v1 +	add	%rax, w0 +	adc	%rdx, w1 +	mov	w0, -8(tp) +	mov	w1, (tp) + +	lea	-16(up), up +	jmp	L(dowhile_mid) + +L(dowhile): +C Function mpn_addmul_2s_m2(tp, up - (i - 1), i - 1, up - i) +	lea	4(i), j +	neg	j + +	mov	16(up,j,8), v0 +	mov	24(up,j,8), v1 +	mov	24(up,j,8), %rax +	mul	v0 +	xor	R32(w3), R32(w3) +	add	%rax, 24(tp,j,8) +	adc	%rdx, w3 +	xor	R32(w0), R32(w0) +	xor	R32(w1), R32(w1) +	jmp	L(am2) + +	ALIGN(16) +L(addmul_2_m2_top): +	add	w3, (tp,j,8) +	adc	%rax, w0 +	mov	8(up,j,8), %rax +	adc	%rdx, w1 +	mov	$0, R32(w2) +	mul	v0 +	add	%rax, w0 +	mov	8(up,j,8), %rax +	adc	%rdx, w1 +	adc	$0, R32(w2) +	mul	v1				C v1 * u0 +	add	w0, 8(tp,j,8) +	adc	%rax, w1 +	adc	%rdx, w2 +	mov	16(up,j,8), %rax +	mov	$0, R32(w3) +	mul	v0				C v0 * u1 +	add	%rax, w1 +	mov	16(up,j,8), %rax +	adc	%rdx, w2 +	adc	$0, R32(w3) +	mul	v1				C v1 * u1 +	add	w1, 16(tp,j,8) +	adc	%rax, w2 +	mov	24(up,j,8), %rax +	adc	%rdx, w3 +	mul	v0 +	mov	$0, R32(w0) +	add	%rax, w2 +	adc	%rdx, w3 +	mov	$0, R32(w1) +	mov	24(up,j,8), %rax +	adc	$0, R32(w0) +	mul	v1 +	add	w2, 24(tp,j,8) +	adc	%rax, w3 +	adc	%rdx, w0 +L(am2):	mov	32(up,j,8), %rax +	mul	v0 +	add	%rax, w3 +	mov	32(up,j,8), %rax +	adc	%rdx, w0 +	adc	$0, R32(w1) +	mul	v1 +	add	$4, j +	js	L(addmul_2_m2_top) + +	add	w3, (tp) +	adc	%rax, w0 +	adc	%rdx, w1 +	mov	w0, 8(tp) +	mov	w1, 16(tp) + +	lea	eval(2*8)(tp), tp	C tp += 2 + +	add	$-2, R32(i)		C i -= 2 + +L(dowhile_mid): +C Function mpn_addmul_2s_m0(tp, up - (i - 1), i - 1, up - i) +	lea	2(i), j +	neg	j + +	mov	(up,j,8), v0 +	mov	8(up,j,8), v1 +	mov	8(up,j,8), %rax +	mul	v0 +	xor	R32(w1), R32(w1) +	add	%rax, 8(tp,j,8) +	adc	%rdx, w1 +	xor	R32(w2), R32(w2) +	jmp	L(20) + +	ALIGN(16) +L(addmul_2_m0_top): +	add	w3, (tp,j,8) +	adc	%rax, w0 +	mov	8(up,j,8), %rax +	adc	%rdx, w1 +	mov	$0, R32(w2) +	mul	v0 +	add	%rax, w0 +	mov	8(up,j,8), %rax +	adc	%rdx, w1 +	adc	$0, R32(w2) +	mul	v1				C v1 * u0 +	add	w0, 8(tp,j,8) +	adc	%rax, w1 +	adc	%rdx, w2 +L(20):	mov	16(up,j,8), %rax +	mov	$0, R32(w3) +	mul	v0				C v0 * u1 +	add	%rax, w1 +	mov	16(up,j,8), %rax +	adc	%rdx, w2 +	adc	$0, R32(w3) +	mul	v1				C v1 * u1 +	add	w1, 16(tp,j,8) +	adc	%rax, w2 +	mov	24(up,j,8), %rax +	adc	%rdx, w3 +	mul	v0 +	mov	$0, R32(w0) +	add	%rax, w2 +	adc	%rdx, w3 +	mov	$0, R32(w1) +	mov	24(up,j,8), %rax +	adc	$0, R32(w0) +	mul	v1 +	add	w2, 24(tp,j,8) +	adc	%rax, w3 +	adc	%rdx, w0 +	mov	32(up,j,8), %rax +	mul	v0 +	add	%rax, w3 +	mov	32(up,j,8), %rax +	adc	%rdx, w0 +	adc	$0, R32(w1) +	mul	v1 +	add	$4, j +	js	L(addmul_2_m0_top) + +	add	w3, (tp) +	adc	%rax, w0 +	adc	%rdx, w1 +	mov	w0, 8(tp) +	mov	w1, 16(tp) + +	lea	eval(2*8)(tp), tp	C tp += 2 +L(dowhile_end): + +	add	$-2, R32(i)		C i -= 2 +	jne	L(dowhile) + +C Function mpn_addmul_2s_2 +	mov	-16(up), v0 +	mov	-8(up), v1 +	mov	-8(up), %rax +	mul	v0 +	xor	R32(w3), R32(w3) +	add	%rax, -8(tp) +	adc	%rdx, w3 +	xor	R32(w0), R32(w0) +	xor	R32(w1), R32(w1) +	mov	(up), %rax +	mul	v0 +	add	%rax, w3 +	mov	(up), %rax +	adc	%rdx, w0 +	mul	v1 +	add	w3, (tp) +	adc	%rax, w0 +	adc	%rdx, w1 +	mov	w0, 8(tp) +	mov	w1, 16(tp) + +C Function mpn_sqr_diag_addlsh1 +	lea	-4(n,n), j + +	mov	8(rp), %r11 +	lea	-8(up), up +	lea	(rp,j,8), rp +	neg	j +	mov	(up,j,4), %rax +	mul	%rax +	test	$2, R8(j) +	jnz	L(odd) + +L(evn):	add	%r11, %r11 +	sbb	R32(%rbx), R32(%rbx)		C save CF +	add	%rdx, %r11 +	mov	%rax, (rp,j,8) +	jmp	L(d0) + +L(odd):	add	%r11, %r11 +	sbb	R32(%rbp), R32(%rbp)		C save CF +	add	%rdx, %r11 +	mov	%rax, (rp,j,8) +	lea	-2(j), j +	jmp	L(d1) + +	ALIGN(16) +L(top):	mov	(up,j,4), %rax +	mul	%rax +	add	R32(%rbp), R32(%rbp)		C restore carry +	adc	%rax, %r10 +	adc	%rdx, %r11 +	mov	%r10, (rp,j,8) +L(d0):	mov	%r11, 8(rp,j,8) +	mov	16(rp,j,8), %r10 +	adc	%r10, %r10 +	mov	24(rp,j,8), %r11 +	adc	%r11, %r11 +	nop +	sbb	R32(%rbp), R32(%rbp)		C save CF +	mov	8(up,j,4), %rax +	mul	%rax +	add	R32(%rbx), R32(%rbx)		C restore carry +	adc	%rax, %r10 +	adc	%rdx, %r11 +	mov	%r10, 16(rp,j,8) +L(d1):	mov	%r11, 24(rp,j,8) +	mov	32(rp,j,8), %r10 +	adc	%r10, %r10 +	mov	40(rp,j,8), %r11 +	adc	%r11, %r11 +	sbb	R32(%rbx), R32(%rbx)		C save CF +	add	$4, j +	js	L(top) + +	mov	(up), %rax +	mul	%rax +	add	R32(%rbp), R32(%rbp)		C restore carry +	adc	%rax, %r10 +	adc	%rdx, %r11 +	mov	%r10, (rp) +	mov	%r11, 8(rp) +	mov	16(rp), %r10 +	adc	%r10, %r10 +	sbb	R32(%rbp), R32(%rbp)		C save CF +	neg	R32(%rbp) +	mov	8(up), %rax +	mul	%rax +	add	R32(%rbx), R32(%rbx)		C restore carry +	adc	%rax, %r10 +	adc	%rbp, %rdx +	mov	%r10, 16(rp) +	mov	%rdx, 24(rp) + +	pop	%r14 +	pop	%r13 +	pop	%r12 +	pop	%rbp +	pop	%rbx +	FUNC_EXIT() +	ret +EPILOGUE()  |