Basic constant folding implementation

author: Thomas Voss <mail@thomasvoss.com> 2024-06-21 23:36:36 +0200
committer: Thomas Voss <mail@thomasvoss.com> 2024-06-21 23:42:26 +0200
commit: a89a14ef5da44684a16b204e7a70460cc8c4922a (patch)
tree: b23b4c6b155977909ef508fdae2f48d33d802813 /vendor/gmp-6.3.0/mpn/x86/p6/mmx
parent: 1db63fcedab0b288820d66e100b1877b1a5a8851 (diff)
5 files changed, 1100 insertions, 0 deletions
diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/mmx/divrem_1.asm b/vendor/gmp-6.3.0/mpn/x86/p6/mmx/divrem_1.asm
new file mode 100644
index 0000000..5300616
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/p6/mmx/divrem_1.asm
@@ -0,0 +1,767 @@
+dnl  Intel Pentium-II mpn_divrem_1 -- mpn by limb division.
+
+dnl  Copyright 1999-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P6MMX: 25.0 cycles/limb integer part, 17.5 cycles/limb fraction part.
+
+
+C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
+C                         mp_srcptr src, mp_size_t size,
+C                         mp_limb_t divisor);
+C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
+C                          mp_srcptr src, mp_size_t size,
+C                          mp_limb_t divisor, mp_limb_t carry);
+C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize,
+C                                mp_srcptr src, mp_size_t size,
+C                                mp_limb_t divisor, mp_limb_t inverse,
+C                                unsigned shift);
+C
+C This code is a lightly reworked version of mpn/x86/k7/mmx/divrem_1.asm,
+C see that file for some comments.  It's possible what's here can be improved.
+
+
+dnl  MUL_THRESHOLD is the value of xsize+size at which the multiply by
+dnl  inverse method is used, rather than plain "divl"s.  Minimum value 1.
+dnl
+dnl  The different speeds of the integer and fraction parts means that using
+dnl  xsize+size isn't quite right.  The threshold wants to be a bit higher
+dnl  for the integer part and a bit lower for the fraction part.  (Or what's
+dnl  really wanted is to speed up the integer part!)
+dnl
+dnl  The threshold is set to make the integer part right.  At 4 limbs the
+dnl  div and mul are about the same there, but on the fractional part the
+dnl  mul is much faster.
+
+deflit(MUL_THRESHOLD, 4)
+
+
+defframe(PARAM_PREINV_SHIFT,   28)  dnl mpn_preinv_divrem_1
+defframe(PARAM_PREINV_INVERSE, 24)  dnl mpn_preinv_divrem_1
+defframe(PARAM_CARRY,  24)          dnl mpn_divrem_1c
+defframe(PARAM_DIVISOR,20)
+defframe(PARAM_SIZE,   16)
+defframe(PARAM_SRC,    12)
+defframe(PARAM_XSIZE,  8)
+defframe(PARAM_DST,    4)
+
+defframe(SAVE_EBX,    -4)
+defframe(SAVE_ESI,    -8)
+defframe(SAVE_EDI,    -12)
+defframe(SAVE_EBP,    -16)
+
+defframe(VAR_NORM,    -20)
+defframe(VAR_INVERSE, -24)
+defframe(VAR_SRC,     -28)
+defframe(VAR_DST,     -32)
+defframe(VAR_DST_STOP,-36)
+
+deflit(STACK_SPACE, 36)
+
+	TEXT
+	ALIGN(16)
+
+PROLOGUE(mpn_preinv_divrem_1)
+deflit(`FRAME',0)
+	movl	PARAM_XSIZE, %ecx
+	subl	$STACK_SPACE, %esp	FRAME_subl_esp(STACK_SPACE)
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+
+	movl	%ebx, SAVE_EBX
+	movl	PARAM_SIZE, %ebx
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_DIVISOR, %ebp
+
+	movl	%edi, SAVE_EDI
+	movl	PARAM_DST, %edx
+
+	movl	-4(%esi,%ebx,4), %eax	C src high limb
+	xorl	%edi, %edi		C initial carry (if can't skip a div)
+
+	C
+
+	leal	8(%edx,%ecx,4), %edx	C &dst[xsize+2]
+	xor	%ecx, %ecx
+
+	movl	%edx, VAR_DST_STOP	C &dst[xsize+2]
+	cmpl	%ebp, %eax		C high cmp divisor
+
+	cmovc(	%eax, %edi)		C high is carry if high<divisor
+
+	cmovnc(	%eax, %ecx)		C 0 if skip div, src high if not
+					C (the latter in case src==dst)
+
+	movl	%ecx, -12(%edx,%ebx,4)	C dst high limb
+
+	sbbl	$0, %ebx		C skip one division if high<divisor
+	movl	PARAM_PREINV_SHIFT, %ecx
+
+	leal	-8(%edx,%ebx,4), %edx	C &dst[xsize+size]
+	movl	$32, %eax
+
+	movl	%edx, VAR_DST		C &dst[xsize+size]
+
+	shll	%cl, %ebp		C d normalized
+	subl	%ecx, %eax
+	movl	%ecx, VAR_NORM
+
+	movd	%eax, %mm7		C rshift
+	movl	PARAM_PREINV_INVERSE, %eax
+	jmp	L(start_preinv)
+
+EPILOGUE()
+
+
+
+	ALIGN(16)
+
+PROLOGUE(mpn_divrem_1c)
+deflit(`FRAME',0)
+	movl	PARAM_CARRY, %edx
+
+	movl	PARAM_SIZE, %ecx
+	subl	$STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+	movl	%ebx, SAVE_EBX
+	movl	PARAM_XSIZE, %ebx
+
+	movl	%edi, SAVE_EDI
+	movl	PARAM_DST, %edi
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_DIVISOR, %ebp
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+
+	leal	-4(%edi,%ebx,4), %edi
+	jmp	L(start_1c)
+
+EPILOGUE()
+
+
+	C offset 0x31, close enough to aligned
+PROLOGUE(mpn_divrem_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	$0, %edx		C initial carry (if can't skip a div)
+	subl	$STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_DIVISOR, %ebp
+
+	movl	%ebx, SAVE_EBX
+	movl	PARAM_XSIZE, %ebx
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+	orl	%ecx, %ecx		C size
+
+	movl	%edi, SAVE_EDI
+	movl	PARAM_DST, %edi
+
+	leal	-4(%edi,%ebx,4), %edi	C &dst[xsize-1]
+	jz	L(no_skip_div)		C if size==0
+
+	movl	-4(%esi,%ecx,4), %eax	C src high limb
+	xorl	%esi, %esi
+	cmpl	%ebp, %eax		C high cmp divisor
+
+	cmovc(	%eax, %edx)		C high is carry if high<divisor
+
+	cmovnc(	%eax, %esi)		C 0 if skip div, src high if not
+					C (the latter in case src==dst)
+
+	movl	%esi, (%edi,%ecx,4)	C dst high limb
+
+	sbbl	$0, %ecx		C size-1 if high<divisor
+	movl	PARAM_SRC, %esi		C reload
+L(no_skip_div):
+
+
+L(start_1c):
+	C eax
+	C ebx	xsize
+	C ecx	size
+	C edx	carry
+	C esi	src
+	C edi	&dst[xsize-1]
+	C ebp	divisor
+
+	leal	(%ebx,%ecx), %eax	C size+xsize
+	cmpl	$MUL_THRESHOLD, %eax
+	jae	L(mul_by_inverse)
+
+	orl	%ecx, %ecx
+	jz	L(divide_no_integer)
+
+L(divide_integer):
+	C eax	scratch (quotient)
+	C ebx	xsize
+	C ecx	counter
+	C edx	scratch (remainder)
+	C esi	src
+	C edi	&dst[xsize-1]
+	C ebp	divisor
+
+	movl	-4(%esi,%ecx,4), %eax
+
+	divl	%ebp
+
+	movl	%eax, (%edi,%ecx,4)
+	decl	%ecx
+	jnz	L(divide_integer)
+
+
+L(divide_no_integer):
+	movl	PARAM_DST, %edi
+	orl	%ebx, %ebx
+	jnz	L(divide_fraction)
+
+L(divide_done):
+	movl	SAVE_ESI, %esi
+
+	movl	SAVE_EDI, %edi
+
+	movl	SAVE_EBX, %ebx
+	movl	%edx, %eax
+
+	movl	SAVE_EBP, %ebp
+	addl	$STACK_SPACE, %esp
+
+	ret
+
+
+L(divide_fraction):
+	C eax	scratch (quotient)
+	C ebx	counter
+	C ecx
+	C edx	scratch (remainder)
+	C esi
+	C edi	dst
+	C ebp	divisor
+
+	movl	$0, %eax
+
+	divl	%ebp
+
+	movl	%eax, -4(%edi,%ebx,4)
+	decl	%ebx
+	jnz	L(divide_fraction)
+
+	jmp	L(divide_done)
+
+
+
+C -----------------------------------------------------------------------------
+
+L(mul_by_inverse):
+	C eax
+	C ebx	xsize
+	C ecx	size
+	C edx	carry
+	C esi	src
+	C edi	&dst[xsize-1]
+	C ebp	divisor
+
+	leal	12(%edi), %ebx		C &dst[xsize+2], loop dst stop
+
+	movl	%ebx, VAR_DST_STOP
+	leal	4(%edi,%ecx,4), %edi	C &dst[xsize+size]
+
+	movl	%edi, VAR_DST
+	movl	%ecx, %ebx		C size
+
+	bsrl	%ebp, %ecx		C 31-l
+	movl	%edx, %edi		C carry
+
+	leal	1(%ecx), %eax		C 32-l
+	xorl	$31, %ecx		C l
+
+	movl	%ecx, VAR_NORM
+	movl	$-1, %edx
+
+	shll	%cl, %ebp		C d normalized
+	movd	%eax, %mm7
+
+	movl	$-1, %eax
+	subl	%ebp, %edx		C (b-d)-1 giving edx:eax = b*(b-d)-1
+
+	divl	%ebp			C floor (b*(b-d)-1) / d
+
+L(start_preinv):
+	C eax	inverse
+	C ebx	size
+	C ecx	shift
+	C edx
+	C esi	src
+	C edi	carry
+	C ebp	divisor
+	C
+	C mm7	rshift
+
+	movl	%eax, VAR_INVERSE
+	orl	%ebx, %ebx		C size
+	leal	-12(%esi,%ebx,4), %eax	C &src[size-3]
+
+	movl	%eax, VAR_SRC
+	jz	L(start_zero)
+
+	movl	8(%eax), %esi		C src high limb
+	cmpl	$1, %ebx
+	jz	L(start_one)
+
+L(start_two_or_more):
+	movl	4(%eax), %edx		C src second highest limb
+
+	shldl(	%cl, %esi, %edi)	C n2 = carry,high << l
+
+	shldl(	%cl, %edx, %esi)	C n10 = high,second << l
+
+	cmpl	$2, %ebx
+	je	L(integer_two_left)
+	jmp	L(integer_top)
+
+
+L(start_one):
+	shldl(	%cl, %esi, %edi)	C n2 = carry,high << l
+
+	shll	%cl, %esi		C n10 = high << l
+	jmp	L(integer_one_left)
+
+
+L(start_zero):
+	C Can be here with xsize==0 if mpn_preinv_divrem_1 had size==1 and
+	C skipped a division.
+
+	shll	%cl, %edi		C n2 = carry << l
+	movl	%edi, %eax		C return value for zero_done
+	cmpl	$0, PARAM_XSIZE
+
+	je	L(zero_done)
+	jmp	L(fraction_some)
+
+
+
+C -----------------------------------------------------------------------------
+C
+C This loop runs at about 25 cycles, which is probably sub-optimal, and
+C certainly more than the dependent chain would suggest.  A better loop, or
+C a better rough analysis of what's possible, would be welcomed.
+C
+C In the current implementation, the following successively dependent
+C micro-ops seem to exist.
+C
+C		       uops
+C		n2+n1	1   (addl)
+C		mul	5
+C		q1+1	3   (addl/adcl)
+C		mul	5
+C		sub	3   (subl/sbbl)
+C		addback	2   (cmov)
+C		       ---
+C		       19
+C
+C Lack of registers hinders explicit scheduling and it might be that the
+C normal out of order execution isn't able to hide enough under the mul
+C latencies.
+C
+C Using sarl/negl to pick out n1 for the n2+n1 stage is a touch faster than
+C cmov (and takes one uop off the dependent chain).  A sarl/andl/addl
+C combination was tried for the addback (despite the fact it would lengthen
+C the dependent chain) but found to be no faster.
+
+
+	ALIGN(16)
+L(integer_top):
+	C eax	scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	scratch (src, dst)
+	C edx	scratch
+	C esi	n10
+	C edi	n2
+	C ebp	d
+	C
+	C mm0	scratch (src qword)
+	C mm7	rshift for normalization
+
+	movl	%esi, %eax
+	movl	%ebp, %ebx
+
+	sarl	$31, %eax          C -n1
+	movl	VAR_SRC, %ecx
+
+	andl	%eax, %ebx         C -n1 & d
+	negl	%eax               C n1
+
+	addl	%esi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow
+	addl	%edi, %eax         C n2+n1
+	movq	(%ecx), %mm0       C next src limb and the one below it
+
+	mull	VAR_INVERSE        C m*(n2+n1)
+
+	subl	$4, %ecx
+
+	movl	%ecx, VAR_SRC
+
+	C
+
+	C
+
+	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
+	movl	%ebp, %eax	   C d
+	leal	1(%edi), %ebx      C n2+1
+
+	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+	jz	L(q1_ff)
+
+	mull	%ebx		   C (q1+1)*d
+
+	movl	VAR_DST, %ecx
+	psrlq	%mm7, %mm0
+
+	C
+
+	C
+
+	C
+
+	subl	%eax, %esi
+	movl	VAR_DST_STOP, %eax
+
+	sbbl	%edx, %edi	   C n - (q1+1)*d
+	movl	%esi, %edi	   C remainder -> n2
+	leal	(%ebp,%esi), %edx
+
+	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
+	movd	%mm0, %esi
+
+	sbbl	$0, %ebx	   C q
+	subl	$4, %ecx
+
+	movl	%ebx, (%ecx)
+	cmpl	%eax, %ecx
+
+	movl	%ecx, VAR_DST
+	jne	L(integer_top)
+
+
+L(integer_loop_done):
+
+
+C -----------------------------------------------------------------------------
+C
+C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz
+C q1_ff special case.  This make the code a bit smaller and simpler, and
+C costs only 2 cycles (each).
+
+L(integer_two_left):
+	C eax	scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	scratch (src, dst)
+	C edx	scratch
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+	C
+	C mm7	rshift
+
+
+	movl	%esi, %eax
+	movl	%ebp, %ebx
+
+	sarl	$31, %eax          C -n1
+	movl	PARAM_SRC, %ecx
+
+	andl	%eax, %ebx         C -n1 & d
+	negl	%eax               C n1
+
+	addl	%esi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow
+	addl	%edi, %eax         C n2+n1
+
+	mull	VAR_INVERSE        C m*(n2+n1)
+
+	movd	(%ecx), %mm0	   C src low limb
+
+	movl	VAR_DST_STOP, %ecx
+
+	C
+
+	C
+
+	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
+	leal	1(%edi), %ebx      C n2+1
+	movl	%ebp, %eax	   C d
+
+	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+	sbbl	$0, %ebx
+
+	mull	%ebx		   C (q1+1)*d
+
+	psllq	$32, %mm0
+
+	psrlq	%mm7, %mm0
+
+	C
+
+	C
+
+	subl	%eax, %esi
+
+	sbbl	%edx, %edi	   C n - (q1+1)*d
+	movl	%esi, %edi	   C remainder -> n2
+	leal	(%ebp,%esi), %edx
+
+	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
+	movd	%mm0, %esi
+
+	sbbl	$0, %ebx	   C q
+
+	movl	%ebx, -4(%ecx)
+
+
+C -----------------------------------------------------------------------------
+L(integer_one_left):
+	C eax	scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	scratch (dst)
+	C edx	scratch
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+	C
+	C mm7	rshift
+
+
+	movl	%esi, %eax
+	movl	%ebp, %ebx
+
+	sarl	$31, %eax          C -n1
+	movl	VAR_DST_STOP, %ecx
+
+	andl	%eax, %ebx         C -n1 & d
+	negl	%eax               C n1
+
+	addl	%esi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow
+	addl	%edi, %eax         C n2+n1
+
+	mull	VAR_INVERSE        C m*(n2+n1)
+
+	C
+
+	C
+
+	C
+
+	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
+	leal	1(%edi), %ebx      C n2+1
+	movl	%ebp, %eax	   C d
+
+	C
+
+	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+	sbbl	$0, %ebx           C q1 if q1+1 overflowed
+
+	mull	%ebx
+
+	C
+
+	C
+
+	C
+
+	C
+
+	subl	%eax, %esi
+	movl	PARAM_XSIZE, %eax
+
+	sbbl	%edx, %edi	   C n - (q1+1)*d
+	movl	%esi, %edi	   C remainder -> n2
+	leal	(%ebp,%esi), %edx
+
+	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
+
+	sbbl	$0, %ebx	   C q
+
+	movl	%ebx, -8(%ecx)
+	subl	$8, %ecx
+
+
+
+	orl	%eax, %eax         C xsize
+	jnz	L(fraction_some)
+
+	movl	%edi, %eax
+L(fraction_done):
+	movl	VAR_NORM, %ecx
+L(zero_done):
+	movl	SAVE_EBP, %ebp
+
+	movl	SAVE_EDI, %edi
+
+	movl	SAVE_ESI, %esi
+
+	movl	SAVE_EBX, %ebx
+	addl	$STACK_SPACE, %esp
+
+	shrl	%cl, %eax
+	emms
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+C
+C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
+C of q*d is simply -d and the remainder n-q*d = n10+d
+
+L(q1_ff):
+	C eax	(divisor)
+	C ebx	(q1+1 == 0)
+	C ecx
+	C edx
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+
+	movl	VAR_DST, %ecx
+	movl	VAR_DST_STOP, %edx
+	subl	$4, %ecx
+
+	movl	%ecx, VAR_DST
+	psrlq	%mm7, %mm0
+	leal	(%ebp,%esi), %edi	C n-q*d remainder -> next n2
+
+	movl	$-1, (%ecx)
+	movd	%mm0, %esi		C next n10
+
+	cmpl	%ecx, %edx
+	jne	L(integer_top)
+
+	jmp	L(integer_loop_done)
+
+
+
+C -----------------------------------------------------------------------------
+C
+C In the current implementation, the following successively dependent
+C micro-ops seem to exist.
+C
+C		       uops
+C		mul	5
+C		q1+1	1   (addl)
+C		mul	5
+C		sub	3   (negl/sbbl)
+C		addback	2   (cmov)
+C		       ---
+C		       16
+C
+C The loop in fact runs at about 17.5 cycles.  Using a sarl/andl/addl for
+C the addback was found to be a touch slower.
+
+
+	ALIGN(16)
+L(fraction_some):
+	C eax
+	C ebx
+	C ecx
+	C edx
+	C esi
+	C edi	carry
+	C ebp	divisor
+
+	movl	PARAM_DST, %esi
+	movl	VAR_DST_STOP, %ecx	C &dst[xsize+2]
+	movl	%edi, %eax
+
+	subl	$8, %ecx		C &dst[xsize]
+
+
+	ALIGN(16)
+L(fraction_top):
+	C eax	n2, then scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	dst, decrementing
+	C edx	scratch
+	C esi	dst stop point
+	C edi	n2
+	C ebp	divisor
+
+	mull	VAR_INVERSE	C m*n2
+
+	movl	%ebp, %eax	C d
+	subl	$4, %ecx	C dst
+	leal	1(%edi), %ebx
+
+	C
+
+	C
+
+	C
+
+	addl	%edx, %ebx	C 1 + high(n2<<32 + m*n2) = q1+1
+
+	mull	%ebx		C (q1+1)*d
+
+	C
+
+	C
+
+	C
+
+	C
+
+	negl	%eax		C low of n - (q1+1)*d
+
+	sbbl	%edx, %edi	C high of n - (q1+1)*d, caring only about carry
+	leal	(%ebp,%eax), %edx
+
+	cmovc(	%edx, %eax)	C n - q1*d if underflow from using q1+1
+
+	sbbl	$0, %ebx	C q
+	movl	%eax, %edi	C remainder->n2
+	cmpl	%esi, %ecx
+
+	movl	%ebx, (%ecx)	C previous q
+	jne	L(fraction_top)
+
+
+	jmp	L(fraction_done)
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/mmx/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/x86/p6/mmx/gmp-mparam.h
new file mode 100644
index 0000000..ef29061
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/p6/mmx/gmp-mparam.h
@@ -0,0 +1,218 @@
+/* Intel P6/mmx gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991-2017 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+
+/* NOTE: In a fat binary build SQR_TOOM2_THRESHOLD here cannot be more than the
+   value in mpn/x86/p6/gmp-mparam.h.  The latter is used as a hard limit in
+   mpn/x86/p6/sqr_basecase.asm.  */
+
+
+/* 800 MHz P6 model 8 */
+/* Generated by tuneup.c, 2017-02-03, gcc 4.8 */
+
+#define MOD_1_1P_METHOD                      2
+#define MOD_1_NORM_THRESHOLD                 3
+#define MOD_1_UNNORM_THRESHOLD               4
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          7
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          7
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         8
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        30
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     14
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1N_PI1_METHOD                 1
+#define DIV_QR_1_NORM_THRESHOLD              4
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           62
+
+#define DIV_1_VS_MUL_1_PERCENT             168
+
+#define MUL_TOOM22_THRESHOLD                22
+#define MUL_TOOM33_THRESHOLD                73
+#define MUL_TOOM44_THRESHOLD               195
+#define MUL_TOOM6H_THRESHOLD               254
+#define MUL_TOOM8H_THRESHOLD               381
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      81
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     122
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      73
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      80
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     100
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 30	/* WRONG value, see comment above */
+#define SQR_TOOM3_THRESHOLD                 83
+#define SQR_TOOM4_THRESHOLD                196
+#define SQR_TOOM6_THRESHOLD                214
+#define SQR_TOOM8_THRESHOLD                381
+
+#define MULMID_TOOM42_THRESHOLD             56
+
+#define MULMOD_BNM1_THRESHOLD               16
+#define SQRMOD_BNM1_THRESHOLD               17
+
+#define MUL_FFT_MODF_THRESHOLD             476  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    476, 5}, {     21, 6}, {     11, 5}, {     23, 6}, \
+    {     21, 7}, {     11, 6}, {     25, 7}, {     13, 6}, \
+    {     27, 7}, {     15, 6}, {     31, 7}, {     21, 8}, \
+    {     11, 7}, {     27, 8}, {     15, 7}, {     35, 8}, \
+    {     19, 7}, {     41, 8}, {     23, 7}, {     47, 8}, \
+    {     27, 9}, {     15, 8}, {     31, 7}, {     63, 8}, \
+    {     39, 9}, {     23, 8}, {     51,10}, {     15, 9}, \
+    {     31, 8}, {     67, 9}, {     39, 8}, {     79, 9}, \
+    {     47, 8}, {     95, 9}, {     55,10}, {     31, 9}, \
+    {     63, 8}, {    127, 9}, {     79,10}, {     47, 9}, \
+    {     95,11}, {     31,10}, {     63, 9}, {    135,10}, \
+    {     79, 9}, {    167,10}, {     95, 9}, {    199,10}, \
+    {    111,11}, {     63,10}, {    127, 9}, {    255, 8}, \
+    {    511,10}, {    143, 9}, {    287, 8}, {    575,10}, \
+    {    159,11}, {     95,10}, {    191, 9}, {    383,10}, \
+    {    207,12}, {     63,11}, {    127,10}, {    255, 9}, \
+    {    511,10}, {    271, 9}, {    543, 8}, {   1087,10}, \
+    {    287, 9}, {    575,11}, {    159,10}, {    319, 9}, \
+    {    639,10}, {    351, 9}, {    703,11}, {    191,10}, \
+    {    383, 9}, {    767,10}, {    415, 9}, {    831,11}, \
+    {    223,10}, {    447,12}, {    127,11}, {    255,10}, \
+    {    543, 9}, {   1087,11}, {    287,10}, {    607, 9}, \
+    {   1215,11}, {    319,10}, {    671,11}, {    351,10}, \
+    {    703,12}, {    191,11}, {    383,10}, {    767,11}, \
+    {    415,10}, {    831,11}, {    447,13}, {    127,12}, \
+    {    255,11}, {    543,10}, {   1087,11}, {    607,10}, \
+    {   1215,12}, {    319,11}, {    671,10}, {   1343,11}, \
+    {    703,10}, {   1407,11}, {    735,12}, {    383,11}, \
+    {    831,12}, {    447,11}, {    959,10}, {   1919,13}, \
+    {    255,12}, {    511,11}, {   1087,12}, {    575,11}, \
+    {   1215,10}, {   2431,12}, {    639,11}, {   1343,12}, \
+    {    703,11}, {   1471,13}, {    383,12}, {    767,11}, \
+    {   1535,12}, {    831,11}, {   1727,12}, {    959,11}, \
+    {   1919,14}, {    255,13}, {    511,12}, {   1215,11}, \
+    {   2431,13}, {    639,12}, {   1471,11}, {   2943,13}, \
+    {    767,12}, {   1727,13}, {    895,12}, {   1919,11}, \
+    {   3839,14}, {    511,13}, {   1023,12}, {   2111,13}, \
+    {   1151,12}, {   2431,13}, {   1279,12}, {   2559,13}, \
+    {   1407,12}, {   2943,14}, {    767,13}, {   1663,12}, \
+    {   3327,13}, {   1919,12}, {   3839,15}, {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 160
+#define MUL_FFT_THRESHOLD                 7040
+
+#define SQR_FFT_MODF_THRESHOLD             376  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    376, 5}, {     21, 6}, {     11, 5}, {     23, 6}, \
+    {     21, 7}, {     11, 6}, {     24, 7}, {     13, 6}, \
+    {     27, 7}, {     15, 6}, {     31, 7}, {     21, 8}, \
+    {     11, 7}, {     27, 8}, {     15, 7}, {     33, 8}, \
+    {     19, 7}, {     39, 8}, {     23, 7}, {     47, 8}, \
+    {     27, 9}, {     15, 8}, {     39, 9}, {     23, 8}, \
+    {     51,10}, {     15, 9}, {     31, 8}, {     67, 9}, \
+    {     39, 8}, {     79, 9}, {     47, 8}, {     95, 9}, \
+    {     55,10}, {     31, 9}, {     79,10}, {     47, 9}, \
+    {     95,11}, {     31,10}, {     63, 9}, {    127, 8}, \
+    {    255, 9}, {    135,10}, {     79, 9}, {    167,10}, \
+    {     95, 9}, {    191, 8}, {    383,10}, {    111,11}, \
+    {     63,10}, {    127, 9}, {    255, 8}, {    511, 9}, \
+    {    271,10}, {    143, 9}, {    287, 8}, {    575, 9}, \
+    {    303, 8}, {    607,10}, {    159, 9}, {    319,11}, \
+    {     95,10}, {    191, 9}, {    383,10}, {    207,12}, \
+    {     63,11}, {    127,10}, {    255, 9}, {    511,10}, \
+    {    271, 9}, {    543,10}, {    287, 9}, {    575,10}, \
+    {    303,11}, {    159,10}, {    319, 9}, {    639,10}, \
+    {    351, 9}, {    703,11}, {    191,10}, {    383, 9}, \
+    {    767,10}, {    415, 9}, {    831,11}, {    223,10}, \
+    {    479,12}, {    127,11}, {    255,10}, {    543, 9}, \
+    {   1087,11}, {    287,10}, {    607, 9}, {   1215,11}, \
+    {    319,10}, {    671,11}, {    351,10}, {    703,12}, \
+    {    191,11}, {    383,10}, {    767,11}, {    415,10}, \
+    {    831,11}, {    479,13}, {    127,12}, {    255,11}, \
+    {    543,10}, {   1087,11}, {    607,10}, {   1215,12}, \
+    {    319,11}, {    671,10}, {   1343,11}, {    703,10}, \
+    {   1407,11}, {    735,12}, {    383,11}, {    831,12}, \
+    {    447,11}, {    959,10}, {   1919,13}, {    255,12}, \
+    {    511,11}, {   1087,12}, {    575,11}, {   1215,10}, \
+    {   2431,12}, {    639,11}, {   1343,12}, {    703,11}, \
+    {   1407,13}, {    383,12}, {    831,11}, {   1727,12}, \
+    {    959,11}, {   1919,14}, {    255,13}, {    511,12}, \
+    {   1215,11}, {   2431,13}, {    639,12}, {   1471,11}, \
+    {   2943,13}, {    767,12}, {   1727,13}, {    895,12}, \
+    {   1919,11}, {   3839,14}, {    511,13}, {   1023,12}, \
+    {   2111,13}, {   1151,12}, {   2431,13}, {   1407,12}, \
+    {   2943,14}, {    767,13}, {   1535,12}, {   3071,13}, \
+    {   1663,12}, {   3455,13}, {   1919,12}, {   3839,15}, \
+    {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 161
+#define SQR_FFT_THRESHOLD                 3712
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  62
+#define MULLO_MUL_N_THRESHOLD            13463
+#define SQRLO_BASECASE_THRESHOLD             0  /* always */
+#define SQRLO_DC_THRESHOLD                 177
+#define SQRLO_SQR_THRESHOLD               8937
+
+#define DC_DIV_QR_THRESHOLD                 80
+#define DC_DIVAPPR_Q_THRESHOLD             240
+#define DC_BDIV_QR_THRESHOLD                76
+#define DC_BDIV_Q_THRESHOLD                166
+
+#define INV_MULMOD_BNM1_THRESHOLD           42
+#define INV_NEWTON_THRESHOLD               262
+#define INV_APPR_THRESHOLD                 250
+
+#define BINV_NEWTON_THRESHOLD              272
+#define REDC_1_TO_REDC_N_THRESHOLD          72
+
+#define MU_DIV_QR_THRESHOLD               1499
+#define MU_DIVAPPR_Q_THRESHOLD            1470
+#define MUPI_DIV_QR_THRESHOLD              124
+#define MU_BDIV_QR_THRESHOLD              1142
+#define MU_BDIV_Q_THRESHOLD               1341
+
+#define POWM_SEC_TABLE  1,16,96,416,1259
+
+#define GET_STR_DC_THRESHOLD                14
+#define GET_STR_PRECOMPUTE_THRESHOLD        27
+#define SET_STR_DC_THRESHOLD               270
+#define SET_STR_PRECOMPUTE_THRESHOLD      1084
+
+#define FAC_DSC_THRESHOLD                  194
+#define FAC_ODD_THRESHOLD                   25
+
+#define MATRIX22_STRASSEN_THRESHOLD         16
+#define HGCD_THRESHOLD                     124
+#define HGCD_APPR_THRESHOLD                152
+#define HGCD_REDUCE_THRESHOLD             3014
+#define GCD_DC_THRESHOLD                   474
+#define GCDEXT_DC_THRESHOLD                321
+#define JACOBI_BASE_METHOD                   1
diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/mmx/lshift.asm b/vendor/gmp-6.3.0/mpn/x86/p6/mmx/lshift.asm
new file mode 100644
index 0000000..febd1c0
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/p6/mmx/lshift.asm
@@ -0,0 +1,38 @@
+dnl  Intel Pentium-II mpn_lshift -- mpn left shift.
+
+dnl  Copyright 2001 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+dnl  The P55 code runs well on P-II/III, but could stand some minor tweaks
+dnl  at some stage probably.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_lshift)
+include_mpn(`x86/pentium/mmx/lshift.asm')
diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/mmx/popham.asm b/vendor/gmp-6.3.0/mpn/x86/p6/mmx/popham.asm
new file mode 100644
index 0000000..fd340e4
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/p6/mmx/popham.asm
@@ -0,0 +1,39 @@
+dnl  Intel Pentium-II mpn_popcount, mpn_hamdist -- population count and
+dnl  hamming distance.
+
+dnl  Copyright 2000, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P6MMX: popcount 11 cycles/limb (approx), hamdist 11.5 cycles/limb (approx)
+
+
+MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
+include_mpn(`x86/k6/mmx/popham.asm')
diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/mmx/rshift.asm b/vendor/gmp-6.3.0/mpn/x86/p6/mmx/rshift.asm
new file mode 100644
index 0000000..77aa190
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/p6/mmx/rshift.asm
@@ -0,0 +1,38 @@
+dnl  Intel Pentium-II mpn_rshift -- mpn left shift.
+
+dnl  Copyright 2001 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+dnl  The P55 code runs well on P-II/III, but could stand some minor tweaks
+dnl  at some stage probably.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_rshift)
+include_mpn(`x86/pentium/mmx/rshift.asm')
author	Thomas Voss <mail@thomasvoss.com>	2024-06-21 23:36:36 +0200
committer	Thomas Voss <mail@thomasvoss.com>	2024-06-21 23:42:26 +0200
commit	a89a14ef5da44684a16b204e7a70460cc8c4922a (patch)
tree	b23b4c6b155977909ef508fdae2f48d33d802813 /vendor/gmp-6.3.0/mpn/x86/p6/mmx
parent	1db63fcedab0b288820d66e100b1877b1a5a8851 (diff)