Basic constant folding implementation

author: Thomas Voss <mail@thomasvoss.com> 2024-06-21 23:36:36 +0200
committer: Thomas Voss <mail@thomasvoss.com> 2024-06-21 23:42:26 +0200
commit: a89a14ef5da44684a16b204e7a70460cc8c4922a (patch)
tree: b23b4c6b155977909ef508fdae2f48d33d802813 /vendor/gmp-6.3.0/mpn/x86/pentium/mmx/mul_1.asm
parent: 1db63fcedab0b288820d66e100b1877b1a5a8851 (diff)
1 files changed, 371 insertions, 0 deletions
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/mul_1.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/mul_1.asm
new file mode 100644
index 0000000..4ced577
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/mul_1.asm
@@ -0,0 +1,371 @@
+dnl  Intel Pentium MMX mpn_mul_1 -- mpn by limb multiplication.
+
+dnl  Copyright 2000-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C    cycles/limb
+C P5:   12.0   for 32-bit multiplier
+C        7.0   for 16-bit multiplier
+
+
+C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                      mp_limb_t multiplier);
+C
+C When the multiplier is 16 bits some special case MMX code is used.  Small
+C multipliers might arise reasonably often from mpz_mul_ui etc.  If the size
+C is odd there's roughly a 5 cycle penalty, so times for say size==7 and
+C size==8 end up being quite close.  If src isn't aligned to an 8 byte
+C boundary then one limb is processed separately with roughly a 5 cycle
+C penalty, so in that case it's say size==8 and size==9 which are close.
+C
+C Alternatives:
+C
+C MMX is not believed to be of any use for 32-bit multipliers, since for
+C instance the current method would just have to be more or less duplicated
+C for the high and low halves of the multiplier, and would probably
+C therefore run at about 14 cycles, which is slower than the plain integer
+C at 12.
+C
+C Adding the high and low MMX products using integer code seems best.  An
+C attempt at using paddd and carry bit propagation with pcmpgtd didn't give
+C any joy.  Perhaps something could be done keeping the values signed and
+C thereby avoiding adjustments to make pcmpgtd into an unsigned compare, or
+C perhaps not.
+C
+C Future:
+C
+C An mpn_mul_1c entrypoint would need a double carry out of the low result
+C limb in the 16-bit code, unless it could be assumed the carry fits in 16
+C bits, possibly as carry<multiplier, this being true of a big calculation
+C done piece by piece.  But let's worry about that if/when mul_1c is
+C actually used.
+
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE,      12)
+defframe(PARAM_SRC,       8)
+defframe(PARAM_DST,       4)
+
+	TEXT
+
+	ALIGN(8)
+PROLOGUE(mpn_mul_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	PARAM_SRC, %edx
+
+	cmpl	$1, %ecx
+	jne	L(two_or_more)
+
+	C one limb only
+
+	movl	PARAM_MULTIPLIER, %eax
+	movl	PARAM_DST, %ecx
+
+	mull	(%edx)
+
+	movl	%eax, (%ecx)
+	movl	%edx, %eax
+
+	ret
+
+
+L(two_or_more):
+	C eax	size
+	C ebx
+	C ecx	carry
+	C edx
+	C esi	src
+	C edi
+	C ebp
+
+	pushl	%esi		FRAME_pushl()
+	pushl	%edi		FRAME_pushl()
+
+	movl	%edx, %esi		C src
+	movl	PARAM_DST, %edi
+
+	movl	PARAM_MULTIPLIER, %eax
+	pushl	%ebx		FRAME_pushl()
+
+	leal	(%esi,%ecx,4), %esi	C src end
+	leal	(%edi,%ecx,4), %edi	C dst end
+
+	negl	%ecx			C -size
+
+	pushl	%ebp		FRAME_pushl()
+	cmpl	$65536, %eax
+
+	jb	L(small)
+
+
+L(big):
+	xorl	%ebx, %ebx		C carry limb
+	sarl	%ecx			C -size/2
+
+	jnc	L(top)			C with carry flag clear
+
+
+	C size was odd, process one limb separately
+
+	mull	4(%esi,%ecx,8)		C m * src[0]
+
+	movl	%eax, 4(%edi,%ecx,8)
+	incl	%ecx
+
+	orl	%edx, %ebx		C carry limb, and clear carry flag
+
+
+L(top):
+	C eax
+	C ebx	carry
+	C ecx	counter, negative
+	C edx
+	C esi	src end
+	C edi	dst end
+	C ebp	(scratch carry)
+
+	adcl	$0, %ebx
+	movl	(%esi,%ecx,8), %eax
+
+	mull	PARAM_MULTIPLIER
+
+	movl	%edx, %ebp
+	addl	%eax, %ebx
+
+	adcl	$0, %ebp
+	movl	4(%esi,%ecx,8), %eax
+
+	mull	PARAM_MULTIPLIER
+
+	movl	%ebx, (%edi,%ecx,8)
+	addl	%ebp, %eax
+
+	movl	%eax, 4(%edi,%ecx,8)
+	incl	%ecx
+
+	movl	%edx, %ebx
+	jnz	L(top)
+
+
+	adcl	$0, %ebx
+	popl	%ebp
+
+	movl	%ebx, %eax
+	popl	%ebx
+
+	popl	%edi
+	popl	%esi
+
+	ret
+
+
+L(small):
+	C Special case for 16-bit multiplier.
+	C
+	C eax	multiplier
+	C ebx
+	C ecx	-size
+	C edx	src
+	C esi	src end
+	C edi	dst end
+	C ebp	multiplier
+
+	C size<3 not supported here.  At size==3 we're already a couple of
+	C cycles faster, so there's no threshold as such, just use the MMX
+	C as soon as possible.
+
+	cmpl	$-3, %ecx
+	ja	L(big)
+
+	movd	%eax, %mm7		C m
+	pxor	%mm6, %mm6		C initial carry word
+
+	punpcklwd %mm7, %mm7		C m replicated 2 times
+	addl	$2, %ecx		C -size+2
+
+	punpckldq %mm7, %mm7		C m replicated 4 times
+	andl	$4, %edx		C test alignment, clear carry flag
+
+	movq	%mm7, %mm0		C m
+	jz	L(small_entry)
+
+
+	C Source is unaligned, process one limb separately.
+	C
+	C Plain integer code is used here, since it's smaller and is about
+	C the same 13 cycles as an mmx block would be.
+	C
+	C An "addl $1,%ecx" doesn't clear the carry flag when size==3, hence
+	C the use of separate incl and orl.
+
+	mull	-8(%esi,%ecx,4)		C m * src[0]
+
+	movl	%eax, -8(%edi,%ecx,4)	C dst[0]
+	incl	%ecx			C one limb processed
+
+	movd	%edx, %mm6		C initial carry
+
+	orl	%eax, %eax		C clear carry flag
+	jmp	L(small_entry)
+
+
+C The scheduling here is quite tricky, since so many instructions have
+C pairing restrictions.  In particular the js won't pair with a movd, and
+C can't be paired with an adc since it wants flags from the inc, so
+C instructions are rotated to the top of the loop to find somewhere useful
+C for it.
+C
+C Trouble has been taken to avoid overlapping successive loop iterations,
+C since that would greatly increase the size of the startup and finishup
+C code.  Actually there's probably not much advantage to be had from
+C overlapping anyway, since the difficulties are mostly with pairing, not
+C with latencies as such.
+C
+C In the comments x represents the src data and m the multiplier (16
+C bits, but replicated 4 times).
+C
+C The m signs calculated in %mm3 are a loop invariant and could be held in
+C say %mm5, but that would save only one instruction and hence be no faster.
+
+L(small_top):
+	C eax	l.low, then l.high
+	C ebx	(h.low)
+	C ecx	counter, -size+2 to 0 or 1
+	C edx	(h.high)
+	C esi	&src[size]
+	C edi	&dst[size]
+	C ebp
+	C
+	C %mm0	(high products)
+	C %mm1	(low products)
+	C %mm2	(adjust for m using x signs)
+	C %mm3	(adjust for x using m signs)
+	C %mm4
+	C %mm5
+	C %mm6	h.low, then carry
+	C %mm7	m replicated 4 times
+
+	movd	%mm6, %ebx		C h.low
+	psrlq	$32, %mm1		C l.high
+
+	movd	%mm0, %edx		C h.high
+	movq	%mm0, %mm6		C new c
+
+	adcl	%eax, %ebx
+	incl	%ecx
+
+	movd	%mm1, %eax		C l.high
+	movq	%mm7, %mm0
+
+	adcl	%eax, %edx
+	movl	%ebx, -16(%edi,%ecx,4)
+
+	movl	%edx, -12(%edi,%ecx,4)
+	psrlq	$32, %mm6		C c
+
+L(small_entry):
+	pmulhw	-8(%esi,%ecx,4), %mm0	C h = (x*m).high
+	movq	%mm7, %mm1
+
+	pmullw	-8(%esi,%ecx,4), %mm1	C l = (x*m).low
+	movq	%mm7, %mm3
+
+	movq	-8(%esi,%ecx,4), %mm2	C x
+	psraw	$15, %mm3		C m signs
+
+	pand	-8(%esi,%ecx,4), %mm3	C x selected by m signs
+	psraw	$15, %mm2		C x signs
+
+	paddw	%mm3, %mm0		C add x to h if m neg
+	pand	%mm7, %mm2		C m selected by x signs
+
+	paddw	%mm2, %mm0		C add m to h if x neg
+	incl	%ecx
+
+	movd	%mm1, %eax		C l.low
+	punpcklwd %mm0, %mm6		C c + h.low << 16
+
+	psrlq	$16, %mm0		C h.high
+	js	L(small_top)
+
+
+
+
+	movd	%mm6, %ebx		C h.low
+	psrlq	$32, %mm1		C l.high
+
+	adcl	%eax, %ebx
+	popl	%ebp		FRAME_popl()
+
+	movd	%mm0, %edx		C h.high
+	psrlq	$32, %mm0		C l.high
+
+	movd	%mm1, %eax		C l.high
+
+	adcl	%eax, %edx
+	movl	%ebx, -12(%edi,%ecx,4)
+
+	movd	%mm0, %eax		C c
+
+	adcl	$0, %eax
+	movl	%edx, -8(%edi,%ecx,4)
+
+	orl	%ecx, %ecx
+	jnz	L(small_done)		C final %ecx==1 means even, ==0 odd
+
+
+	C Size odd, one extra limb to process.
+	C Plain integer code is used here, since it's smaller and is about
+	C the same speed as another mmx block would be.
+
+	movl	%eax, %ecx
+	movl	PARAM_MULTIPLIER, %eax
+
+	mull	-4(%esi)
+
+	addl	%ecx, %eax
+
+	adcl	$0, %edx
+	movl	%eax, -4(%edi)
+
+	movl	%edx, %eax
+L(small_done):
+	popl	%ebx
+
+	popl	%edi
+	popl	%esi
+
+	emms
+
+	ret
+
+EPILOGUE()
author	Thomas Voss <mail@thomasvoss.com>	2024-06-21 23:36:36 +0200
committer	Thomas Voss <mail@thomasvoss.com>	2024-06-21 23:42:26 +0200
commit	a89a14ef5da44684a16b204e7a70460cc8c4922a (patch)
tree	b23b4c6b155977909ef508fdae2f48d33d802813 /vendor/gmp-6.3.0/mpn/x86/pentium/mmx/mul_1.asm
parent	1db63fcedab0b288820d66e100b1877b1a5a8851 (diff)