1 files changed, 237 insertions, 0 deletions
diff --git a/vendor/gmp-6.3.0/mpn/ia64/mod_34lsub1.asm b/vendor/gmp-6.3.0/mpn/ia64/mod_34lsub1.asm
new file mode 100644
index 0000000..7789117
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/ia64/mod_34lsub1.asm
@@ -0,0 +1,237 @@
+dnl  IA-64 mpn_mod_34lsub1
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2003-2005, 2010 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C           cycles/limb
+C Itanium:      ?
+C Itanium 2:    1
+
+
+C INPUT PARAMETERS
+define(`up', `r32')
+define(`n',  `r33')
+
+C Some useful aliases for registers we use
+define(`u0',`r14') define(`u1',`r15') define(`u2',`r16')
+define(`a0',`r17') define(`a1',`r18') define(`a2',`r19')
+define(`c0',`r20') define(`c1',`r21') define(`c2',`r22')
+
+C This is a fairly simple-minded implementation.  One could approach 0.67 c/l
+C with a more sophisticated implementation.  If we're really crazy, we could
+C super-unroll, storing carries just in predicate registers, then copy them to
+C a general register, and population count them from there.  That'd bring us
+C close to 3 insn/limb, for nearly 0.5 c/l.
+
+C Computing n/3 needs 16 cycles, which is a lot of startup overhead.
+C We therefore use a plain while-style loop:
+C	add		n = -3, n
+C	cmp.le		p9, p0 = 3, n
+C  (p9)	br.cond		.Loop
+C Alternatively, we could table n/3 for, say, n < 256, and predicate the
+C 16-cycle code.
+
+C The summing-up code at the end was written quickly, and could surely be
+C vastly improved.
+
+ASM_START()
+PROLOGUE(mpn_mod_34lsub1)
+	.prologue
+	.save	ar.lc, r2
+	.body
+ifdef(`HAVE_ABI_32',`
+	addp4		up = 0, up		C			M I
+	nop.m		0
+	zxt4		n = n			C			I
+	;;
+')
+
+ifelse(0,1,`
+	movl		r14 = 0xAAAAAAAAAAAAAAAB
+	;;
+	setf.sig	f6 = r14
+	setf.sig	f7 = r33
+	;;
+	xmpy.hu		f6 = f6, f7
+	;;
+	getf.sig	r8 = f6
+	;;
+	shr.u		r8 = r8, 1		C Loop count
+	;;
+	mov.i		ar.lc = r8
+')
+
+	ld8	u0 = [up], 8
+	cmp.ne	p9, p0 = 1, n
+  (p9)	br	L(gt1)
+	;;
+	shr.u	r8 = u0, 48
+	dep.z	r27 = u0, 0, 48
+	;;
+	add	r8 = r8, r27
+	br.ret.sptk.many b0
+
+
+L(gt1):
+ {.mmi;	nop.m	0
+	mov	a0 = 0
+	add	n = -2, n
+}{.mmi;	mov	c0 = 0
+	mov	c1 = 0
+	mov	c2 = 0
+	;;
+}{.mmi;	ld8	u1 = [up], 8
+	mov	a1 = 0
+	cmp.ltu	p6, p0 = r0, r0		C clear p6
+}{.mmb;	cmp.gt	p9, p0 = 3, n
+	mov	a2 = 0
+  (p9)	br.cond.dptk	L(end)
+	;;
+}
+	ALIGN(32)
+L(top):
+ {.mmi;	ld8	u2 = [up], 8
+  (p6)	add	c0 = 1, c0
+	cmp.ltu	p7, p0 = a0, u0
+}{.mmb;	sub	a0 = a0, u0
+	add	n = -3, n
+	nop.b	0
+	;;
+}{.mmi;	ld8	u0 = [up], 8
+  (p7)	add	c1 = 1, c1
+	cmp.ltu	p8, p0 = a1, u1
+}{.mmb;	sub	a1 = a1, u1
+	cmp.le	p9, p0 = 3, n
+	nop.b	0
+	;;
+}{.mmi;	ld8	u1 = [up], 8
+  (p8)	add	c2 = 1, c2
+	cmp.ltu	p6, p0 = a2, u2
+}{.mmb;	sub	a2 = a2, u2
+	nop.m	0
+dnl	br.cloop.dptk	L(top)
+  (p9)	br.cond.dptk	L(top)
+	;;
+}
+L(end):
+	cmp.eq	p10, p0 = 0, n
+	cmp.eq	p11, p0 = 1, n
+  (p10)	br	L(0)
+
+L(2):
+ {.mmi;	ld8	u2 = [up], 8
+  (p6)	add	c0 = 1, c0
+	cmp.ltu	p7, p0 = a0, u0
+}{.mmb;	sub	a0 = a0, u0
+	nop.m	0
+  (p11)	br	L(1)
+	;;
+}	ld8	u0 = [up], 8
+  (p7)	add	c1 = 1, c1
+	cmp.ltu	p8, p0 = a1, u1
+	sub	a1 = a1, u1
+	;;
+  (p8)	add	c2 = 1, c2
+	cmp.ltu	p6, p0 = a2, u2
+	sub	a2 = a2, u2
+	;;
+  (p6)	add	c0 = 1, c0
+	cmp.ltu	p7, p0 = a0, u0
+	sub	a0 = a0, u0
+	;;
+  (p7)	add	c1 = 1, c1
+	br	L(com)
+
+
+L(1):
+  (p7)	add	c1 = 1, c1
+	cmp.ltu	p8, p0 = a1, u1
+	sub	a1 = a1, u1
+	;;
+  (p8)	add	c2 = 1, c2
+	cmp.ltu	p6, p0 = a2, u2
+	sub	a2 = a2, u2
+	;;
+  (p6)	add	c0 = 1, c0
+	br	L(com)
+
+
+L(0):
+  (p6)	add	c0 = 1, c0
+	cmp.ltu	p7, p0 = a0, u0
+	sub	a0 = a0, u0
+	;;
+  (p7)	add	c1 = 1, c1
+	cmp.ltu	p8, p0 = a1, u1
+	sub	a1 = a1, u1
+	;;
+  (p8)	add	c2 = 1, c2
+
+L(com):
+C |     a2    |     a1    |     a0    |
+C |        |        |        |        |
+	shr.u	r24 = a0, 48		C 16 bits
+	shr.u	r25 = a1, 32		C 32 bits
+	shr.u	r26 = a2, 16		C 48 bits
+	;;
+	shr.u	r10 = c0, 48		C 16 bits, always zero
+	shr.u	r11 = c1, 32		C 32 bits
+	shr.u	r30 = c2, 16		C 48 bits
+	;;
+	dep.z	r27 = a0,  0, 48	C 48 bits
+	dep.z	r28 = a1, 16, 32	C 48 bits
+	dep.z	r29 = a2, 32, 16	C 48 bits
+	dep.z	r31 = c0,  0, 48	C 48 bits
+	dep.z	r14 = c1, 16, 32	C 48 bits
+	dep.z	r15 = c2, 32, 16	C 48 bits
+	;;
+ {.mmi;	add	r24 = r24, r25
+	add	r26 = r26, r27
+	add	r28 = r28, r29
+}{.mmi;	add	r10 = r10, r11
+	add	r30 = r30, r31
+	add	r14 = r14, r15
+	;;
+}
+	movl	r8 = 0xffffffffffff0
+	add	r24 = r24, r26
+	add	r10 = r10, r30
+	;;
+	add	r24 = r24, r28
+	add	r10 = r10, r14
+	;;
+	sub	r8 = r8, r24
+	;;
+	add	r8 = r8, r10
+	br.ret.sptk.many b0
+EPILOGUE()
+ASM_END()