diff options
Diffstat (limited to 'vendor/gmp-6.3.0/mpn/ia64/mod_34lsub1.asm')
| -rw-r--r-- | vendor/gmp-6.3.0/mpn/ia64/mod_34lsub1.asm | 237 | 
1 files changed, 237 insertions, 0 deletions
diff --git a/vendor/gmp-6.3.0/mpn/ia64/mod_34lsub1.asm b/vendor/gmp-6.3.0/mpn/ia64/mod_34lsub1.asm new file mode 100644 index 0000000..7789117 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/ia64/mod_34lsub1.asm @@ -0,0 +1,237 @@ +dnl  IA-64 mpn_mod_34lsub1 + +dnl  Contributed to the GNU project by Torbjorn Granlund. + +dnl  Copyright 2003-2005, 2010 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C           cycles/limb +C Itanium:      ? +C Itanium 2:    1 + + +C INPUT PARAMETERS +define(`up', `r32') +define(`n',  `r33') + +C Some useful aliases for registers we use +define(`u0',`r14') define(`u1',`r15') define(`u2',`r16') +define(`a0',`r17') define(`a1',`r18') define(`a2',`r19') +define(`c0',`r20') define(`c1',`r21') define(`c2',`r22') + +C This is a fairly simple-minded implementation.  One could approach 0.67 c/l +C with a more sophisticated implementation.  If we're really crazy, we could +C super-unroll, storing carries just in predicate registers, then copy them to +C a general register, and population count them from there.  That'd bring us +C close to 3 insn/limb, for nearly 0.5 c/l. + +C Computing n/3 needs 16 cycles, which is a lot of startup overhead. +C We therefore use a plain while-style loop: +C	add		n = -3, n +C	cmp.le		p9, p0 = 3, n +C  (p9)	br.cond		.Loop +C Alternatively, we could table n/3 for, say, n < 256, and predicate the +C 16-cycle code. + +C The summing-up code at the end was written quickly, and could surely be +C vastly improved. + +ASM_START() +PROLOGUE(mpn_mod_34lsub1) +	.prologue +	.save	ar.lc, r2 +	.body +ifdef(`HAVE_ABI_32',` +	addp4		up = 0, up		C			M I +	nop.m		0 +	zxt4		n = n			C			I +	;; +') + +ifelse(0,1,` +	movl		r14 = 0xAAAAAAAAAAAAAAAB +	;; +	setf.sig	f6 = r14 +	setf.sig	f7 = r33 +	;; +	xmpy.hu		f6 = f6, f7 +	;; +	getf.sig	r8 = f6 +	;; +	shr.u		r8 = r8, 1		C Loop count +	;; +	mov.i		ar.lc = r8 +') + +	ld8	u0 = [up], 8 +	cmp.ne	p9, p0 = 1, n +  (p9)	br	L(gt1) +	;; +	shr.u	r8 = u0, 48 +	dep.z	r27 = u0, 0, 48 +	;; +	add	r8 = r8, r27 +	br.ret.sptk.many b0 + + +L(gt1): + {.mmi;	nop.m	0 +	mov	a0 = 0 +	add	n = -2, n +}{.mmi;	mov	c0 = 0 +	mov	c1 = 0 +	mov	c2 = 0 +	;; +}{.mmi;	ld8	u1 = [up], 8 +	mov	a1 = 0 +	cmp.ltu	p6, p0 = r0, r0		C clear p6 +}{.mmb;	cmp.gt	p9, p0 = 3, n +	mov	a2 = 0 +  (p9)	br.cond.dptk	L(end) +	;; +} +	ALIGN(32) +L(top): + {.mmi;	ld8	u2 = [up], 8 +  (p6)	add	c0 = 1, c0 +	cmp.ltu	p7, p0 = a0, u0 +}{.mmb;	sub	a0 = a0, u0 +	add	n = -3, n +	nop.b	0 +	;; +}{.mmi;	ld8	u0 = [up], 8 +  (p7)	add	c1 = 1, c1 +	cmp.ltu	p8, p0 = a1, u1 +}{.mmb;	sub	a1 = a1, u1 +	cmp.le	p9, p0 = 3, n +	nop.b	0 +	;; +}{.mmi;	ld8	u1 = [up], 8 +  (p8)	add	c2 = 1, c2 +	cmp.ltu	p6, p0 = a2, u2 +}{.mmb;	sub	a2 = a2, u2 +	nop.m	0 +dnl	br.cloop.dptk	L(top) +  (p9)	br.cond.dptk	L(top) +	;; +} +L(end): +	cmp.eq	p10, p0 = 0, n +	cmp.eq	p11, p0 = 1, n +  (p10)	br	L(0) + +L(2): + {.mmi;	ld8	u2 = [up], 8 +  (p6)	add	c0 = 1, c0 +	cmp.ltu	p7, p0 = a0, u0 +}{.mmb;	sub	a0 = a0, u0 +	nop.m	0 +  (p11)	br	L(1) +	;; +}	ld8	u0 = [up], 8 +  (p7)	add	c1 = 1, c1 +	cmp.ltu	p8, p0 = a1, u1 +	sub	a1 = a1, u1 +	;; +  (p8)	add	c2 = 1, c2 +	cmp.ltu	p6, p0 = a2, u2 +	sub	a2 = a2, u2 +	;; +  (p6)	add	c0 = 1, c0 +	cmp.ltu	p7, p0 = a0, u0 +	sub	a0 = a0, u0 +	;; +  (p7)	add	c1 = 1, c1 +	br	L(com) + + +L(1): +  (p7)	add	c1 = 1, c1 +	cmp.ltu	p8, p0 = a1, u1 +	sub	a1 = a1, u1 +	;; +  (p8)	add	c2 = 1, c2 +	cmp.ltu	p6, p0 = a2, u2 +	sub	a2 = a2, u2 +	;; +  (p6)	add	c0 = 1, c0 +	br	L(com) + + +L(0): +  (p6)	add	c0 = 1, c0 +	cmp.ltu	p7, p0 = a0, u0 +	sub	a0 = a0, u0 +	;; +  (p7)	add	c1 = 1, c1 +	cmp.ltu	p8, p0 = a1, u1 +	sub	a1 = a1, u1 +	;; +  (p8)	add	c2 = 1, c2 + +L(com): +C |     a2    |     a1    |     a0    | +C |        |        |        |        | +	shr.u	r24 = a0, 48		C 16 bits +	shr.u	r25 = a1, 32		C 32 bits +	shr.u	r26 = a2, 16		C 48 bits +	;; +	shr.u	r10 = c0, 48		C 16 bits, always zero +	shr.u	r11 = c1, 32		C 32 bits +	shr.u	r30 = c2, 16		C 48 bits +	;; +	dep.z	r27 = a0,  0, 48	C 48 bits +	dep.z	r28 = a1, 16, 32	C 48 bits +	dep.z	r29 = a2, 32, 16	C 48 bits +	dep.z	r31 = c0,  0, 48	C 48 bits +	dep.z	r14 = c1, 16, 32	C 48 bits +	dep.z	r15 = c2, 32, 16	C 48 bits +	;; + {.mmi;	add	r24 = r24, r25 +	add	r26 = r26, r27 +	add	r28 = r28, r29 +}{.mmi;	add	r10 = r10, r11 +	add	r30 = r30, r31 +	add	r14 = r14, r15 +	;; +} +	movl	r8 = 0xffffffffffff0 +	add	r24 = r24, r26 +	add	r10 = r10, r30 +	;; +	add	r24 = r24, r28 +	add	r10 = r10, r14 +	;; +	sub	r8 = r8, r24 +	;; +	add	r8 = r8, r10 +	br.ret.sptk.many b0 +EPILOGUE() +ASM_END()  |