diff options
Diffstat (limited to 'vendor/gmp-6.3.0/mpn/arm/neon/lshiftc.asm')
| -rw-r--r-- | vendor/gmp-6.3.0/mpn/arm/neon/lshiftc.asm | 242 | 
1 files changed, 242 insertions, 0 deletions
| diff --git a/vendor/gmp-6.3.0/mpn/arm/neon/lshiftc.asm b/vendor/gmp-6.3.0/mpn/arm/neon/lshiftc.asm new file mode 100644 index 0000000..f1bf0de --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/arm/neon/lshiftc.asm @@ -0,0 +1,242 @@ +dnl  ARM Neon mpn_lshiftc. + +dnl  Contributed to the GNU project by Torbjörn Granlund. + +dnl  Copyright 2013 Free Software Foundation, Inc. + +dnl  This file is part of the GNU MP Library. +dnl +dnl  The GNU MP Library is free software; you can redistribute it and/or modify +dnl  it under the terms of either: +dnl +dnl    * the GNU Lesser General Public License as published by the Free +dnl      Software Foundation; either version 3 of the License, or (at your +dnl      option) any later version. +dnl +dnl  or +dnl +dnl    * the GNU General Public License as published by the Free Software +dnl      Foundation; either version 2 of the License, or (at your option) any +dnl      later version. +dnl +dnl  or both in parallel, as here. +dnl +dnl  The GNU MP Library is distributed in the hope that it will be useful, but +dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License +dnl  for more details. +dnl +dnl  You should have received copies of the GNU General Public License and the +dnl  GNU Lesser General Public License along with the GNU MP Library.  If not, +dnl  see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C	     cycles/limb     cycles/limb     cycles/limb      good +C              aligned	      unaligned	      best seen	     for cpu? +C StrongARM	 -		 - +C XScale	 -		 - +C Cortex-A7	 ?		 ? +C Cortex-A8	 ?		 ? +C Cortex-A9	 3.5		 3.5				Y +C Cortex-A15	 1.75		 1.75				Y + + +C We read 64 bits at a time at 32-bit aligned addresses, and except for the +C first and last store, we write using 64-bit aligned addresses.  All shifting +C is done on 64-bit words in 'extension' registers. +C +C It should be possible to read also using 64-bit alignment, by manipulating +C the shift count for unaligned operands.  Not done, since it does not seem to +C matter for A9 or A15. +C +C This will not work in big-endian mode. + +C TODO +C  * Try using 128-bit operations.  Note that Neon lacks pure 128-bit shifts, +C    which might make it tricky. +C  * Clean up and simplify. +C  * Consider sharing most of the code for lshift and rshift, since the feed-in +C    code, the loop, and most of the wind-down code are identical. +C  * Replace the basecase code with code using 'extension' registers. +C  * Optimise.  It is not clear that this loop insn permutation is optimal for +C    either A9 or A15. + +C INPUT PARAMETERS +define(`rp',  `r0') +define(`ap',  `r1') +define(`n',   `r2') +define(`cnt', `r3') + +ASM_START(neon) +	TEXT +	ALIGN(64) +PROLOGUE(mpn_lshiftc) +	mov	r12, n, lsl #2 +	add	rp, rp, r12 +	add	ap, ap, r12 + +	cmp	n, #4			C SIMD code n limit +	ble	L(base) + +	vdup.32	d6, r3			C left shift count is positive +	sub	r3, r3, #64		C right shift count is negative +	vdup.32	d7, r3 +	mov	r12, #-8		C lshift pointer update offset + +	sub	ap, ap, #8 +	vld1.32	{d19}, [ap], r12	C load initial 2 limbs +	vshl.u64 d18, d19, d7		C retval + +	tst	rp, #4			C is rp 64-bit aligned already? +	beq	L(rp_aligned)		C yes, skip +	vmvn	 d19, d19 +	add	ap, ap, #4		C move back ap pointer +	vshl.u64 d4, d19, d6 +	sub	n, n, #1		C first limb handled +	sub	 rp, rp, #4 +	vst1.32	 {d4[1]}, [rp]		C store first limb, rp gets aligned +	vld1.32	 {d19}, [ap], r12	C load ap[1] and ap[2] + +L(rp_aligned): +	sub	rp, rp, #8 +	subs	n, n, #6 +	vmvn	 d19, d19 +	blt	L(two_or_three_more) +	tst	n, #2 +	beq	L(2) + +L(1):	vld1.32	 {d17}, [ap], r12 +	vshl.u64 d5, d19, d6 +	vmvn	 d17, d17 +	vld1.32	 {d16}, [ap], r12 +	vshl.u64 d0, d17, d7 +	vshl.u64 d4, d17, d6 +	sub	n, n, #2 +	b	 L(mid) + +L(2):	vld1.32	 {d16}, [ap], r12 +	vshl.u64 d4, d19, d6 +	vmvn	 d16, d16 +	vld1.32	 {d17}, [ap], r12 +	vshl.u64 d1, d16, d7 +	vshl.u64 d5, d16, d6 +	subs	n, n, #4 +	blt	L(end) + +L(top):	vmvn	 d17, d17 +	vld1.32	 {d16}, [ap], r12 +	vorr	 d2, d4, d1 +	vshl.u64 d0, d17, d7 +	vshl.u64 d4, d17, d6 +	vst1.32	 {d2}, [rp:64], r12 +L(mid):	vmvn	 d16, d16 +	vld1.32	 {d17}, [ap], r12 +	vorr	 d3, d5, d0 +	vshl.u64 d1, d16, d7 +	vshl.u64 d5, d16, d6 +	vst1.32	 {d3}, [rp:64], r12 +	subs	n, n, #4 +	bge	L(top) + +L(end):	tst	 n, #1 +	beq	 L(evn) + +	vorr	 d2, d4, d1 +	vst1.32	 {d2}, [rp:64], r12 +	b	 L(cj1) + +L(evn):	vmvn	 d17, d17 +	vorr	 d2, d4, d1 +	vshl.u64 d0, d17, d7 +	vshl.u64 d4, d17, d6 +	vst1.32	 {d2}, [rp:64], r12 +	vmov.u8	 d17, #255 +	vorr	 d2, d5, d0 +	vshl.u64 d0, d17, d7 +	vorr	 d3, d4, d0 +	b	 L(cj2) + +C Load last 2 - 3 limbs, store last 4 - 5 limbs +L(two_or_three_more): +	tst	n, #1 +	beq	L(l2) + +L(l3):	vshl.u64 d5, d19, d6 +	vld1.32	 {d17}, [ap], r12 +L(cj1):	vmov.u8	 d16, #0 +	add	 ap, ap, #4 +	vmvn	 d17, d17 +	vld1.32	 {d16[1]}, [ap], r12 +	vshl.u64 d0, d17, d7 +	vshl.u64 d4, d17, d6 +	vmvn	 d16, d16 +	vorr	 d3, d5, d0 +	vshl.u64 d1, d16, d7 +	vshl.u64 d5, d16, d6 +	vst1.32	 {d3}, [rp:64], r12 +	vorr	 d2, d4, d1 +	vst1.32	 {d2}, [rp:64], r12 +	add	 rp, rp, #4 +	vst1.32	 {d5[1]}, [rp] +	vmov.32	 r0, d18[0] +	bx	lr + +L(l2):	vld1.32	 {d16}, [ap], r12 +	vshl.u64 d4, d19, d6 +	vmvn	 d16, d16 +	vshl.u64 d1, d16, d7 +	vshl.u64 d5, d16, d6 +	vmov.u8	 d17, #255 +	vorr	 d2, d4, d1 +	vshl.u64 d0, d17, d7 +	vorr	 d3, d5, d0 +L(cj2):	vst1.32	 {d2}, [rp:64], r12 +	vst1.32	 {d3}, [rp] +	vmov.32	 r0, d18[0] +	bx	lr + + +define(`tnc', `r12') +L(base): +	push	{r4, r6, r7, r8} +	ldr	r4, [ap, #-4]! +	rsb	tnc, cnt, #32 +	mvn	r6, r4 + +	mov	r7, r6, lsl cnt +	tst	n, #1 +	beq	L(ev)			C n even + +L(od):	subs	n, n, #2 +	bcc	L(ed1)			C n = 1 +	ldr	r8, [ap, #-4]! +	mvn	r8, r8 +	b	L(md)			C n = 3 + +L(ev):	ldr	r6, [ap, #-4]! +	mvn	r6, r6 +	subs	n, n, #2 +	beq	L(ed)			C n = 3 +					C n = 4 +L(tp):	ldr	r8, [ap, #-4]! +	orr	r7, r7, r6, lsr tnc +	str	r7, [rp, #-4]! +	mvn	r8, r8 +	mov	r7, r6, lsl cnt +L(md):	ldr	r6, [ap, #-4]! +	orr	r7, r7, r8, lsr tnc +	str	r7, [rp, #-4]! +	mvn	r6, r6 +	mov	r7, r8, lsl cnt + +L(ed):	orr	r7, r7, r6, lsr tnc +	str	r7, [rp, #-4]! +	mov	r7, r6, lsl cnt +L(ed1):	mvn	r6, #0 +	orr	r7, r7, r6, lsr tnc +	str	r7, [rp, #-4] +	mov	r0, r4, lsr tnc +	pop	{r4, r6, r7, r8} +	bx	r14 +EPILOGUE() |