diff options
author | Thomas Voss <mail@thomasvoss.com> | 2024-06-21 23:36:36 +0200 |
---|---|---|
committer | Thomas Voss <mail@thomasvoss.com> | 2024-06-21 23:42:26 +0200 |
commit | a89a14ef5da44684a16b204e7a70460cc8c4922a (patch) | |
tree | b23b4c6b155977909ef508fdae2f48d33d802813 /vendor/gmp-6.3.0/mpn/arm/neon | |
parent | 1db63fcedab0b288820d66e100b1877b1a5a8851 (diff) |
Basic constant folding implementation
Diffstat (limited to 'vendor/gmp-6.3.0/mpn/arm/neon')
-rw-r--r-- | vendor/gmp-6.3.0/mpn/arm/neon/README | 2 | ||||
-rw-r--r-- | vendor/gmp-6.3.0/mpn/arm/neon/hamdist.asm | 194 | ||||
-rw-r--r-- | vendor/gmp-6.3.0/mpn/arm/neon/lorrshift.asm | 279 | ||||
-rw-r--r-- | vendor/gmp-6.3.0/mpn/arm/neon/lshiftc.asm | 242 | ||||
-rw-r--r-- | vendor/gmp-6.3.0/mpn/arm/neon/popcount.asm | 166 | ||||
-rw-r--r-- | vendor/gmp-6.3.0/mpn/arm/neon/sec_tabselect.asm | 140 |
6 files changed, 1023 insertions, 0 deletions
diff --git a/vendor/gmp-6.3.0/mpn/arm/neon/README b/vendor/gmp-6.3.0/mpn/arm/neon/README new file mode 100644 index 0000000..79e3b48 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/arm/neon/README @@ -0,0 +1,2 @@ +This directory contains Neon code which runs and is efficient on all +ARM CPUs which support Neon. diff --git a/vendor/gmp-6.3.0/mpn/arm/neon/hamdist.asm b/vendor/gmp-6.3.0/mpn/arm/neon/hamdist.asm new file mode 100644 index 0000000..2320896 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/arm/neon/hamdist.asm @@ -0,0 +1,194 @@ +dnl ARM Neon mpn_hamdist -- mpn bit hamming distance. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM: - +C XScale - +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 1.89 +C Cortex-A15 0.95 + +C TODO +C * Explore using vldr and vldm. Does it help on A9? (These loads do +C 64-bits-at-a-time, which will mess up in big-endian mode. Except not for +C popcount. Except perhaps also for popcount for the edge loads.) +C * Arrange to align the pointer, if that helps performance. Use the same +C read-and-mask trick we use on PCs, for simplicity and performance. (Sorry +C valgrind!) +C * Explore if explicit align directives, e.g., "[ptr:128]" help. +C * See rth's gmp-devel 2013-02/03 messages about final summation tricks. + +C INPUT PARAMETERS +define(`ap', r0) +define(`bp', r1) +define(`n', r2) + +C We sum into 16 16-bit counters in q8,q9, but at the end we sum them and end +C up with 8 16-bit counters. Therefore, we can sum to 8(2^16-1) bits, or +C (8*2^16-1)/32 = 0x3fff limbs. We use a chunksize close to that, but which +C can be represented as a 8-bit ARM constant. +C +define(`chunksize',0x3f80) + +ASM_START() +PROLOGUE(mpn_hamdist) + + cmp n, #chunksize + bhi L(gt16k) + +L(lt16k): + vmov.i64 q8, #0 C clear summation register + vmov.i64 q9, #0 C clear summation register + + tst n, #1 + beq L(xxx0) + vmov.i64 d0, #0 + vmov.i64 d20, #0 + sub n, n, #1 + vld1.32 {d0[0]}, [ap]! C load 1 limb + vld1.32 {d20[0]}, [bp]! C load 1 limb + veor d0, d0, d20 + vcnt.8 d24, d0 + vpadal.u8 d16, d24 C d16/q8 = 0; could just splat + +L(xxx0):tst n, #2 + beq L(xx00) + sub n, n, #2 + vld1.32 {d0}, [ap]! C load 2 limbs + vld1.32 {d20}, [bp]! C load 2 limbs + veor d0, d0, d20 + vcnt.8 d24, d0 + vpadal.u8 d16, d24 + +L(xx00):tst n, #4 + beq L(x000) + sub n, n, #4 + vld1.32 {q0}, [ap]! C load 4 limbs + vld1.32 {q10}, [bp]! C load 4 limbs + veor q0, q0, q10 + vcnt.8 q12, q0 + vpadal.u8 q8, q12 + +L(x000):tst n, #8 + beq L(0000) + + subs n, n, #8 + vld1.32 {q0,q1}, [ap]! C load 8 limbs + vld1.32 {q10,q11}, [bp]! C load 8 limbs + bls L(sum) + +L(gt8): vld1.32 {q2,q3}, [ap]! C load 8 limbs + vld1.32 {q14,q15}, [bp]! C load 8 limbs + veor q0, q0, q10 + veor q1, q1, q11 + sub n, n, #8 + vcnt.8 q12, q0 + vcnt.8 q13, q1 + b L(mid) + +L(0000):subs n, n, #16 + blo L(e0) + + vld1.32 {q2,q3}, [ap]! C load 8 limbs + vld1.32 {q0,q1}, [ap]! C load 8 limbs + vld1.32 {q14,q15}, [bp]! C load 8 limbs + vld1.32 {q10,q11}, [bp]! C load 8 limbs + veor q2, q2, q14 + veor q3, q3, q15 + vcnt.8 q12, q2 + vcnt.8 q13, q3 + subs n, n, #16 + blo L(end) + +L(top): vld1.32 {q2,q3}, [ap]! C load 8 limbs + vld1.32 {q14,q15}, [bp]! C load 8 limbs + veor q0, q0, q10 + veor q1, q1, q11 + vpadal.u8 q8, q12 + vcnt.8 q12, q0 + vpadal.u8 q9, q13 + vcnt.8 q13, q1 +L(mid): vld1.32 {q0,q1}, [ap]! C load 8 limbs + vld1.32 {q10,q11}, [bp]! C load 8 limbs + veor q2, q2, q14 + veor q3, q3, q15 + subs n, n, #16 + vpadal.u8 q8, q12 + vcnt.8 q12, q2 + vpadal.u8 q9, q13 + vcnt.8 q13, q3 + bhs L(top) + +L(end): vpadal.u8 q8, q12 + vpadal.u8 q9, q13 +L(sum): veor q0, q0, q10 + veor q1, q1, q11 + vcnt.8 q12, q0 + vcnt.8 q13, q1 + vpadal.u8 q8, q12 + vpadal.u8 q9, q13 + vadd.i16 q8, q8, q9 + C we have 8 16-bit counts +L(e0): vpaddl.u16 q8, q8 C we have 4 32-bit counts + vpaddl.u32 q8, q8 C we have 2 64-bit counts + vmov.32 r0, d16[0] + vmov.32 r1, d17[0] + add r0, r0, r1 + bx lr + +C Code for large count. Splits operand and calls above code. +define(`ap2', r5) +define(`bp2', r6) +L(gt16k): + push {r4,r5,r6,r14} + mov ap2, ap + mov bp2, bp + mov r3, n C full count + mov r4, #0 C total sum + +1: mov n, #chunksize C count for this invocation + bl L(lt16k) C could jump deep inside code + add ap2, ap2, #chunksize*4 C point at next chunk + add bp2, bp2, #chunksize*4 C point at next chunk + add r4, r4, r0 + mov ap, ap2 C put chunk pointer in place for call + mov bp, bp2 C put chunk pointer in place for call + sub r3, r3, #chunksize + cmp r3, #chunksize + bhi 1b + + mov n, r3 C count for final invocation + bl L(lt16k) + add r0, r4, r0 + pop {r4,r5,r6,pc} +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/arm/neon/lorrshift.asm b/vendor/gmp-6.3.0/mpn/arm/neon/lorrshift.asm new file mode 100644 index 0000000..7ebc780 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/arm/neon/lorrshift.asm @@ -0,0 +1,279 @@ +dnl ARM Neon mpn_lshift and mpn_rshift. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb cycles/limb cycles/limb good +C aligned unaligned best seen for cpu? +C StrongARM - - +C XScale - - +C Cortex-A7 ? ? +C Cortex-A8 ? ? +C Cortex-A9 3 3 Y +C Cortex-A15 1.5 1.5 Y + + +C We read 64 bits at a time at 32-bit aligned addresses, and except for the +C first and last store, we write using 64-bit aligned addresses. All shifting +C is done on 64-bit words in 'extension' registers. +C +C It should be possible to read also using 64-bit alignment, by manipulating +C the shift count for unaligned operands. Not done, since it does not seem to +C matter for A9 or A15. +C +C This will not work in big-endian mode. + +C TODO +C * Try using 128-bit operations. Note that Neon lacks pure 128-bit shifts, +C which might make it tricky. +C * Clean up and simplify. +C * Consider sharing most of the code for lshift and rshift, since the feed-in +C code, the loop, and most of the wind-down code are identical. +C * Replace the basecase code with code using 'extension' registers. +C * Optimise. It is not clear that this loop insn permutation is optimal for +C either A9 or A15. + +C INPUT PARAMETERS +define(`rp', `r0') +define(`ap', `r1') +define(`n', `r2') +define(`cnt', `r3') + +ifdef(`OPERATION_lshift',` + define(`IFLSH', `$1') + define(`IFRSH', `') + define(`X',`0') + define(`Y',`1') + define(`func',`mpn_lshift') +') +ifdef(`OPERATION_rshift',` + define(`IFLSH', `') + define(`IFRSH', `$1') + define(`X',`1') + define(`Y',`0') + define(`func',`mpn_rshift') +') + +MULFUNC_PROLOGUE(mpn_lshift mpn_rshift) + +ASM_START(neon) + TEXT + ALIGN(64) +PROLOGUE(func) +IFLSH(` mov r12, n, lsl #2 ') +IFLSH(` add rp, rp, r12 ') +IFLSH(` add ap, ap, r12 ') + + cmp n, #4 C SIMD code n limit + ble L(base) + +ifdef(`OPERATION_lshift',` + vdup.32 d6, r3 C left shift count is positive + sub r3, r3, #64 C right shift count is negative + vdup.32 d7, r3 + mov r12, #-8') C lshift pointer update offset +ifdef(`OPERATION_rshift',` + rsb r3, r3, #0 C right shift count is negative + vdup.32 d6, r3 + add r3, r3, #64 C left shift count is positive + vdup.32 d7, r3 + mov r12, #8') C rshift pointer update offset + +IFLSH(` sub ap, ap, #8 ') + vld1.32 {d19}, [ap], r12 C load initial 2 limbs + vshl.u64 d18, d19, d7 C retval + + tst rp, #4 C is rp 64-bit aligned already? + beq L(rp_aligned) C yes, skip +IFLSH(` add ap, ap, #4 ') C move back ap pointer +IFRSH(` sub ap, ap, #4 ') C move back ap pointer + vshl.u64 d4, d19, d6 + sub n, n, #1 C first limb handled +IFLSH(` sub rp, rp, #4 ') + vst1.32 {d4[Y]}, [rp]IFRSH(!) C store first limb, rp gets aligned + vld1.32 {d19}, [ap], r12 C load ap[1] and ap[2] + +L(rp_aligned): +IFLSH(` sub rp, rp, #8 ') + subs n, n, #6 + blt L(two_or_three_more) + tst n, #2 + beq L(2) + +L(1): vld1.32 {d17}, [ap], r12 + vshl.u64 d5, d19, d6 + vld1.32 {d16}, [ap], r12 + vshl.u64 d0, d17, d7 + vshl.u64 d4, d17, d6 + sub n, n, #2 + b L(mid) + +L(2): vld1.32 {d16}, [ap], r12 + vshl.u64 d4, d19, d6 + vld1.32 {d17}, [ap], r12 + vshl.u64 d1, d16, d7 + vshl.u64 d5, d16, d6 + subs n, n, #4 + blt L(end) + +L(top): vld1.32 {d16}, [ap], r12 + vorr d2, d4, d1 + vshl.u64 d0, d17, d7 + vshl.u64 d4, d17, d6 + vst1.32 {d2}, [rp:64], r12 +L(mid): vld1.32 {d17}, [ap], r12 + vorr d3, d5, d0 + vshl.u64 d1, d16, d7 + vshl.u64 d5, d16, d6 + vst1.32 {d3}, [rp:64], r12 + subs n, n, #4 + bge L(top) + +L(end): tst n, #1 + beq L(evn) + + vorr d2, d4, d1 + vst1.32 {d2}, [rp:64], r12 + b L(cj1) + +L(evn): vorr d2, d4, d1 + vshl.u64 d0, d17, d7 + vshl.u64 d16, d17, d6 + vst1.32 {d2}, [rp:64], r12 + vorr d2, d5, d0 + b L(cj2) + +C Load last 2 - 3 limbs, store last 4 - 5 limbs +L(two_or_three_more): + tst n, #1 + beq L(l2) + +L(l3): vshl.u64 d5, d19, d6 + vld1.32 {d17}, [ap], r12 +L(cj1): veor d16, d16, d16 +IFLSH(` add ap, ap, #4 ') + vld1.32 {d16[Y]}, [ap], r12 + vshl.u64 d0, d17, d7 + vshl.u64 d4, d17, d6 + vorr d3, d5, d0 + vshl.u64 d1, d16, d7 + vshl.u64 d5, d16, d6 + vst1.32 {d3}, [rp:64], r12 + vorr d2, d4, d1 + vst1.32 {d2}, [rp:64], r12 +IFLSH(` add rp, rp, #4 ') + vst1.32 {d5[Y]}, [rp] + vmov.32 r0, d18[X] + bx lr + +L(l2): vld1.32 {d16}, [ap], r12 + vshl.u64 d4, d19, d6 + vshl.u64 d1, d16, d7 + vshl.u64 d16, d16, d6 + vorr d2, d4, d1 +L(cj2): vst1.32 {d2}, [rp:64], r12 + vst1.32 {d16}, [rp] + vmov.32 r0, d18[X] + bx lr + + +define(`tnc', `r12') +L(base): + push {r4, r6, r7, r8} +ifdef(`OPERATION_lshift',` + ldr r4, [ap, #-4]! + rsb tnc, cnt, #32 + + mov r7, r4, lsl cnt + tst n, #1 + beq L(ev) C n even + +L(od): subs n, n, #2 + bcc L(ed1) C n = 1 + ldr r8, [ap, #-4]! + b L(md) C n = 3 + +L(ev): ldr r6, [ap, #-4]! + subs n, n, #2 + beq L(ed) C n = 3 + C n = 4 +L(tp): ldr r8, [ap, #-4]! + orr r7, r7, r6, lsr tnc + str r7, [rp, #-4]! + mov r7, r6, lsl cnt +L(md): ldr r6, [ap, #-4]! + orr r7, r7, r8, lsr tnc + str r7, [rp, #-4]! + mov r7, r8, lsl cnt + +L(ed): orr r7, r7, r6, lsr tnc + str r7, [rp, #-4]! + mov r7, r6, lsl cnt +L(ed1): str r7, [rp, #-4] + mov r0, r4, lsr tnc +') +ifdef(`OPERATION_rshift',` + ldr r4, [ap] + rsb tnc, cnt, #32 + + mov r7, r4, lsr cnt + tst n, #1 + beq L(ev) C n even + +L(od): subs n, n, #2 + bcc L(ed1) C n = 1 + ldr r8, [ap, #4]! + b L(md) C n = 3 + +L(ev): ldr r6, [ap, #4]! + subs n, n, #2 + beq L(ed) C n = 2 + C n = 4 + +L(tp): ldr r8, [ap, #4]! + orr r7, r7, r6, lsl tnc + str r7, [rp], #4 + mov r7, r6, lsr cnt +L(md): ldr r6, [ap, #4]! + orr r7, r7, r8, lsl tnc + str r7, [rp], #4 + mov r7, r8, lsr cnt + +L(ed): orr r7, r7, r6, lsl tnc + str r7, [rp], #4 + mov r7, r6, lsr cnt +L(ed1): str r7, [rp], #4 + mov r0, r4, lsl tnc +') + pop {r4, r6, r7, r8} + bx r14 +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/arm/neon/lshiftc.asm b/vendor/gmp-6.3.0/mpn/arm/neon/lshiftc.asm new file mode 100644 index 0000000..f1bf0de --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/arm/neon/lshiftc.asm @@ -0,0 +1,242 @@ +dnl ARM Neon mpn_lshiftc. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb cycles/limb cycles/limb good +C aligned unaligned best seen for cpu? +C StrongARM - - +C XScale - - +C Cortex-A7 ? ? +C Cortex-A8 ? ? +C Cortex-A9 3.5 3.5 Y +C Cortex-A15 1.75 1.75 Y + + +C We read 64 bits at a time at 32-bit aligned addresses, and except for the +C first and last store, we write using 64-bit aligned addresses. All shifting +C is done on 64-bit words in 'extension' registers. +C +C It should be possible to read also using 64-bit alignment, by manipulating +C the shift count for unaligned operands. Not done, since it does not seem to +C matter for A9 or A15. +C +C This will not work in big-endian mode. + +C TODO +C * Try using 128-bit operations. Note that Neon lacks pure 128-bit shifts, +C which might make it tricky. +C * Clean up and simplify. +C * Consider sharing most of the code for lshift and rshift, since the feed-in +C code, the loop, and most of the wind-down code are identical. +C * Replace the basecase code with code using 'extension' registers. +C * Optimise. It is not clear that this loop insn permutation is optimal for +C either A9 or A15. + +C INPUT PARAMETERS +define(`rp', `r0') +define(`ap', `r1') +define(`n', `r2') +define(`cnt', `r3') + +ASM_START(neon) + TEXT + ALIGN(64) +PROLOGUE(mpn_lshiftc) + mov r12, n, lsl #2 + add rp, rp, r12 + add ap, ap, r12 + + cmp n, #4 C SIMD code n limit + ble L(base) + + vdup.32 d6, r3 C left shift count is positive + sub r3, r3, #64 C right shift count is negative + vdup.32 d7, r3 + mov r12, #-8 C lshift pointer update offset + + sub ap, ap, #8 + vld1.32 {d19}, [ap], r12 C load initial 2 limbs + vshl.u64 d18, d19, d7 C retval + + tst rp, #4 C is rp 64-bit aligned already? + beq L(rp_aligned) C yes, skip + vmvn d19, d19 + add ap, ap, #4 C move back ap pointer + vshl.u64 d4, d19, d6 + sub n, n, #1 C first limb handled + sub rp, rp, #4 + vst1.32 {d4[1]}, [rp] C store first limb, rp gets aligned + vld1.32 {d19}, [ap], r12 C load ap[1] and ap[2] + +L(rp_aligned): + sub rp, rp, #8 + subs n, n, #6 + vmvn d19, d19 + blt L(two_or_three_more) + tst n, #2 + beq L(2) + +L(1): vld1.32 {d17}, [ap], r12 + vshl.u64 d5, d19, d6 + vmvn d17, d17 + vld1.32 {d16}, [ap], r12 + vshl.u64 d0, d17, d7 + vshl.u64 d4, d17, d6 + sub n, n, #2 + b L(mid) + +L(2): vld1.32 {d16}, [ap], r12 + vshl.u64 d4, d19, d6 + vmvn d16, d16 + vld1.32 {d17}, [ap], r12 + vshl.u64 d1, d16, d7 + vshl.u64 d5, d16, d6 + subs n, n, #4 + blt L(end) + +L(top): vmvn d17, d17 + vld1.32 {d16}, [ap], r12 + vorr d2, d4, d1 + vshl.u64 d0, d17, d7 + vshl.u64 d4, d17, d6 + vst1.32 {d2}, [rp:64], r12 +L(mid): vmvn d16, d16 + vld1.32 {d17}, [ap], r12 + vorr d3, d5, d0 + vshl.u64 d1, d16, d7 + vshl.u64 d5, d16, d6 + vst1.32 {d3}, [rp:64], r12 + subs n, n, #4 + bge L(top) + +L(end): tst n, #1 + beq L(evn) + + vorr d2, d4, d1 + vst1.32 {d2}, [rp:64], r12 + b L(cj1) + +L(evn): vmvn d17, d17 + vorr d2, d4, d1 + vshl.u64 d0, d17, d7 + vshl.u64 d4, d17, d6 + vst1.32 {d2}, [rp:64], r12 + vmov.u8 d17, #255 + vorr d2, d5, d0 + vshl.u64 d0, d17, d7 + vorr d3, d4, d0 + b L(cj2) + +C Load last 2 - 3 limbs, store last 4 - 5 limbs +L(two_or_three_more): + tst n, #1 + beq L(l2) + +L(l3): vshl.u64 d5, d19, d6 + vld1.32 {d17}, [ap], r12 +L(cj1): vmov.u8 d16, #0 + add ap, ap, #4 + vmvn d17, d17 + vld1.32 {d16[1]}, [ap], r12 + vshl.u64 d0, d17, d7 + vshl.u64 d4, d17, d6 + vmvn d16, d16 + vorr d3, d5, d0 + vshl.u64 d1, d16, d7 + vshl.u64 d5, d16, d6 + vst1.32 {d3}, [rp:64], r12 + vorr d2, d4, d1 + vst1.32 {d2}, [rp:64], r12 + add rp, rp, #4 + vst1.32 {d5[1]}, [rp] + vmov.32 r0, d18[0] + bx lr + +L(l2): vld1.32 {d16}, [ap], r12 + vshl.u64 d4, d19, d6 + vmvn d16, d16 + vshl.u64 d1, d16, d7 + vshl.u64 d5, d16, d6 + vmov.u8 d17, #255 + vorr d2, d4, d1 + vshl.u64 d0, d17, d7 + vorr d3, d5, d0 +L(cj2): vst1.32 {d2}, [rp:64], r12 + vst1.32 {d3}, [rp] + vmov.32 r0, d18[0] + bx lr + + +define(`tnc', `r12') +L(base): + push {r4, r6, r7, r8} + ldr r4, [ap, #-4]! + rsb tnc, cnt, #32 + mvn r6, r4 + + mov r7, r6, lsl cnt + tst n, #1 + beq L(ev) C n even + +L(od): subs n, n, #2 + bcc L(ed1) C n = 1 + ldr r8, [ap, #-4]! + mvn r8, r8 + b L(md) C n = 3 + +L(ev): ldr r6, [ap, #-4]! + mvn r6, r6 + subs n, n, #2 + beq L(ed) C n = 3 + C n = 4 +L(tp): ldr r8, [ap, #-4]! + orr r7, r7, r6, lsr tnc + str r7, [rp, #-4]! + mvn r8, r8 + mov r7, r6, lsl cnt +L(md): ldr r6, [ap, #-4]! + orr r7, r7, r8, lsr tnc + str r7, [rp, #-4]! + mvn r6, r6 + mov r7, r8, lsl cnt + +L(ed): orr r7, r7, r6, lsr tnc + str r7, [rp, #-4]! + mov r7, r6, lsl cnt +L(ed1): mvn r6, #0 + orr r7, r7, r6, lsr tnc + str r7, [rp, #-4] + mov r0, r4, lsr tnc + pop {r4, r6, r7, r8} + bx r14 +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/arm/neon/popcount.asm b/vendor/gmp-6.3.0/mpn/arm/neon/popcount.asm new file mode 100644 index 0000000..2f8f9af --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/arm/neon/popcount.asm @@ -0,0 +1,166 @@ +dnl ARM Neon mpn_popcount -- mpn bit population count. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM: - +C XScale - +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 1.125 +C Cortex-A15 0.56 + +C TODO +C * Explore using vldr and vldm. Does it help on A9? (These loads do +C 64-bits-at-a-time, which will mess up in big-endian mode. Except not for +C popcount. Except perhaps also for popcount for the edge loads.) +C * Arrange to align the pointer, if that helps performance. Use the same +C read-and-mask trick we use on PCs, for simplicity and performance. (Sorry +C valgrind!) +C * Explore if explicit align directives, e.g., "[ptr:128]" help. +C * See rth's gmp-devel 2013-02/03 messages about final summation tricks. + +C INPUT PARAMETERS +define(`ap', r0) +define(`n', r1) + +C We sum into 16 16-bit counters in q8,q9, but at the end we sum them and end +C up with 8 16-bit counters. Therefore, we can sum to 8(2^16-1) bits, or +C (8*2^16-1)/32 = 0x3fff limbs. We use a chunksize close to that, but which +C can be represented as a 8-bit ARM constant. +C +define(`chunksize',0x3f80) + +ASM_START() +PROLOGUE(mpn_popcount) + + cmp n, #chunksize + bhi L(gt16k) + +L(lt16k): + vmov.i64 q8, #0 C clear summation register + vmov.i64 q9, #0 C clear summation register + + tst n, #1 + beq L(xxx0) + vmov.i64 d0, #0 + sub n, n, #1 + vld1.32 {d0[0]}, [ap]! C load 1 limb + vcnt.8 d24, d0 + vpadal.u8 d16, d24 C d16/q8 = 0; could just splat + +L(xxx0):tst n, #2 + beq L(xx00) + sub n, n, #2 + vld1.32 {d0}, [ap]! C load 2 limbs + vcnt.8 d24, d0 + vpadal.u8 d16, d24 + +L(xx00):tst n, #4 + beq L(x000) + sub n, n, #4 + vld1.32 {q0}, [ap]! C load 4 limbs + vcnt.8 q12, q0 + vpadal.u8 q8, q12 + +L(x000):tst n, #8 + beq L(0000) + + subs n, n, #8 + vld1.32 {q0,q1}, [ap]! C load 8 limbs + bls L(sum) + +L(gt8): vld1.32 {q2,q3}, [ap]! C load 8 limbs + sub n, n, #8 + vcnt.8 q12, q0 + vcnt.8 q13, q1 + b L(mid) + +L(0000):subs n, n, #16 + blo L(e0) + + vld1.32 {q2,q3}, [ap]! C load 8 limbs + vld1.32 {q0,q1}, [ap]! C load 8 limbs + vcnt.8 q12, q2 + vcnt.8 q13, q3 + subs n, n, #16 + blo L(end) + +L(top): vld1.32 {q2,q3}, [ap]! C load 8 limbs + vpadal.u8 q8, q12 + vcnt.8 q12, q0 + vpadal.u8 q9, q13 + vcnt.8 q13, q1 +L(mid): vld1.32 {q0,q1}, [ap]! C load 8 limbs + subs n, n, #16 + vpadal.u8 q8, q12 + vcnt.8 q12, q2 + vpadal.u8 q9, q13 + vcnt.8 q13, q3 + bhs L(top) + +L(end): vpadal.u8 q8, q12 + vpadal.u8 q9, q13 +L(sum): vcnt.8 q12, q0 + vcnt.8 q13, q1 + vpadal.u8 q8, q12 + vpadal.u8 q9, q13 + vadd.i16 q8, q8, q9 + C we have 8 16-bit counts +L(e0): vpaddl.u16 q8, q8 C we have 4 32-bit counts + vpaddl.u32 q8, q8 C we have 2 64-bit counts + vmov.32 r0, d16[0] + vmov.32 r1, d17[0] + add r0, r0, r1 + bx lr + +C Code for large count. Splits operand and calls above code. +define(`ap2', r2) C caller-saves reg not used above +L(gt16k): + push {r4,r14} + mov ap2, ap + mov r3, n C full count + mov r4, #0 C total sum + +1: mov n, #chunksize C count for this invocation + bl L(lt16k) C could jump deep inside code + add ap2, ap2, #chunksize*4 C point at next chunk + add r4, r4, r0 + mov ap, ap2 C put chunk pointer in place for call + sub r3, r3, #chunksize + cmp r3, #chunksize + bhi 1b + + mov n, r3 C count for final invocation + bl L(lt16k) + add r0, r4, r0 + pop {r4,pc} +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/arm/neon/sec_tabselect.asm b/vendor/gmp-6.3.0/mpn/arm/neon/sec_tabselect.asm new file mode 100644 index 0000000..69fceb0 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/arm/neon/sec_tabselect.asm @@ -0,0 +1,140 @@ +dnl ARM Neon mpn_sec_tabselect. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C StrongARM - +C XScale - +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 1.15 +C Cortex-A15 0.65 + +define(`rp', `r0') +define(`tp', `r1') +define(`n', `r2') +define(`nents', `r3') +C define(`which', on stack) + +define(`i', `r4') +define(`j', `r5') + +define(`maskq', `q10') +define(`maskd', `d20') + +ASM_START() +PROLOGUE(mpn_sec_tabselect) + push {r4-r5} + + add r4, sp, #8 + vld1.32 {d30[], d31[]}, [r4] C 4 `which' copies + vmov.i32 q14, #1 C 4 copies of 1 + + subs j, n, #8 + bmi L(outer_end) + +L(outer_top): + mov i, nents + mov r12, tp C preserve tp + veor q13, q13, q13 C 4 counter copies + veor q2, q2, q2 + veor q3, q3, q3 + ALIGN(16) +L(top): vceq.i32 maskq, q13, q15 C compare idx copies to `which' copies + vld1.32 {q0,q1}, [tp] + vadd.i32 q13, q13, q14 + vbit q2, q0, maskq + vbit q3, q1, maskq + add tp, tp, n, lsl #2 + subs i, i, #1 + bne L(top) + vst1.32 {q2,q3}, [rp]! + add tp, r12, #32 C restore tp, point to next slice + subs j, j, #8 + bpl L(outer_top) +L(outer_end): + + tst n, #4 + beq L(b0xx) +L(b1xx):mov i, nents + mov r12, tp + veor q13, q13, q13 + veor q2, q2, q2 + ALIGN(16) +L(tp4): vceq.i32 maskq, q13, q15 + vld1.32 {q0}, [tp] + vadd.i32 q13, q13, q14 + vbit q2, q0, maskq + add tp, tp, n, lsl #2 + subs i, i, #1 + bne L(tp4) + vst1.32 {q2}, [rp]! + add tp, r12, #16 + +L(b0xx):tst n, #2 + beq L(b00x) +L(b01x):mov i, nents + mov r12, tp + veor d26, d26, d26 + veor d4, d4, d4 + ALIGN(16) +L(tp2): vceq.i32 maskd, d26, d30 + vld1.32 {d0}, [tp] + vadd.i32 d26, d26, d28 + vbit d4, d0, maskd + add tp, tp, n, lsl #2 + subs i, i, #1 + bne L(tp2) + vst1.32 {d4}, [rp]! + add tp, r12, #8 + +L(b00x):tst n, #1 + beq L(b000) +L(b001):mov i, nents + mov r12, tp + veor d26, d26, d26 + veor d4, d4, d4 + ALIGN(16) +L(tp1): vceq.i32 maskd, d26, d30 + vld1.32 {d0[0]}, [tp] + vadd.i32 d26, d26, d28 + vbit d4, d0, maskd + add tp, tp, n, lsl #2 + subs i, i, #1 + bne L(tp1) + vst1.32 {d4[0]}, [rp] + +L(b000):pop {r4-r5} + bx r14 +EPILOGUE() |