diff options
Diffstat (limited to 'vendor/gmp-6.3.0/mpn/arm/v6')
-rw-r--r-- | vendor/gmp-6.3.0/mpn/arm/v6/addmul_1.asm | 112 | ||||
-rw-r--r-- | vendor/gmp-6.3.0/mpn/arm/v6/addmul_2.asm | 125 | ||||
-rw-r--r-- | vendor/gmp-6.3.0/mpn/arm/v6/addmul_3.asm | 191 | ||||
-rw-r--r-- | vendor/gmp-6.3.0/mpn/arm/v6/dive_1.asm | 149 | ||||
-rw-r--r-- | vendor/gmp-6.3.0/mpn/arm/v6/gmp-mparam.h | 187 | ||||
-rw-r--r-- | vendor/gmp-6.3.0/mpn/arm/v6/mode1o.asm | 95 | ||||
-rw-r--r-- | vendor/gmp-6.3.0/mpn/arm/v6/mul_1.asm | 115 | ||||
-rw-r--r-- | vendor/gmp-6.3.0/mpn/arm/v6/mul_2.asm | 135 | ||||
-rw-r--r-- | vendor/gmp-6.3.0/mpn/arm/v6/popham.asm | 139 | ||||
-rw-r--r-- | vendor/gmp-6.3.0/mpn/arm/v6/sqr_basecase.asm | 544 | ||||
-rw-r--r-- | vendor/gmp-6.3.0/mpn/arm/v6/submul_1.asm | 125 |
11 files changed, 1917 insertions, 0 deletions
diff --git a/vendor/gmp-6.3.0/mpn/arm/v6/addmul_1.asm b/vendor/gmp-6.3.0/mpn/arm/v6/addmul_1.asm new file mode 100644 index 0000000..a38af58 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/arm/v6/addmul_1.asm @@ -0,0 +1,112 @@ +dnl ARM mpn_addmul_1. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM: - +C XScale - +C ARM11 6.4 +C Cortex-A7 5.25 +C Cortex-A8 7 +C Cortex-A9 3.25 +C Cortex-A15 4 + +C TODO +C * Micro-optimise feed-in code. +C * Optimise for n=1,2 by delaying register saving. +C * Try using ldm/stm. + +define(`rp',`r0') +define(`up',`r1') +define(`n', `r2') +define(`v0',`r3') + +ASM_START() +PROLOGUE(mpn_addmul_1) + stmfd sp!, { r4, r5, r6, r7 } + + ands r6, n, #3 + mov r12, #0 + beq L(fi0) + cmp r6, #2 + bcc L(fi1) + beq L(fi2) + +L(fi3): ldr r4, [up], #4 + ldr r6, [rp, #0] + ldr r5, [up], #4 + b L(lo3) + +L(fi0): ldr r5, [up], #4 + ldr r7, [rp], #4 + ldr r4, [up], #4 + b L(lo0) + +L(fi1): ldr r4, [up], #4 + ldr r6, [rp], #8 + subs n, n, #1 + beq L(1) + ldr r5, [up], #4 + b L(lo1) + +L(fi2): ldr r5, [up], #4 + ldr r7, [rp], #12 + ldr r4, [up], #4 + b L(lo2) + + ALIGN(16) +L(top): ldr r6, [rp, #-8] + ldr r5, [up], #4 + str r7, [rp, #-12] +L(lo1): umaal r6, r12, r4, v0 + ldr r7, [rp, #-4] + ldr r4, [up], #4 + str r6, [rp, #-8] +L(lo0): umaal r7, r12, r5, v0 + ldr r6, [rp, #0] + ldr r5, [up], #4 + str r7, [rp, #-4] +L(lo3): umaal r6, r12, r4, v0 + ldr r7, [rp, #4] + ldr r4, [up], #4 + str r6, [rp], #16 +L(lo2): umaal r7, r12, r5, v0 + subs n, n, #4 + bhi L(top) + + ldr r6, [rp, #-8] + str r7, [rp, #-12] +L(1): umaal r6, r12, r4, v0 + str r6, [rp, #-8] + mov r0, r12 + ldmfd sp!, { r4, r5, r6, r7 } + bx lr +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/arm/v6/addmul_2.asm b/vendor/gmp-6.3.0/mpn/arm/v6/addmul_2.asm new file mode 100644 index 0000000..69d0b8f --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/arm/v6/addmul_2.asm @@ -0,0 +1,125 @@ +dnl ARM mpn_addmul_2. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2012, 2013, 2015 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM: - +C XScale - +C ARM11 4.68 +C Cortex-A5 3.63 +C Cortex-A7 3.65 +C Cortex-A8 4.0 +C Cortex-A9 2.25 +C Cortex-A15 2.5 +C Cortex-A17 2.13 +C Cortex-A53 3.5 + +define(`rp',`r0') +define(`up',`r1') +define(`n', `r2') +define(`vp',`r3') + +define(`v0',`r6') +define(`v1',`r7') +define(`u0',`r3') +define(`u1',`r9') + +define(`cya',`r8') +define(`cyb',`r12') + + +ASM_START() +PROLOGUE(mpn_addmul_2) + push { r4-r9 } + + ldrd v0, v1, [vp, #0] + mov cya, #0 + mov cyb, #0 + + tst n, #1 + beq L(evn) + +L(odd): ldr u1, [up, #0] + ldr r4, [rp, #0] + tst n, #2 + beq L(fi1) +L(fi3): sub up, up, #8 + sub rp, rp, #8 + b L(lo3) +L(fi1): sub n, n, #1 + b L(top) + +L(evn): ldr u0, [up, #0] + ldr r5, [rp, #0] + tst n, #2 + bne L(fi2) +L(fi0): sub up, up, #4 + sub rp, rp, #4 + b L(lo0) +L(fi2): sub up, up, #12 + sub rp, rp, #12 + b L(lo2) + + ALIGN(16) +L(top): ldr r5, [rp, #4] + umaal r4, cya, u1, v0 + ldr u0, [up, #4] + umaal r5, cyb, u1, v1 + str r4, [rp, #0] +L(lo0): ldr r4, [rp, #8] + umaal r5, cya, u0, v0 + ldr u1, [up, #8] + umaal r4, cyb, u0, v1 + str r5, [rp, #4] +L(lo3): ldr r5, [rp, #12] + umaal r4, cya, u1, v0 + ldr u0, [up, #12] + umaal r5, cyb, u1, v1 + str r4, [rp, #8] +L(lo2): ldr r4, [rp, #16]! + umaal r5, cya, u0, v0 + ldr u1, [up, #16]! + umaal r4, cyb, u0, v1 + str r5, [rp, #-4] + subs n, n, #4 + bhi L(top) + +L(end): umaal r4, cya, u1, v0 + umaal cya, cyb, u1, v1 + str r4, [rp, #0] + str cya, [rp, #4] + mov r0, cyb + + pop { r4-r9 } + bx r14 +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/arm/v6/addmul_3.asm b/vendor/gmp-6.3.0/mpn/arm/v6/addmul_3.asm new file mode 100644 index 0000000..d1490cd --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/arm/v6/addmul_3.asm @@ -0,0 +1,191 @@ +dnl ARM mpn_addmul_3. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM: - +C XScale - +C ARM11 4.33 +C Cortex-A5 3.28 +C Cortex-A7 3.25 +C Cortex-A8 3.17 +C Cortex-A9 2.125 +C Cortex-A15 2 +C Cortex-A17 2.11 +C Cortex-A53 4.18 + +C TODO +C * Use a fast path for n <= KARATSUBA_MUL_THRESHOLD using a jump table, +C avoiding the current multiply. +C * Start the first multiply or multiplies early. + +define(`rp',`r0') +define(`up',`r1') +define(`n', `r2') +define(`vp',`r3') + +define(`v0',`r4') define(`v1',`r5') define(`v2',`r6') +define(`u0',`r3') define(`u1',`r14') +define(`w0',`r7') define(`w1',`r8') define(`w2',`r9') +define(`cy0',`r10') define(`cy1',`r11') define(`cy2',`r12') + + +ASM_START() +PROLOGUE(mpn_addmul_3) + push { r4-r11, r14 } + + ldr w0, =0xaaaaaaab C 3^{-1} mod 2^32 + ldm vp, { v0,v1,v2 } + mov cy0, #0 + mov cy1, #0 + mov cy2, #0 + +C Tricky n mod 6 + mul w0, w0, n C n * 3^{-1} mod 2^32 + and w0, w0, #0xc0000001 C pseudo-CRT mod 3,2 + sub n, n, #3 +ifdef(`PIC',` + add pc, pc, w0, ror $28 + nop + b L(b0) + b L(b2) + b L(b4) + .word 0xe7f000f0 C udf + b L(b3) + b L(b5) + b L(b1) +',` + ldr pc, [pc, w0, ror $28] + nop + .word L(b0), L(b2), L(b4), 0, L(b3), L(b5), L(b1) +') + +L(b5): add up, up, #-8 + ldr w1, [rp, #0] + ldr w2, [rp, #4] + ldr u1, [up, #8] + b L(lo5) + +L(b4): add rp, rp, #-4 + add up, up, #-12 + ldr w2, [rp, #4] + ldr w0, [rp, #8] + ldr u0, [up, #12] + b L(lo4) + +L(b3): add rp, rp, #-8 + add up, up, #-16 + ldr w0, [rp, #8] + ldr w1, [rp, #12] + ldr u1, [up, #16] + b L(lo3) + +L(b1): add rp, rp, #8 + ldr w2, [rp, #-8] + ldr w0, [rp, #-4] + ldr u1, [up, #0] + b L(lo1) + +L(b0): add rp, rp, #4 + add up, up, #-4 + ldr w0, [rp, #-4] + ldr w1, [rp, #0] + ldr u0, [up, #4] + b L(lo0) + +L(b2): add rp, rp, #12 + add up, up, #4 + ldr w1, [rp, #-12] + ldr w2, [rp, #-8] + ldr u0, [up, #-4] + + ALIGN(16) +L(top): ldr w0, [rp, #-4] + umaal w1, cy0, u0, v0 + ldr u1, [up, #0] + umaal w2, cy1, u0, v1 + str w1, [rp, #-12] + umaal w0, cy2, u0, v2 +L(lo1): ldr w1, [rp, #0] + umaal w2, cy0, u1, v0 + ldr u0, [up, #4] + umaal w0, cy1, u1, v1 + str w2, [rp, #-8] + umaal w1, cy2, u1, v2 +L(lo0): ldr w2, [rp, #4] + umaal w0, cy0, u0, v0 + ldr u1, [up, #8] + umaal w1, cy1, u0, v1 + str w0, [rp, #-4] + umaal w2, cy2, u0, v2 +L(lo5): ldr w0, [rp, #8] + umaal w1, cy0, u1, v0 + ldr u0, [up, #12] + umaal w2, cy1, u1, v1 + str w1, [rp, #0] + umaal w0, cy2, u1, v2 +L(lo4): ldr w1, [rp, #12] + umaal w2, cy0, u0, v0 + ldr u1, [up, #16] + umaal w0, cy1, u0, v1 + str w2, [rp, #4] + umaal w1, cy2, u0, v2 +L(lo3): ldr w2, [rp, #16] + umaal w0, cy0, u1, v0 + ldr u0, [up, #20] + umaal w1, cy1, u1, v1 + str w0, [rp, #8] + umaal w2, cy2, u1, v2 +L(lo2): subs n, n, #6 + add up, up, #24 + add rp, rp, #24 + bge L(top) + +L(end): umaal w1, cy0, u0, v0 + ldr u1, [up, #0] + umaal w2, cy1, u0, v1 + str w1, [rp, #-12] + mov w0, #0 + umaal w0, cy2, u0, v2 + umaal w2, cy0, u1, v0 + umaal w0, cy1, u1, v1 + str w2, [rp, #-8] + umaal cy1, cy2, u1, v2 + adds w0, w0, cy0 + str w0, [rp, #-4] + adcs w1, cy1, #0 + str w1, [rp, #0] + adc r0, cy2, #0 + + pop { r4-r11, pc } +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/arm/v6/dive_1.asm b/vendor/gmp-6.3.0/mpn/arm/v6/dive_1.asm new file mode 100644 index 0000000..92de814 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/arm/v6/dive_1.asm @@ -0,0 +1,149 @@ +dnl ARM v6 mpn_divexact_1 + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2012, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb cycles/limb +C norm unorm modexact_1c_odd +C StrongARM - - +C XScale - - +C Cortex-A7 ? ? +C Cortex-A8 ? ? +C Cortex-A9 9 10 9 +C Cortex-A15 7 7 7 + +C Architecture requirements: +C v5 - +C v5t clz +C v5te - +C v6 umaal +C v6t2 - +C v7a - + +define(`rp', `r0') +define(`up', `r1') +define(`n', `r2') +define(`d', `r3') + +define(`cy', `r7') +define(`cnt', `r6') +define(`tnc', `r10') + +ASM_START() +PROLOGUE(mpn_divexact_1) + push {r4,r5,r6,r7,r8,r9} + + tst d, #1 + + rsb r4, d, #0 + and r4, r4, d + clz r4, r4 + rsb cnt, r4, #31 C count_trailing_zeros + mov d, d, lsr cnt + +C binvert limb + LEA( r4, binvert_limb_table) + and r12, d, #254 + ldrb r4, [r4, r12, lsr #1] + mul r12, r4, r4 + mul r12, d, r12 + rsb r12, r12, r4, lsl #1 + mul r4, r12, r12 + mul r4, d, r4 + rsb r4, r4, r12, lsl #1 C r4 = inverse + + ldr r5, [up], #4 C up[0] + mov cy, #0 + rsb r8, r4, #0 C r8 = -inverse + beq L(unnorm) + +L(norm): + subs n, n, #1 + mul r5, r5, r4 + beq L(end) + + ALIGN(16) +L(top): ldr r9, [up], #4 + mov r12, #0 + str r5, [rp], #4 + umaal r12, cy, r5, d + mul r5, r9, r4 + mla r5, cy, r8, r5 + subs n, n, #1 + bne L(top) + +L(end): str r5, [rp] + pop {r4,r5,r6,r7,r8,r9} + bx r14 + +L(unnorm): + push {r10,r11} + rsb tnc, cnt, #32 + mov r11, r5, lsr cnt + subs n, n, #1 + beq L(edx) + + ldr r12, [up], #4 + orr r9, r11, r12, lsl tnc + mov r11, r12, lsr cnt + mul r5, r9, r4 + subs n, n, #1 + beq L(edu) + + ALIGN(16) +L(tpu): ldr r12, [up], #4 + orr r9, r11, r12, lsl tnc + mov r11, r12, lsr cnt + mov r12, #0 + str r5, [rp], #4 + umaal r12, cy, r5, d + mul r5, r9, r4 + mla r5, cy, r8, r5 + subs n, n, #1 + bne L(tpu) + +L(edu): str r5, [rp], #4 + mov r12, #0 + umaal r12, cy, r5, d + mul r5, r11, r4 + mla r5, cy, r8, r5 + str r5, [rp] + pop {r10,r11} + pop {r4,r5,r6,r7,r8,r9} + bx r14 + +L(edx): mul r5, r11, r4 + str r5, [rp] + pop {r10,r11} + pop {r4,r5,r6,r7,r8,r9} + bx r14 +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/arm/v6/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/arm/v6/gmp-mparam.h new file mode 100644 index 0000000..35a7c55 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/arm/v6/gmp-mparam.h @@ -0,0 +1,187 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 700 MHz ARM11 (raspberry pi) */ +/* FFT tuning limit = 8,088,775 */ +/* Generated by tuneup.c, 2019-10-23, gcc 8.3 */ + +#define DIVREM_1_NORM_THRESHOLD 0 /* preinv always */ +#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 7 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 6 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */ +#define MOD_1_2_TO_MOD_1_4_THRESHOLD MP_SIZE_T_MAX +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 19 +#define USE_PREINV_DIVREM_1 1 /* preinv always */ +#define DIV_QR_1N_PI1_METHOD 1 /* 71.61% faster than 2 */ +#define DIV_QR_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 38 + +#define DIV_1_VS_MUL_1_PERCENT 251 + +#define MUL_TOOM22_THRESHOLD 38 +#define MUL_TOOM33_THRESHOLD 134 +#define MUL_TOOM44_THRESHOLD 512 +#define MUL_TOOM6H_THRESHOLD 0 /* always */ +#define MUL_TOOM8H_THRESHOLD 620 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 209 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 625 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 209 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 211 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 300 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 55 +#define SQR_TOOM3_THRESHOLD 200 +#define SQR_TOOM4_THRESHOLD 470 +#define SQR_TOOM6_THRESHOLD 614 +#define SQR_TOOM8_THRESHOLD 882 + +#define MULMID_TOOM42_THRESHOLD 62 + +#define MULMOD_BNM1_THRESHOLD 23 +#define SQRMOD_BNM1_THRESHOLD 26 + +#define MUL_FFT_MODF_THRESHOLD 565 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 565, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 15, 5}, { 31, 6}, { 28, 7}, { 15, 6}, \ + { 33, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \ + { 39, 7}, { 21, 6}, { 43, 7}, { 23, 6}, \ + { 47, 7}, { 27, 8}, { 15, 7}, { 35, 8}, \ + { 19, 7}, { 43, 8}, { 23, 7}, { 51, 8}, \ + { 27, 7}, { 55, 8}, { 31, 7}, { 63, 8}, \ + { 43, 9}, { 23, 8}, { 55, 9}, { 31, 8}, \ + { 71, 9}, { 39, 8}, { 83, 9}, { 47, 8}, \ + { 99, 9}, { 55,10}, { 31, 9}, { 79,10}, \ + { 47, 9}, { 103,11}, { 31,10}, { 63, 9}, \ + { 135,10}, { 79, 9}, { 159,10}, { 95, 9}, \ + { 191,10}, { 111,11}, { 63,10}, { 159,11}, \ + { 95,10}, { 207,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271,11}, { 159,10}, \ + { 351,11}, { 191,10}, { 399,11}, { 223,12}, \ + { 127,11}, { 255,10}, { 511,11}, { 287,10}, \ + { 607,11}, { 319,10}, { 639,11}, { 351,12}, \ + { 191,11}, { 415,13}, { 127,12}, { 255,11}, \ + { 575,12}, { 319,11}, { 671,12}, { 383,11}, \ + { 799,12}, { 447,13}, { 255,12}, { 511,11}, \ + { 1023,12}, { 703,13}, { 383,12}, { 895,14}, \ + { 255,13}, { 511,12}, { 1151,13}, { 639,12}, \ + { 1343,13}, { 767,12}, { 1599,13}, { 895,14}, \ + { 16384,15}, { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 98 +#define MUL_FFT_THRESHOLD 5760 + +#define SQR_FFT_MODF_THRESHOLD 530 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 530, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 28, 7}, { 15, 6}, { 33, 7}, { 17, 6}, \ + { 35, 7}, { 19, 6}, { 39, 7}, { 21, 6}, \ + { 43, 7}, { 23, 6}, { 47, 7}, { 29, 8}, \ + { 15, 7}, { 35, 8}, { 19, 7}, { 43, 8}, \ + { 23, 7}, { 49, 8}, { 27, 7}, { 55, 8}, \ + { 31, 7}, { 63, 8}, { 43, 9}, { 23, 8}, \ + { 55, 9}, { 31, 8}, { 71, 9}, { 39, 8}, \ + { 83, 9}, { 47, 8}, { 95, 9}, { 55,10}, \ + { 31, 9}, { 79,10}, { 47, 9}, { 103,11}, \ + { 31,10}, { 63, 9}, { 135,10}, { 79, 9}, \ + { 167,10}, { 95, 9}, { 191,10}, { 111,11}, \ + { 63,10}, { 143, 9}, { 287,10}, { 159,11}, \ + { 95,10}, { 191, 9}, { 383,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \ + { 543,10}, { 287,11}, { 159,10}, { 351,11}, \ + { 191,10}, { 415,11}, { 223,12}, { 127,11}, \ + { 255,10}, { 543,11}, { 287,10}, { 607,11}, \ + { 319,10}, { 639,11}, { 351,12}, { 191,11}, \ + { 383,10}, { 767,11}, { 415,13}, { 127,12}, \ + { 255,11}, { 607,12}, { 319,11}, { 703,12}, \ + { 383,11}, { 799,12}, { 447,11}, { 895,13}, \ + { 255,12}, { 511,11}, { 1023,12}, { 703,13}, \ + { 383,12}, { 895,14}, { 255,13}, { 511,12}, \ + { 1151,13}, { 639,12}, { 1343,13}, { 767,12}, \ + { 1599,13}, { 895,14}, { 16384,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 104 +#define SQR_FFT_THRESHOLD 4416 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 51 +#define MULLO_MUL_N_THRESHOLD 11278 +#define SQRLO_BASECASE_THRESHOLD 10 +#define SQRLO_DC_THRESHOLD 55 +#define SQRLO_SQR_THRESHOLD 8648 + +#define DC_DIV_QR_THRESHOLD 36 +#define DC_DIVAPPR_Q_THRESHOLD 146 +#define DC_BDIV_QR_THRESHOLD 46 +#define DC_BDIV_Q_THRESHOLD 160 + +#define INV_MULMOD_BNM1_THRESHOLD 74 +#define INV_NEWTON_THRESHOLD 145 +#define INV_APPR_THRESHOLD 147 + +#define BINV_NEWTON_THRESHOLD 372 +#define REDC_1_TO_REDC_2_THRESHOLD 6 +#define REDC_2_TO_REDC_N_THRESHOLD 140 + +#define MU_DIV_QR_THRESHOLD 2801 +#define MU_DIVAPPR_Q_THRESHOLD 2801 +#define MUPI_DIV_QR_THRESHOLD 79 +#define MU_BDIV_QR_THRESHOLD 2541 +#define MU_BDIV_Q_THRESHOLD 2764 + +#define POWM_SEC_TABLE 3,20,139,734 + +#define GET_STR_DC_THRESHOLD 27 +#define GET_STR_PRECOMPUTE_THRESHOLD 45 +#define SET_STR_DC_THRESHOLD 342 +#define SET_STR_PRECOMPUTE_THRESHOLD 1290 + +#define FAC_DSC_THRESHOLD 390 +#define FAC_ODD_THRESHOLD 438 + +#define MATRIX22_STRASSEN_THRESHOLD 25 +#define HGCD2_DIV1_METHOD 5 /* 1.32% faster than 3 */ +#define HGCD_THRESHOLD 82 +#define HGCD_APPR_THRESHOLD 81 +#define HGCD_REDUCE_THRESHOLD 4633 +#define GCD_DC_THRESHOLD 345 +#define GCDEXT_DC_THRESHOLD 268 +#define JACOBI_BASE_METHOD 1 /* 3.30% faster than 2 */ + +/* Tuneup completed successfully, took 45018 seconds */ diff --git a/vendor/gmp-6.3.0/mpn/arm/v6/mode1o.asm b/vendor/gmp-6.3.0/mpn/arm/v6/mode1o.asm new file mode 100644 index 0000000..a2f77a6 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/arm/v6/mode1o.asm @@ -0,0 +1,95 @@ +dnl ARM v6 mpn_modexact_1c_odd + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM - +C XScale - +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 9 +C Cortex-A15 7 + +C Architecture requirements: +C v5 - +C v5t - +C v5te smulbb +C v6 umaal +C v6t2 - +C v7a - + +define(`up', `r0') +define(`n', `r1') +define(`d', `r2') +define(`cy', `r3') + + .protected binvert_limb_table +ASM_START() +PROLOGUE(mpn_modexact_1c_odd) + stmfd sp!, {r4, r5, r6, r7} + + LEA( r4, binvert_limb_table) + + ldr r6, [up], #4 C up[0] + + and r12, d, #254 + ldrb r4, [r4, r12, lsr #1] + smulbb r12, r4, r4 + mul r12, d, r12 + rsb r12, r12, r4, asl #1 + mul r4, r12, r12 + mul r4, d, r4 + rsb r4, r4, r12, asl #1 C r4 = inverse + + subs n, n, #1 + sub r6, r6, cy + mul r6, r6, r4 + beq L(end) + + rsb r5, r4, #0 C r5 = -inverse + +L(top): ldr r7, [up], #4 + mov r12, #0 + umaal r12, cy, r6, d + mul r6, r7, r4 + mla r6, cy, r5, r6 + subs n, n, #1 + bne L(top) + +L(end): mov r12, #0 + umaal r12, cy, r6, d + mov r0, cy + + ldmfd sp!, {r4, r5, r6, r7} + bx r14 +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/arm/v6/mul_1.asm b/vendor/gmp-6.3.0/mpn/arm/v6/mul_1.asm new file mode 100644 index 0000000..3c6ef99 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/arm/v6/mul_1.asm @@ -0,0 +1,115 @@ +dnl ARM mpn_mul_1. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM: - +C XScale - +C ARM11 6.4 +C Cortex-A7 5.25 +C Cortex-A8 7 +C Cortex-A9 3.25 +C Cortex-A15 4 + +C TODO +C * Micro-optimise feed-in code. +C * Optimise for n=1,2 by delaying register saving. +C * Try using ldm/stm. + +define(`rp',`r0') +define(`up',`r1') +define(`n', `r2') +define(`v0',`r3') + +ASM_START() +PROLOGUE(mpn_mul_1) + stmfd sp!, { r4, r5, r6, r7 } + + ands r6, n, #3 + mov r12, #0 + beq L(fi0) + cmp r6, #2 + bcc L(fi1) + beq L(fi2) + +L(fi3): ldr r4, [up], #4 + mov r6, #0 + ldr r5, [up], #4 + b L(lo3) + +L(fi0): ldr r5, [up], #4 + add rp, rp, #4 + mov r7, #0 + ldr r4, [up], #4 + b L(lo0) + +L(fi1): ldr r4, [up], #4 + mov r6, #0 + add rp, rp, #8 + subs n, n, #1 + beq L(1) + ldr r5, [up], #4 + b L(lo1) + +L(fi2): ldr r5, [up], #4 + add rp, rp, #12 + mov r7, #0 + ldr r4, [up], #4 + b L(lo2) + + ALIGN(16) +L(top): mov r6, #0 + ldr r5, [up], #4 + str r7, [rp, #-12] +L(lo1): umaal r6, r12, r4, v0 + mov r7, #0 + ldr r4, [up], #4 + str r6, [rp, #-8] +L(lo0): umaal r7, r12, r5, v0 + mov r6, #0 + ldr r5, [up], #4 + str r7, [rp, #-4] +L(lo3): umaal r6, r12, r4, v0 + mov r7, #0 + ldr r4, [up], #4 + str r6, [rp], #16 +L(lo2): umaal r7, r12, r5, v0 + subs n, n, #4 + bhi L(top) + + mov r6, #0 + str r7, [rp, #-12] +L(1): umaal r6, r12, r4, v0 + str r6, [rp, #-8] + mov r0, r12 + ldmfd sp!, { r4, r5, r6, r7 } + bx lr +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/arm/v6/mul_2.asm b/vendor/gmp-6.3.0/mpn/arm/v6/mul_2.asm new file mode 100644 index 0000000..edd27f3 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/arm/v6/mul_2.asm @@ -0,0 +1,135 @@ +dnl ARM mpn_mul_2. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM: - +C XScale - +C ARM11 5.25 +C Cortex-A5 3.63 +C Cortex-A7 3.15 +C Cortex-A8 5.0 +C Cortex-A9 2.25 +C Cortex-A15 2.5 +C Cortex-A17 2.13 +C Cortex-A53 3.5 + +C TODO +C * This is a trivial edit of the addmul_2 code. Check for simplifications, +C and possible speedups to 2.0 c/l. + +define(`rp',`r0') +define(`up',`r1') +define(`n', `r2') +define(`vp',`r3') + +define(`v0',`r6') +define(`v1',`r7') +define(`u0',`r3') +define(`u1',`r9') + +define(`cya',`r8') +define(`cyb',`r12') + + +ASM_START() +PROLOGUE(mpn_mul_2) + push { r4, r5, r6, r7, r8, r9 } + + ldm vp, { v0, v1 } + mov cya, #0 + mov cyb, #0 + + tst n, #1 + beq L(evn) +L(odd): mov r5, #0 + ldr u0, [up, #0] + mov r4, #0 + tst n, #2 + beq L(fi1) +L(fi3): sub up, up, #12 + sub rp, rp, #16 + b L(lo3) +L(fi1): sub n, n, #1 + sub up, up, #4 + sub rp, rp, #8 + b L(lo1) +L(evn): mov r4, #0 + ldr u1, [up, #0] + mov r5, #0 + tst n, #2 + bne L(fi2) +L(fi0): sub up, up, #8 + sub rp, rp, #12 + b L(lo0) +L(fi2): subs n, n, #2 + sub rp, rp, #4 + bls L(end) + + ALIGN(16) +L(top): ldr u0, [up, #4] + umaal r4, cya, u1, v0 + str r4, [rp, #4] + mov r4, #0 + umaal r5, cyb, u1, v1 +L(lo1): ldr u1, [up, #8] + umaal r5, cya, u0, v0 + str r5, [rp, #8] + mov r5, #0 + umaal r4, cyb, u0, v1 +L(lo0): ldr u0, [up, #12] + umaal r4, cya, u1, v0 + str r4, [rp, #12] + mov r4, #0 + umaal r5, cyb, u1, v1 +L(lo3): ldr u1, [up, #16]! + umaal r5, cya, u0, v0 + str r5, [rp, #16]! + mov r5, #0 + umaal r4, cyb, u0, v1 + subs n, n, #4 + bhi L(top) + +L(end): umaal r4, cya, u1, v0 + ldr u0, [up, #4] + umaal r5, cyb, u1, v1 + str r4, [rp, #4] + umaal r5, cya, u0, v0 + umaal cya, cyb, u0, v1 + str r5, [rp, #8] + str cya, [rp, #12] + mov r0, cyb + + pop { r4, r5, r6, r7, r8, r9 } + bx r14 +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/arm/v6/popham.asm b/vendor/gmp-6.3.0/mpn/arm/v6/popham.asm new file mode 100644 index 0000000..c254368 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/arm/v6/popham.asm @@ -0,0 +1,139 @@ +dnl ARM mpn_popcount and mpn_hamdist. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C popcount hamdist +C cycles/limb cycles/limb +C StrongARM - +C XScale - +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 8.94 9.47 +C Cortex-A15 5.67 6.44 + +C Architecture requirements: +C v5 - +C v5t - +C v5te ldrd strd +C v6 usada8 +C v6t2 - +C v7a - + +ifdef(`OPERATION_popcount',` + define(`func',`mpn_popcount') + define(`ap', `r0') + define(`n', `r1') + define(`a0', `r2') + define(`a1', `r3') + define(`s', `r5') + define(`b_01010101', `r6') + define(`b_00110011', `r7') + define(`b_00001111', `r8') + define(`zero', `r9') + define(`POPC', `$1') + define(`HAMD', `dnl') +') +ifdef(`OPERATION_hamdist',` + define(`func',`mpn_hamdist') + define(`ap', `r0') + define(`bp', `r1') + define(`n', `r2') + define(`a0', `r6') + define(`a1', `r7') + define(`b0', `r4') + define(`b1', `r5') + define(`s', `r11') + define(`b_01010101', `r8') + define(`b_00110011', `r9') + define(`b_00001111', `r10') + define(`zero', `r3') + define(`POPC', `dnl') + define(`HAMD', `$1') +') + +MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist) + +ASM_START() +PROLOGUE(func) +POPC(` push { r4-r9 } ') +HAMD(` push { r4-r11 } ') + + ldr b_01010101, =0x55555555 + mov r12, #0 + ldr b_00110011, =0x33333333 + mov zero, #0 + ldr b_00001111, =0x0f0f0f0f + + tst n, #1 + beq L(evn) + +L(odd): ldr a1, [ap], #4 C 1 x 32 1-bit accumulators, 0-1 +HAMD(` ldr b1, [bp], #4 ') C 1 x 32 1-bit accumulators, 0-1 +HAMD(` eor a1, a1, b1 ') + and r4, b_01010101, a1, lsr #1 + sub a1, a1, r4 + and r4, a1, b_00110011 + bic r5, a1, b_00110011 + add r5, r4, r5, lsr #2 C 8 4-bit accumulators, 0-4 + subs n, n, #1 + b L(mid) + +L(evn): mov s, #0 + +L(top): ldrd a0, a1, [ap], #8 C 2 x 32 1-bit accumulators, 0-1 +HAMD(` ldrd b0, b1, [bp], #8') +HAMD(` eor a0, a0, b0 ') +HAMD(` eor a1, a1, b1 ') + subs n, n, #2 + usada8 r12, s, zero, r12 + and r4, b_01010101, a0, lsr #1 + sub a0, a0, r4 + and r4, b_01010101, a1, lsr #1 + sub a1, a1, r4 + and r4, a0, b_00110011 + bic r5, a0, b_00110011 + add a0, r4, r5, lsr #2 C 8 4-bit accumulators, 0-4 + and r4, a1, b_00110011 + bic r5, a1, b_00110011 + add a1, r4, r5, lsr #2 C 8 4-bit accumulators, 0-4 + add r5, a0, a1 C 8 4-bit accumulators, 0-8 +L(mid): and r4, r5, b_00001111 + bic r5, r5, b_00001111 + add s, r4, r5, lsr #4 C 4 8-bit accumulators + bne L(top) + + usada8 r0, s, zero, r12 +POPC(` pop { r4-r9 } ') +HAMD(` pop { r4-r11 } ') + bx r14 +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/arm/v6/sqr_basecase.asm b/vendor/gmp-6.3.0/mpn/arm/v6/sqr_basecase.asm new file mode 100644 index 0000000..0fc4f13 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/arm/v6/sqr_basecase.asm @@ -0,0 +1,544 @@ +dnl ARM v6 mpn_sqr_basecase. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2012, 2013, 2015 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C Code structure: +C +C +C m_2(0m4) m_2(2m4) m_2(1m4) m_2(3m4) +C | | | | +C | | | | +C | | | | +C \|/ \|/ \|/ \|/ +C ____________ ____________ +C / \ / \ +C \|/ \ \|/ \ +C am_2(3m4) am_2(1m4) am_2(0m4) am_2(2m4) +C \ /|\ \ /|\ +C \____________/ \____________/ +C \ / +C \ / +C \ / +C cor3 cor2 +C \ / +C \ / +C sqr_diag_addlsh1 + +C TODO +C * Align more labels. +C * Further tweak counter and updates in outer loops. (This could save +C perhaps 5n cycles). +C * Avoid sub-with-lsl in outer loops. We could keep n up-shifted, then +C initialise loop counter i with a right shift. +C * Try to use fewer register. Perhaps coalesce r9 branch target and n_saved. +C (This could save 2-3 cycles for n > 4.) +C * Optimise sqr_diag_addlsh1 loop. The current code uses old-style carry +C propagation. +C * Stop loops earlier suppressing writes of upper-most rp[] values. +C * The addmul_2 loops here runs well on all cores, but mul_2 runs poorly +C particularly on Cortex-A8. + + +define(`rp', r0) +define(`up', r1) +define(`n', r2) + +define(`v0', r3) +define(`v1', r6) +define(`i', r8) +define(`n_saved', r14) +define(`cya', r11) +define(`cyb', r12) +define(`u0', r7) +define(`u1', r9) + +ASM_START() +PROLOGUE(mpn_sqr_basecase) + and r12, n, #3 + cmp n, #4 + addgt r12, r12, #4 + add pc, pc, r12, lsl #2 + nop + b L(4) + b L(1) + b L(2) + b L(3) + b L(0m4) + b L(1m4) + b L(2m4) + b L(3m4) + + +L(1m4): push {r4-r11, r14} + mov n_saved, n + sub i, n, #4 + sub n, n, #2 + add r10, pc, #L(am2_2m4)-.-8 + ldm up, {v0,v1,u0} + sub up, up, #4 + mov cyb, #0 + mov r5, #0 + umull r4, cya, v1, v0 + str r4, [rp], #-12 + mov r4, #0 + b L(ko0) + +L(3m4): push {r4-r11, r14} + mov n_saved, n + sub i, n, #4 + sub n, n, #2 + add r10, pc, #L(am2_0m4)-.-8 + ldm up, {v0,v1,u0} + add up, up, #4 + mov cyb, #0 + mov r5, #0 + umull r4, cya, v1, v0 + str r4, [rp], #-4 + mov r4, #0 + b L(ko2) + +L(2m4): push {r4-r11, r14} + mov n_saved, n + sub i, n, #4 + sub n, n, #2 + add r10, pc, #L(am2_3m4)-.-8 + ldm up, {v0,v1,u1} + mov cyb, #0 + mov r4, #0 + umull r5, cya, v1, v0 + str r5, [rp], #-8 + mov r5, #0 + b L(ko1) + +L(0m4): push {r4-r11, r14} + mov n_saved, n + sub i, n, #4 + sub n, n, #2 + add r10, pc, #L(am2_1m4)-.-8 + ldm up, {v0,v1,u1} + mov cyb, #0 + mov r4, #0 + add up, up, #8 + umull r5, cya, v1, v0 + str r5, [rp, #0] + mov r5, #0 + +L(top): ldr u0, [up, #4] + umaal r4, cya, u1, v0 + str r4, [rp, #4] + mov r4, #0 + umaal r5, cyb, u1, v1 +L(ko2): ldr u1, [up, #8] + umaal r5, cya, u0, v0 + str r5, [rp, #8] + mov r5, #0 + umaal r4, cyb, u0, v1 +L(ko1): ldr u0, [up, #12] + umaal r4, cya, u1, v0 + str r4, [rp, #12] + mov r4, #0 + umaal r5, cyb, u1, v1 +L(ko0): ldr u1, [up, #16]! + umaal r5, cya, u0, v0 + str r5, [rp, #16]! + mov r5, #0 + umaal r4, cyb, u0, v1 + subs i, i, #4 + bhi L(top) + + umaal r4, cya, u1, v0 + ldr u0, [up, #4] + umaal r5, cyb, u1, v1 + str r4, [rp, #4] + umaal r5, cya, u0, v0 + umaal cya, cyb, u0, v1 + str r5, [rp, #8] + str cya, [rp, #12] + str cyb, [rp, #16] + + add up, up, #4 + sub n, n, #1 + add rp, rp, #8 + bx r10 + +L(evnloop): + subs i, n, #6 + sub n, n, #2 + blt L(cor2) + ldm up, {v0,v1,u1} + add up, up, #8 + mov cya, #0 + mov cyb, #0 + ldr r4, [rp, #-4] + umaal r4, cya, v1, v0 + str r4, [rp, #-4] + ldr r4, [rp, #0] + + ALIGN(16) +L(ua2): ldr r5, [rp, #4] + umaal r4, cya, u1, v0 + ldr u0, [up, #4] + umaal r5, cyb, u1, v1 + str r4, [rp, #0] + ldr r4, [rp, #8] + umaal r5, cya, u0, v0 + ldr u1, [up, #8] + umaal r4, cyb, u0, v1 + str r5, [rp, #4] + ldr r5, [rp, #12] + umaal r4, cya, u1, v0 + ldr u0, [up, #12] + umaal r5, cyb, u1, v1 + str r4, [rp, #8] + ldr r4, [rp, #16]! + umaal r5, cya, u0, v0 + ldr u1, [up, #16]! + umaal r4, cyb, u0, v1 + str r5, [rp, #-4] + subs i, i, #4 + bhs L(ua2) + + umaal r4, cya, u1, v0 + umaal cya, cyb, u1, v1 + str r4, [rp, #0] + str cya, [rp, #4] + str cyb, [rp, #8] +L(am2_0m4): + sub rp, rp, n, lsl #2 + sub up, up, n, lsl #2 + add rp, rp, #8 + + sub i, n, #4 + sub n, n, #2 + ldm up, {v0,v1,u1} + mov cya, #0 + mov cyb, #0 + ldr r4, [rp, #4] + umaal r4, cya, v1, v0 + str r4, [rp, #4] + ldr r4, [rp, #8] + b L(lo0) + + ALIGN(16) +L(ua0): ldr r5, [rp, #4] + umaal r4, cya, u1, v0 + ldr u0, [up, #4] + umaal r5, cyb, u1, v1 + str r4, [rp, #0] + ldr r4, [rp, #8] + umaal r5, cya, u0, v0 + ldr u1, [up, #8] + umaal r4, cyb, u0, v1 + str r5, [rp, #4] +L(lo0): ldr r5, [rp, #12] + umaal r4, cya, u1, v0 + ldr u0, [up, #12] + umaal r5, cyb, u1, v1 + str r4, [rp, #8] + ldr r4, [rp, #16]! + umaal r5, cya, u0, v0 + ldr u1, [up, #16]! + umaal r4, cyb, u0, v1 + str r5, [rp, #-4] + subs i, i, #4 + bhs L(ua0) + + umaal r4, cya, u1, v0 + umaal cya, cyb, u1, v1 + str r4, [rp, #0] + str cya, [rp, #4] + str cyb, [rp, #8] +L(am2_2m4): + sub rp, rp, n, lsl #2 + sub up, up, n, lsl #2 + add rp, rp, #16 + b L(evnloop) + + +L(oddloop): + sub i, n, #5 + sub n, n, #2 + ldm up, {v0,v1,u0} + mov cya, #0 + mov cyb, #0 + ldr r5, [rp, #0] + umaal r5, cya, v1, v0 + str r5, [rp, #0] + ldr r5, [rp, #4] + add up, up, #4 + b L(lo1) + + ALIGN(16) +L(ua1): ldr r5, [rp, #4] + umaal r4, cya, u1, v0 + ldr u0, [up, #4] + umaal r5, cyb, u1, v1 + str r4, [rp, #0] +L(lo1): ldr r4, [rp, #8] + umaal r5, cya, u0, v0 + ldr u1, [up, #8] + umaal r4, cyb, u0, v1 + str r5, [rp, #4] + ldr r5, [rp, #12] + umaal r4, cya, u1, v0 + ldr u0, [up, #12] + umaal r5, cyb, u1, v1 + str r4, [rp, #8] + ldr r4, [rp, #16]! + umaal r5, cya, u0, v0 + ldr u1, [up, #16]! + umaal r4, cyb, u0, v1 + str r5, [rp, #-4] + subs i, i, #4 + bhs L(ua1) + + umaal r4, cya, u1, v0 + umaal cya, cyb, u1, v1 + str r4, [rp, #0] + str cya, [rp, #4] + str cyb, [rp, #8] +L(am2_3m4): + sub rp, rp, n, lsl #2 + sub up, up, n, lsl #2 + add rp, rp, #4 + + subs i, n, #3 + beq L(cor3) + sub n, n, #2 + ldm up, {v0,v1,u0} + mov cya, #0 + mov cyb, #0 + ldr r5, [rp, #8] + sub up, up, #4 + umaal r5, cya, v1, v0 + str r5, [rp, #8] + ldr r5, [rp, #12] + b L(lo3) + + ALIGN(16) +L(ua3): ldr r5, [rp, #4] + umaal r4, cya, u1, v0 + ldr u0, [up, #4] + umaal r5, cyb, u1, v1 + str r4, [rp, #0] + ldr r4, [rp, #8] + umaal r5, cya, u0, v0 + ldr u1, [up, #8] + umaal r4, cyb, u0, v1 + str r5, [rp, #4] + ldr r5, [rp, #12] + umaal r4, cya, u1, v0 + ldr u0, [up, #12] + umaal r5, cyb, u1, v1 + str r4, [rp, #8] +L(lo3): ldr r4, [rp, #16]! + umaal r5, cya, u0, v0 + ldr u1, [up, #16]! + umaal r4, cyb, u0, v1 + str r5, [rp, #-4] + subs i, i, #4 + bhs L(ua3) + + umaal r4, cya, u1, v0 + umaal cya, cyb, u1, v1 + str r4, [rp, #0] + str cya, [rp, #4] + str cyb, [rp, #8] +L(am2_1m4): + sub rp, rp, n, lsl #2 + sub up, up, n, lsl #2 + add rp, rp, #12 + b L(oddloop) + + +L(cor3):ldm up, {v0,v1,u0} + ldr r5, [rp, #8] + mov cya, #0 + mov cyb, #0 + umaal r5, cya, v1, v0 + str r5, [rp, #8] + ldr r5, [rp, #12] + ldr r4, [rp, #16] + umaal r5, cya, u0, v0 + ldr u1, [up, #12] + umaal r4, cyb, u0, v1 + str r5, [rp, #12] + umaal r4, cya, u1, v0 + umaal cya, cyb, u1, v1 + str r4, [rp, #16] + str cya, [rp, #20] + str cyb, [rp, #24] + add up, up, #16 + mov cya, cyb + adds rp, rp, #36 C clear cy + mov cyb, #0 + umaal cya, cyb, u1, u0 + b L(sqr_diag_addlsh1) + +L(cor2): + ldm up!, {v0,v1,u0} + mov r4, cya + mov r5, cyb + mov cya, #0 + umaal r4, cya, v1, v0 + mov cyb, #0 + umaal r5, cya, u0, v0 + strd r4, r5, [rp, #-4] + umaal cya, cyb, u0, v1 + add rp, rp, #16 +C b L(sqr_diag_addlsh1) + + +define(`w0', r6) +define(`w1', r7) +define(`w2', r8) +define(`rbx', r9) + +L(sqr_diag_addlsh1): + str cya, [rp, #-12] + str cyb, [rp, #-8] + sub n, n_saved, #1 + sub up, up, n_saved, lsl #2 + sub rp, rp, n_saved, lsl #3 + ldr r3, [up], #4 + umull w1, r5, r3, r3 + mov w2, #0 + mov r10, #0 +C cmn r0, #0 C clear cy (already clear) + b L(lm) + +L(tsd): adds w0, w0, rbx + adcs w1, w1, r4 + str w0, [rp, #0] +L(lm): ldr w0, [rp, #4] + str w1, [rp, #4] + ldr w1, [rp, #8]! + add rbx, r5, w2 + adcs w0, w0, w0 + ldr r3, [up], #4 + adcs w1, w1, w1 + adc w2, r10, r10 + umull r4, r5, r3, r3 + subs n, n, #1 + bne L(tsd) + + adds w0, w0, rbx + adcs w1, w1, r4 + adc w2, r5, w2 + stm rp, {w0,w1,w2} + + pop {r4-r11, pc} + + +C Straight line code for n <= 4 + +L(1): ldr r3, [up, #0] + umull r1, r2, r3, r3 + stm rp, {r1,r2} + bx r14 + +L(2): push {r4-r5} + ldm up, {r5,r12} + umull r1, r2, r5, r5 + umull r3, r4, r12, r12 + umull r5, r12, r5, r12 + adds r5, r5, r5 + adcs r12, r12, r12 + adc r4, r4, #0 + adds r2, r2, r5 + adcs r3, r3, r12 + adc r4, r4, #0 + stm rp, {r1,r2,r3,r4} + pop {r4-r5} + bx r14 + +L(3): push {r4-r11} + ldm up, {r7,r8,r9} + umull r1, r2, r7, r7 + umull r3, r4, r8, r8 + umull r5, r6, r9, r9 + umull r10, r11, r7, r8 + mov r12, #0 + umlal r11, r12, r7, r9 + mov r7, #0 + umlal r12, r7, r8, r9 + adds r10, r10, r10 + adcs r11, r11, r11 + adcs r12, r12, r12 + adcs r7, r7, r7 + adc r6, r6, #0 + adds r2, r2, r10 + adcs r3, r3, r11 + adcs r4, r4, r12 + adcs r5, r5, r7 + adc r6, r6, #0 + stm rp, {r1,r2,r3,r4,r5,r6} + pop {r4-r11} + bx r14 + +L(4): push {r4-r11, r14} + ldm up, {r9,r10,r11,r12} + umull r1, r2, r9, r9 + umull r3, r4, r10, r10 + umull r5, r6, r11, r11 + umull r7, r8, r12, r12 + stm rp, {r1,r2,r3,r4,r5,r6,r7} + umull r1, r2, r9, r10 + mov r3, #0 + umlal r2, r3, r9, r11 + mov r4, #0 + umlal r3, r4, r9, r12 + mov r5, #0 + umlal r3, r5, r10, r11 + umaal r4, r5, r10, r12 + mov r6, #0 + umlal r5, r6, r11, r12 + adds r1, r1, r1 + adcs r2, r2, r2 + adcs r3, r3, r3 + adcs r4, r4, r4 + adcs r5, r5, r5 + adcs r6, r6, r6 + add rp, rp, #4 + adc r7, r8, #0 + ldm rp, {r8,r9,r10,r11,r12,r14} + adds r1, r1, r8 + adcs r2, r2, r9 + adcs r3, r3, r10 + adcs r4, r4, r11 + adcs r5, r5, r12 + adcs r6, r6, r14 + adc r7, r7, #0 + stm rp, {r1,r2,r3,r4,r5,r6,r7} + pop {r4-r11, pc} +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/arm/v6/submul_1.asm b/vendor/gmp-6.3.0/mpn/arm/v6/submul_1.asm new file mode 100644 index 0000000..8a21733 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/arm/v6/submul_1.asm @@ -0,0 +1,125 @@ +dnl ARM mpn_submul_1. + +dnl Copyright 2012, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM: - +C XScale - +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 3.75 +C Cortex-A15 4.0 + +C This loop complements U on the fly, +C U' = B^n - 1 - U +C and then uses that +C R - U*v = R + U'*v + v - B^n v + +C TODO +C * Micro-optimise feed-in code. +C * Optimise for n=1,2 by delaying register saving. +C * Try using ldm/stm. + +define(`rp',`r0') +define(`up',`r1') +define(`n', `r2') +define(`v0',`r3') + +ASM_START() +PROLOGUE(mpn_submul_1) + stmfd sp!, { r4, r5, r6, r7 } + + ands r6, n, #3 + mov r12, v0 + beq L(fi0) + cmp r6, #2 + bcc L(fi1) + beq L(fi2) + +L(fi3): ldr r4, [up], #12 + mvn r4, r4 + ldr r6, [rp, #0] + ldr r5, [up, #-8] + b L(lo3) + +L(fi0): ldr r5, [up], #16 + mvn r5, r5 + ldr r7, [rp], #4 + ldr r4, [up, #-12] + b L(lo0) + +L(fi1): ldr r4, [up], #4 + mvn r4, r4 + ldr r6, [rp], #8 + subs n, n, #1 + beq L(1) + ldr r5, [up] + b L(lo1) + +L(fi2): ldr r5, [up], #8 + mvn r5, r5 + ldr r7, [rp], #12 + ldr r4, [up, #-4] + b L(lo2) + + ALIGN(16) +L(top): ldr r6, [rp, #-8] + ldr r5, [up] + str r7, [rp, #-12] +L(lo1): umaal r6, r12, r4, v0 + add up, up, #16 + mvn r5, r5 + ldr r7, [rp, #-4] + ldr r4, [up, #-12] + str r6, [rp, #-8] +L(lo0): umaal r7, r12, r5, v0 + mvn r4, r4 + ldr r6, [rp, #0] + ldr r5, [up, #-8] + str r7, [rp, #-4] +L(lo3): umaal r6, r12, r4, v0 + mvn r5, r5 + ldr r7, [rp, #4] + ldr r4, [up, #-4] + str r6, [rp], #16 +L(lo2): umaal r7, r12, r5, v0 + mvn r4, r4 + subs n, n, #4 + bhi L(top) + + ldr r6, [rp, #-8] + str r7, [rp, #-12] +L(1): umaal r6, r12, r4, v0 + str r6, [rp, #-8] + sub r0, v0, r12 + ldmfd sp!, { r4, r5, r6, r7 } + bx lr +EPILOGUE() |