diff options
Diffstat (limited to 'vendor/gmp-6.3.0/mpn/ia64/sqr_diag_addlsh1.asm')
-rw-r--r-- | vendor/gmp-6.3.0/mpn/ia64/sqr_diag_addlsh1.asm | 156 |
1 files changed, 156 insertions, 0 deletions
diff --git a/vendor/gmp-6.3.0/mpn/ia64/sqr_diag_addlsh1.asm b/vendor/gmp-6.3.0/mpn/ia64/sqr_diag_addlsh1.asm new file mode 100644 index 0000000..727f489 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/ia64/sqr_diag_addlsh1.asm @@ -0,0 +1,156 @@ +dnl IA-64 mpn_sqr_diag_addlsh1 + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2010, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Itanium: ? +C Itanium 2: 2 Unrolling could bring it to 1.5 + epsilon + +C Exact performance table. The 2nd line is this code, the 3rd line is ctop- +C less code. In an assembly sqr_basecase, the ctop-full numbers will become a +C few cycles better since we can mitigate the many I0 instructions. +C +C 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 +C - 20 22 24 26 28 30 32 34 36 38 40 42 44 46 48 50 52 54 56 Needs updating +C - 13 16 17 18 20 21 23 25 26 30 31 31 33 34 36 38 39 42 43 + +C We should keep in mind that this code takes linear time in a O(n^2) context +C and that it will only be used under SQR_TOOM2_THRESHOLD, which might become +C around 60. Keeping overhead down for smallish operands (< 10) is more +C important than optimal cycle counts. + +C TODO +C * Make sure we don't depend on uninitialised r-registers, f-registers, or +C * p-registers. +C * Optimise by doing first two loop iterations in function header. + +C INPUT PARAMETERS +define(`rp_param', `r32') define(`rp', `r14') C size: 2n +define(`tp_param', `r33') define(`tp', `r15') C size: 2n - 2 +define(`up_param', `r34') define(`up', `r31') C size: n +define(`n', `r35') + +ifdef(`HAVE_ABI_32',` + define(`ABI64', `') + define(`ABI32', `$1') +',` + define(`ABI64', `$1') + define(`ABI32', `') +') + +ASM_START() +PROLOGUE(mpn_sqr_diag_addlsh1) + + .prologue + .save ar.pfs, r2 + .save ar.lc, r3 + .body + + {.mii; alloc r2 = ar.pfs, 4,24,0,24 C M + mov r3 = ar.lc C I0 + ABI64(` nop 4711 ') + ABI32(` zxt4 n = n ') +}{.mmi; ABI64(` mov tp = tp_param ') C M I + ABI32(` addp4 tp = 0, tp_param') C M I + ABI64(` mov up = up_param ') C M I + ABI32(` addp4 up = 0, up_param') C M I + ABI64(` mov rp = rp_param ') C M I + ABI32(` addp4 rp = 0, rp_param') C M I + ;; +}{.mmi; ld8 r36 = [tp], 8 C M + add r20 = -2, n C M I + mov r9 = ar.ec C I0 + ;; +}{.mmi; ld8 r32 = [tp], 8 C M + mov r16 = 0 C M I + mov ar.ec = 7 C I0 + ;; +}{.mmi; nop 4711 + mov r44 = 0 C M I + mov ar.lc = r20 C I0 + ;; +}{.mii; mov r33 = 0 + mov r10 = pr C I0 + mov pr.rot = 0x30000 C I0 + ;; +} br.cexit.spnt.few.clr L(end) + +dnl *** MAIN LOOP START *** + ALIGN(32) +L(top): + {.mfi; (p18) ldf8 f33 = [up], 8 C M + (p20) xma.l f36 = f35, f35, f42 C F + (p41) cmpequc p50, p0 = -1, r44 C M I +}{.mfi; setfsig f40 = r16 C M23 + (p20) xma.hu f38 = f35, f35, f42 C F + (p23) add r50 = r41, r49 C M I + ;; +}{.mmi; (p16) ld8 r36 = [tp], 8 C M + (p23) cmpltu p40, p0 = r50, r41 C cyout hi M I + (p19) shrp r45 = r38, r35, 63 C non-critical I0 +}{.mmi; (p21) getfsig r39 = f39 C hi M2 + (p24) st8 [rp] = r51, 8 C hi M23 + (p41) add r44 = 1, r44 C M I + ;; +}{.mmi; (p16) ld8 r32 = [tp], 8 C M + (p50) cmpeqor p40, p0 = -1, r50 C cyout hi M I + (p17) shrp r16 = r33, r37, 63 C critical I0 +}{.mmi; (p21) getfsig r42 = f37 C lo M2 + (p23) st8 [rp] = r44, 8 C lo M23 + (p50) add r50 = 1, r50 C M I + ;; +} br.ctop.sptk.few.clr L(top) C B +dnl *** MAIN LOOP END *** + ;; +L(end): + {.mmi; nop 4711 + (p41) add r44 = 1, r44 C M I + shr.u r48 = r39, 63 C I0 + ;; +}{.mmi; st8 [rp] = r51, 8 C M23 + (p41) cmpequc p6, p0 = 0, r44 C M I + add r50 = r41, r48 C M I + ;; +}{.mmi; st8 [rp] = r44, 8 C M23 + (p6) add r50 = 1, r50 C M I + mov ar.lc = r3 C I0 + ;; +}{.mii; st8 [rp] = r50 C M23 + mov ar.ec = r9 C I0 + mov pr = r10 C I0 + ;; +}{.mib; nop 4711 + mov ar.pfs = r2 C I0 + br.ret.sptk.many b0 C B +} +EPILOGUE() |