diff options
Diffstat (limited to 'vendor/gmp-6.3.0/mpn/ia64/mod_34lsub1.asm')
-rw-r--r-- | vendor/gmp-6.3.0/mpn/ia64/mod_34lsub1.asm | 237 |
1 files changed, 237 insertions, 0 deletions
diff --git a/vendor/gmp-6.3.0/mpn/ia64/mod_34lsub1.asm b/vendor/gmp-6.3.0/mpn/ia64/mod_34lsub1.asm new file mode 100644 index 0000000..7789117 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/ia64/mod_34lsub1.asm @@ -0,0 +1,237 @@ +dnl IA-64 mpn_mod_34lsub1 + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2003-2005, 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Itanium: ? +C Itanium 2: 1 + + +C INPUT PARAMETERS +define(`up', `r32') +define(`n', `r33') + +C Some useful aliases for registers we use +define(`u0',`r14') define(`u1',`r15') define(`u2',`r16') +define(`a0',`r17') define(`a1',`r18') define(`a2',`r19') +define(`c0',`r20') define(`c1',`r21') define(`c2',`r22') + +C This is a fairly simple-minded implementation. One could approach 0.67 c/l +C with a more sophisticated implementation. If we're really crazy, we could +C super-unroll, storing carries just in predicate registers, then copy them to +C a general register, and population count them from there. That'd bring us +C close to 3 insn/limb, for nearly 0.5 c/l. + +C Computing n/3 needs 16 cycles, which is a lot of startup overhead. +C We therefore use a plain while-style loop: +C add n = -3, n +C cmp.le p9, p0 = 3, n +C (p9) br.cond .Loop +C Alternatively, we could table n/3 for, say, n < 256, and predicate the +C 16-cycle code. + +C The summing-up code at the end was written quickly, and could surely be +C vastly improved. + +ASM_START() +PROLOGUE(mpn_mod_34lsub1) + .prologue + .save ar.lc, r2 + .body +ifdef(`HAVE_ABI_32',` + addp4 up = 0, up C M I + nop.m 0 + zxt4 n = n C I + ;; +') + +ifelse(0,1,` + movl r14 = 0xAAAAAAAAAAAAAAAB + ;; + setf.sig f6 = r14 + setf.sig f7 = r33 + ;; + xmpy.hu f6 = f6, f7 + ;; + getf.sig r8 = f6 + ;; + shr.u r8 = r8, 1 C Loop count + ;; + mov.i ar.lc = r8 +') + + ld8 u0 = [up], 8 + cmp.ne p9, p0 = 1, n + (p9) br L(gt1) + ;; + shr.u r8 = u0, 48 + dep.z r27 = u0, 0, 48 + ;; + add r8 = r8, r27 + br.ret.sptk.many b0 + + +L(gt1): + {.mmi; nop.m 0 + mov a0 = 0 + add n = -2, n +}{.mmi; mov c0 = 0 + mov c1 = 0 + mov c2 = 0 + ;; +}{.mmi; ld8 u1 = [up], 8 + mov a1 = 0 + cmp.ltu p6, p0 = r0, r0 C clear p6 +}{.mmb; cmp.gt p9, p0 = 3, n + mov a2 = 0 + (p9) br.cond.dptk L(end) + ;; +} + ALIGN(32) +L(top): + {.mmi; ld8 u2 = [up], 8 + (p6) add c0 = 1, c0 + cmp.ltu p7, p0 = a0, u0 +}{.mmb; sub a0 = a0, u0 + add n = -3, n + nop.b 0 + ;; +}{.mmi; ld8 u0 = [up], 8 + (p7) add c1 = 1, c1 + cmp.ltu p8, p0 = a1, u1 +}{.mmb; sub a1 = a1, u1 + cmp.le p9, p0 = 3, n + nop.b 0 + ;; +}{.mmi; ld8 u1 = [up], 8 + (p8) add c2 = 1, c2 + cmp.ltu p6, p0 = a2, u2 +}{.mmb; sub a2 = a2, u2 + nop.m 0 +dnl br.cloop.dptk L(top) + (p9) br.cond.dptk L(top) + ;; +} +L(end): + cmp.eq p10, p0 = 0, n + cmp.eq p11, p0 = 1, n + (p10) br L(0) + +L(2): + {.mmi; ld8 u2 = [up], 8 + (p6) add c0 = 1, c0 + cmp.ltu p7, p0 = a0, u0 +}{.mmb; sub a0 = a0, u0 + nop.m 0 + (p11) br L(1) + ;; +} ld8 u0 = [up], 8 + (p7) add c1 = 1, c1 + cmp.ltu p8, p0 = a1, u1 + sub a1 = a1, u1 + ;; + (p8) add c2 = 1, c2 + cmp.ltu p6, p0 = a2, u2 + sub a2 = a2, u2 + ;; + (p6) add c0 = 1, c0 + cmp.ltu p7, p0 = a0, u0 + sub a0 = a0, u0 + ;; + (p7) add c1 = 1, c1 + br L(com) + + +L(1): + (p7) add c1 = 1, c1 + cmp.ltu p8, p0 = a1, u1 + sub a1 = a1, u1 + ;; + (p8) add c2 = 1, c2 + cmp.ltu p6, p0 = a2, u2 + sub a2 = a2, u2 + ;; + (p6) add c0 = 1, c0 + br L(com) + + +L(0): + (p6) add c0 = 1, c0 + cmp.ltu p7, p0 = a0, u0 + sub a0 = a0, u0 + ;; + (p7) add c1 = 1, c1 + cmp.ltu p8, p0 = a1, u1 + sub a1 = a1, u1 + ;; + (p8) add c2 = 1, c2 + +L(com): +C | a2 | a1 | a0 | +C | | | | | + shr.u r24 = a0, 48 C 16 bits + shr.u r25 = a1, 32 C 32 bits + shr.u r26 = a2, 16 C 48 bits + ;; + shr.u r10 = c0, 48 C 16 bits, always zero + shr.u r11 = c1, 32 C 32 bits + shr.u r30 = c2, 16 C 48 bits + ;; + dep.z r27 = a0, 0, 48 C 48 bits + dep.z r28 = a1, 16, 32 C 48 bits + dep.z r29 = a2, 32, 16 C 48 bits + dep.z r31 = c0, 0, 48 C 48 bits + dep.z r14 = c1, 16, 32 C 48 bits + dep.z r15 = c2, 32, 16 C 48 bits + ;; + {.mmi; add r24 = r24, r25 + add r26 = r26, r27 + add r28 = r28, r29 +}{.mmi; add r10 = r10, r11 + add r30 = r30, r31 + add r14 = r14, r15 + ;; +} + movl r8 = 0xffffffffffff0 + add r24 = r24, r26 + add r10 = r10, r30 + ;; + add r24 = r24, r28 + add r10 = r10, r14 + ;; + sub r8 = r8, r24 + ;; + add r8 = r8, r10 + br.ret.sptk.many b0 +EPILOGUE() +ASM_END() |