aboutsummaryrefslogtreecommitdiff
path: root/vendor/gmp-6.3.0/mpn/ia64
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/gmp-6.3.0/mpn/ia64')
-rw-r--r--vendor/gmp-6.3.0/mpn/ia64/README281
-rw-r--r--vendor/gmp-6.3.0/mpn/ia64/add_n_sub_n.asm307
-rw-r--r--vendor/gmp-6.3.0/mpn/ia64/addmul_1.asm602
-rw-r--r--vendor/gmp-6.3.0/mpn/ia64/addmul_2.asm715
-rw-r--r--vendor/gmp-6.3.0/mpn/ia64/aors_n.asm852
-rw-r--r--vendor/gmp-6.3.0/mpn/ia64/aorsorrlsh1_n.asm48
-rw-r--r--vendor/gmp-6.3.0/mpn/ia64/aorsorrlsh2_n.asm48
-rw-r--r--vendor/gmp-6.3.0/mpn/ia64/aorsorrlshC_n.asm412
-rw-r--r--vendor/gmp-6.3.0/mpn/ia64/bdiv_dbm1c.asm516
-rw-r--r--vendor/gmp-6.3.0/mpn/ia64/cnd_aors_n.asm264
-rw-r--r--vendor/gmp-6.3.0/mpn/ia64/copyd.asm186
-rw-r--r--vendor/gmp-6.3.0/mpn/ia64/copyi.asm182
-rw-r--r--vendor/gmp-6.3.0/mpn/ia64/dive_1.asm236
-rw-r--r--vendor/gmp-6.3.0/mpn/ia64/divrem_1.asm477
-rw-r--r--vendor/gmp-6.3.0/mpn/ia64/divrem_2.asm280
-rw-r--r--vendor/gmp-6.3.0/mpn/ia64/gcd_11.asm110
-rw-r--r--vendor/gmp-6.3.0/mpn/ia64/gmp-mparam.h212
-rw-r--r--vendor/gmp-6.3.0/mpn/ia64/hamdist.asm365
-rw-r--r--vendor/gmp-6.3.0/mpn/ia64/ia64-defs.m4147
-rw-r--r--vendor/gmp-6.3.0/mpn/ia64/invert_limb.asm105
-rw-r--r--vendor/gmp-6.3.0/mpn/ia64/logops_n.asm292
-rw-r--r--vendor/gmp-6.3.0/mpn/ia64/lorrshift.asm358
-rw-r--r--vendor/gmp-6.3.0/mpn/ia64/lshiftc.asm463
-rw-r--r--vendor/gmp-6.3.0/mpn/ia64/mod_34lsub1.asm237
-rw-r--r--vendor/gmp-6.3.0/mpn/ia64/mode1o.asm342
-rw-r--r--vendor/gmp-6.3.0/mpn/ia64/mul_1.asm584
-rw-r--r--vendor/gmp-6.3.0/mpn/ia64/mul_2.asm625
-rw-r--r--vendor/gmp-6.3.0/mpn/ia64/popcount.asm200
-rw-r--r--vendor/gmp-6.3.0/mpn/ia64/rsh1aors_n.asm447
-rw-r--r--vendor/gmp-6.3.0/mpn/ia64/sec_tabselect.asm148
-rw-r--r--vendor/gmp-6.3.0/mpn/ia64/sqr_diag_addlsh1.asm156
-rw-r--r--vendor/gmp-6.3.0/mpn/ia64/submul_1.asm647
32 files changed, 10844 insertions, 0 deletions
diff --git a/vendor/gmp-6.3.0/mpn/ia64/README b/vendor/gmp-6.3.0/mpn/ia64/README
new file mode 100644
index 0000000..45c2d63
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/ia64/README
@@ -0,0 +1,281 @@
+Copyright 2000-2005 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/.
+
+
+
+ IA-64 MPN SUBROUTINES
+
+
+This directory contains mpn functions for the IA-64 architecture.
+
+
+CODE ORGANIZATION
+
+ mpn/ia64 itanium-2, and generic ia64
+
+The code here has been optimized primarily for Itanium 2. Very few Itanium 1
+chips were ever sold, and Itanium 2 is more powerful, so the latter is what
+we concentrate on.
+
+
+
+CHIP NOTES
+
+The IA-64 ISA keeps instructions three and three in 128 bit bundles.
+Programmers/compilers need to put explicit breaks `;;' when there are WAW or
+RAW dependencies, with some notable exceptions. Such "breaks" are typically
+at the end of a bundle, but can be put between operations within some bundle
+types too.
+
+The Itanium 1 and Itanium 2 implementations can under ideal conditions
+execute two bundles per cycle. The Itanium 1 allows 4 of these instructions
+to do integer operations, while the Itanium 2 allows all 6 to be integer
+operations.
+
+Taken cloop branches seem to insert a bubble into the pipeline most of the
+time on Itanium 1.
+
+Loads to the fp registers bypass the L1 cache and thus get extremely long
+latencies, 9 cycles on the Itanium 1 and 6 cycles on the Itanium 2.
+
+The software pipeline stuff using br.ctop instruction causes delays, since
+many issue slots are taken up by instructions with zero predicates, and
+since many extra instructions are needed to set things up. These features
+are clearly designed for code density, not speed.
+
+Misc pipeline limitations (Itanium 1):
+* The getf.sig instruction can only execute in M0.
+* At most four integer instructions/cycle.
+* Nops take up resources like any plain instructions.
+
+Misc pipeline limitations (Itanium 2):
+* The getf.sig instruction can only execute in M0.
+* Nops take up resources like any plain instructions.
+
+
+ASSEMBLY SYNTAX
+
+.align pads with nops in a text segment, but gas 2.14 and earlier
+incorrectly byte-swaps its nop bundle in big endian mode (eg. hpux), making
+it come out as break instructions. We use the ALIGN() macro in
+mpn/ia64/ia64-defs.m4 when it might be executed across. That macro
+suppresses any .align if the problem is detected by configure. Lack of
+alignment might hurt performance but will at least be correct.
+
+foo:: to create a global symbol is not accepted by gas. Use separate
+".global foo" and "foo:" instead.
+
+.global is the standard global directive. gas accepts .globl, but hpux "as"
+doesn't.
+
+.proc / .endp generates the appropriate .type and .size information for ELF,
+so the latter directives don't need to be given explicitly.
+
+.pred.rel "mutex"... is standard for annotating predicate register
+relationships. gas also accepts .pred.rel.mutex, but hpux "as" doesn't.
+
+.pred directives can't be put on a line with a label, like
+".Lfoo: .pred ...", the HP assembler on HP-UX 11.23 rejects that.
+gas is happy with it, and past versions of HP had seemed ok.
+
+// is the standard comment sequence, but we prefer "C" since it inhibits m4
+macro expansion. See comments in ia64-defs.m4.
+
+
+REGISTER USAGE
+
+Special:
+ r0: constant 0
+ r1: global pointer (gp)
+ r8: return value
+ r12: stack pointer (sp)
+ r13: thread pointer (tp)
+Caller-saves: r8-r11 r14-r31 f6-f15 f32-f127
+Caller-saves but rotating: r32-
+
+
+================================================================
+mpn_add_n, mpn_sub_n:
+
+The current code runs at 1.25 c/l on Itanium 2.
+
+================================================================
+mpn_mul_1:
+
+The current code runs at 2 c/l on Itanium 2.
+
+Using a blocked approach, working off of 4 separate places in the operands,
+one could make use of the xma accumulation, and approach 1 c/l.
+
+ ldf8 [up]
+ xma.l
+ xma.hu
+ stf8 [wrp]
+
+================================================================
+mpn_addmul_1:
+
+The current code runs at 2 c/l on Itanium 2.
+
+It seems possible to use a blocked approach, as with mpn_mul_1. We should
+read rp[] to integer registers, allowing for just one getf.sig per cycle.
+
+ ld8 [rp]
+ ldf8 [up]
+ xma.l
+ xma.hu
+ getf.sig
+ add+add+cmp+cmp
+ st8 [wrp]
+
+These 10 instructions can be scheduled to approach 1.667 cycles, and with
+the 4 cycle latency of xma, this means we need at least 3 blocks. Using
+ldfp8 we could approach 1.583 c/l.
+
+================================================================
+mpn_submul_1:
+
+The current code runs at 2.25 c/l on Itanium 2. Getting to 2 c/l requires
+ldfp8 with all alignment headache that implies.
+
+================================================================
+mpn_addmul_N
+
+For best speed, we need to give up using mpn_addmul_2 as the main multiply
+building block, and instead take multiple v limbs per loop. For the Itanium
+1, we need to take about 8 limbs at a time for full speed. For the Itanium
+2, something like mpn_addmul_4 should be enough.
+
+The add+cmp+cmp+add we use on the other codes is optimal for shortening
+recurrencies (1 cycle) but the sequence takes up 4 execution slots. When
+recurrency depth is not critical, a more standard 3-cycle add+cmp+add is
+better.
+
+/* First load the 8 values from v */
+ ldfp8 v0, v1 = [r35], 16;;
+ ldfp8 v2, v3 = [r35], 16;;
+ ldfp8 v4, v5 = [r35], 16;;
+ ldfp8 v6, v7 = [r35], 16;;
+
+/* In the inner loop, get a new U limb and store a result limb. */
+ mov lc = un
+Loop: ldf8 u0 = [r33], 8
+ ld8 r0 = [r32]
+ xma.l lp0 = v0, u0, hp0
+ xma.hu hp0 = v0, u0, hp0
+ xma.l lp1 = v1, u0, hp1
+ xma.hu hp1 = v1, u0, hp1
+ xma.l lp2 = v2, u0, hp2
+ xma.hu hp2 = v2, u0, hp2
+ xma.l lp3 = v3, u0, hp3
+ xma.hu hp3 = v3, u0, hp3
+ xma.l lp4 = v4, u0, hp4
+ xma.hu hp4 = v4, u0, hp4
+ xma.l lp5 = v5, u0, hp5
+ xma.hu hp5 = v5, u0, hp5
+ xma.l lp6 = v6, u0, hp6
+ xma.hu hp6 = v6, u0, hp6
+ xma.l lp7 = v7, u0, hp7
+ xma.hu hp7 = v7, u0, hp7
+ getf.sig l0 = lp0
+ getf.sig l1 = lp1
+ getf.sig l2 = lp2
+ getf.sig l3 = lp3
+ getf.sig l4 = lp4
+ getf.sig l5 = lp5
+ getf.sig l6 = lp6
+ add+cmp+add xx, l0, r0
+ add+cmp+add acc0, acc1, l1
+ add+cmp+add acc1, acc2, l2
+ add+cmp+add acc2, acc3, l3
+ add+cmp+add acc3, acc4, l4
+ add+cmp+add acc4, acc5, l5
+ add+cmp+add acc5, acc6, l6
+ getf.sig acc6 = lp7
+ st8 [r32] = xx, 8
+ br.cloop Loop
+
+ 49 insn at max 6 insn/cycle: 8.167 cycles/limb8
+ 11 memops at max 2 memops/cycle: 5.5 cycles/limb8
+ 16 fpops at max 2 fpops/cycle: 8 cycles/limb8
+ 21 intops at max 4 intops/cycle: 5.25 cycles/limb8
+ 11+21 memops+intops at max 4/cycle 8 cycles/limb8
+
+================================================================
+mpn_lshift, mpn_rshift
+
+The current code runs at 1 cycle/limb on Itanium 2.
+
+Using 63 separate loops, we could use the double-word shrp instruction.
+That instruction has a plain single-cycle latency. We need 63 loops since
+this instruction only accept immediate count. That would lead to a somewhat
+silly code size, but the speed would be 0.75 c/l on Itanium 2 (by using shrp
+each cycle plus shl/shr going down I1 for a further limb every second
+cycle).
+
+================================================================
+mpn_copyi, mpn_copyd
+
+The current code runs at 0.5 c/l on Itanium 2. But that is just for L1
+cache hit. The 4-way unrolled loop takes just 2 cycles, and thus load-use
+scheduling isn't great. It might be best to actually use modulo scheduled
+loops, since that will allow us to do better load-use scheduling without too
+much unrolling.
+
+Depending on size or operand alignment, we get 1 c/l or 0.5 c/l on Itanium
+2, according to tune/speed. Cache bank conflicts?
+
+
+
+REFERENCES
+
+Intel Itanium Architecture Software Developer's Manual, volumes 1 to 3,
+Intel document 245317-004, 245318-004, 245319-004 October 2002. Volume 1
+includes an Itanium optimization guide.
+
+Intel Itanium Processor-specific Application Binary Interface (ABI), Intel
+document 245370-003, May 2001. Describes C type sizes, dynamic linking,
+etc.
+
+Intel Itanium Architecture Assembly Language Reference Guide, Intel document
+248801-004, 2000-2002. Describes assembly instruction syntax and other
+directives.
+
+Itanium Software Conventions and Runtime Architecture Guide, Intel document
+245358-003, May 2001. Describes calling conventions, including stack
+unwinding requirements.
+
+Intel Itanium Processor Reference Manual for Software Optimization, Intel
+document 245473-003, November 2001.
+
+Intel Itanium-2 Processor Reference Manual for Software Development and
+Optimization, Intel document 251110-003, May 2004.
+
+All the above documents can be found online at
+
+ http://developer.intel.com/design/itanium/manuals.htm
diff --git a/vendor/gmp-6.3.0/mpn/ia64/add_n_sub_n.asm b/vendor/gmp-6.3.0/mpn/ia64/add_n_sub_n.asm
new file mode 100644
index 0000000..c15afaa
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/ia64/add_n_sub_n.asm
@@ -0,0 +1,307 @@
+dnl IA-64 mpn_add_n_sub_n -- mpn parallel addition and subtraction.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2010 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C Itanium: ?
+C Itanium 2: 2.25
+
+C INPUT PARAMETERS
+define(`sp', `r32')
+define(`dp', `r33')
+define(`up', `r34')
+define(`vp', `r35')
+define(`n', `r36')
+
+C Some useful aliases for registers we use
+define(`u0',`r16') define(`u1',`r17') define(`u2',`r18') define(`u3',`r19')
+define(`v0',`r20') define(`v1',`r21') define(`v2',`r22') define(`v3',`r23')
+define(`s0',`r24') define(`s1',`r25') define(`s2',`r26') define(`s3',`r27')
+define(`d0',`r28') define(`d1',`r29') define(`d2',`r30') define(`d3',`r31')
+define(`up0',`up')
+define(`up1',`r14')
+define(`vp0',`vp')
+define(`vp1',`r15')
+
+
+ASM_START()
+PROLOGUE(mpn_add_n_sub_n)
+ .prologue
+ .save ar.lc, r2
+ .body
+ifdef(`HAVE_ABI_32',`
+ addp4 sp = 0, sp C M I
+ addp4 dp = 0, dp C M I
+ nop.i 0
+ addp4 up = 0, up C M I
+ addp4 vp = 0, vp C M I
+ zxt4 n = n C I
+ ;;
+')
+
+ and r9 = 3, n C M I
+ mov.i r2 = ar.lc C I0
+ add up1 = 8, up0 C M I
+ add vp1 = 8, vp0 C M I
+ add r8 = -2, n C M I
+ add r10 = 256, up C M I
+ ;;
+ shr.u r8 = r8, 2 C I0
+ cmp.eq p10, p0 = 0, r9 C M I
+ cmp.eq p11, p0 = 2, r9 C M I
+ cmp.eq p12, p0 = 3, r9 C M I
+ add r11 = 256, vp C M I
+ ;;
+ mov.i ar.lc = r8 C I0
+ (p10) br L(b0) C B
+ (p11) br L(b2) C B
+ (p12) br L(b3) C B
+
+L(b1): ld8 u3 = [up0], 8 C M01
+ add up1 = 8, up1 C M I
+ cmpltu p14, p15 = 4, n C M I
+ ld8 v3 = [vp0], 8 C M01
+ add vp1 = 8, vp1 C M I
+ ;;
+ add s3 = u3, v3 C M I
+ sub d3 = u3, v3 C M I
+ mov r8 = 0 C M I
+ ;;
+ cmpltu p9, p0 = s3, v3 C carry from add3 M I
+ cmpltu p13, p0 = u3, v3 C borrow from sub3 M I
+ (p15) br L(cj1) C B
+ st8 [sp] = s3, 8 C M23
+ st8 [dp] = d3, 8 C M23
+ br L(c0) C B
+
+L(b0): cmp.ne p9, p0 = r0, r0 C M I
+ cmp.ne p13, p0 = r0, r0 C M I
+L(c0): ld8 u0 = [up0], 16 C M01
+ ld8 u1 = [up1], 16 C M01
+ ;;
+ ld8 v0 = [vp0], 16 C M01
+ ld8 v1 = [vp1], 16 C M01
+ ;;
+ ld8 u2 = [up0], 16 C M01
+ ld8 u3 = [up1], 16 C M01
+ ;;
+ ld8 v2 = [vp0], 16 C M01
+ ld8 v3 = [vp1], 16 C M01
+ ;;
+ add s0 = u0, v0 C M I
+ add s1 = u1, v1 C M I
+ sub d0 = u0, v0 C M I
+ sub d1 = u1, v1 C M I
+ ;;
+ cmpltu p6, p0 = s0, v0 C carry from add0 M I
+ cmpltu p7, p0 = s1, v1 C carry from add1 M I
+ cmpltu p10, p0 = u0, v0 C borrow from sub0 M I
+ cmpltu p11, p0 = u1, v1 C borrow from sub1 M I
+ ;;
+ nop 0 C
+ br.cloop.dptk L(top) C B
+ br L(end) C B
+
+L(b3): ld8 u1 = [up0], 8 C M01
+ add up1 = 8, up1 C M I
+ ld8 v1 = [vp0], 8 C M01
+ ;;
+ add vp1 = 8, vp1 C M I
+ add s1 = u1, v1 C M I
+ sub d1 = u1, v1 C M I
+ ;;
+ cmpltu p7, p0 = s1, v1 C carry from add1 M I
+ cmpltu p11, p0 = u1, v1 C borrow from sub1 M I
+ ;;
+ st8 [sp] = s1, 8 C M23
+ st8 [dp] = d1, 8 C M23
+ br L(c2) C B
+
+ ALIGN(32)
+L(b2): cmp.ne p7, p0 = r0, r0 C M I
+ cmp.ne p11, p0 = r0, r0 C M I
+ nop 0
+L(c2): ld8 u2 = [up0], 16 C M01
+ ld8 u3 = [up1], 16 C M01
+ cmpltu p14, p0 = 4, n C M I
+ ;;
+ ld8 v2 = [vp0], 16 C M01
+ ld8 v3 = [vp1], 16 C M01
+ (p14) br L(gt4) C B
+ ;;
+ add s2 = u2, v2 C M I
+ add s3 = u3, v3 C M I
+ sub d2 = u2, v2 C M I
+ sub d3 = u3, v3 C M I
+ ;;
+ cmpltu p8, p0 = s2, v2 C carry from add0 M I
+ cmpltu p9, p0 = s3, v3 C carry from add3 M I
+ cmpltu p12, p0 = u2, v2 C borrow from sub2 M I
+ cmpltu p13, p0 = u3, v3 C borrow from sub3 M I
+ br L(cj2) C B
+ ;;
+L(gt4): ld8 u0 = [up0], 16 C M01
+ ld8 u1 = [up1], 16 C M01
+ ;;
+ ld8 v0 = [vp0], 16 C M01
+ ld8 v1 = [vp1], 16 C M01
+ ;;
+ add s2 = u2, v2 C M I
+ add s3 = u3, v3 C M I
+ sub d2 = u2, v2 C M I
+ sub d3 = u3, v3 C M I
+ ;;
+ cmpltu p8, p0 = s2, v2 C carry from add0 M I
+ cmpltu p9, p0 = s3, v3 C carry from add1 M I
+ cmpltu p12, p0 = u2, v2 C borrow from sub0 M I
+ cmpltu p13, p0 = u3, v3 C borrow from sub1 M I
+ br.cloop.dptk L(mid) C B
+
+ ALIGN(32)
+L(top):
+ ld8 u0 = [up0], 16 C M01
+ ld8 u1 = [up1], 16 C M01
+ (p9) cmpeqor p6, p0 = -1, s0 C M I
+ (p9) add s0 = 1, s0 C M I
+ (p13) cmpeqor p10, p0 = 0, d0 C M I
+ (p13) add d0 = -1, d0 C M I
+ ;;
+ ld8 v0 = [vp0], 16 C M01
+ ld8 v1 = [vp1], 16 C M01
+ (p6) cmpeqor p7, p0 = -1, s1 C M I
+ (p6) add s1 = 1, s1 C M I
+ (p10) cmpeqor p11, p0 = 0, d1 C M I
+ (p10) add d1 = -1, d1 C M I
+ ;;
+ st8 [sp] = s0, 8 C M23
+ st8 [dp] = d0, 8 C M23
+ add s2 = u2, v2 C M I
+ add s3 = u3, v3 C M I
+ sub d2 = u2, v2 C M I
+ sub d3 = u3, v3 C M I
+ ;;
+ st8 [sp] = s1, 8 C M23
+ st8 [dp] = d1, 8 C M23
+ cmpltu p8, p0 = s2, v2 C carry from add2 M I
+ cmpltu p9, p0 = s3, v3 C carry from add3 M I
+ cmpltu p12, p0 = u2, v2 C borrow from sub2 M I
+ cmpltu p13, p0 = u3, v3 C borrow from sub3 M I
+ ;;
+L(mid):
+ ld8 u2 = [up0], 16 C M01
+ ld8 u3 = [up1], 16 C M01
+ (p7) cmpeqor p8, p0 = -1, s2 C M I
+ (p7) add s2 = 1, s2 C M I
+ (p11) cmpeqor p12, p0 = 0, d2 C M I
+ (p11) add d2 = -1, d2 C M I
+ ;;
+ ld8 v2 = [vp0], 16 C M01
+ ld8 v3 = [vp1], 16 C M01
+ (p8) cmpeqor p9, p0 = -1, s3 C M I
+ (p8) add s3 = 1, s3 C M I
+ (p12) cmpeqor p13, p0 = 0, d3 C M I
+ (p12) add d3 = -1, d3 C M I
+ ;;
+ st8 [sp] = s2, 8 C M23
+ st8 [dp] = d2, 8 C M23
+ add s0 = u0, v0 C M I
+ add s1 = u1, v1 C M I
+ sub d0 = u0, v0 C M I
+ sub d1 = u1, v1 C M I
+ ;;
+ st8 [sp] = s3, 8 C M23
+ st8 [dp] = d3, 8 C M23
+ cmpltu p6, p0 = s0, v0 C carry from add0 M I
+ cmpltu p7, p0 = s1, v1 C carry from add1 M I
+ cmpltu p10, p0 = u0, v0 C borrow from sub0 M I
+ cmpltu p11, p0 = u1, v1 C borrow from sub1 M I
+ ;;
+ lfetch [r10], 32 C M?
+ lfetch [r11], 32 C M?
+ br.cloop.dptk L(top) C B
+ ;;
+
+L(end):
+ nop 0
+ nop 0
+ (p9) cmpeqor p6, p0 = -1, s0 C M I
+ (p9) add s0 = 1, s0 C M I
+ (p13) cmpeqor p10, p0 = 0, d0 C M I
+ (p13) add d0 = -1, d0 C M I
+ ;;
+ nop 0
+ nop 0
+ (p6) cmpeqor p7, p0 = -1, s1 C M I
+ (p6) add s1 = 1, s1 C M I
+ (p10) cmpeqor p11, p0 = 0, d1 C M I
+ (p10) add d1 = -1, d1 C M I
+ ;;
+ st8 [sp] = s0, 8 C M23
+ st8 [dp] = d0, 8 C M23
+ add s2 = u2, v2 C M I
+ add s3 = u3, v3 C M I
+ sub d2 = u2, v2 C M I
+ sub d3 = u3, v3 C M I
+ ;;
+ st8 [sp] = s1, 8 C M23
+ st8 [dp] = d1, 8 C M23
+ cmpltu p8, p0 = s2, v2 C carry from add2 M I
+ cmpltu p9, p0 = s3, v3 C carry from add3 M I
+ cmpltu p12, p0 = u2, v2 C borrow from sub2 M I
+ cmpltu p13, p0 = u3, v3 C borrow from sub3 M I
+ ;;
+L(cj2):
+ (p7) cmpeqor p8, p0 = -1, s2 C M I
+ (p7) add s2 = 1, s2 C M I
+ (p11) cmpeqor p12, p0 = 0, d2 C M I
+ (p11) add d2 = -1, d2 C M I
+ mov r8 = 0 C M I
+ nop 0
+ ;;
+ st8 [sp] = s2, 8 C M23
+ st8 [dp] = d2, 8 C M23
+ (p8) cmpeqor p9, p0 = -1, s3 C M I
+ (p8) add s3 = 1, s3 C M I
+ (p12) cmpeqor p13, p0 = 0, d3 C M I
+ (p12) add d3 = -1, d3 C M I
+ ;;
+L(cj1):
+ (p9) mov r8 = 2 C M I
+ ;;
+ mov.i ar.lc = r2 C I0
+ (p13) add r8 = 1, r8 C M I
+ st8 [sp] = s3 C M23
+ st8 [dp] = d3 C M23
+ br.ret.sptk.many b0 C B
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/ia64/addmul_1.asm b/vendor/gmp-6.3.0/mpn/ia64/addmul_1.asm
new file mode 100644
index 0000000..ffa3297
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/ia64/addmul_1.asm
@@ -0,0 +1,602 @@
+dnl IA-64 mpn_addmul_1 -- Multiply a limb vector with a limb and add the
+dnl result to a second limb vector.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2000-2005, 2007 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C Itanium: 3.0
+C Itanium 2: 2.0
+
+C TODO
+C * Further optimize feed-in and wind-down code, both for speed and code size.
+C * Handle low limb input and results specially, using a common stf8 in the
+C epilogue.
+C * Use 1 c/l carry propagation scheme in wind-down code.
+C * Use extra pointer registers for `up' and rp to speed up feed-in loads.
+C * Work out final differences with mul_1.asm. That function is 300 bytes
+C smaller than this due to better loop scheduling and thus simpler feed-in
+C code.
+
+C INPUT PARAMETERS
+define(`rp', `r32')
+define(`up', `r33')
+define(`n', `r34')
+define(`vl', `r35')
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+ .prologue
+ .save ar.lc, r2
+ .body
+
+ifdef(`HAVE_ABI_32',
+` addp4 rp = 0, rp C M I
+ addp4 up = 0, up C M I
+ zxt4 n = n C I
+ ;;
+')
+{.mmi
+ adds r15 = -1, n C M I
+ mov r20 = rp C M I
+ mov.i r2 = ar.lc C I0
+}
+{.mmi
+ ldf8 f7 = [up], 8 C M
+ ldf8 f8 = [rp], 8 C M
+ and r14 = 3, n C M I
+ ;;
+}
+{.mmi
+ setf.sig f6 = vl C M2 M3
+ cmp.eq p10, p0 = 0, r14 C M I
+ shr.u r31 = r15, 2 C I0
+}
+{.mmi
+ cmp.eq p11, p0 = 2, r14 C M I
+ cmp.eq p12, p0 = 3, r14 C M I
+ nop.i 0 C I
+ ;;
+}
+{.mii
+ cmp.ne p6, p7 = r0, r0 C M I
+ mov.i ar.lc = r31 C I0
+ cmp.ne p8, p9 = r0, r0 C M I
+}
+{.bbb
+ (p10) br.dptk .Lb00 C B
+ (p11) br.dptk .Lb10 C B
+ (p12) br.dptk .Lb11 C B
+ ;;
+}
+
+.Lb01: br.cloop.dptk .grt1 C B
+
+ xma.l f39 = f7, f6, f8 C F
+ xma.hu f43 = f7, f6, f8 C F
+ ;;
+ getf.sig r8 = f43 C M2
+ stf8 [r20] = f39 C M2 M3
+ mov.i ar.lc = r2 C I0
+ br.ret.sptk.many b0 C B
+
+.grt1:
+ ldf8 f32 = [up], 8
+ ldf8 f44 = [rp], 8
+ ;;
+ ldf8 f33 = [up], 8
+ ldf8 f45 = [rp], 8
+ ;;
+ ldf8 f34 = [up], 8
+ xma.l f39 = f7, f6, f8
+ ldf8 f46 = [rp], 8
+ xma.hu f43 = f7, f6, f8
+ ;;
+ ldf8 f35 = [up], 8
+ ldf8 f47 = [rp], 8
+ br.cloop.dptk .grt5
+
+ xma.l f36 = f32, f6, f44
+ xma.hu f40 = f32, f6, f44
+ ;;
+ stf8 [r20] = f39, 8
+ xma.l f37 = f33, f6, f45
+ xma.hu f41 = f33, f6, f45
+ ;;
+ getf.sig r31 = f43
+ getf.sig r24 = f36
+ xma.l f38 = f34, f6, f46
+ xma.hu f42 = f34, f6, f46
+ ;;
+ getf.sig r28 = f40
+ getf.sig r25 = f37
+ xma.l f39 = f35, f6, f47
+ xma.hu f43 = f35, f6, f47
+ ;;
+ getf.sig r29 = f41
+ getf.sig r26 = f38
+ br .Lcj5
+
+.grt5:
+ mov r30 = 0
+ xma.l f36 = f32, f6, f44
+ xma.hu f40 = f32, f6, f44
+ ;;
+ ldf8 f32 = [up], 8
+ xma.l f37 = f33, f6, f45
+ ldf8 f44 = [rp], 8
+ xma.hu f41 = f33, f6, f45
+ ;;
+ ldf8 f33 = [up], 8
+ getf.sig r27 = f39
+ ;;
+ getf.sig r31 = f43
+ xma.l f38 = f34, f6, f46
+ ldf8 f45 = [rp], 8
+ xma.hu f42 = f34, f6, f46
+ ;;
+ ldf8 f34 = [up], 8
+ getf.sig r24 = f36
+ ;;
+ getf.sig r28 = f40
+ xma.l f39 = f35, f6, f47
+ ldf8 f46 = [rp], 8
+ xma.hu f43 = f35, f6, f47
+ ;;
+ ldf8 f35 = [up], 8
+ getf.sig r25 = f37
+ br.cloop.dptk .Loop
+ br .Le0
+
+
+.Lb10: ldf8 f35 = [up], 8
+ ldf8 f47 = [rp], 8
+ br.cloop.dptk .grt2
+
+ xma.l f38 = f7, f6, f8
+ xma.hu f42 = f7, f6, f8
+ ;;
+ xma.l f39 = f35, f6, f47
+ xma.hu f43 = f35, f6, f47
+ ;;
+ getf.sig r30 = f42
+ stf8 [r20] = f38, 8
+ getf.sig r27 = f39
+ getf.sig r8 = f43
+ br .Lcj2
+
+.grt2:
+ ldf8 f32 = [up], 8
+ ldf8 f44 = [rp], 8
+ ;;
+ ldf8 f33 = [up], 8
+ xma.l f38 = f7, f6, f8
+ ldf8 f45 = [rp], 8
+ xma.hu f42 = f7, f6, f8
+ ;;
+ ldf8 f34 = [up], 8
+ xma.l f39 = f35, f6, f47
+ ldf8 f46 = [rp], 8
+ xma.hu f43 = f35, f6, f47
+ ;;
+ ldf8 f35 = [up], 8
+ ldf8 f47 = [rp], 8
+ br.cloop.dptk .grt6
+
+ stf8 [r20] = f38, 8
+ xma.l f36 = f32, f6, f44
+ xma.hu f40 = f32, f6, f44
+ ;;
+ getf.sig r30 = f42
+ getf.sig r27 = f39
+ xma.l f37 = f33, f6, f45
+ xma.hu f41 = f33, f6, f45
+ ;;
+ getf.sig r31 = f43
+ getf.sig r24 = f36
+ xma.l f38 = f34, f6, f46
+ xma.hu f42 = f34, f6, f46
+ ;;
+ getf.sig r28 = f40
+ getf.sig r25 = f37
+ xma.l f39 = f35, f6, f47
+ xma.hu f43 = f35, f6, f47
+ br .Lcj6
+
+.grt6:
+ mov r29 = 0
+ xma.l f36 = f32, f6, f44
+ xma.hu f40 = f32, f6, f44
+ ;;
+ ldf8 f32 = [up], 8
+ getf.sig r26 = f38
+ ;;
+ getf.sig r30 = f42
+ xma.l f37 = f33, f6, f45
+ ldf8 f44 = [rp], 8
+ xma.hu f41 = f33, f6, f45
+ ;;
+ ldf8 f33 = [up], 8
+ getf.sig r27 = f39
+ ;;
+ getf.sig r31 = f43
+ xma.l f38 = f34, f6, f46
+ ldf8 f45 = [rp], 8
+ xma.hu f42 = f34, f6, f46
+ ;;
+ ldf8 f34 = [up], 8
+ getf.sig r24 = f36
+ br .LL10
+
+
+.Lb11: ldf8 f34 = [up], 8
+ ldf8 f46 = [rp], 8
+ ;;
+ ldf8 f35 = [up], 8
+ ldf8 f47 = [rp], 8
+ br.cloop.dptk .grt3
+ ;;
+
+ xma.l f37 = f7, f6, f8
+ xma.hu f41 = f7, f6, f8
+ xma.l f38 = f34, f6, f46
+ xma.hu f42 = f34, f6, f46
+ xma.l f39 = f35, f6, f47
+ xma.hu f43 = f35, f6, f47
+ ;;
+ getf.sig r29 = f41
+ stf8 [r20] = f37, 8
+ getf.sig r26 = f38
+ getf.sig r30 = f42
+ getf.sig r27 = f39
+ getf.sig r8 = f43
+ br .Lcj3
+
+.grt3:
+ ldf8 f32 = [up], 8
+ xma.l f37 = f7, f6, f8
+ ldf8 f44 = [rp], 8
+ xma.hu f41 = f7, f6, f8
+ ;;
+ ldf8 f33 = [up], 8
+ xma.l f38 = f34, f6, f46
+ ldf8 f45 = [rp], 8
+ xma.hu f42 = f34, f6, f46
+ ;;
+ ldf8 f34 = [up], 8
+ xma.l f39 = f35, f6, f47
+ ldf8 f46 = [rp], 8
+ xma.hu f43 = f35, f6, f47
+ ;;
+ ldf8 f35 = [up], 8
+ getf.sig r25 = f37 C FIXME
+ ldf8 f47 = [rp], 8
+ br.cloop.dptk .grt7
+
+ getf.sig r29 = f41
+ stf8 [r20] = f37, 8 C FIXME
+ xma.l f36 = f32, f6, f44
+ getf.sig r26 = f38
+ xma.hu f40 = f32, f6, f44
+ ;;
+ getf.sig r30 = f42
+ xma.l f37 = f33, f6, f45
+ getf.sig r27 = f39
+ xma.hu f41 = f33, f6, f45
+ ;;
+ getf.sig r31 = f43
+ xma.l f38 = f34, f6, f46
+ getf.sig r24 = f36
+ xma.hu f42 = f34, f6, f46
+ br .Lcj7
+
+.grt7:
+ getf.sig r29 = f41
+ xma.l f36 = f32, f6, f44
+ mov r28 = 0
+ xma.hu f40 = f32, f6, f44
+ ;;
+ ldf8 f32 = [up], 8
+ getf.sig r26 = f38
+ ;;
+ getf.sig r30 = f42
+ xma.l f37 = f33, f6, f45
+ ldf8 f44 = [rp], 8
+ xma.hu f41 = f33, f6, f45
+ ;;
+ ldf8 f33 = [up], 8
+ getf.sig r27 = f39
+ br .LL11
+
+
+.Lb00: ldf8 f33 = [up], 8
+ ldf8 f45 = [rp], 8
+ ;;
+ ldf8 f34 = [up], 8
+ ldf8 f46 = [rp], 8
+ ;;
+ ldf8 f35 = [up], 8
+ xma.l f36 = f7, f6, f8
+ ldf8 f47 = [rp], 8
+ xma.hu f40 = f7, f6, f8
+ br.cloop.dptk .grt4
+
+ xma.l f37 = f33, f6, f45
+ xma.hu f41 = f33, f6, f45
+ xma.l f38 = f34, f6, f46
+ xma.hu f42 = f34, f6, f46
+ ;;
+ getf.sig r28 = f40
+ stf8 [r20] = f36, 8
+ xma.l f39 = f35, f6, f47
+ getf.sig r25 = f37
+ xma.hu f43 = f35, f6, f47
+ ;;
+ getf.sig r29 = f41
+ getf.sig r26 = f38
+ getf.sig r30 = f42
+ getf.sig r27 = f39
+ br .Lcj4
+
+.grt4:
+ ldf8 f32 = [up], 8
+ xma.l f37 = f33, f6, f45
+ ldf8 f44 = [rp], 8
+ xma.hu f41 = f33, f6, f45
+ ;;
+ ldf8 f33 = [up], 8
+ xma.l f38 = f34, f6, f46
+ ldf8 f45 = [rp], 8
+ xma.hu f42 = f34, f6, f46
+ ;;
+ ldf8 f34 = [up], 8
+ getf.sig r24 = f36 C FIXME
+ xma.l f39 = f35, f6, f47
+ ldf8 f46 = [rp], 8
+ getf.sig r28 = f40
+ xma.hu f43 = f35, f6, f47
+ ;;
+ ldf8 f35 = [up], 8
+ getf.sig r25 = f37
+ ldf8 f47 = [rp], 8
+ br.cloop.dptk .grt8
+
+ getf.sig r29 = f41
+ stf8 [r20] = f36, 8 C FIXME
+ xma.l f36 = f32, f6, f44
+ getf.sig r26 = f38
+ getf.sig r30 = f42
+ xma.hu f40 = f32, f6, f44
+ ;;
+ xma.l f37 = f33, f6, f45
+ getf.sig r27 = f39
+ xma.hu f41 = f33, f6, f45
+ br .Lcj8
+
+.grt8:
+ getf.sig r29 = f41
+ xma.l f36 = f32, f6, f44
+ mov r31 = 0
+ xma.hu f40 = f32, f6, f44
+ ;;
+ ldf8 f32 = [up], 8
+ getf.sig r26 = f38
+ br .LL00
+
+
+C *** MAIN LOOP START ***
+ ALIGN(32) C insn fed cycle #
+.Loop:
+ .pred.rel "mutex", p6, p7 C num by i1 i2
+ getf.sig r29 = f41 C 00 16 0 0
+ xma.l f36 = f32, f6, f44 C 01 06,15 0 0
+ (p6) add r14 = r30, r27, 1 C 02 0 0
+ ldf8 f47 = [rp], 8 C 03 0 0
+ xma.hu f40 = f32, f6, f44 C 04 06,15 0 0
+ (p7) add r14 = r30, r27 C 05 0 0
+ ;;
+ .pred.rel "mutex", p6, p7
+ ldf8 f32 = [up], 8 C 06 1 1
+ (p6) cmp.leu p8, p9 = r14, r27 C 07 1 1
+ (p7) cmp.ltu p8, p9 = r14, r27 C 08 1 1
+ getf.sig r26 = f38 C 09 25 2 1
+ st8 [r20] = r14, 8 C 10 2 1
+ nop.b 0 C 11 2 1
+ ;;
+.LL00:
+ .pred.rel "mutex", p8, p9
+ getf.sig r30 = f42 C 12 28 3 2
+ xma.l f37 = f33, f6, f45 C 13 18,27 3 2
+ (p8) add r16 = r31, r24, 1 C 14 3 2
+ ldf8 f44 = [rp], 8 C 15 3 2
+ xma.hu f41 = f33, f6, f45 C 16 18,27 3 2
+ (p9) add r16 = r31, r24 C 17 3 2
+ ;;
+ .pred.rel "mutex", p8, p9
+ ldf8 f33 = [up], 8 C 18 4 3
+ (p8) cmp.leu p6, p7 = r16, r24 C 19 4 3
+ (p9) cmp.ltu p6, p7 = r16, r24 C 20 4 3
+ getf.sig r27 = f39 C 21 37 5 3
+ st8 [r20] = r16, 8 C 22 5 3
+ nop.b 0 C 23 5 3
+ ;;
+.LL11:
+ .pred.rel "mutex", p6, p7
+ getf.sig r31 = f43 C 24 40 6 4
+ xma.l f38 = f34, f6, f46 C 25 30,39 6 4
+ (p6) add r14 = r28, r25, 1 C 26 6 4
+ ldf8 f45 = [rp], 8 C 27 6 4
+ xma.hu f42 = f34, f6, f46 C 28 30,39 6 4
+ (p7) add r14 = r28, r25 C 29 6 4
+ ;;
+ .pred.rel "mutex", p6, p7
+ ldf8 f34 = [up], 8 C 30 7 5
+ (p6) cmp.leu p8, p9 = r14, r25 C 31 7 5
+ (p7) cmp.ltu p8, p9 = r14, r25 C 32 7 5
+ getf.sig r24 = f36 C 33 01 8 5
+ st8 [r20] = r14, 8 C 34 8 5
+ nop.b 0 C 35 8 5
+ ;;
+.LL10:
+ .pred.rel "mutex", p8, p9
+ getf.sig r28 = f40 C 36 04 9 6
+ xma.l f39 = f35, f6, f47 C 37 42,03 9 6
+ (p8) add r16 = r29, r26, 1 C 38 9 6
+ ldf8 f46 = [rp], 8 C 39 9 6
+ xma.hu f43 = f35, f6, f47 C 40 42,03 9 6
+ (p9) add r16 = r29, r26 C 41 9 6
+ ;;
+ .pred.rel "mutex", p8, p9
+ ldf8 f35 = [up], 8 C 42 10 7
+ (p8) cmp.leu p6, p7 = r16, r26 C 43 10 7
+ (p9) cmp.ltu p6, p7 = r16, r26 C 44 10 7
+ getf.sig r25 = f37 C 45 13 11 7
+ st8 [r20] = r16, 8 C 46 11 7
+ br.cloop.dptk .Loop C 47 11 7
+C *** MAIN LOOP END ***
+ ;;
+.Le0:
+ .pred.rel "mutex", p6, p7
+ getf.sig r29 = f41 C
+ xma.l f36 = f32, f6, f44 C
+ (p6) add r14 = r30, r27, 1 C
+ ldf8 f47 = [rp], 8 C
+ xma.hu f40 = f32, f6, f44 C
+ (p7) add r14 = r30, r27 C
+ ;;
+ .pred.rel "mutex", p6, p7
+ (p6) cmp.leu p8, p9 = r14, r27 C
+ (p7) cmp.ltu p8, p9 = r14, r27 C
+ getf.sig r26 = f38 C
+ st8 [r20] = r14, 8 C
+ ;;
+ .pred.rel "mutex", p8, p9
+ getf.sig r30 = f42 C
+ xma.l f37 = f33, f6, f45 C
+ (p8) add r16 = r31, r24, 1 C
+ xma.hu f41 = f33, f6, f45 C
+ (p9) add r16 = r31, r24 C
+ ;;
+ .pred.rel "mutex", p8, p9
+ (p8) cmp.leu p6, p7 = r16, r24 C
+ (p9) cmp.ltu p6, p7 = r16, r24 C
+ getf.sig r27 = f39 C
+ st8 [r20] = r16, 8 C
+ ;;
+.Lcj8:
+ .pred.rel "mutex", p6, p7
+ getf.sig r31 = f43 C
+ xma.l f38 = f34, f6, f46 C
+ (p6) add r14 = r28, r25, 1 C
+ xma.hu f42 = f34, f6, f46 C
+ (p7) add r14 = r28, r25 C
+ ;;
+ .pred.rel "mutex", p6, p7
+ (p6) cmp.leu p8, p9 = r14, r25 C
+ (p7) cmp.ltu p8, p9 = r14, r25 C
+ getf.sig r24 = f36 C
+ st8 [r20] = r14, 8 C
+ ;;
+.Lcj7:
+ .pred.rel "mutex", p8, p9
+ getf.sig r28 = f40 C
+ xma.l f39 = f35, f6, f47 C
+ (p8) add r16 = r29, r26, 1 C
+ xma.hu f43 = f35, f6, f47 C
+ (p9) add r16 = r29, r26 C
+ ;;
+ .pred.rel "mutex", p8, p9
+ (p8) cmp.leu p6, p7 = r16, r26 C
+ (p9) cmp.ltu p6, p7 = r16, r26 C
+ getf.sig r25 = f37 C
+ st8 [r20] = r16, 8 C
+ ;;
+.Lcj6:
+ .pred.rel "mutex", p6, p7
+ getf.sig r29 = f41 C
+ (p6) add r14 = r30, r27, 1 C
+ (p7) add r14 = r30, r27 C
+ ;;
+ .pred.rel "mutex", p6, p7
+ (p6) cmp.leu p8, p9 = r14, r27 C
+ (p7) cmp.ltu p8, p9 = r14, r27 C
+ getf.sig r26 = f38 C
+ st8 [r20] = r14, 8 C
+ ;;
+.Lcj5:
+ .pred.rel "mutex", p8, p9
+ getf.sig r30 = f42 C
+ (p8) add r16 = r31, r24, 1 C
+ (p9) add r16 = r31, r24 C
+ ;;
+ .pred.rel "mutex", p8, p9
+ (p8) cmp.leu p6, p7 = r16, r24 C
+ (p9) cmp.ltu p6, p7 = r16, r24 C
+ getf.sig r27 = f39 C
+ st8 [r20] = r16, 8 C
+ ;;
+.Lcj4:
+ .pred.rel "mutex", p6, p7
+ getf.sig r8 = f43 C
+ (p6) add r14 = r28, r25, 1 C
+ (p7) add r14 = r28, r25 C
+ ;;
+ .pred.rel "mutex", p6, p7
+ st8 [r20] = r14, 8 C
+ (p6) cmp.leu p8, p9 = r14, r25 C
+ (p7) cmp.ltu p8, p9 = r14, r25 C
+ ;;
+.Lcj3:
+ .pred.rel "mutex", p8, p9
+ (p8) add r16 = r29, r26, 1 C
+ (p9) add r16 = r29, r26 C
+ ;;
+ .pred.rel "mutex", p8, p9
+ st8 [r20] = r16, 8 C
+ (p8) cmp.leu p6, p7 = r16, r26 C
+ (p9) cmp.ltu p6, p7 = r16, r26 C
+ ;;
+.Lcj2:
+ .pred.rel "mutex", p6, p7
+ (p6) add r14 = r30, r27, 1 C
+ (p7) add r14 = r30, r27 C
+ ;;
+ .pred.rel "mutex", p6, p7
+ st8 [r20] = r14 C
+ (p6) cmp.leu p8, p9 = r14, r27 C
+ (p7) cmp.ltu p8, p9 = r14, r27 C
+ ;;
+ (p8) add r8 = 1, r8 C M I
+ mov.i ar.lc = r2 C I0
+ br.ret.sptk.many b0 C B
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/ia64/addmul_2.asm b/vendor/gmp-6.3.0/mpn/ia64/addmul_2.asm
new file mode 100644
index 0000000..86e8de4
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/ia64/addmul_2.asm
@@ -0,0 +1,715 @@
+dnl IA-64 mpn_addmul_2 -- Multiply a n-limb number with a 2-limb number and
+dnl add the result to a (n+1)-limb number.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2004, 2005, 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C Itanium: 3.65
+C Itanium 2: 1.625
+
+C TODO
+C * Clean up variable names, and try to decrease the number of distinct
+C registers used.
+C * Clean up feed-in code to not require zeroing several registers.
+C * Make sure we don't depend on uninitialised predicate registers.
+C * Could perhaps save a few cycles by using 1 c/l carry propagation in
+C wind-down code.
+C * Ultimately rewrite. The problem with this code is that it first uses a
+C loaded u value in one xma pair, then leaves it live over several unrelated
+C xma pairs, before it uses it again. It should actually be quite possible
+C to just swap some aligned xma pairs around. But we should then schedule
+C u loads further from the first use.
+
+C INPUT PARAMETERS
+define(`rp',`r32')
+define(`up',`r33')
+define(`n',`r34')
+define(`vp',`r35')
+
+define(`srp',`r3')
+
+define(`v0',`f6')
+define(`v1',`f7')
+
+define(`s0',`r14')
+define(`acc0',`r15')
+
+define(`pr0_0',`r16') define(`pr0_1',`r17')
+define(`pr0_2',`r18') define(`pr0_3',`r19')
+
+define(`pr1_0',`r20') define(`pr1_1',`r21')
+define(`pr1_2',`r22') define(`pr1_3',`r23')
+
+define(`acc1_0',`r24') define(`acc1_1',`r25')
+define(`acc1_2',`r26') define(`acc1_3',`r27')
+
+dnl define(`',`r28')
+dnl define(`',`r29')
+dnl define(`',`r30')
+dnl define(`',`r31')
+
+define(`fp0b_0',`f8') define(`fp0b_1',`f9')
+define(`fp0b_2',`f10') define(`fp0b_3',`f11')
+
+define(`fp1a_0',`f12') define(`fp1a_1',`f13')
+define(`fp1a_2',`f14') define(`fp1a_3',`f15')
+
+define(`fp1b_0',`f32') define(`fp1b_1',`f33')
+define(`fp1b_2',`f34') define(`fp1b_3',`f35')
+
+define(`fp2a_0',`f36') define(`fp2a_1',`f37')
+define(`fp2a_2',`f38') define(`fp2a_3',`f39')
+
+define(`r_0',`f40') define(`r_1',`f41')
+define(`r_2',`f42') define(`r_3',`f43')
+
+define(`u_0',`f44') define(`u_1',`f45')
+define(`u_2',`f46') define(`u_3',`f47')
+
+define(`rx',`f48')
+define(`ux',`f49')
+define(`ry',`f50')
+define(`uy',`f51')
+
+ASM_START()
+PROLOGUE(mpn_addmul_2s)
+ .prologue
+ .save ar.lc, r2
+ .body
+
+ifdef(`HAVE_ABI_32',`
+ {.mmi; addp4 rp = 0, rp C M I
+ addp4 up = 0, up C M I
+ addp4 vp = 0, vp C M I
+}{.mmi; nop 1
+ nop 1
+ zxt4 n = n C I
+ ;;
+}')
+
+ {.mmi; ldf8 ux = [up], 8 C M
+ ldf8 v0 = [vp], 8 C M
+ mov r2 = ar.lc C I0
+}{.mmi; ldf8 rx = [rp], 8 C M
+ and r14 = 3, n C M I
+ add n = -2, n C M I
+ ;;
+}{.mmi; ldf8 uy = [up], 8 C M
+ ldf8 v1 = [vp] C M
+ shr.u n = n, 2 C I0
+}{.mmi; ldf8 ry = [rp], -8 C M
+ cmp.eq p14, p0 = 1, r14 C M I
+ cmp.eq p11, p0 = 2, r14 C M I
+ ;;
+}{.mmi; add srp = 16, rp C M I
+ cmp.eq p15, p0 = 3, r14 C M I
+ mov ar.lc = n C I0
+}{.bbb; (p14) br.dptk L(x01) C B
+ (p11) br.dptk L(x10) C B
+ (p15) br.dptk L(x11) C B
+ ;;
+}
+L(x00): cmp.ne p6, p0 = r0, r0 C suppress initial xma pair
+ mov fp2a_3 = f0
+ br L(b00)
+L(x01): cmp.ne p14, p0 = r0, r0 C suppress initial xma pair
+ mov fp2a_2 = f0
+ br L(b01)
+L(x10): cmp.ne p11, p0 = r0, r0 C suppress initial xma pair
+ mov fp2a_1 = f0
+ br L(b10)
+L(x11): cmp.ne p15, p0 = r0, r0 C suppress initial xma pair
+ mov fp2a_0 = f0
+ br L(b11)
+
+EPILOGUE()
+
+PROLOGUE(mpn_addmul_2)
+ .prologue
+ .save ar.lc, r2
+ .body
+
+ifdef(`HAVE_ABI_32',`
+ {.mmi; addp4 rp = 0, rp C M I
+ addp4 up = 0, up C M I
+ addp4 vp = 0, vp C M I
+}{.mmi; nop 1
+ nop 1
+ zxt4 n = n C I
+ ;;
+}')
+
+ {.mmi; ldf8 ux = [up], 8 C M
+ ldf8 v0 = [vp], 8 C M
+ mov r2 = ar.lc C I0
+}{.mmi; ldf8 rx = [rp], 8 C M
+ and r14 = 3, n C M I
+ add n = -2, n C M I
+ ;;
+}{.mmi; ldf8 uy = [up], 8 C M
+ ldf8 v1 = [vp] C M
+ shr.u n = n, 2 C I0
+}{.mmi; ldf8 ry = [rp], -8 C M
+ cmp.eq p14, p0 = 1, r14 C M I
+ cmp.eq p11, p0 = 2, r14 C M I
+ ;;
+}{.mmi; add srp = 16, rp C M I
+ cmp.eq p15, p6 = 3, r14 C M I
+ mov ar.lc = n C I0
+}{.bbb; (p14) br.dptk L(b01) C B
+ (p11) br.dptk L(b10) C B
+ (p15) br.dptk L(b11) C B
+ ;;
+}
+ ALIGN(32)
+L(b00):
+ {.mmi; ldf8 r_1 = [srp], 8
+ ldf8 u_1 = [up], 8
+ mov acc1_2 = 0
+}{.mmi; mov pr1_2 = 0
+ mov pr0_3 = 0
+ cmp.ne p8, p9 = r0, r0
+ ;;
+}{.mfi; ldf8 r_2 = [srp], 8
+ xma.l fp0b_3 = ux, v0, rx
+ cmp.ne p12, p13 = r0, r0
+}{.mfb; ldf8 u_2 = [up], 8
+ xma.hu fp1b_3 = ux, v0, rx
+ br.cloop.dptk L(gt4)
+}
+ xma.l fp0b_0 = uy, v0, ry
+ xma.hu fp1a_0 = uy, v0, ry
+ ;;
+ getfsig acc0 = fp0b_3
+ (p6) xma.hu fp2a_3 = ux, v1, fp1b_3 C suppressed for addmul_2s
+ (p6) xma.l fp1b_3 = ux, v1, fp1b_3 C suppressed for addmul_2s
+ ;;
+ xma.l fp0b_1 = u_1, v0, r_1
+ xma.hu fp1a_1 = u_1, v0, r_1
+ ;;
+ getfsig pr0_0 = fp0b_0
+ xma.l fp1b_0 = uy, v1, fp1a_0
+ xma.hu fp2a_0 = uy, v1, fp1a_0
+ ;;
+ getfsig pr1_3 = fp1b_3
+ getfsig acc1_3 = fp2a_3
+ xma.l fp0b_2 = u_2, v0, r_2
+ xma.hu fp1a_2 = u_2, v0, r_2
+ br L(cj4)
+
+L(gt4): xma.l fp0b_0 = uy, v0, ry
+ xma.hu fp1a_0 = uy, v0, ry
+ ;;
+ ldf8 r_3 = [srp], 8
+ getfsig acc0 = fp0b_3
+ (p6) xma.hu fp2a_3 = ux, v1, fp1b_3 C suppressed for addmul_2s
+ ldf8 u_3 = [up], 8
+ (p6) xma.l fp1b_3 = ux, v1, fp1b_3 C suppressed for addmul_2s
+ ;;
+ xma.l fp0b_1 = u_1, v0, r_1
+ xma.hu fp1a_1 = u_1, v0, r_1
+ ;;
+ ldf8 r_0 = [srp], 8
+ getfsig pr0_0 = fp0b_0
+ xma.l fp1b_0 = uy, v1, fp1a_0
+ xma.hu fp2a_0 = uy, v1, fp1a_0
+ ;;
+ ldf8 u_0 = [up], 8
+ getfsig pr1_3 = fp1b_3
+ xma.l fp0b_2 = u_2, v0, r_2
+ ;;
+ getfsig acc1_3 = fp2a_3
+ xma.hu fp1a_2 = u_2, v0, r_2
+ br L(00)
+
+
+ ALIGN(32)
+L(b01):
+ {.mmi; ldf8 r_0 = [srp], 8 C M
+ ldf8 u_0 = [up], 8 C M
+ mov acc1_1 = 0 C M I
+}{.mmi; mov pr1_1 = 0 C M I
+ mov pr0_2 = 0 C M I
+ cmp.ne p6, p7 = r0, r0 C M I
+ ;;
+}{.mfi; ldf8 r_1 = [srp], 8 C M
+ xma.l fp0b_2 = ux, v0, rx C F
+ cmp.ne p10, p11 = r0, r0 C M I
+}{.mfi; ldf8 u_1 = [up], 8 C M
+ xma.hu fp1b_2 = ux, v0, rx C F
+ nop 1
+ ;;
+} xma.l fp0b_3 = uy, v0, ry C F
+ xma.hu fp1a_3 = uy, v0, ry C F
+ ;;
+ {.mmf; getfsig acc0 = fp0b_2 C M
+ ldf8 r_2 = [srp], 8 C M
+ (p14) xma.hu fp2a_2 = ux, v1,fp1b_2 C F suppressed for addmul_2s
+}{.mfb; ldf8 u_2 = [up], 8 C M
+ (p14) xma.l fp1b_2 = ux, v1,fp1b_2 C F suppressed for addmul_2s
+ br.cloop.dptk L(gt5)
+}
+ xma.l fp0b_0 = u_0, v0, r_0 C F
+ xma.hu fp1a_0 = u_0, v0, r_0 C F
+ ;;
+ getfsig pr0_3 = fp0b_3 C M
+ xma.l fp1b_3 = uy, v1,fp1a_3 C F
+ xma.hu fp2a_3 = uy, v1,fp1a_3 C F
+ ;;
+ getfsig pr1_2 = fp1b_2 C M
+ getfsig acc1_2 = fp2a_2 C M
+ xma.l fp0b_1 = u_1, v0, r_1 C F
+ xma.hu fp1a_1 = u_1, v0, r_1 C F
+ br L(cj5)
+
+L(gt5): xma.l fp0b_0 = u_0, v0, r_0
+ xma.hu fp1a_0 = u_0, v0, r_0
+ ;;
+ getfsig pr0_3 = fp0b_3
+ ldf8 r_3 = [srp], 8
+ xma.l fp1b_3 = uy, v1, fp1a_3
+ xma.hu fp2a_3 = uy, v1, fp1a_3
+ ;;
+ ldf8 u_3 = [up], 8
+ getfsig pr1_2 = fp1b_2
+ xma.l fp0b_1 = u_1, v0, r_1
+ ;;
+ getfsig acc1_2 = fp2a_2
+ xma.hu fp1a_1 = u_1, v0, r_1
+ br L(01)
+
+
+ ALIGN(32)
+L(b10): br.cloop.dptk L(gt2)
+ xma.l fp0b_1 = ux, v0, rx
+ xma.hu fp1b_1 = ux, v0, rx
+ ;;
+ xma.l fp0b_2 = uy, v0, ry
+ xma.hu fp1a_2 = uy, v0, ry
+ ;;
+ stf8 [rp] = fp0b_1, 8
+ (p11) xma.hu fp2a_1 = ux, v1, fp1b_1 C suppressed for addmul_2s
+ (p11) xma.l fp1b_1 = ux, v1, fp1b_1 C suppressed for addmul_2s
+ ;;
+ getfsig acc0 = fp0b_2
+ xma.l fp1b_2 = uy, v1, fp1a_2
+ xma.hu fp2a_2 = uy, v1, fp1a_2
+ ;;
+ getfsig pr1_1 = fp1b_1
+ getfsig acc1_1 = fp2a_1
+ mov ar.lc = r2
+ getfsig pr1_2 = fp1b_2
+ getfsig r8 = fp2a_2
+ ;;
+ add s0 = pr1_1, acc0
+ ;;
+ st8 [rp] = s0, 8
+ cmp.ltu p8, p9 = s0, pr1_1
+ sub r31 = -1, acc1_1
+ ;;
+ .pred.rel "mutex", p8, p9
+ (p8) add acc0 = pr1_2, acc1_1, 1
+ (p9) add acc0 = pr1_2, acc1_1
+ (p8) cmp.leu p10, p0 = r31, pr1_2
+ (p9) cmp.ltu p10, p0 = r31, pr1_2
+ ;;
+ st8 [rp] = acc0, 8
+ (p10) add r8 = 1, r8
+ br.ret.sptk.many b0
+
+
+L(gt2):
+ {.mmi; ldf8 r_3 = [srp], 8
+ ldf8 u_3 = [up], 8
+ mov acc1_0 = 0
+ ;;
+}{.mfi; ldf8 r_0 = [srp], 8
+ xma.l fp0b_1 = ux, v0, rx
+ mov pr1_0 = 0
+}{.mfi; ldf8 u_0 = [up], 8
+ xma.hu fp1b_1 = ux, v0, rx
+ mov pr0_1 = 0
+ ;;
+} xma.l fp0b_2 = uy, v0, ry
+ xma.hu fp1a_2 = uy, v0, ry
+ ;;
+ getfsig acc0 = fp0b_1
+ ldf8 r_1 = [srp], 8
+ (p11) xma.hu fp2a_1 = ux, v1, fp1b_1 C suppressed for addmul_2s
+ (p11) xma.l fp1b_1 = ux, v1, fp1b_1 C suppressed for addmul_2s
+ ;;
+ ldf8 u_1 = [up], 8
+ xma.l fp0b_3 = u_3, v0, r_3
+ xma.hu fp1a_3 = u_3, v0, r_3
+ ;;
+ getfsig pr0_2 = fp0b_2
+ ldf8 r_2 = [srp], 8
+ xma.l fp1b_2 = uy, v1, fp1a_2
+ xma.hu fp2a_2 = uy, v1, fp1a_2
+ ;;
+ ldf8 u_2 = [up], 8
+ getfsig pr1_1 = fp1b_1
+ ;;
+ {.mfi; getfsig acc1_1 = fp2a_1
+ xma.l fp0b_0 = u_0, v0, r_0
+ cmp.ne p8, p9 = r0, r0
+}{.mfb; cmp.ne p12, p13 = r0, r0
+ xma.hu fp1a_0 = u_0, v0, r_0
+ br.cloop.sptk.clr L(top)
+}
+ br.many L(end)
+
+
+ ALIGN(32)
+L(b11): ldf8 r_2 = [srp], 8
+ mov pr1_3 = 0
+ mov pr0_0 = 0
+ ;;
+ ldf8 u_2 = [up], 8
+ mov acc1_3 = 0
+ br.cloop.dptk L(gt3)
+ ;;
+ cmp.ne p6, p7 = r0, r0
+ xma.l fp0b_0 = ux, v0, rx
+ xma.hu fp1b_0 = ux, v0, rx
+ ;;
+ cmp.ne p10, p11 = r0, r0
+ xma.l fp0b_1 = uy, v0, ry
+ xma.hu fp1a_1 = uy, v0, ry
+ ;;
+ getfsig acc0 = fp0b_0
+ (p15) xma.hu fp2a_0 = ux, v1, fp1b_0 C suppressed for addmul_2s
+ (p15) xma.l fp1b_0 = ux, v1, fp1b_0 C suppressed for addmul_2s
+ ;;
+ xma.l fp0b_2 = uy, v1, r_2
+ xma.hu fp1a_2 = uy, v1, r_2
+ ;;
+ getfsig pr0_1 = fp0b_1
+ xma.l fp1b_1 = u_2, v0, fp1a_1
+ xma.hu fp2a_1 = u_2, v0, fp1a_1
+ ;;
+ getfsig pr1_0 = fp1b_0
+ getfsig acc1_0 = fp2a_0
+ br L(cj3)
+
+L(gt3): ldf8 r_3 = [srp], 8
+ xma.l fp0b_0 = ux, v0, rx
+ cmp.ne p10, p11 = r0, r0
+ ldf8 u_3 = [up], 8
+ xma.hu fp1b_0 = ux, v0, rx
+ cmp.ne p6, p7 = r0, r0
+ ;;
+ xma.l fp0b_1 = uy, v0, ry
+ xma.hu fp1a_1 = uy, v0, ry
+ ;;
+ getfsig acc0 = fp0b_0
+ ldf8 r_0 = [srp], 8
+ (p15) xma.hu fp2a_0 = ux, v1, fp1b_0 C suppressed for addmul_2s
+ ldf8 u_0 = [up], 8
+ (p15) xma.l fp1b_0 = ux, v1, fp1b_0 C suppressed for addmul_2s
+ ;;
+ xma.l fp0b_2 = u_2, v0, r_2
+ xma.hu fp1a_2 = u_2, v0, r_2
+ ;;
+ getfsig pr0_1 = fp0b_1
+ ldf8 r_1 = [srp], 8
+ xma.l fp1b_1 = uy, v1, fp1a_1
+ xma.hu fp2a_1 = uy, v1, fp1a_1
+ ;;
+ ldf8 u_1 = [up], 8
+ getfsig pr1_0 = fp1b_0
+ ;;
+ getfsig acc1_0 = fp2a_0
+ xma.l fp0b_3 = u_3, v0, r_3
+ xma.hu fp1a_3 = u_3, v0, r_3
+ br L(11)
+
+
+C *** MAIN LOOP START ***
+ ALIGN(32)
+L(top): C 00
+ .pred.rel "mutex", p12, p13
+ getfsig pr0_3 = fp0b_3
+ ldf8 r_3 = [srp], 8
+ xma.l fp1b_3 = u_3, v1, fp1a_3
+ (p12) add s0 = pr1_0, acc0, 1
+ (p13) add s0 = pr1_0, acc0
+ xma.hu fp2a_3 = u_3, v1, fp1a_3
+ ;; C 01
+ .pred.rel "mutex", p8, p9
+ .pred.rel "mutex", p12, p13
+ ldf8 u_3 = [up], 8
+ getfsig pr1_2 = fp1b_2
+ (p8) cmp.leu p6, p7 = acc0, pr0_1
+ (p9) cmp.ltu p6, p7 = acc0, pr0_1
+ (p12) cmp.leu p10, p11 = s0, pr1_0
+ (p13) cmp.ltu p10, p11 = s0, pr1_0
+ ;; C 02
+ .pred.rel "mutex", p6, p7
+ getfsig acc1_2 = fp2a_2
+ st8 [rp] = s0, 8
+ xma.l fp0b_1 = u_1, v0, r_1
+ (p6) add acc0 = pr0_2, acc1_0, 1
+ (p7) add acc0 = pr0_2, acc1_0
+ xma.hu fp1a_1 = u_1, v0, r_1
+ ;; C 03
+L(01):
+ .pred.rel "mutex", p10, p11
+ getfsig pr0_0 = fp0b_0
+ ldf8 r_0 = [srp], 8
+ xma.l fp1b_0 = u_0, v1, fp1a_0
+ (p10) add s0 = pr1_1, acc0, 1
+ (p11) add s0 = pr1_1, acc0
+ xma.hu fp2a_0 = u_0, v1, fp1a_0
+ ;; C 04
+ .pred.rel "mutex", p6, p7
+ .pred.rel "mutex", p10, p11
+ ldf8 u_0 = [up], 8
+ getfsig pr1_3 = fp1b_3
+ (p6) cmp.leu p8, p9 = acc0, pr0_2
+ (p7) cmp.ltu p8, p9 = acc0, pr0_2
+ (p10) cmp.leu p12, p13 = s0, pr1_1
+ (p11) cmp.ltu p12, p13 = s0, pr1_1
+ ;; C 05
+ .pred.rel "mutex", p8, p9
+ getfsig acc1_3 = fp2a_3
+ st8 [rp] = s0, 8
+ xma.l fp0b_2 = u_2, v0, r_2
+ (p8) add acc0 = pr0_3, acc1_1, 1
+ (p9) add acc0 = pr0_3, acc1_1
+ xma.hu fp1a_2 = u_2, v0, r_2
+ ;; C 06
+L(00):
+ .pred.rel "mutex", p12, p13
+ getfsig pr0_1 = fp0b_1
+ ldf8 r_1 = [srp], 8
+ xma.l fp1b_1 = u_1, v1, fp1a_1
+ (p12) add s0 = pr1_2, acc0, 1
+ (p13) add s0 = pr1_2, acc0
+ xma.hu fp2a_1 = u_1, v1, fp1a_1
+ ;; C 07
+ .pred.rel "mutex", p8, p9
+ .pred.rel "mutex", p12, p13
+ ldf8 u_1 = [up], 8
+ getfsig pr1_0 = fp1b_0
+ (p8) cmp.leu p6, p7 = acc0, pr0_3
+ (p9) cmp.ltu p6, p7 = acc0, pr0_3
+ (p12) cmp.leu p10, p11 = s0, pr1_2
+ (p13) cmp.ltu p10, p11 = s0, pr1_2
+ ;; C 08
+ .pred.rel "mutex", p6, p7
+ getfsig acc1_0 = fp2a_0
+ st8 [rp] = s0, 8
+ xma.l fp0b_3 = u_3, v0, r_3
+ (p6) add acc0 = pr0_0, acc1_2, 1
+ (p7) add acc0 = pr0_0, acc1_2
+ xma.hu fp1a_3 = u_3, v0, r_3
+ ;; C 09
+L(11):
+ .pred.rel "mutex", p10, p11
+ getfsig pr0_2 = fp0b_2
+ ldf8 r_2 = [srp], 8
+ xma.l fp1b_2 = u_2, v1, fp1a_2
+ (p10) add s0 = pr1_3, acc0, 1
+ (p11) add s0 = pr1_3, acc0
+ xma.hu fp2a_2 = u_2, v1, fp1a_2
+ ;; C 10
+ .pred.rel "mutex", p6, p7
+ .pred.rel "mutex", p10, p11
+ ldf8 u_2 = [up], 8
+ getfsig pr1_1 = fp1b_1
+ (p6) cmp.leu p8, p9 = acc0, pr0_0
+ (p7) cmp.ltu p8, p9 = acc0, pr0_0
+ (p10) cmp.leu p12, p13 = s0, pr1_3
+ (p11) cmp.ltu p12, p13 = s0, pr1_3
+ ;; C 11
+ .pred.rel "mutex", p8, p9
+ getfsig acc1_1 = fp2a_1
+ st8 [rp] = s0, 8
+ xma.l fp0b_0 = u_0, v0, r_0
+ (p8) add acc0 = pr0_1, acc1_3, 1
+ (p9) add acc0 = pr0_1, acc1_3
+ xma.hu fp1a_0 = u_0, v0, r_0
+L(10): br.cloop.sptk.clr L(top) C 12
+ ;;
+C *** MAIN LOOP END ***
+L(end):
+ .pred.rel "mutex", p12, p13
+ {.mfi; getfsig pr0_3 = fp0b_3
+ xma.l fp1b_3 = u_3, v1, fp1a_3
+ (p12) add s0 = pr1_0, acc0, 1
+}{.mfi; (p13) add s0 = pr1_0, acc0
+ xma.hu fp2a_3 = u_3, v1, fp1a_3
+ nop 1
+ ;;
+} .pred.rel "mutex", p8, p9
+ .pred.rel "mutex", p12, p13
+ {.mmi; getfsig pr1_2 = fp1b_2
+ st8 [rp] = s0, 8
+ (p8) cmp.leu p6, p7 = acc0, pr0_1
+}{.mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_1
+ (p12) cmp.leu p10, p11 = s0, pr1_0
+ (p13) cmp.ltu p10, p11 = s0, pr1_0
+ ;;
+} .pred.rel "mutex", p6, p7
+ {.mfi; getfsig acc1_2 = fp2a_2
+ xma.l fp0b_1 = u_1, v0, r_1
+ nop 1
+}{.mmf; (p6) add acc0 = pr0_2, acc1_0, 1
+ (p7) add acc0 = pr0_2, acc1_0
+ xma.hu fp1a_1 = u_1, v0, r_1
+ ;;
+}
+L(cj5):
+ .pred.rel "mutex", p10, p11
+ {.mfi; getfsig pr0_0 = fp0b_0
+ xma.l fp1b_0 = u_0, v1, fp1a_0
+ (p10) add s0 = pr1_1, acc0, 1
+}{.mfi; (p11) add s0 = pr1_1, acc0
+ xma.hu fp2a_0 = u_0, v1, fp1a_0
+ nop 1
+ ;;
+} .pred.rel "mutex", p6, p7
+ .pred.rel "mutex", p10, p11
+ {.mmi; getfsig pr1_3 = fp1b_3
+ st8 [rp] = s0, 8
+ (p6) cmp.leu p8, p9 = acc0, pr0_2
+}{.mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_2
+ (p10) cmp.leu p12, p13 = s0, pr1_1
+ (p11) cmp.ltu p12, p13 = s0, pr1_1
+ ;;
+} .pred.rel "mutex", p8, p9
+ {.mfi; getfsig acc1_3 = fp2a_3
+ xma.l fp0b_2 = u_2, v0, r_2
+ nop 1
+}{.mmf; (p8) add acc0 = pr0_3, acc1_1, 1
+ (p9) add acc0 = pr0_3, acc1_1
+ xma.hu fp1a_2 = u_2, v0, r_2
+ ;;
+}
+L(cj4):
+ .pred.rel "mutex", p12, p13
+ {.mfi; getfsig pr0_1 = fp0b_1
+ xma.l fp1b_1 = u_1, v1, fp1a_1
+ (p12) add s0 = pr1_2, acc0, 1
+}{.mfi; (p13) add s0 = pr1_2, acc0
+ xma.hu fp2a_1 = u_1, v1, fp1a_1
+ nop 1
+ ;;
+} .pred.rel "mutex", p8, p9
+ .pred.rel "mutex", p12, p13
+ {.mmi; getfsig pr1_0 = fp1b_0
+ st8 [rp] = s0, 8
+ (p8) cmp.leu p6, p7 = acc0, pr0_3
+}{.mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_3
+ (p12) cmp.leu p10, p11 = s0, pr1_2
+ (p13) cmp.ltu p10, p11 = s0, pr1_2
+ ;;
+} .pred.rel "mutex", p6, p7
+ {.mmi; getfsig acc1_0 = fp2a_0
+ (p6) add acc0 = pr0_0, acc1_2, 1
+ (p7) add acc0 = pr0_0, acc1_2
+ ;;
+}
+L(cj3):
+ .pred.rel "mutex", p10, p11
+ {.mfi; getfsig pr0_2 = fp0b_2
+ xma.l fp1b_2 = u_2, v1, fp1a_2
+ (p10) add s0 = pr1_3, acc0, 1
+}{.mfi; (p11) add s0 = pr1_3, acc0
+ xma.hu fp2a_2 = u_2, v1, fp1a_2
+ nop 1
+ ;;
+} .pred.rel "mutex", p6, p7
+ .pred.rel "mutex", p10, p11
+ {.mmi; getfsig pr1_1 = fp1b_1
+ st8 [rp] = s0, 8
+ (p6) cmp.leu p8, p9 = acc0, pr0_0
+}{.mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_0
+ (p10) cmp.leu p12, p13 = s0, pr1_3
+ (p11) cmp.ltu p12, p13 = s0, pr1_3
+ ;;
+} .pred.rel "mutex", p8, p9
+ {.mmi; getfsig acc1_1 = fp2a_1
+ (p8) add acc0 = pr0_1, acc1_3, 1
+ (p9) add acc0 = pr0_1, acc1_3
+ ;;
+} .pred.rel "mutex", p12, p13
+ {.mmi; (p12) add s0 = pr1_0, acc0, 1
+ (p13) add s0 = pr1_0, acc0
+ nop 1
+ ;;
+} .pred.rel "mutex", p8, p9
+ .pred.rel "mutex", p12, p13
+ {.mmi; getfsig pr1_2 = fp1b_2
+ st8 [rp] = s0, 8
+ (p8) cmp.leu p6, p7 = acc0, pr0_1
+}{.mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_1
+ (p12) cmp.leu p10, p11 = s0, pr1_0
+ (p13) cmp.ltu p10, p11 = s0, pr1_0
+ ;;
+} .pred.rel "mutex", p6, p7
+ {.mmi; getfsig r8 = fp2a_2
+ (p6) add acc0 = pr0_2, acc1_0, 1
+ (p7) add acc0 = pr0_2, acc1_0
+ ;;
+} .pred.rel "mutex", p10, p11
+ {.mmi; (p10) add s0 = pr1_1, acc0, 1
+ (p11) add s0 = pr1_1, acc0
+ (p6) cmp.leu p8, p9 = acc0, pr0_2
+ ;;
+} .pred.rel "mutex", p10, p11
+ {.mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_2
+ (p10) cmp.leu p12, p13 = s0, pr1_1
+ (p11) cmp.ltu p12, p13 = s0, pr1_1
+ ;;
+} .pred.rel "mutex", p8, p9
+ {.mmi; st8 [rp] = s0, 8
+ (p8) add acc0 = pr1_2, acc1_1, 1
+ (p9) add acc0 = pr1_2, acc1_1
+ ;;
+} .pred.rel "mutex", p8, p9
+ {.mmi; (p8) cmp.leu p10, p11 = acc0, pr1_2
+ (p9) cmp.ltu p10, p11 = acc0, pr1_2
+ (p12) add acc0 = 1, acc0
+ ;;
+}{.mmi; st8 [rp] = acc0, 8
+ (p12) cmpeqor p10, p0 = 0, acc0
+ nop 1
+ ;;
+}{.mib; (p10) add r8 = 1, r8
+ mov ar.lc = r2
+ br.ret.sptk.many b0
+}
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/ia64/aors_n.asm b/vendor/gmp-6.3.0/mpn/ia64/aors_n.asm
new file mode 100644
index 0000000..7705ce6
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/ia64/aors_n.asm
@@ -0,0 +1,852 @@
+dnl IA-64 mpn_add_n/mpn_sub_n -- mpn addition and subtraction.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2003-2005, 2010, 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C Itanium: 2.67
+C Itanium 2: 1.25
+
+C TODO
+C * Consider using special code for small n, using something like
+C "switch (8 * (n >= 8) + (n mod 8))" to enter it and feed-in code.
+C * The non-nc code was trimmed cycle for cycle to its current state. It is
+C probably hard to save more that an odd cycle there. The nc code is much
+C cruder (since tune/speed doesn't have any applicable direct measurements).
+C * Without the nc entry points, this becomes around 1800 bytes of object
+C code; the nc code adds over 1000 bytes. We should perhaps sacrifice a
+C few cycles for the non-nc code and let it fall into the nc code.
+
+C INPUT PARAMETERS
+define(`rp', `r32')
+define(`up', `r33')
+define(`vp', `r34')
+define(`n', `r35')
+define(`cy', `r36')
+
+ifdef(`OPERATION_add_n',`
+ define(ADDSUB, add)
+ define(CND, ltu)
+ define(INCR, 1)
+ define(LIM, -1)
+ define(LIM2, 0)
+ define(func, mpn_add_n)
+ define(func_nc, mpn_add_nc)
+')
+ifdef(`OPERATION_sub_n',`
+ define(ADDSUB, sub)
+ define(CND, gtu)
+ define(INCR, -1)
+ define(LIM, 0)
+ define(LIM2, -1)
+ define(func, mpn_sub_n)
+ define(func_nc, mpn_sub_nc)
+')
+
+define(PFDIST, 500)
+
+C Some useful aliases for registers we use
+define(`u0',`r14') define(`u1',`r15') define(`u2',`r16') define(`u3',`r17')
+define(`v0',`r24') define(`v1',`r25') define(`v2',`r26') define(`v3',`r27')
+define(`w0',`r28') define(`w1',`r29') define(`w2',`r30') define(`w3',`r31')
+define(`rpx',`r3')
+define(`upadv',`r20') define(`vpadv',`r21')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+ASM_START()
+PROLOGUE(func_nc)
+ .prologue
+ .save ar.lc, r2
+ .body
+ifdef(`HAVE_ABI_32',`
+ addp4 rp = 0, rp C M I
+ addp4 up = 0, up C M I
+ nop.i 0
+ addp4 vp = 0, vp C M I
+ nop.m 0
+ zxt4 n = n C I
+ ;;
+')
+
+ {.mmi; ld8 r11 = [vp], 8 C M01
+ ld8 r10 = [up], 8 C M01
+ mov r2 = ar.lc C I0
+}{.mmi; and r14 = 7, n C M I
+ cmp.lt p15, p14 = 8, n C M I
+ add n = -6, n C M I
+ ;;
+}{.mmi; add upadv = PFDIST, up C Merging these lines into the feed-in
+ add vpadv = PFDIST, vp C code could save a cycle per call at
+ mov r23 = cy C the expense of code size.
+ ;;
+}{.mmi; cmp.eq p6, p0 = 1, r14 C M I
+ cmp.eq p7, p0 = 2, r14 C M I
+ cmp.eq p8, p0 = 3, r14 C M I
+}{.bbb; (p6) br.dptk .Lc001 C B
+ (p7) br.dptk .Lc010 C B
+ (p8) br.dptk .Lc011 C B
+ ;;
+}{.mmi; cmp.eq p9, p0 = 4, r14 C M I
+ cmp.eq p10, p0 = 5, r14 C M I
+ cmp.eq p11, p0 = 6, r14 C M I
+}{.bbb; (p9) br.dptk .Lc100 C B
+ (p10) br.dptk .Lc101 C B
+ (p11) br.dptk .Lc110 C B
+ ;;
+}{.mmi; ld8 r19 = [vp], 8 C M01
+ ld8 r18 = [up], 8 C M01
+ cmp.ne p13, p0 = 0, cy C copy cy to p13 M I
+}{.mmb; cmp.eq p12, p0 = 7, r14 C M I
+ nop 0
+ (p12) br.dptk .Lc111 C B
+ ;;
+}
+
+.Lc000:
+ {.mmi; ld8 v3 = [vp], 8 C M01
+ ld8 u3 = [up], 8 C M01
+ shr.u n = n, 3 C I0
+ ;;
+}{.mmi; add vpadv = PFDIST, vp C M I
+ ld8 v0 = [vp], 8 C M01
+ mov ar.lc = n C I0
+}{.mmi; ld8 u0 = [up], 8 C M01
+ ADDSUB w1 = r10, r11 C M I
+ nop 0
+ ;;
+}{.mmi; add upadv = PFDIST, up C M I
+ ld8 v1 = [vp], 8 C M01
+ cmp.CND p7, p0 = w1, r10 C M I
+}{.mmi; ld8 u1 = [up], 8 C M01
+ ADDSUB w2 = r18, r19 C M I
+ add rpx = 8, rp C M I
+ ;;
+}{.mmi; ld8 v2 = [vp], 8 C M01
+ cmp.CND p8, p0 = w2, r18 C M I
+ (p13) cmpeqor p7, p0 = LIM, w1 C M I
+}{.mmi; ld8 u2 = [up], 8 C M01
+ (p13) add w1 = INCR, w1 C M I
+ ADDSUB w3 = u3, v3 C M I
+ ;;
+}{.mmi; ld8 v3 = [vp], 8 C M01
+ cmp.CND p9, p0 = w3, u3 C M I
+ (p7) cmpeqor p8, p0 = LIM, w2 C M I
+}{.mmb; ld8 u3 = [up], 8 C M01
+ (p7) add w2 = INCR, w2 C M I
+ br L(m0)
+}
+
+.Lc001:
+ {.mmi; (p15) ld8 v1 = [vp], 8 C M01
+ (p15) ld8 u1 = [up], 8 C M01
+ ADDSUB w0 = r10, r11 C M I
+}{.mmb; nop 0
+ nop 0
+ (p15) br L(0)
+ ;;
+}{.mmi; cmp.ne p9, p0 = 0, r23 C M I
+ mov r8 = 0
+ cmp.CND p6, p0 = w0, r10 C M I
+ ;;
+}{.mmb; (p9) cmpeqor p6, p0 = LIM, w0 C M I
+ (p9) add w0 = INCR, w0 C M I
+ br L(cj1) C B
+}
+L(0):
+ {.mmi; ld8 v2 = [vp], 8 C M01
+ ld8 u2 = [up], 8 C M01
+ shr.u n = n, 3 C I0
+ ;;
+}{.mmi; ld8 v3 = [vp], 8 C M01
+ ld8 u3 = [up], 8 C M01
+ mov ar.lc = n C I0
+}{.mmi; nop 0
+ cmp.ne p9, p0 = 0, r23 C M I
+ nop 0
+ ;;
+}{.mmi; ld8 v0 = [vp], 8 C M01
+ cmp.CND p6, p0 = w0, r10 C M I
+ add rpx = 16, rp C M I
+}{.mmb; ld8 u0 = [up], 8 C M01
+ ADDSUB w1 = u1, v1 C M I
+ br L(c1) C B
+}
+
+.Lc010:
+ {.mmi; ld8 v0 = [vp], 8 C M01
+ ld8 u0 = [up], 8 C M01
+ mov r8 = 0 C M I
+}{.mmb; ADDSUB w3 = r10, r11 C M I
+ cmp.ne p8, p0 = 0, r23 C M I
+ (p15) br L(1) C B
+ ;;
+}{.mmi; cmp.CND p9, p0 = w3, r10 C M I
+ ADDSUB w0 = u0, v0 C M I
+ (p8) add w3 = INCR, w3 C M I
+ ;;
+}{.mmb; cmp.CND p6, p0 = w0, u0 C M I
+ (p8) cmpeqor p9, p0 = LIM2, w3 C M I
+ br L(cj2) C B
+}
+L(1):
+ {.mmi; ld8 v1 = [vp], 8 C M01
+ ld8 u1 = [up], 8 C M01
+ shr.u n = n, 3 C I0
+ ;;
+}{.mmi; ld8 v2 = [vp], 8 C M01
+ ld8 u2 = [up], 8 C M01
+ mov ar.lc = n C I0
+ ;;
+}{.mmi; ld8 v3 = [vp], 8 C M01
+ ld8 u3 = [up], 8 C M01
+ cmp.CND p9, p0 = w3, r10 C M I
+ ;;
+}{.mmi; (p8) cmpeqor p9, p0 = LIM, w3 C M I
+ (p8) add w3 = INCR, w3 C M I
+ ADDSUB w0 = u0, v0 C M I
+}{.mmb; add rpx = 24, rp C M I
+ nop 0
+ br L(m23) C B
+}
+
+.Lc011:
+ {.mmi; ld8 v3 = [vp], 8 C M01
+ ld8 u3 = [up], 8 C M01
+ shr.u n = n, 3 C I0
+}{.mmi; ADDSUB w2 = r10, r11 C M I
+ cmp.ne p7, p0 = 0, r23 C M I
+ nop 0
+ ;;
+}{.mmb; ld8 v0 = [vp], 8 C M01
+ ld8 u0 = [up], 8 C M01
+ (p15) br L(2) C B
+}{.mmi; cmp.CND p8, p0 = w2, r10 C M I
+ ADDSUB w3 = u3, v3 C M I
+ nop 0
+ ;;
+}{.mmb; (p7) cmpeqor p8, p0 = LIM, w2 C M I
+ (p7) add w2 = INCR, w2 C M I
+ br L(cj3) C B
+}
+L(2):
+ {.mmi; ld8 v1 = [vp], 8 C M01
+ ld8 u1 = [up], 8 C M01
+ ADDSUB w3 = u3, v3 C M I
+ ;;
+}{.mmi; ld8 v2 = [vp], 8 C M01
+ ld8 u2 = [up], 8 C M01
+ cmp.CND p8, p0 = w2, r10 C M I
+ ;;
+}{.mmi; ld8 v3 = [vp], 8 C M01
+ cmp.CND p9, p0 = w3, u3 C M I
+ mov ar.lc = n C I0
+}{.mmi; ld8 u3 = [up], 8 C M01
+ (p7) cmpeqor p8, p0 = LIM, w2 C M I
+ (p7) add w2 = INCR, w2 C M I
+ ;;
+}{.mmi; add rpx = 32, rp C M I
+ st8 [rp] = w2, 8 C M23
+ (p8) cmpeqor p9, p0 = LIM, w3 C M I
+}{.mmb; (p8) add w3 = INCR, w3 C M I
+ ADDSUB w0 = u0, v0 C M I
+ br L(m23)
+}
+
+.Lc100:
+ {.mmi; ld8 v2 = [vp], 8 C M01
+ ld8 u2 = [up], 8 C M01
+ shr.u n = n, 3 C I0
+}{.mmi; ADDSUB w1 = r10, r11 C M I
+ nop 0
+ nop 0
+ ;;
+}{.mmi; ld8 v3 = [vp], 8 C M01
+ ld8 u3 = [up], 8 C M01
+ add rpx = 8, rp C M I
+}{.mmi; cmp.ne p6, p0 = 0, r23 C M I
+ cmp.CND p7, p0 = w1, r10 C M I
+ nop 0
+ ;;
+}{.mmi; ld8 v0 = [vp], 8 C M01
+ ld8 u0 = [up], 8 C M01
+ ADDSUB w2 = u2, v2 C M I
+}{.mmb; (p6) cmpeqor p7, p0 = LIM, w1 C M I
+ (p6) add w1 = INCR, w1 C M I
+ (p14) br L(cj4)
+ ;;
+}{.mmi; ld8 v1 = [vp], 8 C M01
+ ld8 u1 = [up], 8 C M01
+ mov ar.lc = n C I0
+ ;;
+}{.mmi; ld8 v2 = [vp], 8 C M01
+ cmp.CND p8, p0 = w2, u2 C M I
+ nop 0
+}{.mmi; ld8 u2 = [up], 8 C M01
+ nop 0
+ ADDSUB w3 = u3, v3 C M I
+ ;;
+}{.mmi; ld8 v3 = [vp], 8 C M01
+ cmp.CND p9, p0 = w3, u3 C M I
+ (p7) cmpeqor p8, p0 = LIM, w2 C M I
+}{.mmb; ld8 u3 = [up], 8 C M01
+ (p7) add w2 = INCR, w2 C M I
+ br L(m4)
+}
+
+.Lc101:
+ {.mmi; ld8 v1 = [vp], 8 C M01
+ ld8 u1 = [up], 8 C M01
+ shr.u n = n, 3 C I0
+ ;;
+}{.mmi; ld8 v2 = [vp], 8 C M01
+ ld8 u2 = [up], 8 C M01
+ mov ar.lc = n C I0
+ ;;
+}{.mmi; ld8 v3 = [vp], 8 C M01
+ ld8 u3 = [up], 8 C M01
+ ADDSUB w0 = r10, r11 C M I
+}{.mmi; cmp.ne p9, p0 = 0, r23 C M I
+ add rpx = 16, rp C M I
+ nop 0
+ ;;
+}{.mmi; ld8 v0 = [vp], 8 C M01
+ ld8 u0 = [up], 8 C M01
+ cmp.CND p6, p0 = w0, r10 C M I
+}{.mbb; ADDSUB w1 = u1, v1 C M I
+ (p15) br L(c5) C B
+ br L(end) C B
+}
+
+.Lc110:
+ {.mmi; ld8 v0 = [vp], 8 C M01
+ ld8 u0 = [up], 8 C M01
+ shr.u n = n, 3 C I0
+ ;;
+}{.mmi; add upadv = PFDIST, up C M I
+ add vpadv = PFDIST, vp C M I
+ mov ar.lc = n C I0
+}{.mmi; ld8 v1 = [vp], 8 C M01
+ ld8 u1 = [up], 8 C M01
+ ADDSUB w3 = r10, r11 C M I
+ ;;
+}{.mmi; ld8 v2 = [vp], 8 C M01
+ ld8 u2 = [up], 8 C M01
+ ADDSUB w0 = u0, v0 C M I
+}{.mmi; cmp.CND p9, p0 = w3, r10 C M I
+ cmp.ne p8, p0 = 0, r23 C M I
+ add rpx = 24, rp C M I
+ ;;
+}{.mmi; ld8 v3 = [vp], 8 C M01
+ ld8 u3 = [up], 8 C M01
+ nop 0
+}{.mmb; (p8) cmpeqor p9, p0 = LIM, w3 C M I
+ (p8) add w3 = INCR, w3 C M I
+ br L(m67) C B
+}
+
+.Lc111:
+ {.mmi; ld8 v0 = [vp], 8 C M01
+ ld8 u0 = [up], 8 C M01
+ shr.u n = n, 3 C I0
+ ;;
+}{.mmi; add upadv = PFDIST, up C M I
+ ld8 v1 = [vp], 8 C M01
+ mov ar.lc = n C I0
+}{.mmi; ld8 u1 = [up], 8 C M01
+ ADDSUB w2 = r10, r11 C M I
+ nop 0
+ ;;
+}{.mmi; add vpadv = PFDIST, vp C M I
+ ld8 v2 = [vp], 8 C M01
+ cmp.CND p8, p0 = w2, r10 C M I
+}{.mmi; ld8 u2 = [up], 8 C M01
+ ADDSUB w3 = r18, r19 C M I
+ nop 0
+ ;;
+}{.mmi; ld8 v3 = [vp], 8 C M01
+ cmp.CND p9, p0 = w3, r18 C M I
+ (p13) cmpeqor p8, p0 = LIM, w2 C M I
+}{.mmi; ld8 u3 = [up], 8 C M01
+ (p13) add w2 = INCR, w2 C M I
+ nop 0
+ ;;
+}{.mmi; add rpx = 32, rp C M I
+ st8 [rp] = w2, 8 C M23
+ (p8) cmpeqor p9, p0 = LIM, w3 C M I
+}{.mmb; (p8) add w3 = INCR, w3 C M I
+ ADDSUB w0 = u0, v0 C M I
+ br L(m67)
+}
+EPILOGUE()
+
+PROLOGUE(func)
+ .prologue
+ .save ar.lc, r2
+ .body
+ifdef(`HAVE_ABI_32',`
+ addp4 rp = 0, rp C M I
+ addp4 up = 0, up C M I
+ nop.i 0
+ addp4 vp = 0, vp C M I
+ nop.m 0
+ zxt4 n = n C I
+ ;;
+')
+
+ {.mmi; ld8 r11 = [vp], 8 C M01
+ ld8 r10 = [up], 8 C M01
+ mov r2 = ar.lc C I0
+}{.mmi; and r14 = 7, n C M I
+ cmp.lt p15, p14 = 8, n C M I
+ add n = -6, n C M I
+ ;;
+}{.mmi; cmp.eq p6, p0 = 1, r14 C M I
+ cmp.eq p7, p0 = 2, r14 C M I
+ cmp.eq p8, p0 = 3, r14 C M I
+}{.bbb; (p6) br.dptk .Lb001 C B
+ (p7) br.dptk .Lb010 C B
+ (p8) br.dptk .Lb011 C B
+ ;;
+}{.mmi; cmp.eq p9, p0 = 4, r14 C M I
+ cmp.eq p10, p0 = 5, r14 C M I
+ cmp.eq p11, p0 = 6, r14 C M I
+}{.bbb; (p9) br.dptk .Lb100 C B
+ (p10) br.dptk .Lb101 C B
+ (p11) br.dptk .Lb110 C B
+ ;;
+}{.mmi; ld8 r19 = [vp], 8 C M01
+ ld8 r18 = [up], 8 C M01
+ cmp.ne p13, p0 = r0, r0 C clear "CF" M I
+}{.mmb; cmp.eq p12, p0 = 7, r14 C M I
+ mov r23 = 0 C M I
+ (p12) br.dptk .Lb111 C B
+ ;;
+}
+
+.Lb000:
+ {.mmi; ld8 v3 = [vp], 8 C M01
+ ld8 u3 = [up], 8 C M01
+ shr.u n = n, 3 C I0
+ ;;
+}{.mmi; ld8 v0 = [vp], 8 C M01
+ ld8 u0 = [up], 8 C M01
+ ADDSUB w1 = r10, r11 C M I
+ ;;
+}{.mmi; ld8 v1 = [vp], 8 C M01
+ cmp.CND p7, p0 = w1, r10 C M I
+ mov ar.lc = n C I0
+}{.mmi; ld8 u1 = [up], 8 C M01
+ ADDSUB w2 = r18, r19 C M I
+ add rpx = 8, rp C M I
+ ;;
+}{.mmi; add upadv = PFDIST, up
+ add vpadv = PFDIST, vp
+ cmp.CND p8, p0 = w2, r18 C M I
+}{.mmi; ld8 v2 = [vp], 8 C M01
+ ld8 u2 = [up], 8 C M01
+ ADDSUB w3 = u3, v3 C M I
+ ;;
+}{.mmi; ld8 v3 = [vp], 8 C M01
+ cmp.CND p9, p0 = w3, u3 C M I
+ (p7) cmpeqor p8, p0 = LIM, w2 C M I
+}{.mmb; ld8 u3 = [up], 8 C M01
+ (p7) add w2 = INCR, w2 C M I
+ br L(m0) C B
+}
+
+ ALIGN(32)
+.Lb001:
+ {.mmi; ADDSUB w0 = r10, r11 C M I
+ (p15) ld8 v1 = [vp], 8 C M01
+ mov r8 = 0 C M I
+ ;;
+}{.mmb; cmp.CND p6, p0 = w0, r10 C M I
+ (p15) ld8 u1 = [up], 8 C M01
+ (p14) br L(cj1) C B
+ ;;
+}{.mmi; add upadv = PFDIST, up
+ add vpadv = PFDIST, vp
+ shr.u n = n, 3 C I0
+}{.mmi; ld8 v2 = [vp], 8 C M01
+ ld8 u2 = [up], 8 C M01
+ cmp.CND p6, p0 = w0, r10 C M I
+ ;;
+}{.mmi; ld8 v3 = [vp], 8 C M01
+ ld8 u3 = [up], 8 C M01
+ mov ar.lc = n C I0
+ ;;
+}{.mmi; ld8 v0 = [vp], 8 C M01
+ ld8 u0 = [up], 8 C M01
+ ADDSUB w1 = u1, v1 C M I
+ ;;
+}{.mmi; ld8 v1 = [vp], 8 C M01
+ cmp.CND p7, p0 = w1, u1 C M I
+ ADDSUB w2 = u2, v2 C M I
+}{.mmb; ld8 u1 = [up], 8 C M01
+ add rpx = 16, rp C M I
+ br L(m1) C B
+}
+
+ ALIGN(32)
+.Lb010:
+ {.mmi; ld8 v0 = [vp], 8 C M01
+ ld8 u0 = [up], 8 C M01
+ shr.u n = n, 3 C I0
+}{.mmb; ADDSUB w3 = r10, r11 C M I
+ nop 0
+ (p15) br L(gt2) C B
+ ;;
+}{.mmi; cmp.CND p9, p0 = w3, r10 C M I
+ ADDSUB w0 = u0, v0 C M I
+ mov r8 = 0 C M I
+ ;;
+}{.mmb; nop 0
+ cmp.CND p6, p0 = w0, u0 C M I
+ br L(cj2) C B
+}
+L(gt2):
+ {.mmi; ld8 v1 = [vp], 8 C M01
+ ld8 u1 = [up], 8 C M01
+ nop 0
+ ;;
+}{.mmi; add upadv = PFDIST, up
+ add vpadv = PFDIST, vp
+ mov ar.lc = n C I0
+}{.mmi; ld8 v2 = [vp], 8 C M01
+ ld8 u2 = [up], 8 C M01
+ nop 0
+ ;;
+}{.mmi; ld8 v3 = [vp], 8 C M01
+ cmp.CND p9, p0 = w3, r10 C M I
+ ADDSUB w0 = u0, v0 C M I
+}{.mmb; ld8 u3 = [up], 8 C M01
+ add rpx = 24, rp C M I
+ br L(m23) C B
+}
+
+ ALIGN(32)
+.Lb011:
+ {.mmi; ld8 v3 = [vp], 8 C M01
+ ld8 u3 = [up], 8 C M01
+ ADDSUB w2 = r10, r11 C M I
+ ;;
+}{.mmb; ld8 v0 = [vp], 8 C M01
+ ld8 u0 = [up], 8 C M01
+ (p15) br L(3) C B
+}{.mmb; cmp.CND p8, p0 = w2, r10 C M I
+ ADDSUB w3 = u3, v3 C M I
+ br L(cj3) C B
+}
+L(3):
+ {.mmi; ld8 v1 = [vp], 8 C M01
+ ld8 u1 = [up], 8 C M01
+ shr.u n = n, 3 C I0
+ ;;
+}{.mmi; add upadv = PFDIST, up
+ add vpadv = PFDIST, vp
+ ADDSUB w3 = u3, v3 C M I
+}{.mmi; ld8 v2 = [vp], 8 C M01
+ ld8 u2 = [up], 8 C M01
+ cmp.CND p8, p0 = w2, r10 C M I
+ ;;
+}{.mmi; ld8 v3 = [vp], 8 C M01
+ cmp.CND p9, p0 = w3, u3 C M I
+ mov ar.lc = n C I0
+}{.mmi; ld8 u3 = [up], 8 C M01
+ nop 0
+ nop 0
+ ;;
+}{.mmi; add rpx = 32, rp C M I
+ st8 [rp] = w2, 8 C M23
+ (p8) cmpeqor p9, p0 = LIM, w3 C M I
+}{.mmb; (p8) add w3 = INCR, w3 C M I
+ ADDSUB w0 = u0, v0 C M I
+ br L(m23) C B
+}
+
+ ALIGN(32)
+.Lb100:
+ {.mmi; ld8 v2 = [vp], 8 C M01
+ ld8 u2 = [up], 8 C M01
+ shr.u n = n, 3 C I0
+ ;;
+}{.mmi; ld8 v3 = [vp], 8 C M01
+ ld8 u3 = [up], 8 C M01
+ ADDSUB w1 = r10, r11 C M I
+ ;;
+}{.mmi; ld8 v0 = [vp], 8 C M01
+ ld8 u0 = [up], 8 C M01
+ cmp.CND p7, p0 = w1, r10 C M I
+}{.mmb; nop 0
+ ADDSUB w2 = u2, v2 C M I
+ (p14) br L(cj4) C B
+ ;;
+}
+L(gt4):
+ {.mmi; add upadv = PFDIST, up
+ add vpadv = PFDIST, vp
+ mov ar.lc = n C I0
+}{.mmi; ld8 v1 = [vp], 8 C M01
+ ld8 u1 = [up], 8 C M01
+ nop 0
+ ;;
+}{.mmi; ld8 v2 = [vp], 8 C M01
+ cmp.CND p8, p0 = w2, u2 C M I
+ nop 0
+}{.mmi; ld8 u2 = [up], 8 C M01
+ ADDSUB w3 = u3, v3 C M I
+ add rpx = 8, rp C M I
+ ;;
+}{.mmi; ld8 v3 = [vp], 8 C M01
+ cmp.CND p9, p0 = w3, u3 C M I
+ (p7) cmpeqor p8, p0 = LIM, w2 C M I
+}{.mmb; ld8 u3 = [up], 8 C M01
+ (p7) add w2 = INCR, w2 C M I
+ br L(m4) C B
+}
+
+ ALIGN(32)
+.Lb101:
+ {.mmi; ld8 v1 = [vp], 8 C M01
+ ld8 u1 = [up], 8 C M01
+ shr.u n = n, 3 C I0
+ ;;
+}{.mmi; ld8 v2 = [vp], 8 C M01
+ ld8 u2 = [up], 8 C M01
+ ADDSUB w0 = r10, r11 C M I
+ ;;
+}{.mmi; add upadv = PFDIST, up
+ add vpadv = PFDIST, vp
+ add rpx = 16, rp C M I
+}{.mmi; ld8 v3 = [vp], 8 C M01
+ ld8 u3 = [up], 8 C M01
+ nop 0
+ ;;
+}{.mmi; ld8 v0 = [vp], 8 C M01
+ cmp.CND p6, p0 = w0, r10 C M I
+ nop 0
+}{.mmb; ld8 u0 = [up], 8 C M01
+ ADDSUB w1 = u1, v1 C M I
+ (p14) br L(cj5) C B
+ ;;
+}
+L(gt5):
+ {.mmi; ld8 v1 = [vp], 8 C M01
+ cmp.CND p7, p0 = w1, u1 C M I
+ mov ar.lc = n C I0
+}{.mmb; ld8 u1 = [up], 8 C M01
+ ADDSUB w2 = u2, v2 C M I
+ br L(m5) C B
+}
+
+ ALIGN(32)
+.Lb110:
+ {.mmi; ld8 v0 = [vp], 8 C M01
+ ld8 u0 = [up], 8 C M01
+ shr.u n = n, 3 C I0
+ ;;
+}{.mmi; ld8 v1 = [vp], 8 C M01
+ ld8 u1 = [up], 8 C M01
+ ADDSUB w3 = r10, r11 C M I
+ ;;
+}{.mmi; add upadv = PFDIST, up
+ add vpadv = PFDIST, vp
+ mov ar.lc = n C I0
+}{.mmi; ld8 v2 = [vp], 8 C M01
+ ld8 u2 = [up], 8 C M01
+ nop 0
+ ;;
+}{.mmi; ld8 v3 = [vp], 8 C M01
+ cmp.CND p9, p0 = w3, r10 C M I
+ ADDSUB w0 = u0, v0 C M I
+}{.mmb; ld8 u3 = [up], 8 C M01
+ add rpx = 24, rp C M I
+ br L(m67) C B
+}
+
+ ALIGN(32)
+.Lb111:
+ {.mmi; ld8 v0 = [vp], 8 C M01
+ ld8 u0 = [up], 8 C M01
+ shr.u n = n, 3 C I0
+ ;;
+}{.mmi; ld8 v1 = [vp], 8 C M01
+ ld8 u1 = [up], 8 C M01
+ ADDSUB w2 = r10, r11 C M I
+ ;;
+}{.mmi; ld8 v2 = [vp], 8 C M01
+ cmp.CND p8, p0 = w2, r10 C M I
+ mov ar.lc = n C I0
+}{.mmi; ld8 u2 = [up], 8 C M01
+ ADDSUB w3 = r18, r19 C M I
+ nop 0
+ ;;
+}{.mmi; add upadv = PFDIST, up
+ add vpadv = PFDIST, vp
+ nop 0
+}{.mmi; ld8 v3 = [vp], 8 C M01
+ ld8 u3 = [up], 8 C M01
+ cmp.CND p9, p0 = w3, r18 C M I
+ ;;
+}{.mmi; add rpx = 32, rp C M I
+ st8 [rp] = w2, 8 C M23
+ (p8) cmpeqor p9, p0 = LIM, w3 C M I
+}{.mmb; (p8) add w3 = INCR, w3 C M I
+ ADDSUB w0 = u0, v0 C M I
+ br L(m67) C B
+}
+
+C *** MAIN LOOP START ***
+ ALIGN(32)
+L(top):
+L(c5): ld8 v1 = [vp], 8 C M01
+ cmp.CND p7, p0 = w1, u1 C M I
+ (p9) cmpeqor p6, p0 = LIM, w0 C M I
+ ld8 u1 = [up], 8 C M01
+ (p9) add w0 = INCR, w0 C M I
+ ADDSUB w2 = u2, v2 C M I
+ ;;
+L(m5): ld8 v2 = [vp], 8 C M01
+ cmp.CND p8, p0 = w2, u2 C M I
+ (p6) cmpeqor p7, p0 = LIM, w1 C M I
+ ld8 u2 = [up], 8 C M01
+ (p6) add w1 = INCR, w1 C M I
+ ADDSUB w3 = u3, v3 C M I
+ ;;
+ st8 [rp] = w0, 8 C M23
+ ld8 v3 = [vp], 8 C M01
+ cmp.CND p9, p0 = w3, u3 C M I
+ (p7) cmpeqor p8, p0 = LIM, w2 C M I
+ ld8 u3 = [up], 8 C M01
+ (p7) add w2 = INCR, w2 C M I
+ ;;
+L(m4): st8 [rp] = w1, 16 C M23
+ st8 [rpx] = w2, 32 C M23
+ (p8) cmpeqor p9, p0 = LIM, w3 C M I
+ lfetch [upadv], 64
+ (p8) add w3 = INCR, w3 C M I
+ ADDSUB w0 = u0, v0 C M I
+ ;;
+L(m23): st8 [rp] = w3, 8 C M23
+ ld8 v0 = [vp], 8 C M01
+ cmp.CND p6, p0 = w0, u0 C M I
+ ld8 u0 = [up], 8 C M01
+ ADDSUB w1 = u1, v1 C M I
+ nop.b 0
+ ;;
+L(c1): ld8 v1 = [vp], 8 C M01
+ cmp.CND p7, p0 = w1, u1 C M I
+ (p9) cmpeqor p6, p0 = LIM, w0 C M I
+ ld8 u1 = [up], 8 C M01
+ (p9) add w0 = INCR, w0 C M I
+ ADDSUB w2 = u2, v2 C M I
+ ;;
+L(m1): ld8 v2 = [vp], 8 C M01
+ cmp.CND p8, p0 = w2, u2 C M I
+ (p6) cmpeqor p7, p0 = LIM, w1 C M I
+ ld8 u2 = [up], 8 C M01
+ (p6) add w1 = INCR, w1 C M I
+ ADDSUB w3 = u3, v3 C M I
+ ;;
+ st8 [rp] = w0, 8 C M23
+ ld8 v3 = [vp], 8 C M01
+ cmp.CND p9, p0 = w3, u3 C M I
+ (p7) cmpeqor p8, p0 = LIM, w2 C M I
+ ld8 u3 = [up], 8 C M01
+ (p7) add w2 = INCR, w2 C M I
+ ;;
+L(m0): st8 [rp] = w1, 16 C M23
+ st8 [rpx] = w2, 32 C M23
+ (p8) cmpeqor p9, p0 = LIM, w3 C M I
+ lfetch [vpadv], 64
+ (p8) add w3 = INCR, w3 C M I
+ ADDSUB w0 = u0, v0 C M I
+ ;;
+L(m67): st8 [rp] = w3, 8 C M23
+ ld8 v0 = [vp], 8 C M01
+ cmp.CND p6, p0 = w0, u0 C M I
+ ld8 u0 = [up], 8 C M01
+ ADDSUB w1 = u1, v1 C M I
+ br.cloop.dptk L(top) C B
+ ;;
+C *** MAIN LOOP END ***
+
+L(end):
+ {.mmi; (p9) cmpeqor p6, p0 = LIM, w0 C M I
+ (p9) add w0 = INCR, w0 C M I
+ mov ar.lc = r2 C I0
+}
+L(cj5):
+ {.mmi; cmp.CND p7, p0 = w1, u1 C M I
+ ADDSUB w2 = u2, v2 C M I
+ nop 0
+ ;;
+}{.mmi; st8 [rp] = w0, 8 C M23
+ (p6) cmpeqor p7, p0 = LIM, w1 C M I
+ (p6) add w1 = INCR, w1 C M I
+}
+L(cj4):
+ {.mmi; cmp.CND p8, p0 = w2, u2 C M I
+ ADDSUB w3 = u3, v3 C M I
+ nop 0
+ ;;
+}{.mmi; st8 [rp] = w1, 8 C M23
+ (p7) cmpeqor p8, p0 = LIM, w2 C M I
+ (p7) add w2 = INCR, w2 C M I
+}
+L(cj3):
+ {.mmi; cmp.CND p9, p0 = w3, u3 C M I
+ ADDSUB w0 = u0, v0 C M I
+ nop 0
+ ;;
+}{.mmi; st8 [rp] = w2, 8 C M23
+ (p8) cmpeqor p9, p0 = LIM, w3 C M I
+ (p8) add w3 = INCR, w3 C M I
+}{.mmi; cmp.CND p6, p0 = w0, u0 C M I
+ nop 0
+ mov r8 = 0 C M I
+ ;;
+}
+L(cj2):
+ {.mmi; st8 [rp] = w3, 8 C M23
+ (p9) cmpeqor p6, p0 = LIM, w0 C M I
+ (p9) add w0 = INCR, w0 C M I
+ ;;
+}
+L(cj1):
+ {.mmb; st8 [rp] = w0, 8 C M23
+ (p6) mov r8 = 1 C M I
+ br.ret.sptk.many b0 C B
+}
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/ia64/aorsorrlsh1_n.asm b/vendor/gmp-6.3.0/mpn/ia64/aorsorrlsh1_n.asm
new file mode 100644
index 0000000..9b58b9e
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/ia64/aorsorrlsh1_n.asm
@@ -0,0 +1,48 @@
+dnl IA-64 mpn_addlsh1_n, mpn_sublsh1_n, mpn_rsblsh1_n
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2003-2005, 2010, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C Itanium: 3.0
+C Itanium 2: 1.5
+
+
+define(LSH, 1)
+
+ifdef(`OPERATION_addlsh1_n',`define(`DO_add')')
+ifdef(`OPERATION_sublsh1_n',`define(`DO_sub')')
+ifdef(`OPERATION_rsblsh1_n',`define(`DO_rsb')')
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n mpn_rsblsh1_n)
+
+include_mpn(`ia64/aorsorrlshC_n.asm')
diff --git a/vendor/gmp-6.3.0/mpn/ia64/aorsorrlsh2_n.asm b/vendor/gmp-6.3.0/mpn/ia64/aorsorrlsh2_n.asm
new file mode 100644
index 0000000..39b384a
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/ia64/aorsorrlsh2_n.asm
@@ -0,0 +1,48 @@
+dnl IA-64 mpn_addlsh2_n, mpn_sublsh2_n, mpn_rsblsh2_n
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2003-2005, 2010, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C Itanium: 3.0
+C Itanium 2: 1.5
+
+
+define(LSH, 2)
+
+ifdef(`OPERATION_addlsh2_n',`define(`DO_add')')
+ifdef(`OPERATION_sublsh2_n',`define(`DO_sub')')
+ifdef(`OPERATION_rsblsh2_n',`define(`DO_rsb')')
+
+MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_sublsh2_n mpn_rsblsh2_n)
+
+include_mpn(`ia64/aorsorrlshC_n.asm')
diff --git a/vendor/gmp-6.3.0/mpn/ia64/aorsorrlshC_n.asm b/vendor/gmp-6.3.0/mpn/ia64/aorsorrlshC_n.asm
new file mode 100644
index 0000000..2703ce2
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/ia64/aorsorrlshC_n.asm
@@ -0,0 +1,412 @@
+dnl IA-64 mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2003-2005, 2010, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+C cycles/limb
+C Itanium: ?
+C Itanium 2: 1.5
+
+C TODO
+C * Use shladd in feed-in code (for mpn_addlshC_n).
+C * Rewrite loop to schedule loads closer to use, since we do prefetch.
+
+C INPUT PARAMETERS
+define(`rp', `r32')
+define(`up', `r33')
+define(`vp', `r34')
+define(`n', `r35')
+
+ifdef(`DO_add', `
+ define(`ADDSUB', `add $1 = $2, $3')
+ define(`CMP', `cmp.ltu $1,p0 = $2, $3')
+ define(`INCR', 1)
+ define(`LIM', -1)
+ define(`func', mpn_addlsh`'LSH`'_n)')
+ifdef(`DO_sub', `
+ define(`ADDSUB', `sub $1 = $2, $3')
+ define(`CMP', `cmp.gtu $1,p0 = $2, $3')
+ define(`INCR', -1)
+ define(`LIM', 0)
+ define(`func', mpn_sublsh`'LSH`'_n)')
+ifdef(`DO_rsb', `
+ define(`ADDSUB', `sub $1 = $3, $2')
+ define(`CMP', `cmp.gtu $1,p0 = $2, $4')
+ define(`INCR', -1)
+ define(`LIM', 0)
+ define(`func', mpn_rsblsh`'LSH`'_n)')
+
+define(PFDIST, 500)
+
+define(`u0',`r14') define(`u1',`r15') define(`u2',`r16') define(`u3',`r17')
+define(`v0',`r18') define(`v1',`r19') define(`v2',`r20') define(`v3',`r21')
+define(`w0',`r22') define(`w1',`r23') define(`w2',`r24') define(`w3',`r25')
+define(`s0',`r26') define(`s1',`r27') define(`s2',`r28') define(`s3',`r29')
+define(`x0',`r30') define(`x1',`r31') define(`x2',`r3') define(`x3',`r9')
+
+C r3 r8 r9 r10 r11
+
+ASM_START()
+PROLOGUE(func)
+ .prologue
+ .save ar.lc, r2
+ .body
+ifdef(`HAVE_ABI_32',`
+ addp4 rp = 0, rp C M I
+ addp4 up = 0, up C M I
+ nop.i 0
+ addp4 vp = 0, vp C M I
+ nop.m 0
+ zxt4 n = n C I
+ ;;
+')
+ {.mmi; ld8 r11 = [vp], 8 C M01
+ ld8 r10 = [up], 8 C M01
+ mov.i r2 = ar.lc C I0
+}{.mmi; and r14 = 3, n C M I
+ cmp.lt p15, p0 = 4, n C M I
+ add n = -5, n C M I
+ ;;
+}{.mmi; cmp.eq p6, p0 = 1, r14 C M I
+ cmp.eq p7, p0 = 2, r14 C M I
+ cmp.eq p8, p0 = 3, r14 C M I
+}{.bbb
+ (p6) br.dptk .Lb01 C B
+ (p7) br.dptk .Lb10 C B
+ (p8) br.dptk .Lb11 C B
+}
+
+.Lb00:
+ {.mmi; ld8 v0 = [vp], 8 C M01
+ ld8 u0 = [up], 8 C M01
+ shr.u n = n, 2 C I0
+ ;;
+}{.mmi; ld8 v1 = [vp], 8 C M01
+ ld8 u1 = [up], 8 C M01
+ shl x3 = r11, LSH C I0
+ ;;
+}{.mmi; ld8 v2 = [vp], 8 C M01
+ ld8 u2 = [up], 8 C M01
+ shrp x0 = v0, r11, 64-LSH C I0
+}{.mmb; ADDSUB( w3, r10, x3) C M I
+ nop 0
+ (p15) br.dpnt .grt4 C B
+ ;;
+}{.mii; CMP( p7, w3, r10, x3) C M II0
+ shrp x1 = v1, v0, 64-LSH C I0
+ ADDSUB( w0, u0, x0) C M I
+ ;;
+}{.mii; CMP( p8, w0, u0, x0) C M I
+ shrp x2 = v2, v1, 64-LSH C I0
+ ADDSUB( w1, u1, x1) C M I
+}{.mmb; nop 0
+ nop 0
+ br .Lcj4 C B
+}
+ALIGN(32)
+.grt4:
+ {.mii; ld8 v3 = [vp], 8 C M01
+ shrp x0 = v0, r11, 64-LSH C I0
+ CMP( p8, w3, r10, x3) C M I
+ ;;
+}{.mmi; ld8 u3 = [up], 8 C M01
+ add r11 = PFDIST, vp
+ shrp x1 = v1, v0, 64-LSH C I0
+}{.mmi; ld8 v0 = [vp], 8 C M01
+ ADDSUB( w0, u0, x0) C M I
+ nop 0
+ ;;
+}{.mmi; CMP( p6, w0, u0, x0) C M I
+ add r10 = PFDIST, up
+ mov.i ar.lc = n C I0
+}{.mmb; ADDSUB( w1, u1, x1) C M I
+ ld8 u0 = [up], 8 C M01
+ br .LL00 C B
+}
+
+ ALIGN(32)
+.Lb01:
+ifdef(`DO_add',
+` shladd w2 = r11, LSH, r10 C M I
+ shr.u r8 = r11, 64-LSH C retval I0
+ (p15) br.dpnt .grt1 C B
+ ;;
+',`
+ shl x2 = r11, LSH C I0
+ (p15) br.dpnt .grt1 C B
+ ;;
+ ADDSUB( w2, r10, x2) C M I
+ shr.u r8 = r11, 64-LSH C retval I0
+ ;;
+')
+ CMP( p6, w2, r10, x2) C M I
+ br .Lcj1
+
+.grt1: ld8 v3 = [vp], 8 C M01
+ ld8 u3 = [up], 8 C M01
+ shr.u n = n, 2 C I0
+ ;;
+ ld8 v0 = [vp], 8 C M01
+ ld8 u0 = [up], 8 C M01
+ mov.i ar.lc = n C FIXME swap with next I0
+ifdef(`DO_add',
+`',`
+ ADDSUB( w2, r10, x2)
+')
+ ;;
+ {.mmi; ld8 v1 = [vp], 8 C M01
+ ld8 u1 = [up], 8 C M01
+ shrp x3 = v3, r11, 64-LSH C I0
+ ;;
+}{.mmi; ld8 v2 = [vp], 8 C M01
+ ld8 u2 = [up], 8 C M01
+ shrp x0 = v0, v3, 64-LSH C I0
+}{.mmb; CMP( p6, w2, r10, x2) C M I
+ ADDSUB( w3, u3, x3) C M I
+ br.cloop.dptk .grt5 C B
+ ;;
+}{.mmi; CMP( p7, w3, u3, x3) C M I
+ ADDSUB( w0, u0, x0) C M I
+ shrp x1 = v1, v0, 64-LSH C I0
+}{.mmb; nop 0
+ nop 0
+ br .Lcj5 C B
+}
+.grt5:
+ {.mmi; add r10 = PFDIST, up
+ add r11 = PFDIST, vp
+ shrp x0 = v0, v3, 64-LSH C I0
+}{.mmb; ld8 v3 = [vp], 8 C M01
+ CMP( p8, w3, u3, x3) C M I
+ br .LL01 C B
+}
+ ALIGN(32)
+.Lb10:
+ {.mmi; ld8 v2 = [vp], 8 C M01
+ ld8 u2 = [up], 8 C M01
+ shl x1 = r11, LSH C I0
+}{.mmb; nop 0
+ nop 0
+ (p15) br.dpnt .grt2 C B
+ ;;
+}{.mmi; ADDSUB( w1, r10, x1) C M I
+ nop 0
+ shrp x2 = v2, r11, 64-LSH C I0
+ ;;
+}{.mmi; CMP( p9, w1, r10, x1) C M I
+ ADDSUB( w2, u2, x2) C M I
+ shr.u r8 = v2, 64-LSH C retval I0
+ ;;
+}{.mmb; CMP( p6, w2, u2, x2) C M I
+ nop 0
+ br .Lcj2 C B
+}
+.grt2:
+ {.mmi; ld8 v3 = [vp], 8 C M01
+ ld8 u3 = [up], 8 C M01
+ shr.u n = n, 2 C I0
+ ;;
+}{.mmi; ld8 v0 = [vp], 8 C M01
+ ld8 u0 = [up], 8 C M01
+ mov.i ar.lc = n C I0
+}{.mmi; ADDSUB( w1, r10, x1) C M I
+ nop 0
+ nop 0
+ ;;
+}{.mii; ld8 v1 = [vp], 8 C M01
+ shrp x2 = v2, r11, 64-LSH C I0
+ CMP( p8, w1, r10, x1) C M I
+ ;;
+}{.mmi; add r10 = PFDIST, up
+ ld8 u1 = [up], 8 C M01
+ shrp x3 = v3, v2, 64-LSH C I0
+}{.mmi; add r11 = PFDIST, vp
+ ld8 v2 = [vp], 8 C M01
+ ADDSUB( w2, u2, x2) C M I
+ ;;
+}{.mmi; CMP( p6, w2, u2, x2) C M I
+ ld8 u2 = [up], 8 C M01
+ shrp x0 = v0, v3, 64-LSH C I0
+}{.mib; ADDSUB( w3, u3, x3) C M I
+ nop 0
+ br.cloop.dpnt L(top) C B
+}
+ br L(end) C B
+.Lb11:
+ {.mmi; ld8 v1 = [vp], 8 C M01
+ ld8 u1 = [up], 8 C M01
+ shl x0 = r11, LSH C I0
+ ;;
+}{.mmi; ld8 v2 = [vp], 8 C M01
+ ld8 u2 = [up], 8 C M01
+ shr.u n = n, 2 C I0
+}{.mmb; nop 0
+ nop 0
+ (p15) br.dpnt .grt3 C B
+ ;;
+}{.mii; nop 0
+ shrp x1 = v1, r11, 64-LSH C I0
+ ADDSUB( w0, r10, x0) C M I
+ ;;
+}{.mii; CMP( p8, w0, r10, x0) C M I
+ shrp x2 = v2, v1, 64-LSH C I0
+ ADDSUB( w1, u1, x1) C M I
+ ;;
+}{.mmb; CMP( p9, w1, u1, x1) C M I
+ ADDSUB( w2, u2, x2) C M I
+ br .Lcj3 C B
+}
+.grt3:
+ {.mmi; ld8 v3 = [vp], 8 C M01
+ ld8 u3 = [up], 8 C M01
+ shrp x1 = v1, r11, 64-LSH C I0
+}{.mmi; ADDSUB( w0, r10, x0) C M I
+ nop 0
+ nop 0
+ ;;
+}{.mmi; ld8 v0 = [vp], 8 C M01
+ CMP( p6, w0, r10, x0) C M I
+ mov.i ar.lc = n C I0
+}{.mmi; ld8 u0 = [up], 8 C M01
+ ADDSUB( w1, u1, x1) C M I
+ nop 0
+ ;;
+}{.mmi; add r10 = PFDIST, up
+ add r11 = PFDIST, vp
+ shrp x2 = v2, v1, 64-LSH C I0
+}{.mmb; ld8 v1 = [vp], 8 C M01
+ CMP( p8, w1, u1, x1) C M I
+ br .LL11 C B
+}
+
+C *** MAIN LOOP START ***
+ ALIGN(32)
+L(top): st8 [rp] = w1, 8 C M23
+ lfetch [r10], 32
+ (p8) cmpeqor p6, p0 = LIM, w2 C M I
+ (p8) add w2 = INCR, w2 C M I
+ ld8 v3 = [vp], 8 C M01
+ CMP( p8, w3, u3, x3) C M I
+ ;;
+.LL01: ld8 u3 = [up], 8 C M01
+ shrp x1 = v1, v0, 64-LSH C I0
+ (p6) cmpeqor p8, p0 = LIM, w3 C M I
+ (p6) add w3 = INCR, w3 C M I
+ ld8 v0 = [vp], 8 C M01
+ ADDSUB( w0, u0, x0) C M I
+ ;;
+ st8 [rp] = w2, 8 C M23
+ CMP( p6, w0, u0, x0) C M I
+ nop.b 0
+ ld8 u0 = [up], 8 C M01
+ lfetch [r11], 32
+ ADDSUB( w1, u1, x1) C M I
+ ;;
+.LL00: st8 [rp] = w3, 8 C M23
+ shrp x2 = v2, v1, 64-LSH C I0
+ (p8) cmpeqor p6, p0 = LIM, w0 C M I
+ (p8) add w0 = INCR, w0 C M I
+ ld8 v1 = [vp], 8 C M01
+ CMP( p8, w1, u1, x1) C M I
+ ;;
+.LL11: ld8 u1 = [up], 8 C M01
+ shrp x3 = v3, v2, 64-LSH C I0
+ (p6) cmpeqor p8, p0 = LIM, w1 C M I
+ (p6) add w1 = INCR, w1 C M I
+ ld8 v2 = [vp], 8 C M01
+ ADDSUB( w2, u2, x2) C M I
+ ;;
+ {.mmi; st8 [rp] = w0, 8 C M23
+ CMP( p6, w2, u2, x2) C M I
+ shrp x0 = v0, v3, 64-LSH C I0
+}{.mib;
+ ld8 u2 = [up], 8 C M01
+ ADDSUB( w3, u3, x3) C M I
+ br.cloop.dptk L(top) C B
+ ;;
+}
+C *** MAIN LOOP END ***
+
+L(end):
+ {.mmi; st8 [rp] = w1, 8 C M23
+ (p8) cmpeqor p6, p0 = LIM, w2 C M I
+ shrp x1 = v1, v0, 64-LSH C I0
+}{.mmi;
+ (p8) add w2 = INCR, w2 C M I
+ CMP( p7, w3, u3, x3) C M I
+ ADDSUB( w0, u0, x0) C M I
+ ;;
+}
+.Lcj5:
+ {.mmi; st8 [rp] = w2, 8 C M23
+ (p6) cmpeqor p7, p0 = LIM, w3 C M I
+ shrp x2 = v2, v1, 64-LSH C I0
+}{.mmi;
+ (p6) add w3 = INCR, w3 C M I
+ CMP( p8, w0, u0, x0) C M I
+ ADDSUB( w1, u1, x1) C M I
+ ;;
+}
+.Lcj4:
+ {.mmi; st8 [rp] = w3, 8 C M23
+ (p7) cmpeqor p8, p0 = LIM, w0 C M I
+ mov.i ar.lc = r2 C I0
+}{.mmi;
+ (p7) add w0 = INCR, w0 C M I
+ CMP( p9, w1, u1, x1) C M I
+ ADDSUB( w2, u2, x2) C M I
+ ;;
+}
+.Lcj3:
+ {.mmi; st8 [rp] = w0, 8 C M23
+ (p8) cmpeqor p9, p0 = LIM, w1 C M I
+ shr.u r8 = v2, 64-LSH C I0
+}{.mmi;
+ (p8) add w1 = INCR, w1 C M I
+ CMP( p6, w2, u2, x2) C M I
+ nop 0
+ ;;
+}
+.Lcj2:
+ {.mmi; st8 [rp] = w1, 8 C M23
+ (p9) cmpeqor p6, p0 = LIM, w2 C M I
+ (p9) add w2 = INCR, w2 C M I
+ ;;
+}
+.Lcj1:
+ {.mmb; st8 [rp] = w2 C M23
+ifdef(`DO_rsb',`
+ (p6) add r8 = -1, r8 C M I
+',`
+ (p6) add r8 = 1, r8 C M I
+') br.ret.sptk.many b0 C B
+}
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/ia64/bdiv_dbm1c.asm b/vendor/gmp-6.3.0/mpn/ia64/bdiv_dbm1c.asm
new file mode 100644
index 0000000..47e4553
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/ia64/bdiv_dbm1c.asm
@@ -0,0 +1,516 @@
+dnl IA-64 mpn_bdiv_dbm1.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2008, 2009 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C Itanium: 4
+C Itanium 2: 2
+
+C TODO
+C * Optimize feed-in and wind-down code, both for speed and code size.
+
+C INPUT PARAMETERS
+define(`rp', `r32')
+define(`up', `r33')
+define(`n', `r34')
+define(`bd', `r35')
+
+ASM_START()
+PROLOGUE(mpn_bdiv_dbm1c)
+ .prologue
+ .save ar.lc, r2
+ .body
+
+ifdef(`HAVE_ABI_32',
+` addp4 rp = 0, rp C M I
+ addp4 up = 0, up C M I
+ zxt4 n = n C I
+ ;;
+')
+{.mmb
+ mov r15 = r36 C M I
+ ldf8 f9 = [up], 8 C M
+ nop.b 0 C B
+}
+.Lcommon:
+{.mii
+ adds r16 = -1, n C M I
+ mov r2 = ar.lc C I0
+ and r14 = 3, n C M I
+ ;;
+}
+{.mii
+ setf.sig f6 = bd C M2 M3
+ shr.u r31 = r16, 2 C I0
+ cmp.eq p10, p0 = 0, r14 C M I
+}
+{.mii
+ nop.m 0 C M
+ cmp.eq p11, p0 = 2, r14 C M I
+ cmp.eq p12, p0 = 3, r14 C M I
+ ;;
+}
+{.mii
+ cmp.ne p6, p7 = r0, r0 C M I
+ mov.i ar.lc = r31 C I0
+ cmp.ne p8, p9 = r0, r0 C M I
+}
+{.bbb
+ (p10) br.dptk .Lb00 C B
+ (p11) br.dptk .Lb10 C B
+ (p12) br.dptk .Lb11 C B
+ ;;
+}
+
+.Lb01: br.cloop.dptk .grt1
+ ;;
+ xma.l f38 = f9, f6, f0
+ xma.hu f39 = f9, f6, f0
+ ;;
+ getf.sig r26 = f38
+ getf.sig r27 = f39
+ br .Lcj1
+
+.grt1: ldf8 f10 = [r33], 8
+ ;;
+ ldf8 f11 = [r33], 8
+ ;;
+ ldf8 f12 = [r33], 8
+ ;;
+ xma.l f38 = f9, f6, f0
+ xma.hu f39 = f9, f6, f0
+ ;;
+ ldf8 f13 = [r33], 8
+ ;;
+ xma.l f32 = f10, f6, f0
+ xma.hu f33 = f10, f6, f0
+ br.cloop.dptk .grt5
+
+ ;;
+ getf.sig r26 = f38
+ xma.l f34 = f11, f6, f0
+ xma.hu f35 = f11, f6, f0
+ ;;
+ getf.sig r27 = f39
+ ;;
+ getf.sig r20 = f32
+ xma.l f36 = f12, f6, f0
+ xma.hu f37 = f12, f6, f0
+ ;;
+ getf.sig r21 = f33
+ ;;
+ getf.sig r22 = f34
+ xma.l f38 = f13, f6, f0
+ xma.hu f39 = f13, f6, f0
+ br .Lcj5
+
+.grt5: ldf8 f10 = [r33], 8
+ ;;
+ getf.sig r26 = f38
+ xma.l f34 = f11, f6, f0
+ xma.hu f35 = f11, f6, f0
+ ;;
+ getf.sig r27 = f39
+ ldf8 f11 = [r33], 8
+ ;;
+ getf.sig r20 = f32
+ xma.l f36 = f12, f6, f0
+ xma.hu f37 = f12, f6, f0
+ ;;
+ getf.sig r21 = f33
+ ldf8 f12 = [r33], 8
+ ;;
+ getf.sig r22 = f34
+ xma.l f38 = f13, f6, f0
+ xma.hu f39 = f13, f6, f0
+ br .LL01
+
+.Lb10: ldf8 f13 = [r33], 8
+ br.cloop.dptk .grt2
+ ;;
+
+ xma.l f36 = f9, f6, f0
+ xma.hu f37 = f9, f6, f0
+ ;;
+ xma.l f38 = f13, f6, f0
+ xma.hu f39 = f13, f6, f0
+ ;;
+ getf.sig r24 = f36
+ ;;
+ getf.sig r25 = f37
+ ;;
+ getf.sig r26 = f38
+ ;;
+ getf.sig r27 = f39
+ br .Lcj2
+
+.grt2: ldf8 f10 = [r33], 8
+ ;;
+ ldf8 f11 = [r33], 8
+ ;;
+ xma.l f36 = f9, f6, f0
+ xma.hu f37 = f9, f6, f0
+ ;;
+ ldf8 f12 = [r33], 8
+ ;;
+ xma.l f38 = f13, f6, f0
+ xma.hu f39 = f13, f6, f0
+ ;;
+ ldf8 f13 = [r33], 8
+ ;;
+ getf.sig r24 = f36
+ xma.l f32 = f10, f6, f0
+ xma.hu f33 = f10, f6, f0
+ br.cloop.dptk .grt6
+
+ getf.sig r25 = f37
+ ;;
+ getf.sig r26 = f38
+ xma.l f34 = f11, f6, f0
+ xma.hu f35 = f11, f6, f0
+ ;;
+ getf.sig r27 = f39
+ ;;
+ getf.sig r20 = f32
+ xma.l f36 = f12, f6, f0
+ xma.hu f37 = f12, f6, f0
+ br .Lcj6
+
+.grt6: getf.sig r25 = f37
+ ldf8 f10 = [r33], 8
+ ;;
+ getf.sig r26 = f38
+ xma.l f34 = f11, f6, f0
+ xma.hu f35 = f11, f6, f0
+ ;;
+ getf.sig r27 = f39
+ ldf8 f11 = [r33], 8
+ ;;
+ getf.sig r20 = f32
+ xma.l f36 = f12, f6, f0
+ xma.hu f37 = f12, f6, f0
+ br .LL10
+
+
+.Lb11: ldf8 f12 = [r33], 8
+ ;;
+ ldf8 f13 = [r33], 8
+ br.cloop.dptk .grt3
+ ;;
+
+ xma.l f34 = f9, f6, f0
+ xma.hu f35 = f9, f6, f0
+ ;;
+ xma.l f36 = f12, f6, f0
+ xma.hu f37 = f12, f6, f0
+ ;;
+ getf.sig r22 = f34
+ xma.l f38 = f13, f6, f0
+ xma.hu f39 = f13, f6, f0
+ ;;
+ getf.sig r23 = f35
+ ;;
+ getf.sig r24 = f36
+ ;;
+ getf.sig r25 = f37
+ ;;
+ getf.sig r26 = f38
+ br .Lcj3
+
+.grt3: ldf8 f10 = [r33], 8
+ ;;
+ xma.l f34 = f9, f6, f0
+ xma.hu f35 = f9, f6, f0
+ ;;
+ ldf8 f11 = [r33], 8
+ ;;
+ xma.l f36 = f12, f6, f0
+ xma.hu f37 = f12, f6, f0
+ ;;
+ ldf8 f12 = [r33], 8
+ ;;
+ getf.sig r22 = f34
+ xma.l f38 = f13, f6, f0
+ xma.hu f39 = f13, f6, f0
+ ;;
+ getf.sig r23 = f35
+ ldf8 f13 = [r33], 8
+ ;;
+ getf.sig r24 = f36
+ xma.l f32 = f10, f6, f0
+ xma.hu f33 = f10, f6, f0
+ br.cloop.dptk .grt7
+
+ getf.sig r25 = f37
+ ;;
+ getf.sig r26 = f38
+ xma.l f34 = f11, f6, f0
+ xma.hu f35 = f11, f6, f0
+ br .Lcj7
+
+.grt7: getf.sig r25 = f37
+ ldf8 f10 = [r33], 8
+ ;;
+ getf.sig r26 = f38
+ xma.l f34 = f11, f6, f0
+ xma.hu f35 = f11, f6, f0
+ br .LL11
+
+
+.Lb00: ldf8 f11 = [r33], 8
+ ;;
+ ldf8 f12 = [r33], 8
+ ;;
+ ldf8 f13 = [r33], 8
+ br.cloop.dptk .grt4
+ ;;
+
+ xma.l f32 = f9, f6, f0
+ xma.hu f33 = f9, f6, f0
+ ;;
+ xma.l f34 = f11, f6, f0
+ xma.hu f35 = f11, f6, f0
+ ;;
+ getf.sig r20 = f32
+ xma.l f36 = f12, f6, f0
+ xma.hu f37 = f12, f6, f0
+ ;;
+ getf.sig r21 = f33
+ ;;
+ getf.sig r22 = f34
+ xma.l f38 = f13, f6, f0
+ xma.hu f39 = f13, f6, f0
+ ;;
+ getf.sig r23 = f35
+ ;;
+ getf.sig r24 = f36
+ br .Lcj4
+
+.grt4: xma.l f32 = f9, f6, f0
+ xma.hu f33 = f9, f6, f0
+ ;;
+ ldf8 f10 = [r33], 8
+ ;;
+ xma.l f34 = f11, f6, f0
+ xma.hu f35 = f11, f6, f0
+ ;;
+ ldf8 f11 = [r33], 8
+ ;;
+ getf.sig r20 = f32
+ xma.l f36 = f12, f6, f0
+ xma.hu f37 = f12, f6, f0
+ ;;
+ getf.sig r21 = f33
+ ldf8 f12 = [r33], 8
+ ;;
+ getf.sig r22 = f34
+ xma.l f38 = f13, f6, f0
+ xma.hu f39 = f13, f6, f0
+ ;;
+ getf.sig r23 = f35
+ ldf8 f13 = [r33], 8
+ ;;
+ getf.sig r24 = f36
+ xma.l f32 = f10, f6, f0
+ xma.hu f33 = f10, f6, f0
+ br.cloop.dptk .LL00
+ br .Lcj8
+
+C *** MAIN LOOP START ***
+ ALIGN(32)
+.Ltop:
+ .pred.rel "mutex",p6,p7
+C .mfi
+ getf.sig r24 = f36
+ xma.l f32 = f10, f6, f0
+ (p6) sub r15 = r19, r27, 1
+C .mfi
+ st8 [r32] = r19, 8
+ xma.hu f33 = f10, f6, f0
+ (p7) sub r15 = r19, r27
+ ;;
+.LL00:
+C .mfi
+ getf.sig r25 = f37
+ nop.f 0
+ cmp.ltu p6, p7 = r15, r20
+C .mib
+ ldf8 f10 = [r33], 8
+ sub r16 = r15, r20
+ nop.b 0
+ ;;
+
+C .mfi
+ getf.sig r26 = f38
+ xma.l f34 = f11, f6, f0
+ (p6) sub r15 = r16, r21, 1
+C .mfi
+ st8 [r32] = r16, 8
+ xma.hu f35 = f11, f6, f0
+ (p7) sub r15 = r16, r21
+ ;;
+.LL11:
+C .mfi
+ getf.sig r27 = f39
+ nop.f 0
+ cmp.ltu p6, p7 = r15, r22
+C .mib
+ ldf8 f11 = [r33], 8
+ sub r17 = r15, r22
+ nop.b 0
+ ;;
+
+C .mfi
+ getf.sig r20 = f32
+ xma.l f36 = f12, f6, f0
+ (p6) sub r15 = r17, r23, 1
+C .mfi
+ st8 [r32] = r17, 8
+ xma.hu f37 = f12, f6, f0
+ (p7) sub r15 = r17, r23
+ ;;
+.LL10:
+C .mfi
+ getf.sig r21 = f33
+ nop.f 0
+ cmp.ltu p6, p7 = r15, r24
+C .mib
+ ldf8 f12 = [r33], 8
+ sub r18 = r15, r24
+ nop.b 0
+ ;;
+
+C .mfi
+ getf.sig r22 = f34
+ xma.l f38 = f13, f6, f0
+ (p6) sub r15 = r18, r25, 1
+C .mfi
+ st8 [r32] = r18, 8
+ xma.hu f39 = f13, f6, f0
+ (p7) sub r15 = r18, r25
+ ;;
+.LL01:
+C .mfi
+ getf.sig r23 = f35
+ nop.f 0
+ cmp.ltu p6, p7 = r15, r26
+C .mib
+ ldf8 f13 = [r33], 8
+ sub r19 = r15, r26
+ br.cloop.sptk.few .Ltop
+C *** MAIN LOOP END ***
+ ;;
+
+ getf.sig r24 = f36
+ xma.l f32 = f10, f6, f0
+ (p6) sub r15 = r19, r27, 1
+ st8 [r32] = r19, 8
+ xma.hu f33 = f10, f6, f0
+ (p7) sub r15 = r19, r27
+ ;;
+.Lcj8: getf.sig r25 = f37
+ cmp.ltu p6, p7 = r15, r20
+ sub r16 = r15, r20
+ ;;
+ getf.sig r26 = f38
+ xma.l f34 = f11, f6, f0
+ (p6) sub r15 = r16, r21, 1
+ st8 [r32] = r16, 8
+ xma.hu f35 = f11, f6, f0
+ (p7) sub r15 = r16, r21
+ ;;
+.Lcj7: getf.sig r27 = f39
+ cmp.ltu p6, p7 = r15, r22
+ sub r17 = r15, r22
+ ;;
+ getf.sig r20 = f32
+ xma.l f36 = f12, f6, f0
+ (p6) sub r15 = r17, r23, 1
+ st8 [r32] = r17, 8
+ xma.hu f37 = f12, f6, f0
+ (p7) sub r15 = r17, r23
+ ;;
+.Lcj6: getf.sig r21 = f33
+ cmp.ltu p6, p7 = r15, r24
+ sub r18 = r15, r24
+ ;;
+ getf.sig r22 = f34
+ xma.l f38 = f13, f6, f0
+ (p6) sub r15 = r18, r25, 1
+ st8 [r32] = r18, 8
+ xma.hu f39 = f13, f6, f0
+ (p7) sub r15 = r18, r25
+ ;;
+.Lcj5: getf.sig r23 = f35
+ cmp.ltu p6, p7 = r15, r26
+ sub r19 = r15, r26
+ ;;
+ getf.sig r24 = f36
+ (p6) sub r15 = r19, r27, 1
+ st8 [r32] = r19, 8
+ (p7) sub r15 = r19, r27
+ ;;
+.Lcj4: getf.sig r25 = f37
+ cmp.ltu p6, p7 = r15, r20
+ sub r16 = r15, r20
+ ;;
+ getf.sig r26 = f38
+ (p6) sub r15 = r16, r21, 1
+ st8 [r32] = r16, 8
+ (p7) sub r15 = r16, r21
+ ;;
+.Lcj3: getf.sig r27 = f39
+ cmp.ltu p6, p7 = r15, r22
+ sub r17 = r15, r22
+ ;;
+ (p6) sub r15 = r17, r23, 1
+ st8 [r32] = r17, 8
+ (p7) sub r15 = r17, r23
+ ;;
+.Lcj2: cmp.ltu p6, p7 = r15, r24
+ sub r18 = r15, r24
+ ;;
+ (p6) sub r15 = r18, r25, 1
+ st8 [r32] = r18, 8
+ (p7) sub r15 = r18, r25
+ ;;
+.Lcj1: cmp.ltu p6, p7 = r15, r26
+ sub r19 = r15, r26
+ ;;
+ (p6) sub r8 = r19, r27, 1
+ st8 [r32] = r19
+ (p7) sub r8 = r19, r27
+ mov ar.lc = r2
+ br.ret.sptk.many b0
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/ia64/cnd_aors_n.asm b/vendor/gmp-6.3.0/mpn/ia64/cnd_aors_n.asm
new file mode 100644
index 0000000..edd0552
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/ia64/cnd_aors_n.asm
@@ -0,0 +1,264 @@
+dnl IA-64 mpn_cnd_add_n/mpn_cnd_sub_n.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C Itanium: ?
+C Itanium 2: 1.5
+
+C INPUT PARAMETERS
+define(`cnd', `r32')
+define(`rp', `r33')
+define(`up', `r34')
+define(`vp', `r35')
+define(`n', `r36')
+
+ifdef(`OPERATION_cnd_add_n',`
+ define(ADDSUB, add)
+ define(CND, ltu)
+ define(INCR, 1)
+ define(LIM, -1)
+ define(func, mpn_cnd_add_n)
+')
+ifdef(`OPERATION_cnd_sub_n',`
+ define(ADDSUB, sub)
+ define(CND, gtu)
+ define(INCR, -1)
+ define(LIM, 0)
+ define(func, mpn_cnd_sub_n)
+')
+
+define(PFDIST, 160)
+
+C Some useful aliases for registers we use
+define(`u0',`r14') define(`u1',`r15') define(`u2',`r16') define(`u3',`r17')
+define(`x0',`r20') define(`x1',`r21') define(`x2',`r22') define(`x3',`r23')
+define(`v0',`r24') define(`v1',`r25') define(`v2',`r26') define(`v3',`r27')
+define(`w0',`r28') define(`w1',`r29') define(`w2',`r30') define(`w3',`r31')
+define(`up1',`up') define(`up2',`r8') define(`upadv',`r1')
+define(`vp1',`vp') define(`vp2',`r9') define(`vpadv',`r11')
+define(`rp1',`rp') define(`rp2',`r10')
+
+MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n)
+
+ASM_START()
+PROLOGUE(func)
+ .prologue
+ .save ar.lc, r2
+ .body
+ifdef(`HAVE_ABI_32',`
+ addp4 rp = 0, rp C M I
+ addp4 up = 0, up C M I
+ nop.i 0
+ addp4 vp = 0, vp C M I
+ nop.m 0
+ zxt4 n = n C I
+ ;;
+')
+ {.mmi; and r3 = 3, n C M I
+ add n = -1, n C M I
+ mov r2 = ar.lc C I0
+}{.mmi; cmp.ne p6, p7 = 0, cnd C M I
+ add vp2 = 8, vp C M I
+ add up2 = 8, up C M I
+ ;;
+}{.mmi; add upadv = PFDIST, up C M I
+ add vpadv = PFDIST, vp C M I
+ shr.u n = n, 2 C I0
+ .pred.rel "mutex", p6, p7
+}{.mmi; add rp2 = 8, rp C M I
+ (p6) mov cnd = -1 C M I
+ (p7) mov cnd = 0 C M I
+ ;;
+} cmp.eq p9, p0 = 1, r3 C M I
+ cmp.eq p7, p0 = 2, r3 C M I
+ cmp.eq p8, p0 = 3, r3 C M I
+ (p9) br L(b1) C B
+ (p7) br L(b2) C B
+ (p8) br L(b3) C B
+ ;;
+L(b0):
+ {.mmi; ld8 v2 = [vp1], 16 C M01
+ ld8 v3 = [vp2], 16 C M01
+ mov ar.lc = n C I0
+ ;;
+} ld8 u2 = [up1], 16 C M01
+ ld8 u3 = [up2], 16 C M01
+ and x2 = v2, cnd C M I
+ and x3 = v3, cnd C M I
+ ;;
+ ADDSUB w2 = u2, x2 C M I
+ ADDSUB w3 = u3, x3 C M I
+ ;;
+ ld8 v0 = [vp1], 16 C M01
+ ld8 v1 = [vp2], 16 C M01
+ cmp.CND p8, p0 = w2, u2 C M I
+ cmp.CND p9, p0 = w3, u3 C M I
+ br L(lo0)
+
+L(b1): ld8 v1 = [vp1], 8 C M01
+ add vp2 = 8, vp2 C M I
+ add rp2 = 8, rp2 C M I
+ ;;
+ ld8 u1 = [up1], 8 C M01
+ add up2 = 8, up2 C M I
+ and x1 = v1, cnd C M I
+ ;;
+ ADDSUB w1 = u1, x1 C M I
+ cmp.ne p10, p0 = 0, n
+ add n = -1, n
+ ;;
+ cmp.CND p7, p0 = w1, u1 C M I
+ st8 [rp1] = w1, 8 C M23
+ (p10) br L(b0)
+ ;;
+ mov r8 = 0 C M I
+ br L(e1)
+
+L(b3): ld8 v3 = [vp1], 8 C M01
+ add vp2 = 8, vp2 C M I
+ add rp2 = 8, rp2 C M I
+ ;;
+ ld8 u3 = [up1], 8 C M01
+ add up2 = 8, up2 C M I
+ and x3 = v3, cnd C M I
+ ;;
+ ADDSUB w3 = u3, x3 C M I
+ ;;
+ cmp.CND p9, p0 = w3, u3 C M I
+ st8 [rp1] = w3, 8 C M23
+ C fall through
+
+L(b2):
+ {.mmi; ld8 v0 = [vp1], 16 C M01
+ ld8 v1 = [vp2], 16 C M01
+ mov ar.lc = n C I0
+ ;;
+} ld8 u0 = [up1], 16 C M01
+ ld8 u1 = [up2], 16 C M01
+ and x0 = v0, cnd C M I
+ and x1 = v1, cnd C M I
+ ;;
+ ADDSUB w0 = u0, x0 C M I
+ ADDSUB w1 = u1, x1 C M I
+ br.cloop.dptk L(gt2) C B
+ ;;
+ cmp.CND p6, p0 = w0, u0 C M I
+ br L(e2) C B
+L(gt2):
+ ld8 v2 = [vp1], 16 C M01
+ ld8 v3 = [vp2], 16 C M01
+ cmp.CND p6, p0 = w0, u0 C M I
+ cmp.CND p7, p0 = w1, u1 C M I
+ br L(lo2) C B
+
+
+C *** MAIN LOOP START ***
+C ALIGN(32)
+L(top):
+ {.mmi; ld8 v2 = [vp1], 16 C M01
+ ld8 v3 = [vp2], 16 C M01
+ cmp.CND p6, p0 = w0, u0 C M I
+}{.mmi; st8 [rp1] = w2, 16 C M23
+ st8 [rp2] = w3, 16 C M23
+ cmp.CND p7, p0 = w1, u1 C M I
+ ;;
+}
+L(lo2):
+ {.mmi; ld8 u2 = [up1], 16 C M01
+ ld8 u3 = [up2], 16 C M01
+ (p9) cmpeqor p6, p0 = LIM, w0 C M I
+}{.mmi; and x2 = v2, cnd C M I
+ and x3 = v3, cnd C M I
+ (p9) add w0 = INCR, w0 C M I
+ ;;
+}{.mmi; ADDSUB w2 = u2, x2 C M I
+ (p6) cmpeqor p7, p0 = LIM, w1 C M I
+ (p6) add w1 = INCR, w1 C M I
+}{.mmi; ADDSUB w3 = u3, x3 C M I
+ lfetch [upadv], 32
+ nop 0
+ ;;
+}{.mmi; ld8 v0 = [vp1], 16 C M01
+ ld8 v1 = [vp2], 16 C M01
+ cmp.CND p8, p0 = w2, u2 C M I
+}{.mmi; st8 [rp1] = w0, 16 C M23
+ st8 [rp2] = w1, 16 C M23
+ cmp.CND p9, p0 = w3, u3 C M I
+ ;;
+}
+L(lo0):
+ {.mmi; ld8 u0 = [up1], 16 C M01
+ ld8 u1 = [up2], 16 C M01
+ (p7) cmpeqor p8, p0 = LIM, w2 C M I
+}{.mmi; and x0 = v0, cnd C M I
+ and x1 = v1, cnd C M I
+ (p7) add w2 = INCR, w2 C M I
+ ;;
+}{.mmi; ADDSUB w0 = u0, x0 C M I
+ (p8) cmpeqor p9, p0 = LIM, w3 C M I
+ (p8) add w3 = INCR, w3 C M I
+}{.mmb; ADDSUB w1 = u1, x1 C M I
+ lfetch [vpadv], 32
+ br.cloop.dptk L(top) C B
+ ;;
+}
+C *** MAIN LOOP END ***
+
+
+L(end):
+ {.mmi; st8 [rp1] = w2, 16 C M23
+ st8 [rp2] = w3, 16 C M23
+ cmp.CND p6, p0 = w0, u0 C M I
+ ;;
+}
+L(e2):
+ {.mmi; cmp.CND p7, p0 = w1, u1 C M I
+ (p9) cmpeqor p6, p0 = LIM, w0 C M I
+ (p9) add w0 = INCR, w0 C M I
+ ;;
+}{.mmi; mov r8 = 0 C M I
+ (p6) cmpeqor p7, p0 = LIM, w1 C M I
+ (p6) add w1 = INCR, w1 C M I
+ ;;
+}{.mmi; st8 [rp1] = w0, 16 C M23
+ st8 [rp2] = w1, 16 C M23
+ mov ar.lc = r2 C I0
+}
+L(e1):
+ {.mmb; nop 0
+ (p7) mov r8 = 1 C M I
+ br.ret.sptk.many b0 C B
+}
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/ia64/copyd.asm b/vendor/gmp-6.3.0/mpn/ia64/copyd.asm
new file mode 100644
index 0000000..b94a1af
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/ia64/copyd.asm
@@ -0,0 +1,186 @@
+dnl IA-64 mpn_copyd -- copy limb vector, decrementing.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2001, 2002, 2004 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C Itanium: 1
+C Itanium 2: 0.5
+
+C INPUT PARAMETERS
+C rp = r32
+C sp = r33
+C n = r34
+
+ASM_START()
+PROLOGUE(mpn_copyd)
+ .prologue
+ .save ar.lc, r2
+ .body
+ifdef(`HAVE_ABI_32',
+` addp4 r32 = 0, r32
+ addp4 r33 = 0, r33
+ sxt4 r34 = r34
+ ;;
+')
+{.mmi
+ shladd r32 = r34, 3, r32
+ shladd r33 = r34, 3, r33
+ mov.i r2 = ar.lc
+}
+{.mmi
+ and r14 = 3, r34
+ cmp.ge p14, p15 = 3, r34
+ add r34 = -4, r34
+ ;;
+}
+{.mmi
+ cmp.eq p8, p0 = 1, r14
+ cmp.eq p10, p0 = 2, r14
+ cmp.eq p12, p0 = 3, r14
+}
+{.bbb
+ (p8) br.dptk .Lb01
+ (p10) br.dptk .Lb10
+ (p12) br.dptk .Lb11
+}
+
+.Lb00: C n = 0, 4, 8, 12, ...
+ add r32 = -8, r32
+ add r33 = -8, r33
+ (p14) br.dptk .Ls00
+ ;;
+ add r21 = -8, r33
+ ld8 r16 = [r33], -16
+ shr r15 = r34, 2
+ ;;
+ ld8 r17 = [r21], -16
+ mov.i ar.lc = r15
+ ld8 r18 = [r33], -16
+ add r20 = -8, r32
+ ;;
+ ld8 r19 = [r21], -16
+ br.cloop.dptk .Loop
+ ;;
+ br.sptk .Lend
+ ;;
+
+.Lb01: C n = 1, 5, 9, 13, ...
+ add r21 = -8, r33
+ add r20 = -8, r32
+ add r33 = -16, r33
+ add r32 = -16, r32
+ ;;
+ ld8 r19 = [r21], -16
+ shr r15 = r34, 2
+ (p14) br.dptk .Ls01
+ ;;
+ ld8 r16 = [r33], -16
+ mov.i ar.lc = r15
+ ;;
+ ld8 r17 = [r21], -16
+ ld8 r18 = [r33], -16
+ br.sptk .Li01
+ ;;
+
+.Lb10: C n = 2,6, 10, 14, ...
+ add r21 = -16, r33
+ shr r15 = r34, 2
+ add r20 = -16, r32
+ add r32 = -8, r32
+ add r33 = -8, r33
+ ;;
+ ld8 r18 = [r33], -16
+ ld8 r19 = [r21], -16
+ mov.i ar.lc = r15
+ (p14) br.dptk .Ls10
+ ;;
+ ld8 r16 = [r33], -16
+ ld8 r17 = [r21], -16
+ br.sptk .Li10
+ ;;
+
+.Lb11: C n = 3, 7, 11, 15, ...
+ add r21 = -8, r33
+ add r20 = -8, r32
+ add r33 = -16, r33
+ add r32 = -16, r32
+ ;;
+ ld8 r17 = [r21], -16
+ shr r15 = r34, 2
+ ;;
+ ld8 r18 = [r33], -16
+ mov.i ar.lc = r15
+ ld8 r19 = [r21], -16
+ (p14) br.dptk .Ls11
+ ;;
+ ld8 r16 = [r33], -16
+ br.sptk .Li11
+ ;;
+
+ ALIGN(32)
+.Loop:
+.Li00:
+{.mmb
+ st8 [r32] = r16, -16
+ ld8 r16 = [r33], -16
+ nop.b 0
+}
+.Li11:
+{.mmb
+ st8 [r20] = r17, -16
+ ld8 r17 = [r21], -16
+ nop.b 0
+ ;;
+}
+.Li10:
+{.mmb
+ st8 [r32] = r18, -16
+ ld8 r18 = [r33], -16
+ nop.b 0
+}
+.Li01:
+{.mmb
+ st8 [r20] = r19, -16
+ ld8 r19 = [r21], -16
+ br.cloop.dptk .Loop
+ ;;
+}
+.Lend: st8 [r32] = r16, -16
+.Ls11: st8 [r20] = r17, -16
+ ;;
+.Ls10: st8 [r32] = r18, -16
+.Ls01: st8 [r20] = r19, -16
+.Ls00: mov.i ar.lc = r2
+ br.ret.sptk.many b0
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/ia64/copyi.asm b/vendor/gmp-6.3.0/mpn/ia64/copyi.asm
new file mode 100644
index 0000000..49ed192
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/ia64/copyi.asm
@@ -0,0 +1,182 @@
+dnl IA-64 mpn_copyi -- copy limb vector, incrementing.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2001, 2002, 2004 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C Itanium: 1
+C Itanium 2: 0.5
+
+C INPUT PARAMETERS
+C rp = r32
+C sp = r33
+C n = r34
+
+ASM_START()
+PROLOGUE(mpn_copyi)
+ .prologue
+ .save ar.lc, r2
+ .body
+ifdef(`HAVE_ABI_32',
+` addp4 r32 = 0, r32
+ addp4 r33 = 0, r33
+ sxt4 r34 = r34
+ ;;
+')
+{.mmi
+ nop 0
+ nop 0
+ mov.i r2 = ar.lc
+}
+{.mmi
+ and r14 = 3, r34
+ cmp.ge p14, p15 = 3, r34
+ add r34 = -4, r34
+ ;;
+}
+{.mmi
+ cmp.eq p8, p0 = 1, r14
+ cmp.eq p10, p0 = 2, r14
+ cmp.eq p12, p0 = 3, r14
+}
+{.bbb
+ (p8) br.dptk .Lb01
+ (p10) br.dptk .Lb10
+ (p12) br.dptk .Lb11
+}
+
+.Lb00: C n = 0, 4, 8, 12, ...
+ (p14) br.dptk .Ls00
+ ;;
+ add r21 = 8, r33
+ ld8 r16 = [r33], 16
+ shr r15 = r34, 2
+ ;;
+ ld8 r17 = [r21], 16
+ mov.i ar.lc = r15
+ ld8 r18 = [r33], 16
+ add r20 = 8, r32
+ ;;
+ ld8 r19 = [r21], 16
+ br.cloop.dptk .Loop
+ ;;
+ br.sptk .Lend
+ ;;
+
+.Lb01: C n = 1, 5, 9, 13, ...
+ add r21 = 0, r33
+ add r20 = 0, r32
+ add r33 = 8, r33
+ add r32 = 8, r32
+ ;;
+ ld8 r19 = [r21], 16
+ shr r15 = r34, 2
+ (p14) br.dptk .Ls01
+ ;;
+ ld8 r16 = [r33], 16
+ mov.i ar.lc = r15
+ ;;
+ ld8 r17 = [r21], 16
+ ld8 r18 = [r33], 16
+ br.sptk .Li01
+ ;;
+
+.Lb10: C n = 2,6, 10, 14, ...
+ add r21 = 8, r33
+ add r20 = 8, r32
+ ld8 r18 = [r33], 16
+ shr r15 = r34, 2
+ ;;
+ ld8 r19 = [r21], 16
+ mov.i ar.lc = r15
+ (p14) br.dptk .Ls10
+ ;;
+ ld8 r16 = [r33], 16
+ ld8 r17 = [r21], 16
+ br.sptk .Li10
+ ;;
+
+.Lb11: C n = 3, 7, 11, 15, ...
+ add r21 = 0, r33
+ add r20 = 0, r32
+ add r33 = 8, r33
+ add r32 = 8, r32
+ ;;
+ ld8 r17 = [r21], 16
+ shr r15 = r34, 2
+ ;;
+ ld8 r18 = [r33], 16
+ mov.i ar.lc = r15
+ ld8 r19 = [r21], 16
+ (p14) br.dptk .Ls11
+ ;;
+ ld8 r16 = [r33], 16
+ br.sptk .Li11
+ ;;
+
+ ALIGN(32)
+.Loop:
+.Li00:
+{.mmb
+ st8 [r32] = r16, 16
+ ld8 r16 = [r33], 16
+ nop.b 0
+}
+.Li11:
+{.mmb
+ st8 [r20] = r17, 16
+ ld8 r17 = [r21], 16
+ nop.b 0
+ ;;
+}
+.Li10:
+{.mmb
+ st8 [r32] = r18, 16
+ ld8 r18 = [r33], 16
+ nop.b 0
+}
+.Li01:
+{.mmb
+ st8 [r20] = r19, 16
+ ld8 r19 = [r21], 16
+ br.cloop.dptk .Loop
+ ;;
+}
+.Lend: st8 [r32] = r16, 16
+.Ls11: st8 [r20] = r17, 16
+ ;;
+.Ls10: st8 [r32] = r18, 16
+.Ls01: st8 [r20] = r19, 16
+.Ls00: mov.i ar.lc = r2
+ br.ret.sptk.many b0
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/ia64/dive_1.asm b/vendor/gmp-6.3.0/mpn/ia64/dive_1.asm
new file mode 100644
index 0000000..5e4a273
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/ia64/dive_1.asm
@@ -0,0 +1,236 @@
+dnl IA-64 mpn_divexact_1 -- mpn by limb exact division.
+
+dnl Contributed to the GNU project by Torbjorn Granlund and Kevin Ryde.
+
+dnl Copyright 2003-2005, 2010 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C Itanium: 16
+C Itanium 2: 8
+
+C INPUT PARAMETERS
+define(`rp', `r32')
+define(`up', `r33')
+define(`n', `r34')
+define(`divisor', `r35')
+
+define(`lshift', `r24')
+define(`rshift', `r25')
+
+C This code is a bit messy, and not as similar to mode1o.asm as desired.
+
+C The critical path during initialization is for computing the inverse of the
+C divisor. Since odd divisors are probably common, we conditionally execute
+C the initial count_trailing_zeros code and the downshift.
+
+C Possible improvement: Merge more of the feed-in code into the inverse
+C computation.
+
+ASM_START()
+ .text
+ .align 32
+.Ltab:
+data1 0,0x01, 0,0xAB, 0,0xCD, 0,0xB7, 0,0x39, 0,0xA3, 0,0xC5, 0,0xEF
+data1 0,0xF1, 0,0x1B, 0,0x3D, 0,0xA7, 0,0x29, 0,0x13, 0,0x35, 0,0xDF
+data1 0,0xE1, 0,0x8B, 0,0xAD, 0,0x97, 0,0x19, 0,0x83, 0,0xA5, 0,0xCF
+data1 0,0xD1, 0,0xFB, 0,0x1D, 0,0x87, 0,0x09, 0,0xF3, 0,0x15, 0,0xBF
+data1 0,0xC1, 0,0x6B, 0,0x8D, 0,0x77, 0,0xF9, 0,0x63, 0,0x85, 0,0xAF
+data1 0,0xB1, 0,0xDB, 0,0xFD, 0,0x67, 0,0xE9, 0,0xD3, 0,0xF5, 0,0x9F
+data1 0,0xA1, 0,0x4B, 0,0x6D, 0,0x57, 0,0xD9, 0,0x43, 0,0x65, 0,0x8F
+data1 0,0x91, 0,0xBB, 0,0xDD, 0,0x47, 0,0xC9, 0,0xB3, 0,0xD5, 0,0x7F
+data1 0,0x81, 0,0x2B, 0,0x4D, 0,0x37, 0,0xB9, 0,0x23, 0,0x45, 0,0x6F
+data1 0,0x71, 0,0x9B, 0,0xBD, 0,0x27, 0,0xA9, 0,0x93, 0,0xB5, 0,0x5F
+data1 0,0x61, 0,0x0B, 0,0x2D, 0,0x17, 0,0x99, 0,0x03, 0,0x25, 0,0x4F
+data1 0,0x51, 0,0x7B, 0,0x9D, 0,0x07, 0,0x89, 0,0x73, 0,0x95, 0,0x3F
+data1 0,0x41, 0,0xEB, 0,0x0D, 0,0xF7, 0,0x79, 0,0xE3, 0,0x05, 0,0x2F
+data1 0,0x31, 0,0x5B, 0,0x7D, 0,0xE7, 0,0x69, 0,0x53, 0,0x75, 0,0x1F
+data1 0,0x21, 0,0xCB, 0,0xED, 0,0xD7, 0,0x59, 0,0xC3, 0,0xE5, 0,0x0F
+data1 0,0x11, 0,0x3B, 0,0x5D, 0,0xC7, 0,0x49, 0,0x33, 0,0x55, 0,0xFF
+
+
+PROLOGUE(mpn_divexact_1)
+ .prologue
+ .save ar.lc, r2
+ .body
+
+ {.mmi; add r8 = -1, divisor C M0
+ nop 0 C M1
+ tbit.z p8, p9 = divisor, 0 C I0
+}
+ifdef(`HAVE_ABI_32',
+` addp4 rp = 0, rp C M2 rp extend
+ addp4 up = 0, up C M3 up extend
+ sxt4 n = n') C I1 size extend
+ ;;
+.Lhere:
+ {.mmi; ld8 r20 = [up], 8 C M0 up[0]
+ (p8) andcm r8 = r8, divisor C M1
+ mov r15 = ip C I0 .Lhere
+ ;;
+}{.mii
+ .pred.rel "mutex", p8, p9
+ (p9) mov rshift = 0 C M0
+ (p8) popcnt rshift = r8 C I0 r8 = cnt_lo_zeros(divisor)
+ cmp.eq p6, p10 = 1, n C I1
+ ;;
+}{.mii; add r9 = .Ltab-.Lhere, r15 C M0
+ (p8) shr.u divisor = divisor, rshift C I0
+ nop 0 C I1
+ ;;
+}{.mmi; add n = -4, n C M0 size-1
+ (p10) ld8 r21 = [up], 8 C M1 up[1]
+ mov r14 = 2 C M1 2
+}{.mfi; setf.sig f6 = divisor C M2 divisor
+ mov f9 = f0 C M3 carry FIXME
+ zxt1 r3 = divisor C I1 divisor low byte
+ ;;
+}{.mmi; add r3 = r9, r3 C M0 table offset ip and index
+ sub r16 = 0, divisor C M1 -divisor
+ mov r2 = ar.lc C I0
+}{.mmi; sub lshift = 64, rshift C M2
+ setf.sig f13 = r14 C M3 2 in significand
+ mov r17 = -1 C I1 -1
+ ;;
+}{.mmi; ld1 r3 = [r3] C M0 inverse, 8 bits
+ nop 0 C M1
+ mov ar.lc = n C I0 size-1 loop count
+}{.mmi; setf.sig f12 = r16 C M2 -divisor
+ setf.sig f8 = r17 C M3 -1
+ cmp.eq p7, p0 = -2, n C I1
+ ;;
+}{.mmi; setf.sig f7 = r3 C M2 inverse, 8 bits
+ cmp.eq p8, p0 = -1, n C M0
+ shr.u r23 = r20, rshift C I0
+ ;;
+}
+
+ C f6 divisor
+ C f7 inverse, being calculated
+ C f8 -1, will be -inverse
+ C f9 carry
+ C f12 -divisor
+ C f13 2
+ C f14 scratch
+
+ xmpy.l f14 = f13, f7 C Newton 2*i
+ xmpy.l f7 = f7, f7 C Newton i*i
+ ;;
+ xma.l f7 = f7, f12, f14 C Newton i*i*-d + 2*i, 16 bits
+ ;;
+ setf.sig f10 = r23 C speculative, used iff n = 1
+ xmpy.l f14 = f13, f7 C Newton 2*i
+ shl r22 = r21, lshift C speculative, used iff n > 1
+ xmpy.l f7 = f7, f7 C Newton i*i
+ ;;
+ or r31 = r22, r23 C speculative, used iff n > 1
+ xma.l f7 = f7, f12, f14 C Newton i*i*-d + 2*i, 32 bits
+ shr.u r23 = r21, rshift C speculative, used iff n > 1
+ ;;
+ setf.sig f11 = r31 C speculative, used iff n > 1
+ xmpy.l f14 = f13, f7 C Newton 2*i
+ xmpy.l f7 = f7, f7 C Newton i*i
+ ;;
+ xma.l f7 = f7, f12, f14 C Newton i*i*-d + 2*i, 64 bits
+
+ (p7) br.cond.dptk .Ln2
+ (p10) br.cond.dptk .grt3
+ ;;
+
+.Ln1: xmpy.l f12 = f10, f7 C q = ulimb * inverse
+ br .Lx1
+
+.Ln2:
+ xmpy.l f8 = f7, f8 C -inverse = inverse * -1
+ xmpy.l f12 = f11, f7 C q = ulimb * inverse
+ setf.sig f11 = r23
+ br .Lx2
+
+.grt3:
+ ld8 r21 = [up], 8 C up[2]
+ xmpy.l f8 = f7, f8 C -inverse = inverse * -1
+ ;;
+ shl r22 = r21, lshift
+ ;;
+ xmpy.l f12 = f11, f7 C q = ulimb * inverse
+ ;;
+ or r31 = r22, r23
+ shr.u r23 = r21, rshift
+ ;;
+ setf.sig f11 = r31
+ (p8) br.cond.dptk .Lx3 C branch for n = 3
+ ;;
+ ld8 r21 = [up], 8
+ br .Lent
+
+.Ltop: ld8 r21 = [up], 8
+ xma.l f12 = f9, f8, f10 C q = c * -inverse + si
+ nop.b 0
+ ;;
+.Lent: add r16 = 160, up
+ shl r22 = r21, lshift
+ nop.b 0
+ ;;
+ stf8 [rp] = f12, 8
+ xma.hu f9 = f12, f6, f9 C c = high(q * divisor + c)
+ nop.b 0
+ nop.m 0
+ xmpy.l f10 = f11, f7 C si = ulimb * inverse
+ nop.b 0
+ ;;
+ or r31 = r22, r23
+ shr.u r23 = r21, rshift
+ nop.b 0
+ ;;
+ lfetch [r16]
+ setf.sig f11 = r31
+ br.cloop.sptk.few.clr .Ltop
+
+
+ xma.l f12 = f9, f8, f10 C q = c * -inverse + si
+ ;;
+.Lx3: stf8 [rp] = f12, 8
+ xma.hu f9 = f12, f6, f9 C c = high(q * divisor + c)
+ xmpy.l f10 = f11, f7 C si = ulimb * inverse
+ ;;
+ setf.sig f11 = r23
+ ;;
+ xma.l f12 = f9, f8, f10 C q = c * -inverse + si
+ ;;
+.Lx2: stf8 [rp] = f12, 8
+ xma.hu f9 = f12, f6, f9 C c = high(q * divisor + c)
+ xmpy.l f10 = f11, f7 C si = ulimb * inverse
+ ;;
+ xma.l f12 = f9, f8, f10 C q = c * -inverse + si
+ ;;
+.Lx1: stf8 [rp] = f12, 8
+ mov ar.lc = r2 C I0
+ br.ret.sptk.many b0
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/ia64/divrem_1.asm b/vendor/gmp-6.3.0/mpn/ia64/divrem_1.asm
new file mode 100644
index 0000000..e887820
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/ia64/divrem_1.asm
@@ -0,0 +1,477 @@
+dnl IA-64 mpn_divrem_1 and mpn_preinv_divrem_1 -- Divide an mpn number by an
+dnl unnormalized limb.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2002, 2004, 2005 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C Itanium: 40-42
+C Itanium 2: 29-30
+
+C This was generated by gcc, then the loops were optimized. The preinv entry
+C point was shoehorned into the file. Lots of things outside the loops could
+C be streamlined. It would probably be a good idea to merge the loops for
+C normalized and unnormalized divisor, since the shifting stuff is done for
+C free in parallel with other operations. It would even be possible to merge
+C all loops, if the ld8 were made conditional.
+
+C TODO
+C * Consider delaying inversion for normalized mpn_divrem_1 entry till after
+C computing leading limb.
+C * Inline and interleave limb inversion code with loop setup code.
+
+ASM_START()
+
+C HP's assembler requires these declarations for importing mpn_invert_limb
+ .global mpn_invert_limb
+ .type mpn_invert_limb,@function
+
+C INPUT PARAMETERS
+C rp = r32
+C qxn = r33
+C up = r34
+C n = r35
+C vl = r36
+C vlinv = r37 (preinv only)
+C cnt = r38 (preinv only)
+
+PROLOGUE(mpn_preinv_divrem_1)
+ .prologue
+ .save ar.pfs, r42
+ alloc r42 = ar.pfs, 7, 8, 1, 0
+ .save ar.lc, r44
+ mov r44 = ar.lc
+ .save rp, r41
+ mov r41 = b0
+ .body
+ifdef(`HAVE_ABI_32',
+` addp4 r32 = 0, r32
+ sxt4 r33 = r33
+ addp4 r34 = 0, r34
+ sxt4 r35 = r35
+ ;;
+')
+ mov r40 = r38
+ shladd r34 = r35, 3, r34
+ ;;
+ adds r34 = -8, r34
+ ;;
+ ld8 r39 = [r34], -8
+ ;;
+
+ add r15 = r35, r33
+ ;;
+ mov r8 = r37
+ shladd r32 = r15, 3, r32 C r32 = rp + n + qxn
+ cmp.le p8, p0 = 0, r36
+ ;;
+ adds r32 = -8, r32 C r32 = rp + n + qxn - 1
+ cmp.leu p6, p7 = r36, r39
+ (p8) br.cond.dpnt .Lpunnorm
+ ;;
+
+ (p6) addl r15 = 1, r0
+ (p7) mov r15 = r0
+ ;;
+ (p6) sub r38 = r39, r36
+ (p7) mov r38 = r39
+ st8 [r32] = r15, -8
+ adds r35 = -2, r35 C un -= 2
+ br .Lpn
+
+.Lpunnorm:
+ (p6) add r34 = 8, r34
+ mov r38 = 0 C r = 0
+ shl r36 = r36, r40
+ (p6) br.cond.dptk .Lpu
+ ;;
+ shl r38 = r39, r40 C r = ahigh << cnt
+ cmp.ne p8, p0 = 1, r35
+ st8 [r32] = r0, -8
+ adds r35 = -1, r35 C un--
+ (p8) br.cond.dpnt .Lpu
+
+ mov r23 = 1
+ ;;
+ setf.sig f6 = r8
+ setf.sig f12 = r23
+ br .L435
+EPILOGUE()
+
+
+PROLOGUE(mpn_divrem_1)
+ .prologue
+ .save ar.pfs, r42
+ alloc r42 = ar.pfs, 5, 8, 1, 0
+ .save ar.lc, r44
+ mov r44 = ar.lc
+ .save rp, r41
+ mov r41 = b0
+ .body
+ifdef(`HAVE_ABI_32',
+` addp4 r32 = 0, r32
+ sxt4 r33 = r33
+ addp4 r34 = 0, r34
+ sxt4 r35 = r35
+ ;;
+')
+ mov r38 = r0
+ add r15 = r35, r33
+ ;;
+ cmp.ne p6, p7 = 0, r15
+ ;;
+ (p7) mov r8 = r0
+ (p7) br.cond.dpnt .Lret
+ shladd r14 = r15, 3, r32 C r14 = rp + n + qxn
+ cmp.le p6, p7 = 0, r36
+ ;;
+ adds r32 = -8, r14 C r32 = rp + n + qxn - 1
+ (p6) br.cond.dpnt .Lunnorm
+ cmp.eq p6, p7 = 0, r35
+ (p6) br.cond.dpnt .L179
+ shladd r14 = r35, 3, r34
+ ;;
+ adds r14 = -8, r14
+ adds r35 = -1, r35
+ ;;
+ ld8 r38 = [r14]
+ ;;
+ cmp.leu p6, p7 = r36, r38
+ ;;
+ (p6) addl r15 = 1, r0
+ (p7) mov r15 = r0
+ ;;
+ st8 [r32] = r15, -8
+ (p6) sub r38 = r38, r36
+
+.L179:
+ mov r45 = r36
+ adds r35 = -1, r35
+ br.call.sptk.many b0 = mpn_invert_limb
+ ;;
+ shladd r34 = r35, 3, r34
+.Lpn:
+ mov r23 = 1
+ ;;
+ setf.sig f6 = r8
+ setf.sig f12 = r23
+ cmp.le p6, p7 = 0, r35
+ mov r40 = 0
+ (p7) br.cond.dpnt .L435
+ setf.sig f10 = r36
+ mov ar.lc = r35
+ setf.sig f7 = r38
+ ;;
+ sub r28 = -1, r36
+C Develop quotient limbs for normalized divisor
+.Loop1: C 00 C q=r18 nh=r38/f7
+ ld8 r20 = [r34], -8
+ xma.hu f11 = f7, f6, f0
+ ;; C 04
+ xma.l f8 = f11, f12, f7 C q = q + nh
+ ;; C 08
+ getf.sig r18 = f8
+ xma.hu f9 = f8, f10, f0
+ xma.l f8 = f8, f10, f0
+ ;; C 12
+ getf.sig r16 = f9
+ C 13
+ getf.sig r15 = f8
+ ;; C 18
+ cmp.ltu p6, p7 = r20, r15
+ sub r15 = r20, r15
+ sub r16 = r38, r16
+ ;; C 19
+ (p6) cmp.ne p8, p9 = 1, r16 C is rH != 0?
+ (p7) cmp.ne p8, p9 = 0, r16 C is rH != 0?
+ (p6) add r16 = -1, r16
+ (p0) cmp.ne.unc p6, p7 = r0, r0
+ ;; C 20
+ (p8) cmp.ltu p6, p7 = r15, r36
+ (p8) sub r15 = r15, r36
+ (p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0
+ ;; C 21
+ .pred.rel "mutex",p6,p7
+ (p6) cmp.ne p8, p9 = 1, r16 C is rH != 0 still?
+ (p7) cmp.ne p8, p9 = 0, r16 C is rH != 0 still?
+ cmp.ltu p6, p7 = r15, r36 C speculative
+ sub r28 = r15, r36 C speculative, just for cmp
+ ;; C 22
+ (p8) cmp.ltu p6, p7 = r28, r36 C redo last cmp if needed
+ (p8) mov r15 = r28
+ (p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0
+ ;; C 23
+ (p6) setf.sig f7 = r15
+ (p7) sub r15 = r15, r36
+ (p7) add r18 = 1, r18 C q = q + 1; done if: rH > 0
+ ;; C 24
+ (p7) setf.sig f7 = r15
+ st8 [r32] = r18, -8
+ mov r38 = r15
+ br.cloop.dptk .Loop1
+ C 29/30
+ br.sptk .L435
+ ;;
+.Lunnorm:
+ mux1 r16 = r36, @rev
+ cmp.eq p6, p7 = 0, r35
+ (p6) br.cond.dpnt .L322
+ shladd r34 = r35, 3, r34
+ ;;
+ adds r34 = -8, r34
+ ;;
+ ld8 r39 = [r34]
+ ;;
+ cmp.leu p6, p7 = r36, r39
+ (p6) br.cond.dptk .L322
+ adds r34 = -8, r34
+ ;;
+ mov r38 = r39
+ ;;
+ cmp.ne p6, p7 = 1, r15
+ st8 [r32] = r0, -8
+ ;;
+ (p7) mov r8 = r38
+ (p7) br.cond.dpnt .Lret
+ adds r35 = -1, r35
+.L322:
+ sub r14 = r0, r16
+ ;;
+ or r14 = r16, r14
+ ;;
+ mov r16 = -8
+ czx1.l r14 = r14
+ ;;
+ shladd r16 = r14, 3, r16
+ ;;
+ shr.u r14 = r36, r16
+ ;;
+ cmp.geu p6, p7 = 15, r14
+ ;;
+ (p7) shr.u r14 = r14, 4
+ (p7) adds r16 = 4, r16
+ ;;
+ cmp.geu p6, p7 = 3, r14
+ ;;
+ (p7) shr.u r14 = r14, 2
+ (p7) adds r16 = 2, r16
+ ;;
+ tbit.nz p6, p7 = r14, 1
+ ;;
+ .pred.rel "mutex",p6,p7
+ (p6) sub r40 = 62, r16
+ (p7) sub r40 = 63, r16
+ ;;
+ shl r45 = r36, r40
+ shl r36 = r36, r40
+ shl r38 = r38, r40
+ br.call.sptk.many b0 = mpn_invert_limb
+ ;;
+.Lpu:
+ mov r23 = 1
+ ;;
+ setf.sig f6 = r8
+ setf.sig f12 = r23
+ cmp.eq p6, p7 = 0, r35
+ (p6) br.cond.dpnt .L435
+ sub r16 = 64, r40
+ adds r35 = -2, r35
+ ;;
+ ld8 r39 = [r34], -8
+ cmp.le p6, p7 = 0, r35
+ ;;
+ shr.u r14 = r39, r16
+ ;;
+ or r38 = r14, r38
+ (p7) br.cond.dpnt .Lend3
+ ;;
+ mov r22 = r16
+ setf.sig f10 = r36
+ setf.sig f7 = r38
+ mov ar.lc = r35
+ ;;
+C Develop quotient limbs for unnormalized divisor
+.Loop3:
+ ld8 r14 = [r34], -8
+ xma.hu f11 = f7, f6, f0
+ ;;
+ xma.l f8 = f11, f12, f7 C q = q + nh
+ ;;
+ getf.sig r18 = f8
+ xma.hu f9 = f8, f10, f0
+ shl r20 = r39, r40
+ xma.l f8 = f8, f10, f0
+ shr.u r24 = r14, r22
+ ;;
+ getf.sig r16 = f9
+ getf.sig r15 = f8
+ or r20 = r24, r20
+ ;;
+ cmp.ltu p6, p7 = r20, r15
+ sub r15 = r20, r15
+ sub r16 = r38, r16
+ ;;
+ (p6) cmp.ne p8, p9 = 1, r16 C is rH != 0?
+ (p7) cmp.ne p8, p9 = 0, r16 C is rH != 0?
+ (p6) add r16 = -1, r16
+ (p0) cmp.ne.unc p6, p7 = r0, r0
+ ;;
+ (p8) cmp.ltu p6, p7 = r15, r36
+ (p8) sub r15 = r15, r36
+ (p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0
+ ;;
+ .pred.rel "mutex",p6,p7
+ (p6) cmp.ne p8, p9 = 1, r16 C is rH != 0 still?
+ (p7) cmp.ne p8, p9 = 0, r16 C is rH != 0 still?
+ cmp.ltu p6, p7 = r15, r36 C speculative
+ sub r28 = r15, r36 C speculative, just for cmp
+ ;;
+ (p8) cmp.ltu p6, p7 = r28, r36 C redo last cmp if needed
+ (p8) mov r15 = r28
+ (p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0
+ ;;
+ (p6) setf.sig f7 = r15
+ (p7) sub r15 = r15, r36
+ (p7) add r18 = 1, r18 C q = q + 1; done if: rH > 0
+ ;;
+ (p7) setf.sig f7 = r15
+ st8 [r32] = r18, -8
+ mov r39 = r14
+ mov r38 = r15
+ br.cloop.dptk .Loop3
+ ;;
+.Lend3:
+ setf.sig f10 = r36
+ setf.sig f7 = r38
+ ;;
+ xma.hu f11 = f7, f6, f0
+ ;;
+ xma.l f8 = f11, f12, f7 C q = q + nh
+ ;;
+ getf.sig r18 = f8
+ xma.hu f9 = f8, f10, f0
+ shl r20 = r39, r40
+ xma.l f8 = f8, f10, f0
+ ;;
+ getf.sig r16 = f9
+ getf.sig r15 = f8
+ ;;
+ cmp.ltu p6, p7 = r20, r15
+ sub r15 = r20, r15
+ sub r16 = r38, r16
+ ;;
+ (p6) cmp.ne p8, p9 = 1, r16 C is rH != 0?
+ (p7) cmp.ne p8, p9 = 0, r16 C is rH != 0?
+ (p6) add r16 = -1, r16
+ (p0) cmp.ne.unc p6, p7 = r0, r0
+ ;;
+ (p8) cmp.ltu p6, p7 = r15, r36
+ (p8) sub r15 = r15, r36
+ (p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0
+ ;;
+ .pred.rel "mutex",p6,p7
+ (p6) cmp.ne p8, p9 = 1, r16 C is rH != 0 still?
+ (p7) cmp.ne p8, p9 = 0, r16 C is rH != 0 still?
+ ;;
+ (p8) sub r15 = r15, r36
+ (p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0
+ ;;
+ cmp.ltu p6, p7 = r15, r36
+ ;;
+ (p7) sub r15 = r15, r36
+ (p7) add r18 = 1, r18 C q = q + 1; done if: rH > 0
+ ;;
+ st8 [r32] = r18, -8
+ mov r38 = r15
+.L435:
+ adds r35 = -1, r33
+ cmp.le p6, p7 = 1, r33
+ (p7) br.cond.dpnt .Lend4
+ ;;
+ setf.sig f7 = r38
+ setf.sig f10 = r36
+ mov ar.lc = r35
+ ;;
+.Loop4:
+ xma.hu f11 = f7, f6, f0
+ ;;
+ xma.l f8 = f11, f12, f7 C q = q + nh
+ ;;
+ getf.sig r18 = f8
+ xma.hu f9 = f8, f10, f0
+ xma.l f8 = f8, f10, f0
+ ;;
+ getf.sig r16 = f9
+ getf.sig r15 = f8
+ ;;
+ cmp.ltu p6, p7 = 0, r15
+ sub r15 = 0, r15
+ sub r16 = r38, r16
+ ;;
+ (p6) cmp.ne p8, p9 = 1, r16 C is rH != 0?
+ (p7) cmp.ne p8, p9 = 0, r16 C is rH != 0?
+ (p6) add r16 = -1, r16
+ (p0) cmp.ne.unc p6, p7 = r0, r0
+ ;;
+ (p8) cmp.ltu p6, p7 = r15, r36
+ (p8) sub r15 = r15, r36
+ (p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0
+ ;;
+ .pred.rel "mutex",p6,p7
+ (p6) cmp.ne p8, p9 = 1, r16 C is rH != 0 still?
+ (p7) cmp.ne p8, p9 = 0, r16 C is rH != 0 still?
+ cmp.ltu p6, p7 = r15, r36 C speculative
+ sub r28 = r15, r36 C speculative, just for cmp
+ ;;
+ (p8) cmp.ltu p6, p7 = r28, r36 C redo last cmp if needed
+ (p8) mov r15 = r28
+ (p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0
+ ;;
+ (p6) setf.sig f7 = r15
+ (p7) sub r15 = r15, r36
+ (p7) add r18 = 1, r18 C q = q + 1; done if: rH > 0
+ ;;
+ (p7) setf.sig f7 = r15
+ st8 [r32] = r18, -8
+ mov r38 = r15
+ br.cloop.dptk .Loop4
+ ;;
+.Lend4:
+ shr.u r8 = r38, r40
+.Lret:
+ mov ar.pfs = r42
+ mov ar.lc = r44
+ mov b0 = r41
+ br.ret.sptk.many b0
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/ia64/divrem_2.asm b/vendor/gmp-6.3.0/mpn/ia64/divrem_2.asm
new file mode 100644
index 0000000..9864311
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/ia64/divrem_2.asm
@@ -0,0 +1,280 @@
+dnl IA-64 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number.
+
+dnl Copyright 2010, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C norm frac
+C itanium 1
+C itanium 2 29 29
+
+
+C TODO
+C * Inline and interleave limb inversion code with loop setup code.
+C * We should use explicit bundling in much of the code, since it typically
+C cuts some cycles with the GNU assembler.
+
+
+ASM_START()
+
+C HP's assembler requires these declarations for importing mpn_invert_limb
+ .global mpn_invert_limb
+ .type mpn_invert_limb,@function
+
+C INPUT PARAMETERS
+C qp = r32
+C fn = r33
+C np = r34
+C nn = r35
+C dp = r36
+
+define(`f0x1', `f15')
+
+ASM_START()
+PROLOGUE(mpn_divrem_2)
+ .prologue
+ifdef(`HAVE_ABI_32',
+` addp4 r32 = 0, r32 C M I
+ addp4 r34 = 0, r34 C M I
+ zxt4 r35 = r35 C I
+ addp4 r36 = 0, r36 C M I
+ nop.m 0
+ zxt4 r33 = r33 C I
+ ;;
+')
+ .save ar.pfs, r42
+ alloc r42 = ar.pfs, 5, 9, 1, 0
+ shladd r34 = r35, 3, r34
+ adds r14 = 8, r36
+ mov r43 = r1
+ ;;
+ adds r15 = -8, r34
+ ld8 r39 = [r14]
+ .save ar.lc, r45
+ mov r45 = ar.lc
+ adds r14 = -16, r34
+ mov r40 = r0
+ adds r34 = -24, r34
+ ;;
+ ld8 r38 = [r15]
+ .save rp, r41
+ mov r41 = b0
+ .body
+ ld8 r36 = [r36]
+ ld8 r37 = [r14]
+ ;;
+ cmp.gtu p6, p7 = r39, r38
+ (p6) br.cond.dptk .L8
+ ;;
+ cmp.leu p8, p9 = r36, r37
+ cmp.geu p6, p7 = r39, r38
+ ;;
+ (p8) cmp4.ne.and.orcm p6, p7 = 0, r0
+ (p7) br.cond.dptk .L51
+.L8:
+ add r14 = r33, r35 // un + fn
+ mov r46 = r39 // argument to mpn_invert_limb
+ ;;
+ adds r35 = -3, r14
+ ;;
+ cmp.gt p12, p0 = r0, r35
+ (p12) br.cond.dpnt L(end)
+ br.call.sptk.many b0 = mpn_invert_limb
+ ;;
+ setf.sig f11 = r8 // di (non-final)
+ setf.sig f34 = r39 // d1
+ setf.sig f33 = r36 // d0
+ mov r1 = r43
+ ;;
+ mov r17 = 1
+ setf.sig f9 = r38 // n2
+ xma.l f6 = f11, f34, f0 // t0 = LO(di * d1)
+ ;;
+ setf.sig f10 = r37 // n1
+ setf.sig f15 = r17 // 1
+ xma.hu f8 = f11, f33, f0 // s0 = HI(di * d0)
+ ;;
+ getf.sig r17 = f6
+ getf.sig r16 = f8
+ mov ar.lc = r35
+ ;;
+ sub r18 = r0, r39 // -d1
+ add r14 = r17, r36
+ ;;
+ setf.sig f14 = r18 // -d1
+ cmp.leu p8, p9 = r17, r14
+ add r16 = r14, r16
+ ;;
+ (p9) adds r19 = 0, r0
+ (p8) adds r19 = -1, r0
+ cmp.gtu p6, p7 = r14, r16
+ ;;
+ (p6) adds r19 = 1, r19
+ ;;
+ifelse(1,1,`
+ cmp.gt p7, p6 = r0, r19
+ ;;
+ (p6) adds r8 = -1, r8 // di--
+ (p6) sub r14 = r16, r39 // t0 -= d1
+ (p6) cmp.ltu p6, p7 = r16, r39 // cy for: t0 - d1
+ ;;
+ (p6) cmp.gt p9, p8 = 1, r19
+ (p7) cmp.gt p9, p8 = 0, r19
+ (p6) adds r19 = -1, r19 // t1 -= cy
+ mov r16 = r14
+ ;;
+ (p8) adds r8 = -1, r8 // di--
+ (p8) sub r14 = r16, r39 // t0 -= d1
+ (p8) cmp.ltu p8, p9 = r16, r39 // cy for: t0 - d1
+ ;;
+ (p8) cmp.gt p7, p6 = 1, r19
+ (p9) cmp.gt p7, p6 = 0, r19
+ (p8) adds r19 = -1, r19 // t1 -= cy
+ mov r16 = r14
+ ;;
+ (p6) adds r8 = -1, r8 // di--
+ (p6) sub r14 = r16, r39 // t0 -= d1
+ (p6) cmp.ltu p6, p7 = r16, r39 // cy for: t0 - d1
+ ;;
+ (p6) cmp.gt p9, p8 = 1, r19
+ (p7) cmp.gt p9, p8 = 0, r19
+ (p6) adds r19 = -1, r19 // t1 -= cy
+ mov r16 = r14
+ ;;
+ (p8) adds r8 = -1, r8 // di--
+ (p8) sub r14 = r16, r39 // t0 -= d1
+ (p8) cmp.ltu p8, p9 = r16, r39 // cy for: t0 - d1
+ ;;
+ (p8) adds r19 = -1, r19 // t1 -= cy
+ mov r16 = r14
+',`
+ cmp.gt p8, p9 = r0, r19
+ (p8) br.cond.dpnt .L46
+.L52:
+ cmp.leu p6, p7 = r39, r16
+ sub r14 = r16, r39
+ adds r8 = -1, r8
+ ;;
+ (p7) adds r19 = -1, r19
+ mov r16 = r14
+ ;;
+ (p7) cmp.gt p8, p9 = r0, r19
+ (p9) br.cond.dptk .L52
+.L46:
+')
+ setf.sig f32 = r8 // di
+ shladd r32 = r35, 3, r32
+ ;;
+
+ ALIGN(16)
+L(top): nop 0
+ nop 0
+ cmp.gt p8, p9 = r33, r35
+ ;;
+ (p8) mov r37 = r0
+ (p9) ld8 r37 = [r34], -8
+ xma.hu f8 = f9, f32, f10 // 0,29
+ xma.l f12 = f9, f32, f10 // 0
+ ;;
+ getf.sig r20 = f12 // q0 4
+ xma.l f13 = f15, f8, f9 // q += n2 4
+ sub r8 = -1, r36 // bitnot d0
+ ;;
+ getf.sig r18 = f13 // 8
+ xma.l f7 = f14, f13, f10 // 8
+ xma.l f6 = f33, f13, f33 // t0 = LO(d0*q+d0) 8
+ xma.hu f9 = f33, f13, f33 // t1 = HI(d0*q+d0) 9
+ ;;
+ getf.sig r38 = f7 // n1 12
+ getf.sig r16 = f6 // 13
+ getf.sig r19 = f9 // 14
+ ;;
+ sub r38 = r38, r39 // n1 -= d1 17
+ ;;
+ cmp.ne p9, p0 = r0, r0 // clear p9
+ cmp.leu p10, p11 = r16, r37 // cy for: n0 - t0 18
+ ;;
+ sub r37 = r37, r16 // n0 -= t0 19
+ (p11) sub r38 = r38, r19, 1 // n1 -= t1 - cy 19
+ (p10) sub r38 = r38, r19 // n1 -= t1 19
+ ;;
+ cmp.gtu p6, p7 = r20, r38 // n1 >= q0 20
+ ;;
+ (p7) cmp.ltu p9, p0 = r8, r37 // 21
+ (p6) add r18 = 1, r18 //
+ (p7) add r37 = r37, r36 // 21
+ (p7) add r38 = r38, r39 // 21
+ ;;
+ setf.sig f10 = r37 // n1 22
+ (p9) add r38 = 1, r38 // 22
+ ;;
+ setf.sig f9 = r38 // n2 23
+ cmp.gtu p6, p7 = r39, r38 // 23
+ (p7) br.cond.spnt L(fix)
+L(bck): st8 [r32] = r18, -8
+ adds r35 = -1, r35
+ br.cloop.sptk.few L(top)
+ ;;
+
+L(end): add r14 = 8, r34
+ add r15 = 16, r34
+ mov b0 = r41
+ ;;
+ st8 [r14] = r37
+ st8 [r15] = r38
+ mov ar.pfs = r42
+ mov r8 = r40
+ mov ar.lc = r45
+ br.ret.sptk.many b0
+ ;;
+.L51:
+ .pred.rel "mutex", p8, p9
+ sub r37 = r37, r36
+ (p9) sub r38 = r38, r39, 1
+ (p8) sub r38 = r38, r39
+ adds r40 = 1, r0
+ br .L8
+ ;;
+
+L(fix): cmp.geu p6, p7 = r39, r38
+ cmp.leu p8, p9 = r36, r37
+ ;;
+ (p8) cmp4.ne.and.orcm p6, p7 = 0, r0
+ (p6) br.cond.dptk L(bck)
+ sub r37 = r37, r36
+ (p9) sub r38 = r38, r39, 1
+ (p8) sub r38 = r38, r39
+ adds r18 = 1, r18
+ ;;
+ setf.sig f9 = r38 // n2
+ setf.sig f10 = r37 // n1
+ br L(bck)
+
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/ia64/gcd_11.asm b/vendor/gmp-6.3.0/mpn/ia64/gcd_11.asm
new file mode 100644
index 0000000..6137227
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/ia64/gcd_11.asm
@@ -0,0 +1,110 @@
+dnl Itanium-2 mpn_gcd_11
+
+dnl Copyright 2002-2005, 2012, 2013, 2015, 2019 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/bitpair (1x1 gcd)
+C Itanium: ?
+C Itanium 2: 4.5
+
+
+ASM_START()
+
+C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
+
+deflit(MAXSHIFT, 7)
+deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1))
+
+ .rodata
+ ALIGN(m4_lshift(1,MAXSHIFT)) C align table to allow using dep
+ctz_table:
+ data1 MAXSHIFT
+forloop(i,1,MASK,
+` data1 m4_count_trailing_zeros(i)-1
+')
+
+define(`x0', r32)
+define(`y0', r33)
+
+PROLOGUE(mpn_gcd_11)
+ .prologue
+ .body
+ addl r22 = @ltoff(ctz_table), r1
+ ;;
+ ld8 r22 = [r22]
+ br L(ent)
+ ;;
+
+ ALIGN(32)
+L(top):
+ .pred.rel "mutex", p6,p7
+ {.mmi; (p7) mov y0 = x0
+ (p6) sub x0 = x0, y0
+ dep r21 = r19, r22, 0, MAXSHIFT C concat(table,lowbits)
+}{.mmi; and r20 = MASK, r19
+ (p7) mov x0 = r19
+ and r23 = 6, r19
+ ;;
+}{.mmi; cmp.eq p6,p0 = 4, r23
+ cmp.eq p7,p0 = 0, r23
+ shr.u x0 = x0, 1 C shift-by-1, always OK
+}{.mmb; ld1 r16 = [r21]
+ cmp.eq p10,p0 = 0, r20
+ (p10) br.spnt.few.clr L(count_better)
+ ;;
+}
+L(bck):
+ .pred.rel "mutex", p6,p7
+ {.mii; nop 0
+ (p6) shr.u x0 = x0, 1 C u was ...100 before shift-by-1 above
+ (p7) shr.u x0 = x0, r16 C u was ...000 before shift-by-1 above
+ ;;
+}
+L(ent):
+ {.mmi; sub r19 = y0, x0
+ cmp.gtu p6,p7 = x0, y0
+ cmp.ne p8,p0 = x0, y0
+}{.mmb; nop 0
+ nop 0
+ (p8) br.sptk.few.clr L(top)
+}
+
+L(end): mov r8 = y0
+ br.ret.sptk.many b0
+
+L(count_better):
+ add r20 = -1, x0
+ ;;
+ andcm r23 = r20, x0
+ ;;
+ popcnt r16 = r23
+ br L(bck)
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/ia64/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/ia64/gmp-mparam.h
new file mode 100644
index 0000000..34d2bf3
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/ia64/gmp-mparam.h
@@ -0,0 +1,212 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 900MHz Itanium2 (olympic.gmplib.org) */
+/* FFT tuning limit = 59,194,709 */
+/* Generated by tuneup.c, 2019-10-13, gcc 4.2 */
+
+#define MOD_1_1P_METHOD 2 /* 17.40% faster than 1 */
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 8
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 6
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 18
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 13
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1N_PI1_METHOD 1 /* 1.35% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD 10
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */
+
+#define DIV_1_VS_MUL_1_PERCENT 316
+
+#define MUL_TOOM22_THRESHOLD 47
+#define MUL_TOOM33_THRESHOLD 89
+#define MUL_TOOM44_THRESHOLD 220
+#define MUL_TOOM6H_THRESHOLD 327
+#define MUL_TOOM8H_THRESHOLD 454
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 153
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 143
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 153
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 226
+
+#define SQR_BASECASE_THRESHOLD 11
+#define SQR_TOOM2_THRESHOLD 98
+#define SQR_TOOM3_THRESHOLD 135
+#define SQR_TOOM4_THRESHOLD 272
+#define SQR_TOOM6_THRESHOLD 354
+#define SQR_TOOM8_THRESHOLD 490
+
+#define MULMID_TOOM42_THRESHOLD 99
+
+#define MULMOD_BNM1_THRESHOLD 23
+#define SQRMOD_BNM1_THRESHOLD 27
+
+#define MUL_FFT_MODF_THRESHOLD 840 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 840, 5}, { 30, 6}, { 16, 5}, { 33, 6}, \
+ { 17, 5}, { 36, 6}, { 35, 7}, { 18, 6}, \
+ { 37, 7}, { 19, 6}, { 42, 7}, { 37, 8}, \
+ { 19, 7}, { 43, 8}, { 23, 7}, { 47, 8}, \
+ { 43, 9}, { 23, 8}, { 51, 9}, { 27, 8}, \
+ { 57, 9}, { 31, 8}, { 63, 9}, { 35, 8}, \
+ { 71, 9}, { 43,10}, { 23, 9}, { 55,10}, \
+ { 31, 9}, { 71,10}, { 39, 9}, { 83,10}, \
+ { 47, 9}, { 99,10}, { 55,11}, { 31,10}, \
+ { 87,11}, { 47,10}, { 111,12}, { 31,11}, \
+ { 63,10}, { 135,11}, { 79,10}, { 167,11}, \
+ { 95,10}, { 191,11}, { 111,12}, { 63,11}, \
+ { 143,10}, { 287,11}, { 159,12}, { 95,11}, \
+ { 207,13}, { 63,12}, { 127,11}, { 271,12}, \
+ { 159,11}, { 335,10}, { 671,12}, { 191,10}, \
+ { 799,12}, { 223,13}, { 127,12}, { 287,11}, \
+ { 607,12}, { 319,11}, { 671,13}, { 191,12}, \
+ { 383,11}, { 799,10}, { 1599,12}, { 415,11}, \
+ { 863,14}, { 127,13}, { 255,12}, { 543,11}, \
+ { 1119,12}, { 607,13}, { 319,12}, { 735,11}, \
+ { 1471,12}, { 863,13}, { 447,12}, { 927,11}, \
+ { 1855,12}, { 959,14}, { 255,13}, { 511,12}, \
+ { 1055,11}, { 2111,12}, { 1119,13}, { 575,12}, \
+ { 1247,13}, { 639,12}, { 1311,13}, { 703,12}, \
+ { 1471,13}, { 831,12}, { 1727,13}, { 895,12}, \
+ { 1791,13}, { 959,15}, { 255,14}, { 511,13}, \
+ { 1087,12}, { 2239,13}, { 1215,14}, { 639,13}, \
+ { 1471,14}, { 767,13}, { 1727,14}, { 895,13}, \
+ { 1855,12}, { 3711,13}, { 1919,15}, { 511,14}, \
+ { 1023,13}, { 2111,12}, { 4223,13}, { 2175,14}, \
+ { 1151,13}, { 2495,14}, { 1279,13}, { 2623,14}, \
+ { 1407,15}, { 767,14}, { 1663,13}, { 3455,14}, \
+ { 1919,16}, { 511,15}, { 1023,14}, { 2175,13}, \
+ { 4479,14}, { 2431,15}, { 1279,14}, { 2943,15}, \
+ { 1535,14}, { 3455,15}, { 1791,14}, { 16384,15}, \
+ { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \
+ { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
+ {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 149
+#define MUL_FFT_THRESHOLD 8576
+
+#define SQR_FFT_MODF_THRESHOLD 765 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 765, 5}, { 36, 6}, { 37, 7}, { 19, 6}, \
+ { 42, 7}, { 43, 8}, { 37, 9}, { 19, 8}, \
+ { 43, 9}, { 23, 8}, { 49, 9}, { 27, 8}, \
+ { 57, 9}, { 43,10}, { 23, 9}, { 55,10}, \
+ { 31, 9}, { 71,10}, { 39, 9}, { 83,10}, \
+ { 47, 9}, { 99,10}, { 55,11}, { 31,10}, \
+ { 87,11}, { 47,10}, { 111,12}, { 31,11}, \
+ { 63,10}, { 135,11}, { 79,10}, { 175,11}, \
+ { 95,10}, { 199,11}, { 111,12}, { 63,11}, \
+ { 159,12}, { 95,11}, { 191,10}, { 399,11}, \
+ { 207,13}, { 63,12}, { 127,10}, { 511, 9}, \
+ { 1023,10}, { 527,11}, { 271,12}, { 159,10}, \
+ { 703,12}, { 191,11}, { 399,10}, { 799,11}, \
+ { 431,12}, { 223,13}, { 127,12}, { 255,11}, \
+ { 527,10}, { 1055,11}, { 559,12}, { 287,11}, \
+ { 607,10}, { 1215,11}, { 703,13}, { 191,12}, \
+ { 383,11}, { 799,12}, { 415,11}, { 863,12}, \
+ { 447,14}, { 127,13}, { 255,12}, { 511,11}, \
+ { 1055,12}, { 543,11}, { 1119,12}, { 607,11}, \
+ { 1215,12}, { 735,13}, { 383,12}, { 799,11}, \
+ { 1599,12}, { 863,13}, { 447,12}, { 991,14}, \
+ { 255,13}, { 511,12}, { 1055,11}, { 2111,12}, \
+ { 1119,13}, { 575,12}, { 1215,13}, { 639,12}, \
+ { 1311,13}, { 703,12}, { 1407,14}, { 383,13}, \
+ { 767,12}, { 1599,13}, { 831,12}, { 1727,13}, \
+ { 895,12}, { 1791,13}, { 959,12}, { 1919,15}, \
+ { 255,14}, { 511,13}, { 1023,12}, { 2047,13}, \
+ { 1087,12}, { 2239,13}, { 1151,12}, { 2303,13}, \
+ { 1215,14}, { 639,13}, { 1279,12}, { 2559,13}, \
+ { 1471,14}, { 767,13}, { 1727,14}, { 895,13}, \
+ { 1919,15}, { 511,14}, { 1023,13}, { 2239,14}, \
+ { 1151,13}, { 2495,14}, { 1279,13}, { 2623,14}, \
+ { 1407,15}, { 767,14}, { 1663,13}, { 3455,14}, \
+ { 1919,16}, { 511,15}, { 1023,14}, { 2175,13}, \
+ { 4479,14}, { 2431,15}, { 1279,14}, { 2943,15}, \
+ { 1535,14}, { 3455,15}, { 1791,14}, { 16384,15}, \
+ { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \
+ { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
+ {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 153
+#define SQR_FFT_THRESHOLD 6272
+
+#define MULLO_BASECASE_THRESHOLD 39
+#define MULLO_DC_THRESHOLD 0 /* never mpn_mullo_basecase */
+#define MULLO_MUL_N_THRESHOLD 17050
+#define SQRLO_BASECASE_THRESHOLD 0 /* always */
+#define SQRLO_DC_THRESHOLD 134
+#define SQRLO_SQR_THRESHOLD 12322
+
+#define DC_DIV_QR_THRESHOLD 73
+#define DC_DIVAPPR_Q_THRESHOLD 262
+#define DC_BDIV_QR_THRESHOLD 111
+#define DC_BDIV_Q_THRESHOLD 315
+
+#define INV_MULMOD_BNM1_THRESHOLD 92
+#define INV_NEWTON_THRESHOLD 15
+#define INV_APPR_THRESHOLD 17
+
+#define BINV_NEWTON_THRESHOLD 280
+#define REDC_1_TO_REDC_2_THRESHOLD 0 /* always */
+#define REDC_2_TO_REDC_N_THRESHOLD 172
+
+#define MU_DIV_QR_THRESHOLD 1470
+#define MU_DIVAPPR_Q_THRESHOLD 1210
+#define MUPI_DIV_QR_THRESHOLD 0 /* always */
+#define MU_BDIV_QR_THRESHOLD 1566
+#define MU_BDIV_Q_THRESHOLD 1787
+
+#define POWM_SEC_TABLE 3,22,139,1867
+
+#define GET_STR_DC_THRESHOLD 14
+#define GET_STR_PRECOMPUTE_THRESHOLD 42
+#define SET_STR_DC_THRESHOLD 1339
+#define SET_STR_PRECOMPUTE_THRESHOLD 3934
+
+#define FAC_DSC_THRESHOLD 866
+#define FAC_ODD_THRESHOLD 0 /* always */
+
+#define MATRIX22_STRASSEN_THRESHOLD 20
+#define HGCD2_DIV1_METHOD 3 /* 13.73% faster than 1 */
+#define HGCD_THRESHOLD 129
+#define HGCD_APPR_THRESHOLD 202
+#define HGCD_REDUCE_THRESHOLD 4455
+#define GCD_DC_THRESHOLD 658
+#define GCDEXT_DC_THRESHOLD 469
+#define JACOBI_BASE_METHOD 2 /* 0.62% faster than 4 */
+
+/* Tuneup completed successfully, took 199042 seconds */
diff --git a/vendor/gmp-6.3.0/mpn/ia64/hamdist.asm b/vendor/gmp-6.3.0/mpn/ia64/hamdist.asm
new file mode 100644
index 0000000..477df4c
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/ia64/hamdist.asm
@@ -0,0 +1,365 @@
+dnl IA-64 mpn_hamdist -- mpn hamming distance.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2003-2005 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C Itanium: 2
+C Itanium 2: 1
+
+C INPUT PARAMETERS
+define(`up', `r32')
+define(`vp', `r33')
+define(`n', `r34')
+
+define(`u0',`r16') define(`u1',`r17') define(`u2',`r18') define(`u3',`r19')
+define(`v0',`r20') define(`v1',`r21') define(`v2',`r22') define(`v3',`r23')
+define(`x0',`r24') define(`x1',`r25') define(`x2',`r26') define(`x3',`r27')
+define(`c0',`r28') define(`c1',`r29') define(`c2',`r30') define(`c3',`r31')
+define(`s',`r8')
+
+
+ASM_START()
+PROLOGUE(mpn_hamdist)
+ .prologue
+ifdef(`HAVE_ABI_32',
+` addp4 up = 0, up C M I
+ addp4 vp = 0, vp C M I
+ zxt4 n = n C I
+ ;;
+')
+
+ {.mmi; ld8 r10 = [up], 8 C load first ulimb M01
+ ld8 r11 = [vp], 8 C load first vlimb M01
+ mov.i r2 = ar.lc C save ar.lc I0
+}{.mmi; and r14 = 3, n C M I
+ cmp.lt p15, p0 = 4, n C small count? M I
+ add n = -5, n C M I
+ ;;
+}{.mmi; cmp.eq p6, p0 = 1, r14 C M I
+ cmp.eq p7, p0 = 2, r14 C M I
+ cmp.eq p8, p0 = 3, r14 C M I
+}{.bbb
+ (p6) br.dptk .Lb01 C B
+ (p7) br.dptk .Lb10 C B
+ (p8) br.dptk .Lb11 C B
+}
+
+
+.Lb00: ld8 u1 = [up], 8 C M01
+ ld8 v1 = [vp], 8 C M01
+ shr.u n = n, 2 C I0
+ xor x0 = r10, r11 C M I
+ ;;
+ ld8 u2 = [up], 8 C M01
+ ld8 v2 = [vp], 8 C M01
+ mov.i ar.lc = n C I0
+ xor x1 = u1, v1 C M I
+ ;;
+ ld8 u3 = [up], 8 C M01
+ ld8 v3 = [vp], 8 C M01
+ xor x2 = u2, v2 C M I
+ mov s = 0 C M I
+ (p15) br.cond.dptk .grt4 C B
+ ;;
+ popcnt c0 = x0 C I0
+ xor x3 = u3, v3 C M I
+ ;;
+ popcnt c1 = x1 C I0
+ ;;
+ popcnt c2 = x2 C I0
+ br .Lcj4 C B
+
+.grt4: ld8 u0 = [up], 8 C M01
+ ld8 v0 = [vp], 8 C M01
+ xor x1 = u1, v1 C M I
+ ;;
+ ld8 u1 = [up], 8 C M01
+ ld8 v1 = [vp], 8 C M01
+ xor x2 = u2, v2 C M I
+ ;;
+ ld8 u2 = [up], 8 C M01
+ ld8 v2 = [vp], 8 C M01
+ popcnt c0 = x0 C I0
+ xor x3 = u3, v3 C M I
+ ;;
+ ld8 u3 = [up], 8 C M01
+ ld8 v3 = [vp], 8 C M01
+ popcnt c1 = x1 C I0
+ xor x0 = u0, v0 C M I
+ br.cloop.dpnt .grt8 C B
+
+ popcnt c2 = x2 C I0
+ xor x1 = u1, v1 C M I
+ br .Lcj8 C B
+
+.grt8: ld8 u0 = [up], 8 C M01
+ ld8 v0 = [vp], 8 C M01
+ popcnt c2 = x2 C I0
+ xor x1 = u1, v1 C M I
+ br .LL00 C B
+
+
+.Lb01: xor x3 = r10, r11 C M I
+ shr.u n = n, 2 C I0
+ (p15) br.cond.dptk .grt1 C B
+ ;;
+ popcnt r8 = x3 C I0
+ br.ret.sptk.many b0 C B
+
+.grt1: ld8 u0 = [up], 8 C M01
+ ld8 v0 = [vp], 8 C M01
+ mov.i ar.lc = n C I0
+ ;;
+ ld8 u1 = [up], 8 C M01
+ ld8 v1 = [vp], 8 C M01
+ mov s = 0 C M I
+ ;;
+ ld8 u2 = [up], 8 C M01
+ ld8 v2 = [vp], 8 C M01
+ ;;
+ ld8 u3 = [up], 8 C M01
+ ld8 v3 = [vp], 8 C M01
+ xor x0 = u0, v0 C M I
+ br.cloop.dpnt .grt5 C B
+
+ xor x1 = u1, v1 C M I
+ ;;
+ popcnt c3 = x3 C I0
+ xor x2 = u2, v2 C M I
+ ;;
+ popcnt c0 = x0 C I0
+ xor x3 = u3, v3 C M I
+ ;;
+ popcnt c1 = x1 C I0
+ br .Lcj5 C B
+
+.grt5: ld8 u0 = [up], 8 C M01
+ ld8 v0 = [vp], 8 C M01
+ xor x1 = u1, v1 C M I
+ ;;
+ ld8 u1 = [up], 8 C M01
+ ld8 v1 = [vp], 8 C M01
+ popcnt c3 = x3 C I0
+ xor x2 = u2, v2 C M I
+ ;;
+ ld8 u2 = [up], 8 C M01
+ ld8 v2 = [vp], 8 C M01
+ popcnt c0 = x0 C I0
+ xor x3 = u3, v3 C M I
+ ;;
+ ld8 u3 = [up], 8 C M01
+ ld8 v3 = [vp], 8 C M01
+ popcnt c1 = x1 C I0
+ xor x0 = u0, v0 C M I
+ br.cloop.dpnt .Loop C B
+ br .Lend C B
+
+
+.Lb10: ld8 u3 = [up], 8 C M01
+ ld8 v3 = [vp], 8 C M01
+ xor x2 = r10, r11 C M I
+ (p15) br.cond.dptk .grt2 C B
+ ;;
+ xor x3 = u3, v3 C M I
+ ;;
+ popcnt c2 = x2 C I0
+ ;;
+ popcnt c3 = x3 C I0
+ ;;
+ add s = c2, c3 C M I
+ br.ret.sptk.many b0 C B
+
+.grt2: ld8 u0 = [up], 8 C M01
+ ld8 v0 = [vp], 8 C M01
+ shr.u n = n, 2 C I0
+ ;;
+ ld8 u1 = [up], 8 C M01
+ ld8 v1 = [vp], 8 C M01
+ mov.i ar.lc = n C I0
+ mov s = 0 C M I
+ ;;
+ ld8 u2 = [up], 8 C M01
+ ld8 v2 = [vp], 8 C M01
+ xor x3 = u3, v3 C M I
+ ;;
+ ld8 u3 = [up], 8 C M01
+ ld8 v3 = [vp], 8 C M01
+ xor x0 = u0, v0 C M I
+ br.cloop.dptk .grt6 C B
+
+ popcnt c2 = x2 C I0
+ xor x1 = u1, v1 C M I
+ ;;
+ popcnt c3 = x3 C I0
+ xor x2 = u2, v2 C M I
+ ;;
+ popcnt c0 = x0 C I0
+ xor x3 = u3, v3 C M I
+ br .Lcj6 C B
+
+.grt6: ld8 u0 = [up], 8 C M01
+ ld8 v0 = [vp], 8 C M01
+ popcnt c2 = x2 C I0
+ xor x1 = u1, v1 C M I
+ ;;
+ ld8 u1 = [up], 8 C M01
+ ld8 v1 = [vp], 8 C M01
+ popcnt c3 = x3 C I0
+ xor x2 = u2, v2 C M I
+ ;;
+ ld8 u2 = [up], 8 C M01
+ ld8 v2 = [vp], 8 C M01
+ popcnt c0 = x0 C I0
+ xor x3 = u3, v3 C M I
+ br .LL10 C B
+
+
+.Lb11: ld8 u2 = [up], 8 C M01
+ ld8 v2 = [vp], 8 C M01
+ shr.u n = n, 2 C I0
+ xor x1 = r10, r11 C M I
+ ;;
+ ld8 u3 = [up], 8 C M01
+ ld8 v3 = [vp], 8 C M01
+ xor x2 = u2, v2 C M I
+ (p15) br.cond.dptk .grt3 C B
+ ;;
+ xor x3 = u3, v3 C M I
+ ;;
+ popcnt c1 = x1 C I0
+ ;;
+ popcnt c2 = x2 C I0
+ ;;
+ popcnt c3 = x3 C I0
+ ;;
+ add s = c1, c2 C M I
+ ;;
+ add s = s, c3 C M I
+ br.ret.sptk.many b0 C B
+
+.grt3: ld8 u0 = [up], 8 C M01
+ ld8 v0 = [vp], 8 C M01
+ mov.i ar.lc = n C I0
+ ;;
+ ld8 u1 = [up], 8 C M01
+ ld8 v1 = [vp], 8 C M01
+ mov s = 0 C M I
+ ;;
+ ld8 u2 = [up], 8 C M01
+ ld8 v2 = [vp], 8 C M01
+ xor x3 = u3, v3 C M I
+ ;;
+ ld8 u3 = [up], 8 C M01
+ ld8 v3 = [vp], 8 C M01
+ popcnt c1 = x1 C I0
+ xor x0 = u0, v0 C M I
+ br.cloop.dptk .grt7 C B
+ popcnt c2 = x2 C I0
+ xor x1 = u1, v1 C M I
+ ;;
+ popcnt c3 = x3 C I0
+ xor x2 = u2, v2 C M I
+ br .Lcj7 C B
+
+.grt7: ld8 u0 = [up], 8 C M01
+ ld8 v0 = [vp], 8 C M01
+ popcnt c2 = x2 C I0
+ xor x1 = u1, v1 C M I
+ ;;
+ ld8 u1 = [up], 8 C M01
+ ld8 v1 = [vp], 8 C M01
+ popcnt c3 = x3 C I0
+ xor x2 = u2, v2 C M I
+ br .LL11 C B
+
+
+ ALIGN(32)
+.Loop: ld8 u0 = [up], 8 C M01
+ ld8 v0 = [vp], 8 C M01
+ popcnt c2 = x2 C I0
+ add s = s, c3 C M I
+ xor x1 = u1, v1 C M I
+ nop.b 1 C -
+ ;;
+.LL00: ld8 u1 = [up], 8 C M01
+ ld8 v1 = [vp], 8 C M01
+ popcnt c3 = x3 C I0
+ add s = s, c0 C M I
+ xor x2 = u2, v2 C M I
+ nop.b 1 C -
+ ;;
+.LL11: ld8 u2 = [up], 8 C M01
+ ld8 v2 = [vp], 8 C M01
+ popcnt c0 = x0 C I0
+ add s = s, c1 C M I
+ xor x3 = u3, v3 C M I
+ nop.b 1 C -
+ ;;
+.LL10: ld8 u3 = [up], 8 C M01
+ ld8 v3 = [vp], 8 C M01
+ popcnt c1 = x1 C I0
+ add s = s, c2 C M I
+ xor x0 = u0, v0 C M I
+ br.cloop.dptk .Loop C B
+ ;;
+
+.Lend: popcnt c2 = x2 C I0
+ add s = s, c3 C M I
+ xor x1 = u1, v1 C M I
+ ;;
+.Lcj8: popcnt c3 = x3 C I0
+ add s = s, c0 C M I
+ xor x2 = u2, v2 C M I
+ ;;
+.Lcj7: popcnt c0 = x0 C I0
+ add s = s, c1 C M I
+ xor x3 = u3, v3 C M I
+ ;;
+.Lcj6: popcnt c1 = x1 C I0
+ add s = s, c2 C M I
+ ;;
+.Lcj5: popcnt c2 = x2 C I0
+ add s = s, c3 C M I
+ ;;
+.Lcj4: popcnt c3 = x3 C I0
+ add s = s, c0 C M I
+ ;;
+ add s = s, c1 C M I
+ ;;
+ add s = s, c2 C M I
+ ;;
+ add s = s, c3 C M I
+ mov.i ar.lc = r2 C I0
+ br.ret.sptk.many b0 C B
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/ia64/ia64-defs.m4 b/vendor/gmp-6.3.0/mpn/ia64/ia64-defs.m4
new file mode 100644
index 0000000..f71d280
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/ia64/ia64-defs.m4
@@ -0,0 +1,147 @@
+divert(-1)
+
+
+dnl Copyright 2000, 2002, 2003 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+dnl ia64 assembler comments are C++ style "//" to the end of line. gas
+dnl also accepts "#" as a comment, if it's the first non-blank on a line.
+dnl
+dnl BSD m4 can't handle a multi-character comment like "//" (see notes in
+dnl mpn/asm-defs.m4). For now the default "#" is left, but with care taken
+dnl not to put any macros after "foo#" (since of course they won't expand).
+
+
+define(`ASM_START',
+m4_assert_numargs(0)
+`')
+
+
+dnl Called: PROLOGUE_cpu(GSYM_PREFIX`'foo)
+dnl EPILOGUE_cpu(GSYM_PREFIX`'foo)
+dnl
+dnl 32-byte alignment is used for the benefit of itanium-2, where the code
+dnl fetcher will only take 2 bundles from a 32-byte aligned target. At
+dnl 16mod32 it only reads 1 in the first cycle. This might not make any
+dnl difference if the rotate buffers are full or there's other work holding
+dnl up execution, but we use 32-bytes to give the best chance of peak
+dnl throughput.
+dnl
+dnl We can use .align here despite the gas bug noted in mpn/ia64/README,
+dnl since we're not expecting to execute across a PROLOGUE(), at least not
+dnl currently.
+
+define(`PROLOGUE_cpu',
+m4_assert_numargs(1)
+ `
+ .text
+ .align 32
+ .global $1#
+ .proc $1#
+$1:')
+
+define(`EPILOGUE_cpu',
+m4_assert_numargs(1)
+ `
+ .endp $1#
+')
+
+define(`DATASTART',
+ `dnl
+ DATA
+$1:')
+define(`DATAEND',`dnl')
+
+define(`ASM_END',`dnl')
+
+
+dnl Usage: ALIGN(bytes)
+dnl
+dnl Emit a ".align" directive. "bytes" is eval()ed, so can be an
+dnl expression.
+dnl
+dnl This version overrides the definition in mpn/asm-defs.m4. We suppress
+dnl any .align if the gas byte-swapped-nops bug was detected by configure
+dnl GMP_ASM_IA64_ALIGN_OK.
+
+define(`ALIGN',
+m4_assert_numargs(1)
+m4_assert_defined(`IA64_ALIGN_OK')
+`ifelse(IA64_ALIGN_OK,no,,
+`.align eval($1)')')
+
+
+dnl Usage: ASSERT([pr] [,code])
+dnl
+dnl Require that the given predicate register is true after executing the
+dnl test code. For example,
+dnl
+dnl ASSERT(p6,
+dnl ` cmp.eq p6,p0 = r3, r4')
+dnl
+dnl If the predicate register argument is empty then nothing is tested, the
+dnl code is just executed. This can be used for setups required by later
+dnl ASSERTs. The code argument can be omitted to just test a predicate
+dnl with no special setup code.
+dnl
+dnl For convenience, stops are inserted before and after the code emitted.
+
+define(ASSERT,
+m4_assert_numargs_range(1,2)
+m4_assert_defined(`WANT_ASSERT')
+`ifelse(WANT_ASSERT,1,
+` ;;
+ifelse(`$2',,,
+`$2
+ ;;
+')
+ifelse(`$1',,,
+`($1) br .LASSERTok`'ASSERT_label_counter ;;
+ cmp.ne p6,p6 = r0, r0 C illegal instruction
+ ;;
+.LASSERTok`'ASSERT_label_counter:
+define(`ASSERT_label_counter',eval(ASSERT_label_counter+1))
+')
+')')
+define(`ASSERT_label_counter',1)
+
+define(`getfsig', `getf.sig')
+define(`setfsig', `setf.sig')
+define(`cmpeq', `cmp.eq')
+define(`cmpne', `cmp.ne')
+define(`cmpltu', `cmp.ltu')
+define(`cmpleu', `cmp.leu')
+define(`cmpgtu', `cmp.gtu')
+define(`cmpgeu', `cmp.geu')
+define(`cmple', `cmp.le')
+define(`cmpgt', `cmp.gt')
+define(`cmpeqor', `cmp.eq.or')
+define(`cmpequc', `cmp.eq.unc')
+
+divert
diff --git a/vendor/gmp-6.3.0/mpn/ia64/invert_limb.asm b/vendor/gmp-6.3.0/mpn/ia64/invert_limb.asm
new file mode 100644
index 0000000..5effdda
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/ia64/invert_limb.asm
@@ -0,0 +1,105 @@
+dnl IA-64 mpn_invert_limb -- Invert a normalized limb.
+
+dnl Contributed to the GNU project by Torbjorn Granlund and Kevin Ryde.
+
+dnl Copyright 2000, 2002, 2004 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C d = r32
+
+C cycles
+C Itanium: 74
+C Itanium 2: 50+6
+
+C It should be possible to avoid the xmpy.hu and the following tests by
+C explicitly chopping in the last fma. That would save about 10 cycles.
+
+ASM_START()
+ .sdata
+ .align 16
+ifdef(`HAVE_DOUBLE_IEEE_LITTLE_ENDIAN',`
+.LC0: data4 0x00000000, 0x80000000, 0x0000403f, 0x00000000 C 2^64
+.LC1: data4 0x00000000, 0x80000000, 0x0000407f, 0x00000000 C 2^128
+
+',`ifdef(`HAVE_DOUBLE_IEEE_BIG_ENDIAN',`
+.LC0: data4 0x403f8000, 0x00000000, 0x00000000, 0x00000000 C 2^64
+.LC1: data4 0x407f8000, 0x00000000, 0x00000000, 0x00000000 C 2^128
+
+',`m4_error(`Oops, need to know float endianness
+')')')
+
+
+PROLOGUE(mpn_invert_limb)
+ C 00
+ addl r14 = @gprel(.LC0), gp
+ addl r15 = @gprel(.LC1), gp
+ setf.sig f7 = r32
+ add r9 = r32, r32 C check for d = 2^63
+ ;; C 01
+ ldfe f10 = [r14] C 2^64
+ ldfe f8 = [r15] C 2^128
+ cmp.eq p6, p0 = 0, r9 C check for d = 2^63
+ mov r8 = -1 C retval for 2^63
+ (p6) br.ret.spnt.many b0
+ ;; C 07
+ fmpy.s1 f11 = f7, f10 C f11 = d * 2^64
+ fnma.s1 f6 = f7, f10, f8 C f6 = 2^128 - d * 2^64
+ ;; C 11
+ frcpa.s1 f8, p6 = f6, f7
+ ;; C 15
+ (p6) fnma.s1 f9 = f7, f8, f1
+ (p6) fmpy.s1 f10 = f6, f8
+ ;; C 19
+ (p6) fmpy.s1 f11 = f9, f9
+ (p6) fma.s1 f10 = f9, f10, f10
+ ;; C 23
+ (p6) fma.s1 f8 = f9, f8, f8
+ (p6) fma.s1 f9 = f11, f10, f10
+ ;; C 27
+ (p6) fma.s1 f8 = f11, f8, f8
+ (p6) fnma.s1 f10 = f7, f9, f6
+ ;; C 31
+ (p6) fma.s1 f8 = f10, f8, f9
+ ;; C 35
+ fcvt.fxu.trunc.s1 f8 = f8
+ ;; C 39
+ getf.sig r8 = f8
+ xmpy.hu f10 = f8, f7 C di * d
+ ;; C 43
+ getf.sig r14 = f10
+ andcm r9 = -1, r32 C one's complement
+ ;; C 48
+ cmp.ltu p6, p0 = r9, r14 C got overflow?
+ ;; C 49
+ (p6) add r8 = -1, r8 C adjust di down
+ br.ret.sptk.many b0
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/ia64/logops_n.asm b/vendor/gmp-6.3.0/mpn/ia64/logops_n.asm
new file mode 100644
index 0000000..e4a2f61
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/ia64/logops_n.asm
@@ -0,0 +1,292 @@
+dnl IA-64 mpn_and_n, mpn_andn_n, mpn_nand_n, mpn_ior_n, mpn_iorn_n,
+dnl mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise logical operations.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2003-2005 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C Itanium: 2
+C Itanium 2: 1
+
+C TODO
+C * Use rp,rpx scheme of aors_n.asm to allow parallel stores (useful in
+C wind-down code).
+
+C INPUT PARAMETERS
+define(`rp', `r32')
+define(`up', `r33')
+define(`vp', `r34')
+define(`n', `r35')
+
+ifdef(`OPERATION_and_n',
+` define(`func',`mpn_and_n')
+ define(`logop', `and $1 = $2, $3')
+ define(`notormov', `mov $1 = $2')')
+ifdef(`OPERATION_andn_n',
+` define(`func',`mpn_andn_n')
+ define(`logop', `andcm $1 = $2, $3')
+ define(`notormov', `mov $1 = $2')')
+ifdef(`OPERATION_nand_n',
+` define(`func',`mpn_nand_n')
+ define(`logop', `and $1 = $2, $3')
+ define(`notormov', `sub $1 = -1, $2')')
+ifdef(`OPERATION_ior_n',
+` define(`func',`mpn_ior_n')
+ define(`logop', `or $1 = $2, $3')
+ define(`notormov', `mov $1 = $2')')
+ifdef(`OPERATION_iorn_n',
+` define(`func',`mpn_iorn_n')
+ define(`logop', `andcm $1 = $3, $2')
+ define(`notormov', `sub $1 = -1, $2')')
+ifdef(`OPERATION_nior_n',
+` define(`func',`mpn_nior_n')
+ define(`logop', `or $1 = $2, $3')
+ define(`notormov', `sub $1 = -1, $2')')
+ifdef(`OPERATION_xor_n',
+` define(`func',`mpn_xor_n')
+ define(`logop', `xor $1 = $2, $3')
+ define(`notormov', `mov $1 = $2')')
+ifdef(`OPERATION_xnor_n',
+` define(`func',`mpn_xnor_n')
+ define(`logop', `xor $1 = $2, $3')
+ define(`notormov', `sub $1 = -1, $2')')
+
+MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
+
+ASM_START()
+PROLOGUE(func)
+ .prologue
+ .save ar.lc, r2
+ .body
+ifdef(`HAVE_ABI_32',
+` addp4 rp = 0, rp C M I
+ addp4 up = 0, up C M I
+ addp4 vp = 0, vp C M I
+ nop.m 0
+ nop.m 0
+ zxt4 n = n C I
+ ;;
+')
+{.mmi
+ ld8 r10 = [up], 8 C M
+ ld8 r11 = [vp], 8 C M
+ mov.i r2 = ar.lc C I0
+}
+{.mmi
+ and r14 = 3, n C M I
+ cmp.lt p15, p14 = 4, n C M I
+ shr.u n = n, 2 C I0
+ ;;
+}
+{.mmi
+ cmp.eq p6, p0 = 1, r14 C M I
+ cmp.eq p7, p0 = 2, r14 C M I
+ cmp.eq p8, p0 = 3, r14 C M I
+}
+{.bbb
+ (p6) br.dptk .Lb01 C B
+ (p7) br.dptk .Lb10 C B
+ (p8) br.dptk .Lb11 C B
+}
+
+.Lb00: ld8 r17 = [up], 8 C M
+ ld8 r21 = [vp], 8 C M
+ add n = -2, n C M I
+ ;;
+ ld8 r18 = [up], 8 C M
+ ld8 r22 = [vp], 8 C M
+ ;;
+ ld8 r19 = [up], 8 C M
+ ld8 r23 = [vp], 8 C M
+ (p15) br.cond.dpnt .grt4 C B
+
+ logop( r14, r10, r11) C M I
+ ;;
+ logop( r15, r17, r21) C M I
+ notormov( r8, r14) C M I
+ br .Lcj4 C B
+
+.grt4: logop( r14, r10, r11) C M I
+ ld8 r16 = [up], 8 C M
+ ld8 r20 = [vp], 8 C M
+ ;;
+ logop( r15, r17, r21) C M I
+ ld8 r17 = [up], 8 C M
+ mov.i ar.lc = n C I0
+ notormov( r8, r14) C M I
+ ld8 r21 = [vp], 8 C M
+ br .LL00 C B
+
+.Lb01: add n = -1, n C M I
+ logop( r15, r10, r11) C M I
+ (p15) br.cond.dpnt .grt1 C B
+ ;;
+
+ notormov( r9, r15) C M I
+ br .Lcj1 C B
+
+.grt1: ld8 r16 = [up], 8 C M
+ ld8 r20 = [vp], 8 C M
+ ;;
+ ld8 r17 = [up], 8 C M
+ ld8 r21 = [vp], 8 C M
+ mov.i ar.lc = n C I0
+ ;;
+ ld8 r18 = [up], 8 C M
+ ld8 r22 = [vp], 8 C M
+ ;;
+ ld8 r19 = [up], 8 C M
+ ld8 r23 = [vp], 8 C M
+ br.cloop.dptk .grt5 C B
+ ;;
+
+ logop( r14, r16, r20) C M I
+ notormov( r9, r15) C M I
+ br .Lcj5 C B
+
+.grt5: logop( r14, r16, r20) C M I
+ ld8 r16 = [up], 8 C M
+ notormov( r9, r15) C M I
+ ld8 r20 = [vp], 8 C M
+ br .LL01 C B
+
+.Lb10: ld8 r19 = [up], 8 C M
+ ld8 r23 = [vp], 8 C M
+ (p15) br.cond.dpnt .grt2 C B
+
+ logop( r14, r10, r11) C M I
+ ;;
+ logop( r15, r19, r23) C M I
+ notormov( r8, r14) C M I
+ br .Lcj2 C B
+
+.grt2: ld8 r16 = [up], 8 C M
+ ld8 r20 = [vp], 8 C M
+ add n = -1, n C M I
+ ;;
+ ld8 r17 = [up], 8 C M
+ ld8 r21 = [vp], 8 C M
+ logop( r14, r10, r11) C M I
+ ;;
+ ld8 r18 = [up], 8 C M
+ ld8 r22 = [vp], 8 C M
+ mov.i ar.lc = n C I0
+ ;;
+ logop( r15, r19, r23) C M I
+ ld8 r19 = [up], 8 C M
+ notormov( r8, r14) C M I
+ ld8 r23 = [vp], 8 C M
+ br.cloop.dptk .Loop C B
+ br .Lcj6 C B
+
+.Lb11: ld8 r18 = [up], 8 C M
+ ld8 r22 = [vp], 8 C M
+ add n = -1, n C M I
+ ;;
+ ld8 r19 = [up], 8 C M
+ ld8 r23 = [vp], 8 C M
+ logop( r15, r10, r11) C M I
+ (p15) br.cond.dpnt .grt3 C B
+ ;;
+
+ logop( r14, r18, r22) C M I
+ notormov( r9, r15) C M I
+ br .Lcj3 C B
+
+.grt3: ld8 r16 = [up], 8 C M
+ ld8 r20 = [vp], 8 C M
+ ;;
+ ld8 r17 = [up], 8 C M
+ ld8 r21 = [vp], 8 C M
+ mov.i ar.lc = n C I0
+ ;;
+ logop( r14, r18, r22) C M I
+ ld8 r18 = [up], 8 C M
+ notormov( r9, r15) C M I
+ ld8 r22 = [vp], 8 C M
+ br .LL11 C B
+
+C *** MAIN LOOP START ***
+ ALIGN(32)
+.Loop: st8 [rp] = r8, 8 C M
+ logop( r14, r16, r20) C M I
+ notormov( r9, r15) C M I
+ ld8 r16 = [up], 8 C M
+ ld8 r20 = [vp], 8 C M
+ nop.b 0
+ ;;
+.LL01: st8 [rp] = r9, 8 C M
+ logop( r15, r17, r21) C M I
+ notormov( r8, r14) C M I
+ ld8 r17 = [up], 8 C M
+ ld8 r21 = [vp], 8 C M
+ nop.b 0
+ ;;
+.LL00: st8 [rp] = r8, 8 C M
+ logop( r14, r18, r22) C M I
+ notormov( r9, r15) C M I
+ ld8 r18 = [up], 8 C M
+ ld8 r22 = [vp], 8 C M
+ nop.b 0
+ ;;
+.LL11: st8 [rp] = r9, 8 C M
+ logop( r15, r19, r23) C M I
+ notormov( r8, r14) C M I
+ ld8 r19 = [up], 8 C M
+ ld8 r23 = [vp], 8 C M
+ br.cloop.dptk .Loop ;; C B
+C *** MAIN LOOP END ***
+
+.Lcj6: st8 [rp] = r8, 8 C M
+ logop( r14, r16, r20) C M I
+ notormov( r9, r15) C M I
+ ;;
+.Lcj5: st8 [rp] = r9, 8 C M
+ logop( r15, r17, r21) C M I
+ notormov( r8, r14) C M I
+ ;;
+.Lcj4: st8 [rp] = r8, 8 C M
+ logop( r14, r18, r22) C M I
+ notormov( r9, r15) C M I
+ ;;
+.Lcj3: st8 [rp] = r9, 8 C M
+ logop( r15, r19, r23) C M I
+ notormov( r8, r14) C M I
+ ;;
+.Lcj2: st8 [rp] = r8, 8 C M
+ notormov( r9, r15) C M I
+ ;;
+.Lcj1: st8 [rp] = r9, 8 C M
+ mov.i ar.lc = r2 C I0
+ br.ret.sptk.many b0 C B
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/ia64/lorrshift.asm b/vendor/gmp-6.3.0/mpn/ia64/lorrshift.asm
new file mode 100644
index 0000000..694aaf0
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/ia64/lorrshift.asm
@@ -0,0 +1,358 @@
+dnl IA-64 mpn_lshift/mpn_rshift.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2000-2005 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C Itanium: 2
+C Itanium 2: 1
+
+C This code is scheduled deeply since the plain shift instructions shr and shl
+C have a latency of 4 (on Itanium) or 3 (on Itanium 2). Poor scheduling of
+C these instructions cause a 10 cycle replay trap on Itanium.
+
+C The ld8 scheduling should probably be decreased to make the function smaller.
+C Good lfetch will make sure we never stall anyway.
+
+C We should actually issue the first ld8 at cycle 0, and the first BSH/FSH pair
+C at cycle 2. Judicious use of predicates could allow us to issue more ld8's
+C in the prologue.
+
+
+C INPUT PARAMETERS
+define(`rp', `r32')
+define(`up', `r33')
+define(`n', `r34')
+define(`cnt',`r35')
+
+define(`tnc',`r9')
+
+ifdef(`OPERATION_lshift',`
+ define(`FSH',`shl')
+ define(`BSH',`shr.u')
+ define(`UPD',`-8')
+ define(`POFF',`-512')
+ define(`PUPD',`-32')
+ define(`func',`mpn_lshift')
+')
+ifdef(`OPERATION_rshift',`
+ define(`FSH',`shr.u')
+ define(`BSH',`shl')
+ define(`UPD',`8')
+ define(`POFF',`512')
+ define(`PUPD',`32')
+ define(`func',`mpn_rshift')
+')
+
+MULFUNC_PROLOGUE(mpn_lshift mpn_rshift)
+
+ASM_START()
+PROLOGUE(func)
+ .prologue
+ .save ar.lc, r2
+ .body
+ifdef(`HAVE_ABI_32',
+` addp4 rp = 0, rp C M I
+ addp4 up = 0, up C M I
+ sxt4 n = n C M I
+ nop.m 0
+ nop.m 0
+ zxt4 cnt = cnt C I
+ ;;
+')
+
+ {.mmi; cmp.lt p14, p15 = 4, n C M I
+ and r14 = 3, n C M I
+ mov.i r2 = ar.lc C I0
+}{.mmi; add r15 = -1, n C M I
+ sub tnc = 64, cnt C M I
+ add r16 = -5, n
+ ;;
+}{.mmi; cmp.eq p6, p0 = 1, r14 C M I
+ cmp.eq p7, p0 = 2, r14 C M I
+ shr.u n = r16, 2 C I0
+}{.mmi; cmp.eq p8, p0 = 3, r14 C M I
+ifdef(`OPERATION_lshift',
+` shladd up = r15, 3, up C M I
+ shladd rp = r15, 3, rp') C M I
+ ;;
+}{.mmi; add r11 = POFF, up C M I
+ ld8 r10 = [up], UPD C M01
+ mov.i ar.lc = n C I0
+}{.bbb;
+ (p6) br.dptk .Lb01
+ (p7) br.dptk .Lb10
+ (p8) br.dptk .Lb11
+ ;; }
+
+.Lb00: ld8 r19 = [up], UPD
+ ;;
+ ld8 r16 = [up], UPD
+ ;;
+ ld8 r17 = [up], UPD
+ BSH r8 = r10, tnc C function return value
+ ;;
+ FSH r24 = r10, cnt
+ BSH r25 = r19, tnc
+ (p14) br.cond.dptk .grt4
+ ;;
+ FSH r26 = r19, cnt
+ BSH r27 = r16, tnc
+ ;;
+ FSH r20 = r16, cnt
+ BSH r21 = r17, tnc
+ ;;
+ or r14 = r25, r24
+ FSH r22 = r17, cnt
+ BSH r23 = r10, tnc
+ br .Lr4
+
+.grt4: ld8 r18 = [up], UPD
+ FSH r26 = r19, cnt
+ BSH r27 = r16, tnc
+ ;;
+ ld8 r19 = [up], UPD
+ FSH r20 = r16, cnt
+ BSH r21 = r17, tnc
+ ;;
+ ld8 r16 = [up], UPD
+ FSH r22 = r17, cnt
+ BSH r23 = r18, tnc
+ ;;
+ or r14 = r25, r24
+ ld8 r17 = [up], UPD
+ br.cloop.dpnt .Ltop
+ br .Lbot
+
+.Lb01:
+ (p15) BSH r8 = r10, tnc C function return value I
+ (p15) FSH r22 = r10, cnt C I
+ (p15) br.cond.dptk .Lr1 C return B
+
+.grt1: ld8 r18 = [up], UPD
+ ;;
+ ld8 r19 = [up], UPD
+ BSH r8 = r10, tnc C function return value
+ ;;
+ ld8 r16 = [up], UPD
+ FSH r22 = r10, cnt
+ BSH r23 = r18, tnc
+ ;;
+ ld8 r17 = [up], UPD
+ FSH r24 = r18, cnt
+ BSH r25 = r19, tnc
+ br.cloop.dpnt .grt5
+ ;;
+ or r15 = r23, r22
+ FSH r26 = r19, cnt
+ BSH r27 = r16, tnc
+ ;;
+ FSH r20 = r16, cnt
+ BSH r21 = r17, tnc
+ br .Lr5
+
+.grt5: ld8 r18 = [up], UPD
+ FSH r26 = r19, cnt
+ BSH r27 = r16, tnc
+ ;;
+ ld8 r19 = [up], UPD
+ FSH r20 = r16, cnt
+ BSH r21 = r17, tnc
+ ;;
+ or r15 = r23, r22
+ ld8 r16 = [up], UPD
+ br .LL01
+
+
+.Lb10: ld8 r17 = [up], UPD
+ (p14) br.cond.dptk .grt2
+
+ BSH r8 = r10, tnc C function return value
+ ;;
+ FSH r20 = r10, cnt
+ BSH r21 = r17, tnc
+ ;;
+ or r14 = r21, r20
+ FSH r22 = r17, cnt
+ br .Lr2 C return
+
+.grt2: ld8 r18 = [up], UPD
+ BSH r8 = r10, tnc C function return value
+ ;;
+ ld8 r19 = [up], UPD
+ FSH r20 = r10, cnt
+ BSH r21 = r17, tnc
+ ;;
+ ld8 r16 = [up], UPD
+ FSH r22 = r17, cnt
+ BSH r23 = r18, tnc
+ ;;
+ {.mmi; ld8 r17 = [up], UPD
+ or r14 = r21, r20
+ FSH r24 = r18, cnt
+}{.mib; nop 0
+ BSH r25 = r19, tnc
+ br.cloop.dpnt .grt6
+ ;; }
+
+ FSH r26 = r19, cnt
+ BSH r27 = r16, tnc
+ br .Lr6
+
+.grt6: ld8 r18 = [up], UPD
+ FSH r26 = r19, cnt
+ BSH r27 = r16, tnc
+ ;;
+ ld8 r19 = [up], UPD
+ br .LL10
+
+
+.Lb11: ld8 r16 = [up], UPD
+ ;;
+ ld8 r17 = [up], UPD
+ BSH r8 = r10, tnc C function return value
+ (p14) br.cond.dptk .grt3
+ ;;
+
+ FSH r26 = r10, cnt
+ BSH r27 = r16, tnc
+ ;;
+ FSH r20 = r16, cnt
+ BSH r21 = r17, tnc
+ ;;
+ or r15 = r27, r26
+ FSH r22 = r17, cnt
+ br .Lr3 C return
+
+.grt3: ld8 r18 = [up], UPD
+ FSH r26 = r10, cnt
+ BSH r27 = r16, tnc
+ ;;
+ ld8 r19 = [up], UPD
+ FSH r20 = r16, cnt
+ BSH r21 = r17, tnc
+ ;;
+ ld8 r16 = [up], UPD
+ FSH r22 = r17, cnt
+ BSH r23 = r18, tnc
+ ;;
+ ld8 r17 = [up], UPD
+ br.cloop.dpnt .grt7
+
+ or r15 = r27, r26
+ FSH r24 = r18, cnt
+ BSH r25 = r19, tnc
+ br .Lr7
+
+.grt7: or r15 = r27, r26
+ FSH r24 = r18, cnt
+ BSH r25 = r19, tnc
+ ld8 r18 = [up], UPD
+ br .LL11
+
+C *** MAIN LOOP START ***
+ ALIGN(32)
+.Ltop:
+ {.mmi; st8 [rp] = r14, UPD C M2
+ or r15 = r27, r26 C M3
+ FSH r24 = r18, cnt C I0
+}{.mmi; ld8 r18 = [up], UPD C M1
+ lfetch [r11], PUPD
+ BSH r25 = r19, tnc C I1
+ ;; }
+.LL11:
+ {.mmi; st8 [rp] = r15, UPD
+ or r14 = r21, r20
+ FSH r26 = r19, cnt
+}{.mmi; ld8 r19 = [up], UPD
+ nop.m 0
+ BSH r27 = r16, tnc
+ ;; }
+.LL10:
+ {.mmi; st8 [rp] = r14, UPD
+ or r15 = r23, r22
+ FSH r20 = r16, cnt
+}{.mmi; ld8 r16 = [up], UPD
+ nop.m 0
+ BSH r21 = r17, tnc
+ ;; }
+.LL01:
+ {.mmi; st8 [rp] = r15, UPD
+ or r14 = r25, r24
+ FSH r22 = r17, cnt
+}{.mib; ld8 r17 = [up], UPD
+ BSH r23 = r18, tnc
+ br.cloop.dptk .Ltop
+ ;; }
+C *** MAIN LOOP END ***
+
+.Lbot:
+ {.mmi; st8 [rp] = r14, UPD
+ or r15 = r27, r26
+ FSH r24 = r18, cnt
+}{.mib; nop 0
+ BSH r25 = r19, tnc
+ nop 0
+ ;; }
+.Lr7:
+ {.mmi; st8 [rp] = r15, UPD
+ or r14 = r21, r20
+ FSH r26 = r19, cnt
+}{.mib; nop 0
+ BSH r27 = r16, tnc
+ nop 0
+ ;; }
+.Lr6:
+ {.mmi; st8 [rp] = r14, UPD
+ or r15 = r23, r22
+ FSH r20 = r16, cnt
+}{.mib; nop 0
+ BSH r21 = r17, tnc
+ nop 0
+ ;; }
+.Lr5: st8 [rp] = r15, UPD
+ or r14 = r25, r24
+ FSH r22 = r17, cnt
+ ;;
+.Lr4: st8 [rp] = r14, UPD
+ or r15 = r27, r26
+ ;;
+.Lr3: st8 [rp] = r15, UPD
+ or r14 = r21, r20
+ ;;
+.Lr2: st8 [rp] = r14, UPD
+ ;;
+.Lr1: st8 [rp] = r22, UPD C M23
+ mov ar.lc = r2 C I0
+ br.ret.sptk.many b0 C B
+EPILOGUE(func)
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/ia64/lshiftc.asm b/vendor/gmp-6.3.0/mpn/ia64/lshiftc.asm
new file mode 100644
index 0000000..e8cec87
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/ia64/lshiftc.asm
@@ -0,0 +1,463 @@
+dnl IA-64 mpn_lshiftc.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2000-2005, 2010 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C Itanium: ?
+C Itanium 2: 1.25
+
+C This code is scheduled deeply since the plain shift instructions shr and shl
+C have a latency of 4 (on Itanium) or 3 (on Itanium 2). Poor scheduling of
+C these instructions cause a 10 cycle replay trap on Itanium.
+
+C The ld8 scheduling should probably be decreased to make the function smaller.
+C Good lfetch will make sure we never stall anyway.
+
+C We should actually issue the first ld8 at cycle 0, and the first BSH/FSH pair
+C at cycle 2. Judicious use of predicates could allow us to issue more ld8's
+C in the prologue.
+
+
+C INPUT PARAMETERS
+define(`rp', `r32')
+define(`up', `r33')
+define(`n', `r34')
+define(`cnt',`r35')
+
+define(`tnc',`r9')
+
+define(`FSH',`shl')
+define(`BSH',`shr.u')
+define(`UPD',`-8')
+define(`POFF',`-512')
+define(`PUPD',`-32')
+define(`func',`mpn_lshiftc')
+
+ASM_START()
+PROLOGUE(mpn_lshiftc)
+ .prologue
+ .save ar.lc, r2
+ .body
+ifdef(`HAVE_ABI_32',
+` addp4 rp = 0, rp C M I
+ addp4 up = 0, up C M I
+ sxt4 n = n C M I
+ nop.m 0
+ nop.m 0
+ zxt4 cnt = cnt C I
+ ;;
+')
+
+ {.mmi; nop 0 C M I
+ and r14 = 3, n C M I
+ mov.i r2 = ar.lc C I0
+}{.mmi; add r15 = -1, n C M I
+ sub tnc = 64, cnt C M I
+ nop 0
+ ;;
+}{.mmi; cmp.eq p6, p0 = 1, r14 C M I
+ cmp.eq p7, p0 = 2, r14 C M I
+ shr.u n = r15, 2 C I0
+}{.mmi; cmp.eq p8, p0 = 3, r14 C M I
+ shladd up = r15, 3, up C M I
+ shladd rp = r15, 3, rp C M I
+ ;;
+}{.mmi; add r11 = POFF, up C M I
+ ld8 r10 = [up], UPD C M01
+ mov.i ar.lc = n C I0
+}{.bbb;
+ (p6) br.dptk .Lb01
+ (p7) br.dptk .Lb10
+ (p8) br.dptk .Lb11
+ ;; }
+
+.Lb00:
+ ld8 r19 = [up], UPD
+ ;;
+ ld8 r16 = [up], UPD
+ ;;
+ ld8 r17 = [up], UPD
+ BSH r8 = r10, tnc
+ br.cloop.dptk L(gt4)
+ ;;
+ FSH r24 = r10, cnt
+ BSH r25 = r19, tnc
+ ;;
+ FSH r26 = r19, cnt
+ BSH r27 = r16, tnc
+ ;;
+ FSH r20 = r16, cnt
+ BSH r21 = r17, tnc
+ ;;
+ or r14 = r25, r24
+ FSH r22 = r17, cnt
+ ;;
+ or r15 = r27, r26
+ sub r31 = -1, r14
+ br .Lr4
+
+L(gt4):
+ {.mmi; nop 0
+ nop 0
+ FSH r24 = r10, cnt
+}{.mmi; ld8 r18 = [up], UPD
+ nop 0
+ BSH r25 = r19, tnc
+ ;; }
+ {.mmi; nop 0
+ nop 0
+ FSH r26 = r19, cnt
+}{.mmi; ld8 r19 = [up], UPD
+ nop 0
+ BSH r27 = r16, tnc
+ ;; }
+ {.mmi; nop 0
+ nop 0
+ FSH r20 = r16, cnt
+}{.mmi; ld8 r16 = [up], UPD
+ nop 0
+ BSH r21 = r17, tnc
+ ;; }
+ {.mmi; nop 0
+ or r14 = r25, r24
+ FSH r22 = r17, cnt
+}{.mib; ld8 r17 = [up], UPD
+ BSH r23 = r18, tnc
+ br.cloop.dptk L(gt8)
+ ;; }
+ {.mmi; nop 0
+ or r15 = r27, r26
+ FSH r24 = r18, cnt
+}{.mib; sub r31 = -1, r14
+ BSH r25 = r19, tnc
+ br .Lr8 }
+
+L(gt8):
+ or r15 = r27, r26
+ FSH r24 = r18, cnt
+ ld8 r18 = [up], UPD
+ sub r31 = -1, r14
+ BSH r25 = r19, tnc
+ br .LL00
+
+.Lb01:
+ br.cloop.dptk L(gt1)
+ ;;
+ BSH r8 = r10, tnc
+ FSH r22 = r10, cnt
+ ;;
+ sub r31 = -1, r22
+ br .Lr1
+ ;;
+L(gt1):
+ ld8 r18 = [up], UPD
+ BSH r8 = r10, tnc
+ FSH r22 = r10, cnt
+ ;;
+ ld8 r19 = [up], UPD
+ ;;
+ ld8 r16 = [up], UPD
+ ;;
+ ld8 r17 = [up], UPD
+ BSH r23 = r18, tnc
+ br.cloop.dptk L(gt5)
+ ;;
+ nop 0
+ FSH r24 = r18, cnt
+ BSH r25 = r19, tnc
+ ;;
+ nop 0
+ FSH r26 = r19, cnt
+ BSH r27 = r16, tnc
+ ;;
+ or r15 = r23, r22
+ FSH r20 = r16, cnt
+ BSH r21 = r17, tnc
+ ;;
+ or r14 = r25, r24
+ FSH r22 = r17, cnt
+ sub r31 = -1, r15
+ br .Lr5
+
+L(gt5):
+ {.mmi; nop 0
+ nop 0
+ FSH r24 = r18, cnt
+}{.mmi; ld8 r18 = [up], UPD
+ nop 0
+ BSH r25 = r19, tnc
+ ;; }
+ {.mmi; nop 0
+ nop 0
+ FSH r26 = r19, cnt
+}{.mmi; ld8 r19 = [up], UPD
+ nop 0
+ BSH r27 = r16, tnc
+ ;; }
+ {.mmi; nop 0
+ or r15 = r23, r22
+ FSH r20 = r16, cnt
+}{.mmi; ld8 r16 = [up], UPD
+ nop 0
+ BSH r21 = r17, tnc
+ ;; }
+ {.mmi; or r14 = r25, r24
+ sub r31 = -1, r15
+ FSH r22 = r17, cnt
+}{.mib; ld8 r17 = [up], UPD
+ BSH r23 = r18, tnc
+ br L(end)
+ ;; }
+
+.Lb10:
+ ld8 r17 = [up], UPD
+ br.cloop.dptk L(gt2)
+ ;;
+ BSH r8 = r10, tnc
+ FSH r20 = r10, cnt
+ ;;
+ BSH r21 = r17, tnc
+ FSH r22 = r17, cnt
+ ;;
+ or r14 = r21, r20
+ ;;
+ sub r31 = -1, r14
+ br .Lr2
+ ;;
+L(gt2):
+ ld8 r18 = [up], UPD
+ BSH r8 = r10, tnc
+ FSH r20 = r10, cnt
+ ;;
+ ld8 r19 = [up], UPD
+ ;;
+ ld8 r16 = [up], UPD
+ BSH r21 = r17, tnc
+ FSH r22 = r17, cnt
+ ;;
+ ld8 r17 = [up], UPD
+ BSH r23 = r18, tnc
+ br.cloop.dptk L(gt6)
+ ;;
+ nop 0
+ FSH r24 = r18, cnt
+ BSH r25 = r19, tnc
+ ;;
+ or r14 = r21, r20
+ FSH r26 = r19, cnt
+ BSH r27 = r16, tnc
+ ;;
+ {.mmi; nop 0
+ or r15 = r23, r22
+ FSH r20 = r16, cnt
+}{.mib; sub r31 = -1, r14
+ BSH r21 = r17, tnc
+ br .Lr6
+ ;; }
+L(gt6):
+ {.mmi; nop 0
+ nop 0
+ FSH r24 = r18, cnt
+}{.mmi; ld8 r18 = [up], UPD
+ nop 0
+ BSH r25 = r19, tnc
+ ;; }
+ {.mmi; nop 0
+ or r14 = r21, r20
+ FSH r26 = r19, cnt
+}{.mmi; ld8 r19 = [up], UPD
+ nop 0
+ BSH r27 = r16, tnc
+ ;; }
+ {.mmi; or r15 = r23, r22
+ sub r31 = -1, r14
+ FSH r20 = r16, cnt
+}{.mib; ld8 r16 = [up], UPD
+ BSH r21 = r17, tnc
+ br .LL10
+}
+
+.Lb11:
+ ld8 r16 = [up], UPD
+ ;;
+ ld8 r17 = [up], UPD
+ BSH r8 = r10, tnc
+ FSH r26 = r10, cnt
+ br.cloop.dptk L(gt3)
+ ;;
+ BSH r27 = r16, tnc
+ ;;
+ FSH r20 = r16, cnt
+ BSH r21 = r17, tnc
+ ;;
+ FSH r22 = r17, cnt
+ ;;
+ or r15 = r27, r26
+ ;;
+ or r14 = r21, r20
+ sub r31 = -1, r15
+ br .Lr3
+ ;;
+L(gt3):
+ ld8 r18 = [up], UPD
+ ;;
+ ld8 r19 = [up], UPD
+ BSH r27 = r16, tnc
+ ;;
+ {.mmi; nop 0
+ nop 0
+ FSH r20 = r16, cnt
+}{.mmi; ld8 r16 = [up], UPD
+ nop 0
+ BSH r21 = r17, tnc
+ ;;
+}{.mmi; nop 0
+ nop 0
+ FSH r22 = r17, cnt
+}{.mib; ld8 r17 = [up], UPD
+ BSH r23 = r18, tnc
+ br.cloop.dptk L(gt7)
+ ;; }
+ or r15 = r27, r26
+ FSH r24 = r18, cnt
+ BSH r25 = r19, tnc
+ ;;
+ {.mmi; nop 0
+ or r14 = r21, r20
+ FSH r26 = r19, cnt
+}{.mib; sub r31 = -1, r15
+ BSH r27 = r16, tnc
+ br .Lr7
+}
+L(gt7):
+ {.mmi; nop 0
+ or r15 = r27, r26
+ FSH r24 = r18, cnt
+}{.mmi; ld8 r18 = [up], UPD
+ nop 0
+ BSH r25 = r19, tnc
+ ;; }
+ {.mmi; or r14 = r21, r20
+ sub r31 = -1, r15
+ FSH r26 = r19, cnt
+}{.mib; ld8 r19 = [up], UPD
+ BSH r27 = r16, tnc
+ br .LL11
+}
+
+C *** MAIN LOOP START ***
+ ALIGN(32)
+L(top):
+.LL01:
+ {.mmi; st8 [rp] = r31, UPD C M2
+ or r15 = r27, r26 C M3
+ FSH r24 = r18, cnt C I0
+}{.mmi; ld8 r18 = [up], UPD C M0
+ sub r31 = -1, r14 C M1
+ BSH r25 = r19, tnc C I1
+ ;; }
+.LL00:
+ {.mmi; st8 [rp] = r31, UPD
+ or r14 = r21, r20
+ FSH r26 = r19, cnt
+}{.mmi; ld8 r19 = [up], UPD
+ sub r31 = -1, r15
+ BSH r27 = r16, tnc
+ ;; }
+.LL11:
+ {.mmi; st8 [rp] = r31, UPD
+ or r15 = r23, r22
+ FSH r20 = r16, cnt
+}{.mmi; ld8 r16 = [up], UPD
+ sub r31 = -1, r14
+ BSH r21 = r17, tnc
+ ;; }
+.LL10:
+ {.mmi; st8 [rp] = r31, UPD
+ or r14 = r25, r24
+ FSH r22 = r17, cnt
+}{.mmi; ld8 r17 = [up], UPD
+ sub r31 = -1, r15
+ BSH r23 = r18, tnc
+ ;; }
+L(end): lfetch [r11], PUPD
+ br.cloop.dptk L(top)
+C *** MAIN LOOP END ***
+
+ {.mmi; st8 [rp] = r31, UPD
+ or r15 = r27, r26
+ FSH r24 = r18, cnt
+}{.mib; sub r31 = -1, r14
+ BSH r25 = r19, tnc
+ nop 0
+ ;; }
+.Lr8:
+ {.mmi; st8 [rp] = r31, UPD
+ or r14 = r21, r20
+ FSH r26 = r19, cnt
+}{.mib; sub r31 = -1, r15
+ BSH r27 = r16, tnc
+ nop 0
+ ;; }
+.Lr7:
+ {.mmi; st8 [rp] = r31, UPD
+ or r15 = r23, r22
+ FSH r20 = r16, cnt
+}{.mib; sub r31 = -1, r14
+ BSH r21 = r17, tnc
+ nop 0
+ ;; }
+.Lr6: st8 [rp] = r31, UPD
+ or r14 = r25, r24
+ FSH r22 = r17, cnt
+ sub r31 = -1, r15
+ ;;
+.Lr5: st8 [rp] = r31, UPD
+ or r15 = r27, r26
+ sub r31 = -1, r14
+ ;;
+.Lr4: st8 [rp] = r31, UPD
+ or r14 = r21, r20
+ sub r31 = -1, r15
+ ;;
+.Lr3: st8 [rp] = r31, UPD
+ sub r31 = -1, r14
+ ;;
+.Lr2: st8 [rp] = r31, UPD
+ sub r31 = -1, r22
+ ;;
+.Lr1: st8 [rp] = r31, UPD C M23
+ mov ar.lc = r2 C I0
+ br.ret.sptk.many b0 C B
+EPILOGUE(func)
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/ia64/mod_34lsub1.asm b/vendor/gmp-6.3.0/mpn/ia64/mod_34lsub1.asm
new file mode 100644
index 0000000..7789117
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/ia64/mod_34lsub1.asm
@@ -0,0 +1,237 @@
+dnl IA-64 mpn_mod_34lsub1
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2003-2005, 2010 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C Itanium: ?
+C Itanium 2: 1
+
+
+C INPUT PARAMETERS
+define(`up', `r32')
+define(`n', `r33')
+
+C Some useful aliases for registers we use
+define(`u0',`r14') define(`u1',`r15') define(`u2',`r16')
+define(`a0',`r17') define(`a1',`r18') define(`a2',`r19')
+define(`c0',`r20') define(`c1',`r21') define(`c2',`r22')
+
+C This is a fairly simple-minded implementation. One could approach 0.67 c/l
+C with a more sophisticated implementation. If we're really crazy, we could
+C super-unroll, storing carries just in predicate registers, then copy them to
+C a general register, and population count them from there. That'd bring us
+C close to 3 insn/limb, for nearly 0.5 c/l.
+
+C Computing n/3 needs 16 cycles, which is a lot of startup overhead.
+C We therefore use a plain while-style loop:
+C add n = -3, n
+C cmp.le p9, p0 = 3, n
+C (p9) br.cond .Loop
+C Alternatively, we could table n/3 for, say, n < 256, and predicate the
+C 16-cycle code.
+
+C The summing-up code at the end was written quickly, and could surely be
+C vastly improved.
+
+ASM_START()
+PROLOGUE(mpn_mod_34lsub1)
+ .prologue
+ .save ar.lc, r2
+ .body
+ifdef(`HAVE_ABI_32',`
+ addp4 up = 0, up C M I
+ nop.m 0
+ zxt4 n = n C I
+ ;;
+')
+
+ifelse(0,1,`
+ movl r14 = 0xAAAAAAAAAAAAAAAB
+ ;;
+ setf.sig f6 = r14
+ setf.sig f7 = r33
+ ;;
+ xmpy.hu f6 = f6, f7
+ ;;
+ getf.sig r8 = f6
+ ;;
+ shr.u r8 = r8, 1 C Loop count
+ ;;
+ mov.i ar.lc = r8
+')
+
+ ld8 u0 = [up], 8
+ cmp.ne p9, p0 = 1, n
+ (p9) br L(gt1)
+ ;;
+ shr.u r8 = u0, 48
+ dep.z r27 = u0, 0, 48
+ ;;
+ add r8 = r8, r27
+ br.ret.sptk.many b0
+
+
+L(gt1):
+ {.mmi; nop.m 0
+ mov a0 = 0
+ add n = -2, n
+}{.mmi; mov c0 = 0
+ mov c1 = 0
+ mov c2 = 0
+ ;;
+}{.mmi; ld8 u1 = [up], 8
+ mov a1 = 0
+ cmp.ltu p6, p0 = r0, r0 C clear p6
+}{.mmb; cmp.gt p9, p0 = 3, n
+ mov a2 = 0
+ (p9) br.cond.dptk L(end)
+ ;;
+}
+ ALIGN(32)
+L(top):
+ {.mmi; ld8 u2 = [up], 8
+ (p6) add c0 = 1, c0
+ cmp.ltu p7, p0 = a0, u0
+}{.mmb; sub a0 = a0, u0
+ add n = -3, n
+ nop.b 0
+ ;;
+}{.mmi; ld8 u0 = [up], 8
+ (p7) add c1 = 1, c1
+ cmp.ltu p8, p0 = a1, u1
+}{.mmb; sub a1 = a1, u1
+ cmp.le p9, p0 = 3, n
+ nop.b 0
+ ;;
+}{.mmi; ld8 u1 = [up], 8
+ (p8) add c2 = 1, c2
+ cmp.ltu p6, p0 = a2, u2
+}{.mmb; sub a2 = a2, u2
+ nop.m 0
+dnl br.cloop.dptk L(top)
+ (p9) br.cond.dptk L(top)
+ ;;
+}
+L(end):
+ cmp.eq p10, p0 = 0, n
+ cmp.eq p11, p0 = 1, n
+ (p10) br L(0)
+
+L(2):
+ {.mmi; ld8 u2 = [up], 8
+ (p6) add c0 = 1, c0
+ cmp.ltu p7, p0 = a0, u0
+}{.mmb; sub a0 = a0, u0
+ nop.m 0
+ (p11) br L(1)
+ ;;
+} ld8 u0 = [up], 8
+ (p7) add c1 = 1, c1
+ cmp.ltu p8, p0 = a1, u1
+ sub a1 = a1, u1
+ ;;
+ (p8) add c2 = 1, c2
+ cmp.ltu p6, p0 = a2, u2
+ sub a2 = a2, u2
+ ;;
+ (p6) add c0 = 1, c0
+ cmp.ltu p7, p0 = a0, u0
+ sub a0 = a0, u0
+ ;;
+ (p7) add c1 = 1, c1
+ br L(com)
+
+
+L(1):
+ (p7) add c1 = 1, c1
+ cmp.ltu p8, p0 = a1, u1
+ sub a1 = a1, u1
+ ;;
+ (p8) add c2 = 1, c2
+ cmp.ltu p6, p0 = a2, u2
+ sub a2 = a2, u2
+ ;;
+ (p6) add c0 = 1, c0
+ br L(com)
+
+
+L(0):
+ (p6) add c0 = 1, c0
+ cmp.ltu p7, p0 = a0, u0
+ sub a0 = a0, u0
+ ;;
+ (p7) add c1 = 1, c1
+ cmp.ltu p8, p0 = a1, u1
+ sub a1 = a1, u1
+ ;;
+ (p8) add c2 = 1, c2
+
+L(com):
+C | a2 | a1 | a0 |
+C | | | | |
+ shr.u r24 = a0, 48 C 16 bits
+ shr.u r25 = a1, 32 C 32 bits
+ shr.u r26 = a2, 16 C 48 bits
+ ;;
+ shr.u r10 = c0, 48 C 16 bits, always zero
+ shr.u r11 = c1, 32 C 32 bits
+ shr.u r30 = c2, 16 C 48 bits
+ ;;
+ dep.z r27 = a0, 0, 48 C 48 bits
+ dep.z r28 = a1, 16, 32 C 48 bits
+ dep.z r29 = a2, 32, 16 C 48 bits
+ dep.z r31 = c0, 0, 48 C 48 bits
+ dep.z r14 = c1, 16, 32 C 48 bits
+ dep.z r15 = c2, 32, 16 C 48 bits
+ ;;
+ {.mmi; add r24 = r24, r25
+ add r26 = r26, r27
+ add r28 = r28, r29
+}{.mmi; add r10 = r10, r11
+ add r30 = r30, r31
+ add r14 = r14, r15
+ ;;
+}
+ movl r8 = 0xffffffffffff0
+ add r24 = r24, r26
+ add r10 = r10, r30
+ ;;
+ add r24 = r24, r28
+ add r10 = r10, r14
+ ;;
+ sub r8 = r8, r24
+ ;;
+ add r8 = r8, r10
+ br.ret.sptk.many b0
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/ia64/mode1o.asm b/vendor/gmp-6.3.0/mpn/ia64/mode1o.asm
new file mode 100644
index 0000000..14d5e81
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/ia64/mode1o.asm
@@ -0,0 +1,342 @@
+dnl Itanium-2 mpn_modexact_1c_odd -- mpn by 1 exact remainder.
+
+dnl Contributed to the GNU project by Kevin Ryde.
+
+dnl Copyright 2003-2005 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C Itanium: 15
+C Itanium 2: 8
+
+
+dnl Usage: ABI32(`code')
+dnl
+dnl Emit the given code only under HAVE_ABI_32.
+dnl
+define(ABI32,
+m4_assert_onearg()
+`ifdef(`HAVE_ABI_32',`$1')')
+
+
+C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor, mp_limb_t carry);
+C
+C The modexact algorithm is usually conceived as a dependent chain
+C
+C l = src[i] - c
+C q = low(l * inverse)
+C c = high(q*divisor) + (src[i]<c)
+C
+C but we can work the src[i]-c into an xma by calculating si=src[i]*inverse
+C separately (off the dependent chain) and using
+C
+C q = low(c * inverse + si)
+C c = high(q*divisor + c)
+C
+C This means the dependent chain is simply xma.l followed by xma.hu, for a
+C total 8 cycles/limb on itanium-2.
+C
+C The reason xma.hu works for the new c is that the low of q*divisor is
+C src[i]-c (being the whole purpose of the q generated, and it can be
+C verified algebraically). If there was an underflow from src[i]-c, then
+C there will be an overflow from (src-c)+c, thereby adding 1 to the new c
+C the same as the borrow bit (src[i]<c) gives in the first style shown.
+C
+C Incidentally, fcmp is not an option for treating src[i]-c, since it
+C apparently traps to the kernel for unnormalized operands like those used
+C and generated by ldf8 and xma. On one GNU/Linux system it took about 1200
+C cycles.
+C
+C
+C First Limb:
+C
+C The first limb uses q = (src[0]-c) * inverse shown in the first style.
+C This lets us get the first q as soon as the inverse is ready, without
+C going through si=s*inverse. Basically at the start we have c and can use
+C it while waiting for the inverse, whereas for the second and subsequent
+C limbs it's the other way around, ie. we have the inverse and are waiting
+C for c.
+C
+C At .Lentry the first two instructions in the loop have been done already.
+C The load of f11=src[1] at the start (predicated on size>=2), and the
+C calculation of q by the initial different scheme.
+C
+C
+C Entry Sequence:
+C
+C In the entry sequence, the critical path is the calculation of the
+C inverse, so this is begun first and optimized. Apart from that, ar.lc is
+C established nice and early so the br.cloop's should predict perfectly.
+C And the load for the low limbs src[0] and src[1] can be initiated long
+C ahead of where they're needed.
+C
+C
+C Inverse Calculation:
+C
+C The initial 8-bit inverse is calculated using a table lookup. If it hits
+C L1 (which is likely if we're called several times) then it should take a
+C total 4 cycles, otherwise hopefully L2 for 9 cycles. This is considered
+C the best approach, on balance. It could be done bitwise, but that would
+C probably be about 14 cycles (2 per bit beyond the first couple). Or it
+C could be taken from 4 bits to 8 with xmpy doubling as used beyond 8 bits,
+C but that would be about 11 cycles.
+C
+C The table is not the same as binvert_limb_table, instead it's 256 bytes,
+C designed to be indexed by the low byte of the divisor. The divisor is
+C always odd, so the relevant data is every second byte in the table. The
+C padding lets us use zxt1 instead of extr.u, the latter would cost an extra
+C cycle because it must go down I0, and we're using the first I0 slot to get
+C ip. The extra 128 bytes of padding should be insignificant compared to
+C typical ia64 code bloat.
+C
+C Having the table in .text allows us to use IP-relative addressing,
+C avoiding a fetch from ltoff. .rodata is apparently not suitable for use
+C IP-relative, it gets a linker relocation overflow on GNU/Linux.
+C
+C
+C Load Scheduling:
+C
+C In the main loop, the data loads are scheduled for an L2 hit, which means
+C 6 cycles for the data ready to use. In fact we end up 7 cycles ahead. In
+C any case that scheduling is achieved simply by doing the load (and xmpy.l
+C for "si") in the immediately preceding iteration.
+C
+C The main loop requires size >= 2, and we handle size==1 by an initial
+C br.cloop to enter the loop only if size>1. Since ar.lc is established
+C early, this should predict perfectly.
+C
+C
+C Not done:
+C
+C Consideration was given to using a plain "(src[0]-c) % divisor" for
+C size==1, but cycle counting suggests about 50 for the sort of approach
+C taken by gcc __umodsi3, versus about 47 for the modexact. (Both assuming
+C L1 hits for their respective fetching.)
+C
+C Consideration was given to a test for high<divisor and replacing the last
+C loop iteration with instead c-=src[size-1] followed by c+=d if underflow.
+C Branching on high<divisor wouldn't be good since a mispredict would cost
+C more than the loop iteration saved, and the condition is of course data
+C dependent. So the theory would be to shorten the loop count if
+C high<divisor, and predicate extra operations at the end. That would mean
+C a gain of 6 when high<divisor, or a cost of 2 if not.
+C
+C Whether such a tradeoff is a win on average depends on assumptions about
+C how many bits in the high and the divisor. If both are uniformly
+C distributed then high<divisor about 50% of the time. But smallish
+C divisors (less chance of high<divisor) might be more likely from
+C applications (mpz_divisible_ui, mpz_gcd_ui, etc). Though biggish divisors
+C would be normal internally from say mpn/generic/perfsqr.c. On balance,
+C for the moment, it's felt the gain is not really enough to be worth the
+C trouble.
+C
+C
+C Enhancement:
+C
+C Process two source limbs per iteration using a two-limb inverse and a
+C sequence like
+C
+C ql = low (c * il + sil) quotient low limb
+C qlc = high(c * il + sil)
+C qh1 = low (c * ih + sih) quotient high, partial
+C
+C cl = high (ql * d + c) carry out of low
+C qh = low (qlc * 1 + qh1) quotient high limb
+C
+C new c = high (qh * d + cl) carry out of high
+C
+C This would be 13 cycles/iteration, giving 6.5 cycles/limb. The two limb
+C s*inverse as sih:sil = sh:sl * ih:il would be calculated off the dependent
+C chain with 4 multiplies. The bigger inverse would take extra time to
+C calculate, but a one limb iteration to handle an odd size could be done as
+C soon as 64-bits of inverse were ready.
+C
+C Perhaps this could even extend to a 3 limb inverse, which might promise 17
+C or 18 cycles for 3 limbs, giving 5.66 or 6.0 cycles/limb.
+C
+
+ASM_START()
+ .explicit
+
+ .text
+ .align 32
+.Ltable:
+data1 0,0x01, 0,0xAB, 0,0xCD, 0,0xB7, 0,0x39, 0,0xA3, 0,0xC5, 0,0xEF
+data1 0,0xF1, 0,0x1B, 0,0x3D, 0,0xA7, 0,0x29, 0,0x13, 0,0x35, 0,0xDF
+data1 0,0xE1, 0,0x8B, 0,0xAD, 0,0x97, 0,0x19, 0,0x83, 0,0xA5, 0,0xCF
+data1 0,0xD1, 0,0xFB, 0,0x1D, 0,0x87, 0,0x09, 0,0xF3, 0,0x15, 0,0xBF
+data1 0,0xC1, 0,0x6B, 0,0x8D, 0,0x77, 0,0xF9, 0,0x63, 0,0x85, 0,0xAF
+data1 0,0xB1, 0,0xDB, 0,0xFD, 0,0x67, 0,0xE9, 0,0xD3, 0,0xF5, 0,0x9F
+data1 0,0xA1, 0,0x4B, 0,0x6D, 0,0x57, 0,0xD9, 0,0x43, 0,0x65, 0,0x8F
+data1 0,0x91, 0,0xBB, 0,0xDD, 0,0x47, 0,0xC9, 0,0xB3, 0,0xD5, 0,0x7F
+data1 0,0x81, 0,0x2B, 0,0x4D, 0,0x37, 0,0xB9, 0,0x23, 0,0x45, 0,0x6F
+data1 0,0x71, 0,0x9B, 0,0xBD, 0,0x27, 0,0xA9, 0,0x93, 0,0xB5, 0,0x5F
+data1 0,0x61, 0,0x0B, 0,0x2D, 0,0x17, 0,0x99, 0,0x03, 0,0x25, 0,0x4F
+data1 0,0x51, 0,0x7B, 0,0x9D, 0,0x07, 0,0x89, 0,0x73, 0,0x95, 0,0x3F
+data1 0,0x41, 0,0xEB, 0,0x0D, 0,0xF7, 0,0x79, 0,0xE3, 0,0x05, 0,0x2F
+data1 0,0x31, 0,0x5B, 0,0x7D, 0,0xE7, 0,0x69, 0,0x53, 0,0x75, 0,0x1F
+data1 0,0x21, 0,0xCB, 0,0xED, 0,0xD7, 0,0x59, 0,0xC3, 0,0xE5, 0,0x0F
+data1 0,0x11, 0,0x3B, 0,0x5D, 0,0xC7, 0,0x49, 0,0x33, 0,0x55, 0,0xFF
+
+
+PROLOGUE(mpn_modexact_1c_odd)
+
+ C r32 src
+ C r33 size
+ C r34 divisor
+ C r35 carry
+
+ .prologue
+.Lhere:
+{ .mmi; add r33 = -1, r33 C M0 size-1
+ mov r14 = 2 C M1 2
+ mov r15 = ip C I0 .Lhere
+}{.mmi; setf.sig f6 = r34 C M2 divisor
+ setf.sig f9 = r35 C M3 carry
+ zxt1 r3 = r34 C I1 divisor low byte
+} ;;
+
+{ .mmi; add r3 = .Ltable-.Lhere, r3 C M0 table offset ip and index
+ sub r16 = 0, r34 C M1 -divisor
+ .save ar.lc, r2
+ mov r2 = ar.lc C I0
+}{.mmi; .body
+ setf.sig f13 = r14 C M2 2 in significand
+ mov r17 = -1 C M3 -1
+ABI32(` zxt4 r33 = r33') C I1 size extend
+} ;;
+
+{ .mmi; add r3 = r3, r15 C M0 table entry address
+ABI32(` addp4 r32 = 0, r32') C M1 src extend
+ mov ar.lc = r33 C I0 size-1 loop count
+}{.mmi; setf.sig f12 = r16 C M2 -divisor
+ setf.sig f8 = r17 C M3 -1
+} ;;
+
+{ .mmi; ld1 r3 = [r3] C M0 inverse, 8 bits
+ ldf8 f10 = [r32], 8 C M1 src[0]
+ cmp.ne p6,p0 = 0, r33 C I0 test size!=1
+} ;;
+
+ C Wait for table load.
+ C Hope for an L1 hit of 1 cycles to ALU, but could be more.
+ setf.sig f7 = r3 C M2 inverse, 8 bits
+(p6) ldf8 f11 = [r32], 8 C M1 src[1], if size!=1
+ ;;
+
+ C 5 cycles
+
+ C f6 divisor
+ C f7 inverse, being calculated
+ C f8 -1, will be -inverse
+ C f9 carry
+ C f10 src[0]
+ C f11 src[1]
+ C f12 -divisor
+ C f13 2
+ C f14 scratch
+
+ xmpy.l f14 = f13, f7 C 2*i
+ xmpy.l f7 = f7, f7 C i*i
+ ;;
+ xma.l f7 = f7, f12, f14 C i*i*-d + 2*i, inverse 16 bits
+ ;;
+
+ xmpy.l f14 = f13, f7 C 2*i
+ xmpy.l f7 = f7, f7 C i*i
+ ;;
+ xma.l f7 = f7, f12, f14 C i*i*-d + 2*i, inverse 32 bits
+ ;;
+
+ xmpy.l f14 = f13, f7 C 2*i
+ xmpy.l f7 = f7, f7 C i*i
+ ;;
+
+ xma.l f7 = f7, f12, f14 C i*i*-d + 2*i, inverse 64 bits
+ xma.l f10 = f9, f8, f10 C sc = c * -1 + src[0]
+ ;;
+ASSERT(p6, `
+ xmpy.l f15 = f6, f7 ;; C divisor*inverse
+ getf.sig r31 = f15 ;;
+ cmp.eq p6,p0 = 1, r31 C should == 1
+')
+
+ xmpy.l f10 = f10, f7 C q = sc * inverse
+ xmpy.l f8 = f7, f8 C -inverse = inverse * -1
+ br.cloop.sptk.few.clr .Lentry C main loop, if size > 1
+ ;;
+
+ C size==1, finish up now
+ xma.hu f9 = f10, f6, f9 C c = high(q * divisor + c)
+ mov ar.lc = r2 C I0
+ ;;
+ getf.sig r8 = f9 C M2 return c
+ br.ret.sptk.many b0
+
+
+
+.Ltop:
+ C r2 saved ar.lc
+ C f6 divisor
+ C f7 inverse
+ C f8 -inverse
+ C f9 carry
+ C f10 src[i] * inverse
+ C f11 scratch src[i+1]
+
+ add r16 = 160, r32
+ ldf8 f11 = [r32], 8 C src[i+1]
+ ;;
+ C 2 cycles
+
+ lfetch [r16]
+ xma.l f10 = f9, f8, f10 C q = c * -inverse + si
+ ;;
+ C 3 cycles
+
+.Lentry:
+ xma.hu f9 = f10, f6, f9 C c = high(q * divisor + c)
+ xmpy.l f10 = f11, f7 C si = src[i] * inverse
+ br.cloop.sptk.few.clr .Ltop
+ ;;
+
+
+
+ xma.l f10 = f9, f8, f10 C q = c * -inverse + si
+ mov ar.lc = r2 C I0
+ ;;
+ xma.hu f9 = f10, f6, f9 C c = high(q * divisor + c)
+ ;;
+ getf.sig r8 = f9 C M2 return c
+ br.ret.sptk.many b0
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/ia64/mul_1.asm b/vendor/gmp-6.3.0/mpn/ia64/mul_1.asm
new file mode 100644
index 0000000..21bf6d0
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/ia64/mul_1.asm
@@ -0,0 +1,584 @@
+dnl IA-64 mpn_mul_1, mpn_mul_1c -- Multiply a limb vector with a limb and
+dnl store the result in a second limb vector.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2000-2004, 2006, 2007 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C Itanium: 4.0
+C Itanium 2: 2.0
+
+C TODO
+C * Further optimize feed-in and wind-down code, both for speed and code size.
+C * Handle low limb input and results specially, using a common stf8 in the
+C epilogue.
+C * Use 1 c/l carry propagation scheme in wind-down code.
+C * Use extra pointer register for `up' to speed up feed-in loads.
+C * Work out final differences with addmul_1.asm.
+
+C INPUT PARAMETERS
+define(`rp', `r32')
+define(`up', `r33')
+define(`n', `r34')
+define(`vl', `r35')
+define(`cy', `r36') C for mpn_mul_1c
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+ .prologue
+ .save ar.lc, r2
+ .body
+
+ifdef(`HAVE_ABI_32',
+` addp4 rp = 0, rp C M I
+ addp4 up = 0, up C M I
+ zxt4 n = n C I
+ ;;
+')
+{.mfi
+ adds r15 = -1, n C M I
+ mov f9 = f0 C F
+ mov.i r2 = ar.lc C I0
+}
+{.mmi
+ ldf8 f7 = [up], 8 C M
+ nop.m 0 C M
+ and r14 = 3, n C M I
+ ;;
+}
+.Lcommon:
+{.mii
+ setf.sig f6 = vl C M2 M3
+ shr.u r31 = r15, 2 C I0
+ cmp.eq p10, p0 = 0, r14 C M I
+}
+{.mii
+ cmp.eq p11, p0 = 2, r14 C M I
+ cmp.eq p12, p0 = 3, r14 C M I
+ nop.i 0 C I
+ ;;
+}
+{.mii
+ cmp.ne p6, p7 = r0, r0 C M I
+ mov.i ar.lc = r31 C I0
+ cmp.ne p8, p9 = r0, r0 C M I
+}
+{.bbb
+ (p10) br.dptk .Lb00 C B
+ (p11) br.dptk .Lb10 C B
+ (p12) br.dptk .Lb11 C B
+ ;;
+}
+
+.Lb01: mov r20 = 0
+ br.cloop.dptk .grt1 C B
+
+ xma.l f39 = f7, f6, f9 C F
+ xma.hu f43 = f7, f6, f9 C F
+ ;;
+ getf.sig r8 = f43 C M2
+ stf8 [rp] = f39 C M2 M3
+ mov.i ar.lc = r2 C I0
+ br.ret.sptk.many b0 C B
+
+.grt1:
+ ldf8 f32 = [up], 8
+ ;;
+ ldf8 f33 = [up], 8
+ ;;
+ ldf8 f34 = [up], 8
+ xma.l f39 = f7, f6, f9
+ xma.hu f43 = f7, f6, f9
+ ;;
+ ldf8 f35 = [up], 8
+ br.cloop.dptk .grt5
+
+ xma.l f36 = f32, f6, f0
+ xma.hu f40 = f32, f6, f0
+ ;;
+ stf8 [rp] = f39, 8
+ xma.l f37 = f33, f6, f0
+ xma.hu f41 = f33, f6, f0
+ ;;
+ getf.sig r21 = f43
+ getf.sig r18 = f36
+ xma.l f38 = f34, f6, f0
+ xma.hu f42 = f34, f6, f0
+ ;;
+ getf.sig r22 = f40
+ getf.sig r19 = f37
+ xma.l f39 = f35, f6, f0
+ xma.hu f43 = f35, f6, f0
+ ;;
+ getf.sig r23 = f41
+ getf.sig r16 = f38
+ br .Lcj5
+
+.grt5:
+ xma.l f36 = f32, f6, f0
+ xma.hu f40 = f32, f6, f0
+ ;;
+ getf.sig r17 = f39
+ ldf8 f32 = [up], 8
+ xma.l f37 = f33, f6, f0
+ xma.hu f41 = f33, f6, f0
+ ;;
+ getf.sig r21 = f43
+ ldf8 f33 = [up], 8
+ xma.l f38 = f34, f6, f0
+ ;;
+ getf.sig r18 = f36
+ xma.hu f42 = f34, f6, f0
+ ;;
+ getf.sig r22 = f40
+ ldf8 f34 = [up], 8
+ xma.l f39 = f35, f6, f0
+ ;;
+ getf.sig r19 = f37
+ xma.hu f43 = f35, f6, f0
+ br .LL01
+
+
+.Lb10: ldf8 f35 = [up], 8
+ mov r23 = 0
+ br.cloop.dptk .grt2
+
+ xma.l f38 = f7, f6, f9
+ xma.hu f42 = f7, f6, f9
+ ;;
+ stf8 [rp] = f38, 8
+ xma.l f39 = f35, f6, f42
+ xma.hu f43 = f35, f6, f42
+ ;;
+ getf.sig r8 = f43
+ stf8 [rp] = f39
+ mov.i ar.lc = r2
+ br.ret.sptk.many b0
+
+
+.grt2:
+ ldf8 f32 = [up], 8
+ ;;
+ ldf8 f33 = [up], 8
+ xma.l f38 = f7, f6, f9
+ xma.hu f42 = f7, f6, f9
+ ;;
+ ldf8 f34 = [up], 8
+ xma.l f39 = f35, f6, f0
+ xma.hu f43 = f35, f6, f0
+ ;;
+ ldf8 f35 = [up], 8
+ br.cloop.dptk .grt6
+
+ stf8 [rp] = f38, 8
+ xma.l f36 = f32, f6, f0
+ xma.hu f40 = f32, f6, f0
+ ;;
+ getf.sig r20 = f42
+ getf.sig r17 = f39
+ xma.l f37 = f33, f6, f0
+ xma.hu f41 = f33, f6, f0
+ ;;
+ getf.sig r21 = f43
+ getf.sig r18 = f36
+ xma.l f38 = f34, f6, f0
+ xma.hu f42 = f34, f6, f0
+ ;;
+ getf.sig r22 = f40
+ getf.sig r19 = f37
+ xma.l f39 = f35, f6, f0
+ xma.hu f43 = f35, f6, f0
+ br .Lcj6
+
+.grt6:
+ getf.sig r16 = f38
+ xma.l f36 = f32, f6, f0
+ xma.hu f40 = f32, f6, f0
+ ;;
+ getf.sig r20 = f42
+ ldf8 f32 = [up], 8
+ xma.l f37 = f33, f6, f0
+ ;;
+ getf.sig r17 = f39
+ xma.hu f41 = f33, f6, f0
+ ;;
+ getf.sig r21 = f43
+ ldf8 f33 = [up], 8
+ xma.l f38 = f34, f6, f0
+ ;;
+ getf.sig r18 = f36
+ xma.hu f42 = f34, f6, f0
+ br .LL10
+
+
+.Lb11: ldf8 f34 = [up], 8
+ mov r22 = 0
+ ;;
+ ldf8 f35 = [up], 8
+ br.cloop.dptk .grt3
+ ;;
+
+ xma.l f37 = f7, f6, f9
+ xma.hu f41 = f7, f6, f9
+ xma.l f38 = f34, f6, f0
+ xma.hu f42 = f34, f6, f0
+ xma.l f39 = f35, f6, f0
+ xma.hu f43 = f35, f6, f0
+ ;;
+ getf.sig r23 = f41
+ stf8 [rp] = f37, 8
+ getf.sig r16 = f38
+ getf.sig r20 = f42
+ getf.sig r17 = f39
+ getf.sig r8 = f43
+ br .Lcj3
+
+.grt3:
+ ldf8 f32 = [up], 8
+ xma.l f37 = f7, f6, f9
+ xma.hu f41 = f7, f6, f9
+ ;;
+ ldf8 f33 = [up], 8
+ xma.l f38 = f34, f6, f0
+ xma.hu f42 = f34, f6, f0
+ ;;
+ getf.sig r19 = f37
+ ldf8 f34 = [up], 8
+ xma.l f39 = f35, f6, f0
+ xma.hu f43 = f35, f6, f0
+ ;;
+ getf.sig r23 = f41
+ ldf8 f35 = [up], 8
+ br.cloop.dptk .grt7
+
+ getf.sig r16 = f38
+ xma.l f36 = f32, f6, f0
+ getf.sig r20 = f42
+ xma.hu f40 = f32, f6, f0
+ ;;
+ getf.sig r17 = f39
+ xma.l f37 = f33, f6, f0
+ getf.sig r21 = f43
+ xma.hu f41 = f33, f6, f0
+ ;;
+ getf.sig r18 = f36
+ st8 [rp] = r19, 8
+ xma.l f38 = f34, f6, f0
+ xma.hu f42 = f34, f6, f0
+ br .Lcj7
+
+.grt7:
+ getf.sig r16 = f38
+ xma.l f36 = f32, f6, f0
+ xma.hu f40 = f32, f6, f0
+ ;;
+ getf.sig r20 = f42
+ ldf8 f32 = [up], 8
+ xma.l f37 = f33, f6, f0
+ ;;
+ getf.sig r17 = f39
+ xma.hu f41 = f33, f6, f0
+ br .LL11
+
+
+.Lb00: ldf8 f33 = [up], 8
+ mov r21 = 0
+ ;;
+ ldf8 f34 = [up], 8
+ ;;
+ ldf8 f35 = [up], 8
+ xma.l f36 = f7, f6, f9
+ xma.hu f40 = f7, f6, f9
+ br.cloop.dptk .grt4
+
+ xma.l f37 = f33, f6, f0
+ xma.hu f41 = f33, f6, f0
+ xma.l f38 = f34, f6, f0
+ xma.hu f42 = f34, f6, f0
+ ;;
+ getf.sig r22 = f40
+ stf8 [rp] = f36, 8
+ xma.l f39 = f35, f6, f0
+ getf.sig r19 = f37
+ xma.hu f43 = f35, f6, f0
+ ;;
+ getf.sig r23 = f41
+ getf.sig r16 = f38
+ getf.sig r20 = f42
+ getf.sig r17 = f39
+ br .Lcj4
+
+.grt4:
+ ldf8 f32 = [up], 8
+ xma.l f37 = f33, f6, f0
+ xma.hu f41 = f33, f6, f0
+ ;;
+ getf.sig r18 = f36
+ ldf8 f33 = [up], 8
+ xma.l f38 = f34, f6, f0
+ xma.hu f42 = f34, f6, f0
+ ;;
+ getf.sig r22 = f40
+ ldf8 f34 = [up], 8
+ xma.l f39 = f35, f6, f0
+ ;;
+ getf.sig r19 = f37
+ getf.sig r23 = f41
+ xma.hu f43 = f35, f6, f0
+ ldf8 f35 = [up], 8
+ br.cloop.dptk .grt8
+
+ getf.sig r16 = f38
+ xma.l f36 = f32, f6, f0
+ getf.sig r20 = f42
+ xma.hu f40 = f32, f6, f0
+ ;;
+ getf.sig r17 = f39
+ st8 [rp] = r18, 8
+ xma.l f37 = f33, f6, f0
+ xma.hu f41 = f33, f6, f0
+ br .Lcj8
+
+.grt8:
+ getf.sig r16 = f38
+ xma.l f36 = f32, f6, f0
+ xma.hu f40 = f32, f6, f0
+ br .LL00
+
+
+C *** MAIN LOOP START ***
+ ALIGN(32)
+.Loop:
+ .pred.rel "mutex",p6,p7
+ getf.sig r16 = f38
+ xma.l f36 = f32, f6, f0
+ (p6) cmp.leu p8, p9 = r24, r17
+ st8 [rp] = r24, 8
+ xma.hu f40 = f32, f6, f0
+ (p7) cmp.ltu p8, p9 = r24, r17
+ ;;
+.LL00:
+ .pred.rel "mutex",p8,p9
+ getf.sig r20 = f42
+ (p8) add r24 = r18, r21, 1
+ nop.b 0
+ ldf8 f32 = [up], 8
+ (p9) add r24 = r18, r21
+ nop.b 0
+ ;;
+ .pred.rel "mutex",p8,p9
+ getf.sig r17 = f39
+ xma.l f37 = f33, f6, f0
+ (p8) cmp.leu p6, p7 = r24, r18
+ st8 [rp] = r24, 8
+ xma.hu f41 = f33, f6, f0
+ (p9) cmp.ltu p6, p7 = r24, r18
+ ;;
+.LL11:
+ .pred.rel "mutex",p6,p7
+ getf.sig r21 = f43
+ (p6) add r24 = r19, r22, 1
+ nop.b 0
+ ldf8 f33 = [up], 8
+ (p7) add r24 = r19, r22
+ nop.b 0
+ ;;
+ .pred.rel "mutex",p6,p7
+ getf.sig r18 = f36
+ xma.l f38 = f34, f6, f0
+ (p6) cmp.leu p8, p9 = r24, r19
+ st8 [rp] = r24, 8
+ xma.hu f42 = f34, f6, f0
+ (p7) cmp.ltu p8, p9 = r24, r19
+ ;;
+.LL10:
+ .pred.rel "mutex",p8,p9
+ getf.sig r22 = f40
+ (p8) add r24 = r16, r23, 1
+ nop.b 0
+ ldf8 f34 = [up], 8
+ (p9) add r24 = r16, r23
+ nop.b 0
+ ;;
+ .pred.rel "mutex",p8,p9
+ getf.sig r19 = f37
+ xma.l f39 = f35, f6, f0
+ (p8) cmp.leu p6, p7 = r24, r16
+ st8 [rp] = r24, 8
+ xma.hu f43 = f35, f6, f0
+ (p9) cmp.ltu p6, p7 = r24, r16
+ ;;
+.LL01:
+ .pred.rel "mutex",p6,p7
+ getf.sig r23 = f41
+ (p6) add r24 = r17, r20, 1
+ nop.b 0
+ ldf8 f35 = [up], 8
+ (p7) add r24 = r17, r20
+ br.cloop.dptk .Loop
+C *** MAIN LOOP END ***
+ ;;
+
+.Lcj9:
+ .pred.rel "mutex",p6,p7
+ getf.sig r16 = f38
+ xma.l f36 = f32, f6, f0
+ (p6) cmp.leu p8, p9 = r24, r17
+ st8 [rp] = r24, 8
+ xma.hu f40 = f32, f6, f0
+ (p7) cmp.ltu p8, p9 = r24, r17
+ ;;
+ .pred.rel "mutex",p8,p9
+ getf.sig r20 = f42
+ (p8) add r24 = r18, r21, 1
+ (p9) add r24 = r18, r21
+ ;;
+ .pred.rel "mutex",p8,p9
+ getf.sig r17 = f39
+ xma.l f37 = f33, f6, f0
+ (p8) cmp.leu p6, p7 = r24, r18
+ st8 [rp] = r24, 8
+ xma.hu f41 = f33, f6, f0
+ (p9) cmp.ltu p6, p7 = r24, r18
+ ;;
+.Lcj8:
+ .pred.rel "mutex",p6,p7
+ getf.sig r21 = f43
+ (p6) add r24 = r19, r22, 1
+ (p7) add r24 = r19, r22
+ ;;
+ .pred.rel "mutex",p6,p7
+ getf.sig r18 = f36
+ xma.l f38 = f34, f6, f0
+ (p6) cmp.leu p8, p9 = r24, r19
+ st8 [rp] = r24, 8
+ xma.hu f42 = f34, f6, f0
+ (p7) cmp.ltu p8, p9 = r24, r19
+ ;;
+.Lcj7:
+ .pred.rel "mutex",p8,p9
+ getf.sig r22 = f40
+ (p8) add r24 = r16, r23, 1
+ (p9) add r24 = r16, r23
+ ;;
+ .pred.rel "mutex",p8,p9
+ getf.sig r19 = f37
+ xma.l f39 = f35, f6, f0
+ (p8) cmp.leu p6, p7 = r24, r16
+ st8 [rp] = r24, 8
+ xma.hu f43 = f35, f6, f0
+ (p9) cmp.ltu p6, p7 = r24, r16
+ ;;
+.Lcj6:
+ .pred.rel "mutex",p6,p7
+ getf.sig r23 = f41
+ (p6) add r24 = r17, r20, 1
+ (p7) add r24 = r17, r20
+ ;;
+ .pred.rel "mutex",p6,p7
+ (p6) cmp.leu p8, p9 = r24, r17
+ (p7) cmp.ltu p8, p9 = r24, r17
+ getf.sig r16 = f38
+ st8 [rp] = r24, 8
+ ;;
+.Lcj5:
+ .pred.rel "mutex",p8,p9
+ getf.sig r20 = f42
+ (p8) add r24 = r18, r21, 1
+ (p9) add r24 = r18, r21
+ ;;
+ .pred.rel "mutex",p8,p9
+ (p8) cmp.leu p6, p7 = r24, r18
+ (p9) cmp.ltu p6, p7 = r24, r18
+ getf.sig r17 = f39
+ st8 [rp] = r24, 8
+ ;;
+.Lcj4:
+ .pred.rel "mutex",p6,p7
+ getf.sig r8 = f43
+ (p6) add r24 = r19, r22, 1
+ (p7) add r24 = r19, r22
+ ;;
+ .pred.rel "mutex",p6,p7
+ st8 [rp] = r24, 8
+ (p6) cmp.leu p8, p9 = r24, r19
+ (p7) cmp.ltu p8, p9 = r24, r19
+ ;;
+.Lcj3:
+ .pred.rel "mutex",p8,p9
+ (p8) add r24 = r16, r23, 1
+ (p9) add r24 = r16, r23
+ ;;
+ .pred.rel "mutex",p8,p9
+ st8 [rp] = r24, 8
+ (p8) cmp.leu p6, p7 = r24, r16
+ (p9) cmp.ltu p6, p7 = r24, r16
+ ;;
+.Lcj2:
+ .pred.rel "mutex",p6,p7
+ (p6) add r24 = r17, r20, 1
+ (p7) add r24 = r17, r20
+ ;;
+ .pred.rel "mutex",p6,p7
+ st8 [rp] = r24, 8
+ (p6) cmp.leu p8, p9 = r24, r17
+ (p7) cmp.ltu p8, p9 = r24, r17
+ ;;
+ (p8) add r8 = 1, r8
+ mov.i ar.lc = r2
+ br.ret.sptk.many b0
+EPILOGUE()
+
+PROLOGUE(mpn_mul_1c)
+ .prologue
+ .save ar.lc, r2
+ .body
+
+ifdef(`HAVE_ABI_32',
+` addp4 rp = 0, rp C M I
+ addp4 up = 0, up C M I
+ zxt4 n = n C I
+ ;;
+')
+{.mmi
+ adds r15 = -1, n C M I
+ setf.sig f9 = cy C M2 M3
+ mov.i r2 = ar.lc C I0
+}
+{.mmb
+ ldf8 f7 = [up], 8 C M
+ and r14 = 3, n C M I
+ br.sptk .Lcommon
+ ;;
+}
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/ia64/mul_2.asm b/vendor/gmp-6.3.0/mpn/ia64/mul_2.asm
new file mode 100644
index 0000000..5343f64
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/ia64/mul_2.asm
@@ -0,0 +1,625 @@
+dnl IA-64 mpn_mul_2 -- Multiply a n-limb number with a 2-limb number and store
+dnl store the result to a (n+1)-limb number.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2004, 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C Itanium: ?
+C Itanium 2: 1.5
+
+C TODO
+C * Clean up variable names, and try to decrease the number of distinct
+C registers used.
+C * Clean up feed-in code to not require zeroing several registers.
+C * Make sure we don't depend on uninitialized predicate registers.
+C * Could perhaps save a few cycles by using 1 c/l carry propagation in
+C wind-down code.
+C * Ultimately rewrite. The problem with this code is that it first uses a
+C loaded u value in one xma pair, then leaves it live over several unrelated
+C xma pairs, before it uses it again. It should actually be quite possible
+C to just swap some aligned xma pairs around. But we should then schedule
+C u loads further from the first use.
+
+C INPUT PARAMETERS
+define(`rp',`r32')
+define(`up',`r33')
+define(`n',`r34')
+define(`vp',`r35')
+
+define(`srp',`r3')
+
+define(`v0',`f6')
+define(`v1',`f7')
+
+define(`s0',`r14')
+define(`acc0',`r15')
+
+define(`pr0_0',`r16') define(`pr0_1',`r17')
+define(`pr0_2',`r18') define(`pr0_3',`r19')
+
+define(`pr1_0',`r20') define(`pr1_1',`r21')
+define(`pr1_2',`r22') define(`pr1_3',`r23')
+
+define(`acc1_0',`r24') define(`acc1_1',`r25')
+define(`acc1_2',`r26') define(`acc1_3',`r27')
+
+dnl define(`',`r28')
+dnl define(`',`r29')
+dnl define(`',`r30')
+dnl define(`',`r31')
+
+define(`fp0b_0',`f8') define(`fp0b_1',`f9')
+define(`fp0b_2',`f10') define(`fp0b_3',`f11')
+
+define(`fp1a_0',`f12') define(`fp1a_1',`f13')
+define(`fp1a_2',`f14') define(`fp1a_3',`f15')
+
+define(`fp1b_0',`f32') define(`fp1b_1',`f33')
+define(`fp1b_2',`f34') define(`fp1b_3',`f35')
+
+define(`fp2a_0',`f36') define(`fp2a_1',`f37')
+define(`fp2a_2',`f38') define(`fp2a_3',`f39')
+
+define(`u_0',`f44') define(`u_1',`f45')
+define(`u_2',`f46') define(`u_3',`f47')
+
+define(`ux',`f49')
+define(`uy',`f51')
+
+ASM_START()
+PROLOGUE(mpn_mul_2)
+ .prologue
+ .save ar.lc, r2
+ .body
+
+ifdef(`HAVE_ABI_32',`
+ {.mmi; addp4 rp = 0, rp C M I
+ addp4 up = 0, up C M I
+ addp4 vp = 0, vp C M I
+}{.mmi; nop 1
+ nop 1
+ zxt4 n = n C I
+ ;;
+}')
+
+ {.mmi; ldf8 ux = [up], 8 C M
+ ldf8 v0 = [vp], 8 C M
+ mov r2 = ar.lc C I0
+}{.mmi; nop 1 C M
+ and r14 = 3, n C M I
+ add n = -2, n C M I
+ ;;
+}{.mmi; ldf8 uy = [up], 8 C M
+ ldf8 v1 = [vp] C M
+ shr.u n = n, 2 C I0
+}{.mmi; nop 1 C M
+ cmp.eq p10, p0 = 1, r14 C M I
+ cmp.eq p11, p0 = 2, r14 C M I
+ ;;
+}{.mmi; nop 1 C M
+ cmp.eq p12, p0 = 3, r14 C M I
+ mov ar.lc = n C I0
+}{.bbb; (p10) br.dptk L(b01) C B
+ (p11) br.dptk L(b10) C B
+ (p12) br.dptk L(b11) C B
+ ;;
+}
+ ALIGN(32)
+L(b00): ldf8 u_1 = [up], 8
+ mov acc1_2 = 0
+ mov pr1_2 = 0
+ mov pr0_3 = 0
+ cmp.ne p8, p9 = r0, r0
+ ;;
+ xma.l fp0b_3 = ux, v0, f0
+ cmp.ne p12, p13 = r0, r0
+ ldf8 u_2 = [up], 8
+ xma.hu fp1a_3 = ux, v0, f0
+ br.cloop.dptk L(gt4)
+
+ xma.l fp0b_0 = uy, v0, f0
+ xma.hu fp1a_0 = uy, v0, f0
+ ;;
+ getfsig acc0 = fp0b_3
+ xma.l fp1b_3 = ux, v1, fp1a_3
+ xma.hu fp2a_3 = ux, v1, fp1a_3
+ ;;
+ xma.l fp0b_1 = u_1, v0, f0
+ xma.hu fp1a_1 = u_1, v0, f0
+ ;;
+ getfsig pr0_0 = fp0b_0
+ xma.l fp1b_0 = uy, v1, fp1a_0
+ xma.hu fp2a_0 = uy, v1, fp1a_0
+ ;;
+ getfsig pr1_3 = fp1b_3
+ getfsig acc1_3 = fp2a_3
+ xma.l fp0b_2 = u_2, v0, f0
+ xma.hu fp1a_2 = u_2, v0, f0
+ br L(cj4)
+
+L(gt4): xma.l fp0b_0 = uy, v0, f0
+ xma.hu fp1a_0 = uy, v0, f0
+ ;;
+ getfsig acc0 = fp0b_3
+ xma.l fp1b_3 = ux, v1, fp1a_3
+ ldf8 u_3 = [up], 8
+ xma.hu fp2a_3 = ux, v1, fp1a_3
+ ;;
+ xma.l fp0b_1 = u_1, v0, f0
+ xma.hu fp1a_1 = u_1, v0, f0
+ ;;
+ getfsig pr0_0 = fp0b_0
+ xma.l fp1b_0 = uy, v1, fp1a_0
+ xma.hu fp2a_0 = uy, v1, fp1a_0
+ ;;
+ ldf8 u_0 = [up], 8
+ getfsig pr1_3 = fp1b_3
+ xma.l fp0b_2 = u_2, v0, f0
+ ;;
+ getfsig acc1_3 = fp2a_3
+ xma.hu fp1a_2 = u_2, v0, f0
+ br L(00)
+
+
+ ALIGN(32)
+L(b01): ldf8 u_0 = [up], 8 C M
+ mov acc1_1 = 0 C M I
+ mov pr1_1 = 0 C M I
+ mov pr0_2 = 0 C M I
+ cmp.ne p6, p7 = r0, r0 C M I
+ ;;
+ xma.l fp0b_2 = ux, v0, f0 C F
+ cmp.ne p10, p11 = r0, r0 C M I
+ ldf8 u_1 = [up], 8 C M
+ xma.hu fp1a_2 = ux, v0, f0 C F
+ ;;
+ xma.l fp0b_3 = uy, v0, f0 C F
+ xma.hu fp1a_3 = uy, v0, f0 C F
+ ;;
+ getfsig acc0 = fp0b_2 C M
+ xma.l fp1b_2 = ux, v1,fp1a_2 C F
+ ldf8 u_2 = [up], 8 C M
+ xma.hu fp2a_2 = ux, v1,fp1a_2 C F
+ br.cloop.dptk L(gt5)
+
+ xma.l fp0b_0 = u_0, v0, f0 C F
+ xma.hu fp1a_0 = u_0, v0, f0 C F
+ ;;
+ getfsig pr0_3 = fp0b_3 C M
+ xma.l fp1b_3 = uy, v1,fp1a_3 C F
+ xma.hu fp2a_3 = uy, v1,fp1a_3 C F
+ ;;
+ getfsig pr1_2 = fp1b_2 C M
+ getfsig acc1_2 = fp2a_2 C M
+ xma.l fp0b_1 = u_1, v0, f0 C F
+ xma.hu fp1a_1 = u_1, v0, f0 C F
+ br L(cj5)
+
+L(gt5): xma.l fp0b_0 = u_0, v0, f0
+ xma.hu fp1a_0 = u_0, v0, f0
+ ;;
+ getfsig pr0_3 = fp0b_3
+ xma.l fp1b_3 = uy, v1, fp1a_3
+ xma.hu fp2a_3 = uy, v1, fp1a_3
+ ;;
+ ldf8 u_3 = [up], 8
+ getfsig pr1_2 = fp1b_2
+ xma.l fp0b_1 = u_1, v0, f0
+ ;;
+ getfsig acc1_2 = fp2a_2
+ xma.hu fp1a_1 = u_1, v0, f0
+ br L(01)
+
+
+ ALIGN(32)
+L(b10): br.cloop.dptk L(gt2)
+ xma.l fp0b_1 = ux, v0, f0
+ xma.hu fp1a_1 = ux, v0, f0
+ ;;
+ xma.l fp0b_2 = uy, v0, f0
+ xma.hu fp1a_2 = uy, v0, f0
+ ;;
+ stf8 [rp] = fp0b_1, 8
+ xma.l fp1b_1 = ux, v1, fp1a_1
+ xma.hu fp2a_1 = ux, v1, fp1a_1
+ ;;
+ getfsig acc0 = fp0b_2
+ xma.l fp1b_2 = uy, v1, fp1a_2
+ xma.hu fp2a_2 = uy, v1, fp1a_2
+ ;;
+ getfsig pr1_1 = fp1b_1
+ getfsig acc1_1 = fp2a_1
+ mov ar.lc = r2
+ getfsig pr1_2 = fp1b_2
+ getfsig r8 = fp2a_2
+ ;;
+ add s0 = pr1_1, acc0
+ ;;
+ st8 [rp] = s0, 8
+ cmp.ltu p8, p9 = s0, pr1_1
+ sub r31 = -1, acc1_1
+ ;;
+ .pred.rel "mutex", p8, p9
+ (p8) add acc0 = pr1_2, acc1_1, 1
+ (p9) add acc0 = pr1_2, acc1_1
+ (p8) cmp.leu p10, p0 = r31, pr1_2
+ (p9) cmp.ltu p10, p0 = r31, pr1_2
+ ;;
+ st8 [rp] = acc0, 8
+ (p10) add r8 = 1, r8
+ br.ret.sptk.many b0
+
+L(gt2): ldf8 u_3 = [up], 8
+ mov acc1_0 = 0
+ mov pr1_0 = 0
+ ;;
+ mov pr0_1 = 0
+ xma.l fp0b_1 = ux, v0, f0
+ ldf8 u_0 = [up], 8
+ xma.hu fp1a_1 = ux, v0, f0
+ ;;
+ xma.l fp0b_2 = uy, v0, f0
+ xma.hu fp1a_2 = uy, v0, f0
+ ;;
+ getfsig acc0 = fp0b_1
+ xma.l fp1b_1 = ux, v1, fp1a_1
+ xma.hu fp2a_1 = ux, v1, fp1a_1
+ ;;
+ ldf8 u_1 = [up], 8
+ xma.l fp0b_3 = u_3, v0, f0
+ xma.hu fp1a_3 = u_3, v0, f0
+ ;;
+ getfsig pr0_2 = fp0b_2
+ xma.l fp1b_2 = uy, v1, fp1a_2
+ xma.hu fp2a_2 = uy, v1, fp1a_2
+ ;;
+ ldf8 u_2 = [up], 8
+ getfsig pr1_1 = fp1b_1
+ ;;
+ {.mfi; getfsig acc1_1 = fp2a_1
+ xma.l fp0b_0 = u_0, v0, f0
+ cmp.ne p8, p9 = r0, r0
+}{.mfb; cmp.ne p12, p13 = r0, r0
+ xma.hu fp1a_0 = u_0, v0, f0
+ br L(10)
+}
+
+ ALIGN(32)
+L(b11): mov acc1_3 = 0
+ mov pr1_3 = 0
+ mov pr0_0 = 0
+ ldf8 u_2 = [up], 8
+ cmp.ne p6, p7 = r0, r0
+ br.cloop.dptk L(gt3)
+ ;;
+ xma.l fp0b_0 = ux, v0, f0
+ xma.hu fp1a_0 = ux, v0, f0
+ ;;
+ cmp.ne p10, p11 = r0, r0
+ xma.l fp0b_1 = uy, v0, f0
+ xma.hu fp1a_1 = uy, v0, f0
+ ;;
+ getfsig acc0 = fp0b_0
+ xma.l fp1b_0 = ux, v1, fp1a_0
+ xma.hu fp2a_0 = ux, v1, fp1a_0
+ ;;
+ xma.l fp0b_2 = u_2, v0, f0
+ xma.hu fp1a_2 = u_2, v0, f0
+ ;;
+ getfsig pr0_1 = fp0b_1
+ xma.l fp1b_1 = uy, v1, fp1a_1
+ xma.hu fp2a_1 = uy, v1, fp1a_1
+ ;;
+ getfsig pr1_0 = fp1b_0
+ getfsig acc1_0 = fp2a_0
+ br L(cj3)
+
+L(gt3): xma.l fp0b_0 = ux, v0, f0
+ cmp.ne p10, p11 = r0, r0
+ ldf8 u_3 = [up], 8
+ xma.hu fp1a_0 = ux, v0, f0
+ ;;
+ xma.l fp0b_1 = uy, v0, f0
+ xma.hu fp1a_1 = uy, v0, f0
+ ;;
+ getfsig acc0 = fp0b_0
+ xma.l fp1b_0 = ux, v1, fp1a_0
+ ldf8 u_0 = [up], 8
+ xma.hu fp2a_0 = ux, v1, fp1a_0
+ ;;
+ xma.l fp0b_2 = u_2, v0, f0
+ xma.hu fp1a_2 = u_2, v0, f0
+ ;;
+ getfsig pr0_1 = fp0b_1
+ xma.l fp1b_1 = uy, v1, fp1a_1
+ xma.hu fp2a_1 = uy, v1, fp1a_1
+ ;;
+ ldf8 u_1 = [up], 8
+ getfsig pr1_0 = fp1b_0
+ ;;
+ getfsig acc1_0 = fp2a_0
+ xma.l fp0b_3 = u_3, v0, f0
+ xma.hu fp1a_3 = u_3, v0, f0
+ br L(11)
+
+
+C *** MAIN LOOP START ***
+ ALIGN(32)
+L(top): C 00
+ .pred.rel "mutex", p8, p9
+ .pred.rel "mutex", p12, p13
+ ldf8 u_3 = [up], 8
+ getfsig pr1_2 = fp1b_2
+ (p8) cmp.leu p6, p7 = acc0, pr0_1
+ (p9) cmp.ltu p6, p7 = acc0, pr0_1
+ (p12) cmp.leu p10, p11 = s0, pr1_0
+ (p13) cmp.ltu p10, p11 = s0, pr1_0
+ ;; C 01
+ .pred.rel "mutex", p6, p7
+ getfsig acc1_2 = fp2a_2
+ st8 [rp] = s0, 8
+ xma.l fp0b_1 = u_1, v0, f0
+ (p6) add acc0 = pr0_2, acc1_0, 1
+ (p7) add acc0 = pr0_2, acc1_0
+ xma.hu fp1a_1 = u_1, v0, f0
+ ;; C 02
+L(01):
+ .pred.rel "mutex", p10, p11
+ getfsig pr0_0 = fp0b_0
+ xma.l fp1b_0 = u_0, v1, fp1a_0
+ (p10) add s0 = pr1_1, acc0, 1
+ (p11) add s0 = pr1_1, acc0
+ xma.hu fp2a_0 = u_0, v1, fp1a_0
+ nop 1
+ ;; C 03
+ .pred.rel "mutex", p6, p7
+ .pred.rel "mutex", p10, p11
+ ldf8 u_0 = [up], 8
+ getfsig pr1_3 = fp1b_3
+ (p6) cmp.leu p8, p9 = acc0, pr0_2
+ (p7) cmp.ltu p8, p9 = acc0, pr0_2
+ (p10) cmp.leu p12, p13 = s0, pr1_1
+ (p11) cmp.ltu p12, p13 = s0, pr1_1
+ ;; C 04
+ .pred.rel "mutex", p8, p9
+ getfsig acc1_3 = fp2a_3
+ st8 [rp] = s0, 8
+ xma.l fp0b_2 = u_2, v0, f0
+ (p8) add acc0 = pr0_3, acc1_1, 1
+ (p9) add acc0 = pr0_3, acc1_1
+ xma.hu fp1a_2 = u_2, v0, f0
+ ;; C 05
+L(00):
+ .pred.rel "mutex", p12, p13
+ getfsig pr0_1 = fp0b_1
+ xma.l fp1b_1 = u_1, v1, fp1a_1
+ (p12) add s0 = pr1_2, acc0, 1
+ (p13) add s0 = pr1_2, acc0
+ xma.hu fp2a_1 = u_1, v1, fp1a_1
+ nop 1
+ ;; C 06
+ .pred.rel "mutex", p8, p9
+ .pred.rel "mutex", p12, p13
+ ldf8 u_1 = [up], 8
+ getfsig pr1_0 = fp1b_0
+ (p8) cmp.leu p6, p7 = acc0, pr0_3
+ (p9) cmp.ltu p6, p7 = acc0, pr0_3
+ (p12) cmp.leu p10, p11 = s0, pr1_2
+ (p13) cmp.ltu p10, p11 = s0, pr1_2
+ ;; C 07
+ .pred.rel "mutex", p6, p7
+ getfsig acc1_0 = fp2a_0
+ st8 [rp] = s0, 8
+ xma.l fp0b_3 = u_3, v0, f0
+ (p6) add acc0 = pr0_0, acc1_2, 1
+ (p7) add acc0 = pr0_0, acc1_2
+ xma.hu fp1a_3 = u_3, v0, f0
+ ;; C 08
+L(11):
+ .pred.rel "mutex", p10, p11
+ getfsig pr0_2 = fp0b_2
+ xma.l fp1b_2 = u_2, v1, fp1a_2
+ (p10) add s0 = pr1_3, acc0, 1
+ (p11) add s0 = pr1_3, acc0
+ xma.hu fp2a_2 = u_2, v1, fp1a_2
+ nop 1
+ ;; C 09
+ .pred.rel "mutex", p6, p7
+ .pred.rel "mutex", p10, p11
+ ldf8 u_2 = [up], 8
+ getfsig pr1_1 = fp1b_1
+ (p6) cmp.leu p8, p9 = acc0, pr0_0
+ (p7) cmp.ltu p8, p9 = acc0, pr0_0
+ (p10) cmp.leu p12, p13 = s0, pr1_3
+ (p11) cmp.ltu p12, p13 = s0, pr1_3
+ ;; C 10
+ .pred.rel "mutex", p8, p9
+ getfsig acc1_1 = fp2a_1
+ st8 [rp] = s0, 8
+ xma.l fp0b_0 = u_0, v0, f0
+ (p8) add acc0 = pr0_1, acc1_3, 1
+ (p9) add acc0 = pr0_1, acc1_3
+ xma.hu fp1a_0 = u_0, v0, f0
+ ;; C 11
+L(10):
+ .pred.rel "mutex", p12, p13
+ getfsig pr0_3 = fp0b_3
+ xma.l fp1b_3 = u_3, v1, fp1a_3
+ (p12) add s0 = pr1_0, acc0, 1
+ (p13) add s0 = pr1_0, acc0
+ xma.hu fp2a_3 = u_3, v1, fp1a_3
+ br.cloop.dptk L(top)
+ ;;
+C *** MAIN LOOP END ***
+
+ .pred.rel "mutex", p8, p9
+ .pred.rel "mutex", p12, p13
+ {.mmi; getfsig pr1_2 = fp1b_2
+ st8 [rp] = s0, 8
+ (p8) cmp.leu p6, p7 = acc0, pr0_1
+}{.mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_1
+ (p12) cmp.leu p10, p11 = s0, pr1_0
+ (p13) cmp.ltu p10, p11 = s0, pr1_0
+ ;;
+} .pred.rel "mutex", p6, p7
+ {.mfi; getfsig acc1_2 = fp2a_2
+ xma.l fp0b_1 = u_1, v0, f0
+ nop 1
+}{.mmf; (p6) add acc0 = pr0_2, acc1_0, 1
+ (p7) add acc0 = pr0_2, acc1_0
+ xma.hu fp1a_1 = u_1, v0, f0
+ ;;
+}
+L(cj5):
+ .pred.rel "mutex", p10, p11
+ {.mfi; getfsig pr0_0 = fp0b_0
+ xma.l fp1b_0 = u_0, v1, fp1a_0
+ (p10) add s0 = pr1_1, acc0, 1
+}{.mfi; (p11) add s0 = pr1_1, acc0
+ xma.hu fp2a_0 = u_0, v1, fp1a_0
+ nop 1
+ ;;
+} .pred.rel "mutex", p6, p7
+ .pred.rel "mutex", p10, p11
+ {.mmi; getfsig pr1_3 = fp1b_3
+ st8 [rp] = s0, 8
+ (p6) cmp.leu p8, p9 = acc0, pr0_2
+}{.mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_2
+ (p10) cmp.leu p12, p13 = s0, pr1_1
+ (p11) cmp.ltu p12, p13 = s0, pr1_1
+ ;;
+} .pred.rel "mutex", p8, p9
+ {.mfi; getfsig acc1_3 = fp2a_3
+ xma.l fp0b_2 = u_2, v0, f0
+ nop 1
+}{.mmf; (p8) add acc0 = pr0_3, acc1_1, 1
+ (p9) add acc0 = pr0_3, acc1_1
+ xma.hu fp1a_2 = u_2, v0, f0
+ ;;
+}
+L(cj4):
+ .pred.rel "mutex", p12, p13
+ {.mfi; getfsig pr0_1 = fp0b_1
+ xma.l fp1b_1 = u_1, v1, fp1a_1
+ (p12) add s0 = pr1_2, acc0, 1
+}{.mfi; (p13) add s0 = pr1_2, acc0
+ xma.hu fp2a_1 = u_1, v1, fp1a_1
+ nop 1
+ ;;
+} .pred.rel "mutex", p8, p9
+ .pred.rel "mutex", p12, p13
+ {.mmi; getfsig pr1_0 = fp1b_0
+ st8 [rp] = s0, 8
+ (p8) cmp.leu p6, p7 = acc0, pr0_3
+}{.mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_3
+ (p12) cmp.leu p10, p11 = s0, pr1_2
+ (p13) cmp.ltu p10, p11 = s0, pr1_2
+ ;;
+} .pred.rel "mutex", p6, p7
+ {.mmi; getfsig acc1_0 = fp2a_0
+ (p6) add acc0 = pr0_0, acc1_2, 1
+ (p7) add acc0 = pr0_0, acc1_2
+ ;;
+}
+L(cj3):
+ .pred.rel "mutex", p10, p11
+ {.mfi; getfsig pr0_2 = fp0b_2
+ xma.l fp1b_2 = u_2, v1, fp1a_2
+ (p10) add s0 = pr1_3, acc0, 1
+}{.mfi; (p11) add s0 = pr1_3, acc0
+ xma.hu fp2a_2 = u_2, v1, fp1a_2
+ nop 1
+ ;;
+} .pred.rel "mutex", p6, p7
+ .pred.rel "mutex", p10, p11
+ {.mmi; getfsig pr1_1 = fp1b_1
+ st8 [rp] = s0, 8
+ (p6) cmp.leu p8, p9 = acc0, pr0_0
+}{.mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_0
+ (p10) cmp.leu p12, p13 = s0, pr1_3
+ (p11) cmp.ltu p12, p13 = s0, pr1_3
+ ;;
+} .pred.rel "mutex", p8, p9
+ {.mmi; getfsig acc1_1 = fp2a_1
+ (p8) add acc0 = pr0_1, acc1_3, 1
+ (p9) add acc0 = pr0_1, acc1_3
+ ;;
+} .pred.rel "mutex", p12, p13
+ {.mmi; (p12) add s0 = pr1_0, acc0, 1
+ (p13) add s0 = pr1_0, acc0
+ nop 1
+ ;;
+} .pred.rel "mutex", p8, p9
+ .pred.rel "mutex", p12, p13
+ {.mmi; getfsig pr1_2 = fp1b_2
+ st8 [rp] = s0, 8
+ (p8) cmp.leu p6, p7 = acc0, pr0_1
+}{.mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_1
+ (p12) cmp.leu p10, p11 = s0, pr1_0
+ (p13) cmp.ltu p10, p11 = s0, pr1_0
+ ;;
+} .pred.rel "mutex", p6, p7
+ {.mmi; getfsig r8 = fp2a_2
+ (p6) add acc0 = pr0_2, acc1_0, 1
+ (p7) add acc0 = pr0_2, acc1_0
+ ;;
+} .pred.rel "mutex", p10, p11
+ {.mmi; (p10) add s0 = pr1_1, acc0, 1
+ (p11) add s0 = pr1_1, acc0
+ (p6) cmp.leu p8, p9 = acc0, pr0_2
+ ;;
+} .pred.rel "mutex", p10, p11
+ {.mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_2
+ (p10) cmp.leu p12, p13 = s0, pr1_1
+ (p11) cmp.ltu p12, p13 = s0, pr1_1
+ ;;
+} .pred.rel "mutex", p8, p9
+ {.mmi; st8 [rp] = s0, 8
+ (p8) add acc0 = pr1_2, acc1_1, 1
+ (p9) add acc0 = pr1_2, acc1_1
+ ;;
+} .pred.rel "mutex", p8, p9
+ {.mmi; (p8) cmp.leu p10, p11 = acc0, pr1_2
+ (p9) cmp.ltu p10, p11 = acc0, pr1_2
+ (p12) add acc0 = 1, acc0
+ ;;
+}{.mmi; st8 [rp] = acc0, 8
+ (p12) cmpeqor p10, p0 = 0, acc0
+ nop 1
+ ;;
+}{.mib; (p10) add r8 = 1, r8
+ mov ar.lc = r2
+ br.ret.sptk.many b0
+}
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/ia64/popcount.asm b/vendor/gmp-6.3.0/mpn/ia64/popcount.asm
new file mode 100644
index 0000000..c0b5c5c
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/ia64/popcount.asm
@@ -0,0 +1,200 @@
+dnl IA-64 mpn_popcount -- mpn population count.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2000-2005 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C Itanium: 1.5
+C Itanium 2: 1
+
+C INPUT PARAMETERS
+define(`up', `r32')
+define(`n', `r33')
+
+define(`u0',`r16') define(`u1',`r17') define(`u2',`r18') define(`u3',`r19')
+define(`c0',`r28') define(`c1',`r29') define(`c2',`r30') define(`c3',`r31')
+define(`s',`r8')
+
+
+ASM_START()
+PROLOGUE(mpn_popcount)
+ .prologue
+ifdef(`HAVE_ABI_32',
+` addp4 up = 0, up C M I
+ nop.m 0
+ zxt4 n = n C I
+ ;;
+')
+
+ {.mmi; add r9 = 512, up C prefetch pointer M I
+ ld8 r10 = [up], 8 C load first limb M01
+ mov.i r2 = ar.lc C save ar.lc I0
+}{.mmi; and r14 = 3, n C M I
+ cmp.lt p15, p14 = 4, n C small count? M I
+ add n = -5, n C M I
+ ;;
+}{.mmi; cmp.eq p6, p0 = 1, r14 C M I
+ cmp.eq p7, p0 = 2, r14 C M I
+ cmp.eq p8, p0 = 3, r14 C M I
+}{.bbb
+ (p6) br.dptk .Lb01 C B
+ (p7) br.dptk .Lb10 C B
+ (p8) br.dptk .Lb11 C B
+}
+
+
+.Lb00: ld8 u1 = [up], 8 C M01
+ shr.u n = n, 2 C I0
+ mov s = 0 C M I
+ ;;
+ ld8 u2 = [up], 8 C M01
+ popcnt c0 = r10 C I0
+ mov.i ar.lc = n C I0
+ ;;
+ ld8 u3 = [up], 8 C M01
+ popcnt c1 = u1 C I0
+ (p15) br.cond.dptk .grt4 C B
+ ;;
+ nop.m 0 C -
+ nop.m 0 C -
+ popcnt c2 = u2 C I0
+ ;;
+ mov s = c0 C M I
+ popcnt c3 = u3 C I0
+ br .Lcj4 C B
+
+.grt4: ld8 u0 = [up], 8 C M01
+ popcnt c2 = u2 C I0
+ br .LL00 C B
+
+
+.Lb01:
+ popcnt s = r10 C I0
+ (p14) br.ret.sptk.many b0 C B
+
+.grt1: ld8 u0 = [up], 8 C M01
+ shr.u n = n, 2 C I0
+ ;;
+ ld8 u1 = [up], 8 C M01
+ mov.i ar.lc = n C I0
+ ;;
+ ld8 u2 = [up], 8 C M01
+ popcnt c0 = u0 C I0
+ mov c3 = 0 C I0
+
+ ;;
+ ld8 u3 = [up], 8 C M01
+ popcnt c1 = u1 C I0
+ br.cloop.dptk .Loop C B
+ br .Lend C B
+
+
+.Lb10: ld8 u3 = [up], 8 C M01
+ shr.u n = n, 2 C I0
+ (p15) br.cond.dptk .grt2 C B
+
+ popcnt s = r10 C I0
+ ;;
+ popcnt c3 = u3 C I0
+ br .Lcj2 C B
+
+.grt2: ld8 u0 = [up], 8 C M01
+ mov.i ar.lc = n C I0
+ popcnt c2 = r10 C I0
+ ;;
+ ld8 u1 = [up], 8 C M01
+ popcnt c3 = u3 C I0
+ mov s = 0 C M I
+ ;;
+ ld8 u2 = [up], 8 C M01
+ popcnt c0 = u0 C I0
+ br .LL10 C B
+
+
+.Lb11: ld8 u2 = [up], 8 C M01
+ shr.u n = n, 2 C I0
+ mov s = 0 C M I
+ ;;
+ ld8 u3 = [up], 8 C M01
+ popcnt s = r10 C I0
+ (p15) br.cond.dptk .grt3 C B
+
+ popcnt c2 = u2 C I0
+ ;;
+ popcnt c3 = u3 C I0
+ br .Lcj3 C B
+
+.grt3: ld8 u0 = [up], 8 C M01
+ popcnt c2 = u2 C I0
+ mov.i ar.lc = n C I0
+ mov c1 = 0
+ ;;
+ ld8 u1 = [up], 8 C M01
+ popcnt c3 = u3 C I0
+ br .LL11 C B
+
+
+.Loop: ld8 u0 = [up], 8 C M01
+ popcnt c2 = u2 C I0
+ add s = s, c3 C M I
+ ;;
+.LL00: ld8 u1 = [up], 8 C M01
+ popcnt c3 = u3 C I0
+ add s = s, c0 C M I
+ ;;
+.LL11: ld8 u2 = [up], 8 C M01
+ popcnt c0 = u0 C I0
+ add s = s, c1 C M I
+ ;;
+.LL10: ld8 u3 = [up], 8 C M01
+ popcnt c1 = u1 C I0
+ add s = s, c2 C M I
+ lfetch [r9], 32 C M01
+ nop.m 0 C -
+ br.cloop.dptk .Loop C B
+ ;;
+
+.Lend: popcnt c2 = u2 C I0
+ add s = s, c3 C M I
+ ;;
+ popcnt c3 = u3 C I0
+ add s = s, c0 C M I
+ ;;
+.Lcj4: add s = s, c1 C M I
+ ;;
+.Lcj3: add s = s, c2 C M I
+ ;;
+.Lcj2: add s = s, c3 C M I
+ mov.i ar.lc = r2 C I0
+ br.ret.sptk.many b0 C B
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/ia64/rsh1aors_n.asm b/vendor/gmp-6.3.0/mpn/ia64/rsh1aors_n.asm
new file mode 100644
index 0000000..3c7defb
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/ia64/rsh1aors_n.asm
@@ -0,0 +1,447 @@
+dnl IA-64 mpn_rsh1add_n/mpn_rsh1sub_n -- rp[] = (up[] +- vp[]) >> 1.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2003-2005 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C Itanium: 2.5
+C Itanium 2: 1.5
+
+C TODO
+C * Rewrite function entry code using aorslsh1_n.asm style.
+C * Micro-optimize feed-in and wind-down code.
+
+C INPUT PARAMETERS
+define(`rp',`r32')
+define(`up',`r33')
+define(`vp',`r34')
+define(`n',`r35')
+
+ifdef(`OPERATION_rsh1add_n',`
+ define(ADDSUB, add)
+ define(PRED, ltu)
+ define(INCR, 1)
+ define(LIM, -1)
+ define(func, mpn_rsh1add_n)
+')
+ifdef(`OPERATION_rsh1sub_n',`
+ define(ADDSUB, sub)
+ define(PRED, gtu)
+ define(INCR, -1)
+ define(LIM, 0)
+ define(func, mpn_rsh1sub_n)
+')
+
+C Some useful aliases for registers we use
+define(`u0',`r14') define(`u1',`r15') define(`u2',`r16') define(`u3',`r17')
+define(`v0',`r18') define(`v1',`r19') define(`v2',`r20') define(`v3',`r21')
+define(`w0',`r22') define(`w1',`r23') define(`w2',`r24') define(`w3',`r25')
+define(`x0',`r26') define(`x1',`r9') define(`x2',`r30') define(`x3',`r31')
+
+MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1sub_n)
+
+ASM_START()
+PROLOGUE(func)
+ .prologue
+ .save ar.lc, r2
+ .body
+ifdef(`HAVE_ABI_32',`
+ addp4 rp = 0, rp C M I
+ addp4 up = 0, up C M I
+ addp4 vp = 0, vp C M I
+ nop.m 0
+ nop.m 0
+ zxt4 n = n C I
+ ;;
+')
+ {.mmi; ld8 r11 = [vp], 8 C M01
+ ld8 r10 = [up], 8 C M01
+ mov.i r2 = ar.lc C I0
+}{.mmi; and r14 = 3, n C M I
+ cmp.lt p15, p0 = 4, n C M I
+ add n = -4, n C M I
+ ;;
+}{.mmi; cmp.eq p6, p0 = 1, r14 C M I
+ cmp.eq p7, p0 = 2, r14 C M I
+ cmp.eq p8, p0 = 3, r14 C M I
+}{.bbb
+ (p6) br.dptk .Lb01 C B
+ (p7) br.dptk .Lb10 C B
+ (p8) br.dptk .Lb11 C B
+}
+
+.Lb00: ld8 v0 = [vp], 8 C M01
+ ld8 u0 = [up], 8 C M01
+ shr.u n = n, 2 C I0
+ ;;
+ ld8 v1 = [vp], 8 C M01
+ ld8 u1 = [up], 8 C M01
+ ADDSUB w3 = r10, r11 C M I
+ ;;
+ ld8 v2 = [vp], 8 C M01
+ ld8 u2 = [up], 8 C M01
+ (p15) br.dpnt .grt4 C B
+ ;;
+
+ cmp.PRED p7, p0 = w3, r10 C M I
+ and r8 = 1, w3 C M I
+ ADDSUB w0 = u0, v0 C M I
+ ;;
+ cmp.PRED p8, p0 = w0, u0 C M I
+ ADDSUB w1 = u1, v1 C M I
+ ;;
+ cmp.PRED p9, p0 = w1, u1 C M I
+ (p7) cmp.eq.or p8, p0 = LIM, w0 C M I
+ (p7) add w0 = INCR, w0 C M I
+ ;;
+ shrp x3 = w0, w3, 1 C I0
+ ADDSUB w2 = u2, v2 C M I
+ (p8) cmp.eq.or p9, p0 = LIM, w1 C M I
+ (p8) add w1 = INCR, w1 C M I
+ br .Lcj4 C B
+
+.grt4: ld8 v3 = [vp], 8 C M01
+ cmp.PRED p7, p0 = w3, r10 C M I
+ ld8 u3 = [up], 8 C M01
+ and r8 = 1, w3 C M I
+ ;;
+ ADDSUB w0 = u0, v0 C M I
+ ld8 v0 = [vp], 8 C M01
+ add n = -1, n
+ ;;
+ cmp.PRED p8, p0 = w0, u0 C M I
+ ld8 u0 = [up], 8 C M01
+ ADDSUB w1 = u1, v1 C M I
+ ;;
+ ld8 v1 = [vp], 8 C M01
+ mov.i ar.lc = n C I0
+ cmp.PRED p9, p0 = w1, u1 C M I
+ ld8 u1 = [up], 8 C M01
+ (p7) cmp.eq.or p8, p0 = LIM, w0 C M I
+ (p7) add w0 = INCR, w0 C M I
+ ;;
+ ADDSUB w2 = u2, v2 C M I
+ ld8 v2 = [vp], 8 C M01
+ shrp x3 = w0, w3, 1 C I0
+ (p8) cmp.eq.or p9, p0 = LIM, w1 C M I
+ (p8) add w1 = INCR, w1 C M I
+ br .LL00 C B
+
+
+.Lb01: ADDSUB w2 = r10, r11 C M I
+ shr.u n = n, 2 C I0
+ (p15) br.dpnt .grt1 C B
+ ;;
+
+ cmp.PRED p6, p7 = w2, r10 C M I
+ shr.u x2 = w2, 1 C I0
+ and r8 = 1, w2 C M I
+ ;;
+ (p6) dep x2 = -1, x2, 63, 1 C I0
+ br .Lcj1 C B
+
+.grt1: ld8 v3 = [vp], 8 C M01
+ ld8 u3 = [up], 8 C M01
+ ;;
+ ld8 v0 = [vp], 8 C M01
+ ld8 u0 = [up], 8 C M01
+ mov.i ar.lc = n C FIXME swap with next I0
+ ;;
+ ld8 v1 = [vp], 8 C M01
+ ld8 u1 = [up], 8 C M01
+ ;;
+ ld8 v2 = [vp], 8 C M01
+ ld8 u2 = [up], 8 C M01
+ cmp.PRED p6, p0 = w2, r10 C M I
+ and r8 = 1, w2 C M I
+ ADDSUB w3 = u3, v3 C M I
+ br.cloop.dptk .grt5 C B
+ ;;
+
+ cmp.PRED p7, p0 = w3, u3 C M I
+ ;;
+ ADDSUB w0 = u0, v0 C M I
+ (p6) cmp.eq.or p7, p0 = LIM, w3 C M I
+ (p6) add w3 = INCR, w3 C M I
+ ;;
+ cmp.PRED p8, p0 = w0, u0 C M I
+ shrp x2 = w3, w2, 1 C I0
+ ADDSUB w1 = u1, v1 C M I
+ ;;
+ cmp.PRED p9, p0 = w1, u1 C M I
+ (p7) cmp.eq.or p8, p0 = LIM, w0 C M I
+ (p7) add w0 = INCR, w0 C M I
+ br .Lcj5 C B
+
+.grt5: ld8 v3 = [vp], 8 C M01
+ cmp.PRED p7, p0 = w3, u3 C M I
+ ld8 u3 = [up], 8 C M01
+ ;;
+ ADDSUB w0 = u0, v0 C M I
+ ld8 v0 = [vp], 8 C M01
+ (p6) cmp.eq.or p7, p0 = LIM, w3 C M I
+ (p6) add w3 = INCR, w3 C M I
+ ;;
+ cmp.PRED p8, p0 = w0, u0 C M I
+ shrp x2 = w3, w2, 1 C I0
+ ld8 u0 = [up], 8 C M01
+ ADDSUB w1 = u1, v1 C M I
+ ;;
+ ld8 v1 = [vp], 8 C M01
+ cmp.PRED p9, p0 = w1, u1 C M I
+ ld8 u1 = [up], 8 C M01
+ (p7) cmp.eq.or p8, p0 = LIM, w0 C M I
+ (p7) add w0 = INCR, w0 C M I
+ br .LL01 C B
+
+
+.Lb10: ld8 v2 = [vp], 8 C M01
+ ld8 u2 = [up], 8 C M01
+ shr.u n = n, 2 C I0
+ ADDSUB w1 = r10, r11 C M I
+ (p15) br.dpnt .grt2 C B
+ ;;
+
+ cmp.PRED p9, p0 = w1, r10 C M I
+ and r8 = 1, w1 C M I
+ ADDSUB w2 = u2, v2 C M I
+ ;;
+ cmp.PRED p6, p0 = w2, u2 C M I
+ ;;
+ (p9) cmp.eq.or p6, p0 = LIM, w2 C M I
+ (p9) add w2 = INCR, w2 C M I
+ ;;
+ shrp x1 = w2, w1, 1 C I0
+ shr.u x2 = w2, 1 C I0
+ br .Lcj2 C B
+
+.grt2: ld8 v3 = [vp], 8 C M01
+ ld8 u3 = [up], 8 C M01
+ ;;
+ ld8 v0 = [vp], 8 C M01
+ ld8 u0 = [up], 8 C M01
+ mov.i ar.lc = n C I0
+ ;;
+ ld8 v1 = [vp], 8 C M01
+ cmp.PRED p9, p0 = w1, r10 C M I
+ ld8 u1 = [up], 8 C M01
+ and r8 = 1, w1 C M I
+ ;;
+ ADDSUB w2 = u2, v2 C M I
+ ld8 v2 = [vp], 8 C M01
+ ;;
+ cmp.PRED p6, p0 = w2, u2 C M I
+ ld8 u2 = [up], 8 C M01
+ ADDSUB w3 = u3, v3 C M I
+ br.cloop.dptk .grt6 C B
+ ;;
+
+ cmp.PRED p7, p0 = w3, u3 C M I
+ (p9) cmp.eq.or p6, p0 = LIM, w2 C M I
+ (p9) add w2 = INCR, w2 C M I
+ ;;
+ shrp x1 = w2, w1, 1 C I0
+ ADDSUB w0 = u0, v0 C M I
+ (p6) cmp.eq.or p7, p0 = LIM, w3 C M I
+ (p6) add w3 = INCR, w3 C M I
+ br .Lcj6 C B
+
+.grt6: ld8 v3 = [vp], 8 C M01
+ cmp.PRED p7, p0 = w3, u3 C M I
+ ld8 u3 = [up], 8 C M01
+ (p9) cmp.eq.or p6, p0 = LIM, w2 C M I
+ (p9) add w2 = INCR, w2 C M I
+ ;;
+ shrp x1 = w2, w1, 1 C I0
+ ADDSUB w0 = u0, v0 C M I
+ ld8 v0 = [vp], 8 C M01
+ (p6) cmp.eq.or p7, p0 = LIM, w3 C M I
+ (p6) add w3 = INCR, w3 C M I
+ br .LL10 C B
+
+
+.Lb11: ld8 v1 = [vp], 8 C M01
+ ld8 u1 = [up], 8 C M01
+ shr.u n = n, 2 C I0
+ ;;
+ ld8 v2 = [vp], 8 C M01
+ ld8 u2 = [up], 8 C M01
+ ADDSUB w0 = r10, r11 C M I
+ (p15) br.dpnt .grt3 C B
+ ;;
+
+ cmp.PRED p8, p0 = w0, r10 C M I
+ ADDSUB w1 = u1, v1 C M I
+ and r8 = 1, w0 C M I
+ ;;
+ cmp.PRED p9, p0 = w1, u1 C M I
+ ;;
+ ADDSUB w2 = u2, v2 C M I
+ (p8) cmp.eq.or p9, p0 = LIM, w1 C M I
+ (p8) add w1 = INCR, w1 C M I
+ ;;
+ cmp.PRED p6, p0 = w2, u2 C M I
+ shrp x0 = w1, w0, 1 C I0
+ ;;
+ (p9) cmp.eq.or p6, p0 = LIM, w2 C M I
+ (p9) add w2 = INCR, w2 C M I
+ br .Lcj3 C B
+
+.grt3: ld8 v3 = [vp], 8 C M01
+ ld8 u3 = [up], 8 C M01
+ ;;
+ ld8 v0 = [vp], 8 C M01
+ mov.i ar.lc = n C I0
+ cmp.PRED p8, p0 = w0, r10 C M I
+ ld8 u0 = [up], 8 C M01
+ ADDSUB w1 = u1, v1 C M I
+ and r8 = 1, w0 C M I
+ ;;
+ ld8 v1 = [vp], 8 C M01
+ cmp.PRED p9, p0 = w1, u1 C M I
+ ld8 u1 = [up], 8 C M01
+ ;;
+ ADDSUB w2 = u2, v2 C M I
+ ld8 v2 = [vp], 8 C M01
+ (p8) cmp.eq.or p9, p0 = LIM, w1 C M I
+ (p8) add w1 = INCR, w1 C M I
+ ;;
+ cmp.PRED p6, p0 = w2, u2 C M I
+ shrp x0 = w1, w0, 1 C I0
+ ld8 u2 = [up], 8 C M01
+ ADDSUB w3 = u3, v3 C M I
+ br.cloop.dptk .grt7 C B
+ ;;
+
+ cmp.PRED p7, p0 = w3, u3 C M I
+ (p9) cmp.eq.or p6, p0 = LIM, w2 C M I
+ (p9) add w2 = INCR, w2 C M I
+ br .Lcj7 C B
+
+.grt7: ld8 v3 = [vp], 8 C M01
+ cmp.PRED p7, p0 = w3, u3 C M I
+ ld8 u3 = [up], 8 C M01
+ (p9) cmp.eq.or p6, p0 = LIM, w2 C M I
+ (p9) add w2 = INCR, w2 C M I
+ br .LL11 C B
+
+
+C *** MAIN LOOP START ***
+ ALIGN(32)
+.Loop: st8 [rp] = x3, 8 C M23
+ ld8 v3 = [vp], 8 C M01
+ cmp.PRED p7, p0 = w3, u3 C M I
+ ld8 u3 = [up], 8 C M01
+ (p9) cmp.eq.or p6, p0 = LIM, w2 C M I
+ (p9) add w2 = INCR, w2 C M I
+ ;;
+.LL11: st8 [rp] = x0, 8 C M23
+ shrp x1 = w2, w1, 1 C I0
+ ADDSUB w0 = u0, v0 C M I
+ ld8 v0 = [vp], 8 C M01
+ (p6) cmp.eq.or p7, p0 = LIM, w3 C M I
+ (p6) add w3 = INCR, w3 C M I
+ ;;
+.LL10: cmp.PRED p8, p0 = w0, u0 C M I
+ shrp x2 = w3, w2, 1 C I0
+ nop.b 0
+ ld8 u0 = [up], 8 C M01
+ ADDSUB w1 = u1, v1 C M I
+ nop.b 0
+ ;;
+ st8 [rp] = x1, 8 C M23
+ ld8 v1 = [vp], 8 C M01
+ cmp.PRED p9, p0 = w1, u1 C M I
+ ld8 u1 = [up], 8 C M01
+ (p7) cmp.eq.or p8, p0 = LIM, w0 C M I
+ (p7) add w0 = INCR, w0 C M I
+ ;;
+.LL01: st8 [rp] = x2, 8 C M23
+ shrp x3 = w0, w3, 1 C I0
+ ADDSUB w2 = u2, v2 C M I
+ ld8 v2 = [vp], 8 C M01
+ (p8) cmp.eq.or p9, p0 = LIM, w1 C M I
+ (p8) add w1 = INCR, w1 C M I
+ ;;
+.LL00: cmp.PRED p6, p0 = w2, u2 C M I
+ shrp x0 = w1, w0, 1 C I0
+ nop.b 0
+ ld8 u2 = [up], 8 C M01
+ ADDSUB w3 = u3, v3 C M I
+ br.cloop.dptk .Loop C B
+ ;;
+C *** MAIN LOOP END ***
+
+.Lskip: st8 [rp] = x3, 8 C M23
+ cmp.PRED p7, p0 = w3, u3 C M I
+ (p9) cmp.eq.or p6, p0 = LIM, w2 C M I
+ (p9) add w2 = INCR, w2 C M I
+ ;;
+.Lcj7: st8 [rp] = x0, 8 C M23
+ shrp x1 = w2, w1, 1 C I0
+ ADDSUB w0 = u0, v0 C M I
+ (p6) cmp.eq.or p7, p0 = LIM, w3 C M I
+ (p6) add w3 = INCR, w3 C M I
+ ;;
+.Lcj6: cmp.PRED p8, p0 = w0, u0 C M I
+ shrp x2 = w3, w2, 1 C I0
+ ADDSUB w1 = u1, v1 C M I
+ ;;
+ st8 [rp] = x1, 8 C M23
+ cmp.PRED p9, p0 = w1, u1 C M I
+ (p7) cmp.eq.or p8, p0 = LIM, w0 C M I
+ (p7) add w0 = INCR, w0 C M I
+ ;;
+.Lcj5: st8 [rp] = x2, 8 C M23
+ shrp x3 = w0, w3, 1 C I0
+ ADDSUB w2 = u2, v2 C M I
+ (p8) cmp.eq.or p9, p0 = LIM, w1 C M I
+ (p8) add w1 = INCR, w1 C M I
+ ;;
+.Lcj4: cmp.PRED p6, p0 = w2, u2 C M I
+ shrp x0 = w1, w0, 1 C I0
+ ;;
+ st8 [rp] = x3, 8 C M23
+ (p9) cmp.eq.or p6, p0 = LIM, w2 C M I
+ (p9) add w2 = INCR, w2 C M I
+ ;;
+.Lcj3: st8 [rp] = x0, 8 C M23
+ shrp x1 = w2, w1, 1 C I0
+ shr.u x2 = w2, 1 C I0
+ ;;
+.Lcj2: st8 [rp] = x1, 8 C M23
+ (p6) dep x2 = -1, x2, 63, 1 C I0
+ ;;
+.Lcj1: st8 [rp] = x2 C M23
+ mov.i ar.lc = r2 C I0
+ br.ret.sptk.many b0 C B
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/ia64/sec_tabselect.asm b/vendor/gmp-6.3.0/mpn/ia64/sec_tabselect.asm
new file mode 100644
index 0000000..9b11cde
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/ia64/sec_tabselect.asm
@@ -0,0 +1,148 @@
+dnl IA-64 mpn_sec_tabselect.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C Itanium: ?
+C Itanium 2: 2.5
+
+C NOTES
+C * Using software pipelining could trivially yield 2 c/l without unrolling,
+C or 1+epsilon with unrolling. (This code was modelled after the powerpc64
+C code, for simplicity.)
+
+C mpn_sec_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which)
+define(`rp', `r32')
+define(`tp', `r33')
+define(`n', `r34')
+define(`nents', `r35')
+define(`which', `r36')
+
+define(`mask', `r8')
+
+define(`rp1', `r32')
+define(`tp1', `r33')
+define(`rp2', `r14')
+define(`tp2', `r15')
+
+ASM_START()
+PROLOGUE(mpn_sec_tabselect)
+ .prologue
+ .save ar.lc, r2
+ .body
+ifdef(`HAVE_ABI_32',`
+ {.mmi; addp4 rp = 0, rp C M I
+ addp4 tp = 0, tp C M I
+ zxt4 n = n C I
+}{.mii; nop 0
+ zxt4 nents = nents C I
+ zxt4 which = which C I
+ ;;
+}')
+ {.mmi; add rp2 = 8, rp1
+ add tp2 = 8, tp1
+ add r6 = -2, n
+ ;;
+}{.mmi; cmp.eq p10, p0 = 1, n
+ and r9 = 1, n C set cr0 for use in inner loop
+ shr.u r6 = r6, 1 C inner loop count
+ ;;
+}{.mmi; cmp.eq p8, p0 = 0, r9
+ sub which = nents, which
+ shl n = n, 3
+ ;;
+}
+L(outer):
+ {.mmi; cmp.eq p6, p7 = which, nents C are we at the selected table entry?
+ nop 0
+ mov ar.lc = r6 C I0
+ ;;
+}{.mmb;
+ (p6) mov mask = -1
+ (p7) mov mask = 0
+ (p8) br.dptk L(top) C branch to loop entry if n even
+ ;;
+}{.mmi; ld8 r16 = [tp1], 8
+ add tp2 = 8, tp2
+ nop 0
+ ;;
+}{.mmi; ld8 r18 = [rp1]
+ and r16 = r16, mask
+ nop 0
+ ;;
+}{.mmi; andcm r18 = r18, mask
+ ;;
+ or r16 = r16, r18
+ nop 0
+ ;;
+}{.mmb; st8 [rp1] = r16, 8
+ add rp2 = 8, rp2
+ (p10) br.dpnt L(end)
+}
+ ALIGN(32)
+L(top):
+ {.mmi; ld8 r16 = [tp1], 16
+ ld8 r17 = [tp2], 16
+ nop 0
+ ;;
+}{.mmi; ld8 r18 = [rp1]
+ and r16 = r16, mask
+ nop 0
+}{.mmi; ld8 r19 = [rp2]
+ and r17 = r17, mask
+ nop 0
+ ;;
+}{.mmi; andcm r18 = r18, mask
+ andcm r19 = r19, mask
+ nop 0
+ ;;
+}{.mmi; or r16 = r16, r18
+ or r17 = r17, r19
+ nop 0
+ ;;
+}{.mmb; st8 [rp1] = r16, 16
+ st8 [rp2] = r17, 16
+ br.cloop.dptk L(top)
+ ;;
+}
+L(end):
+ {.mmi; sub rp1 = rp1, n C move rp back to beginning
+ sub rp2 = rp2, n C move rp back to beginning
+ cmp.ne p9, p0 = 1, nents
+}{.mmb; add nents = -1, nents
+ nop 0
+ (p9) br.dptk L(outer)
+ ;;
+}{.mib; nop 0
+ nop 0
+ br.ret.sptk.many b0
+}
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/ia64/sqr_diag_addlsh1.asm b/vendor/gmp-6.3.0/mpn/ia64/sqr_diag_addlsh1.asm
new file mode 100644
index 0000000..727f489
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/ia64/sqr_diag_addlsh1.asm
@@ -0,0 +1,156 @@
+dnl IA-64 mpn_sqr_diag_addlsh1
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2010, 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C Itanium: ?
+C Itanium 2: 2 Unrolling could bring it to 1.5 + epsilon
+
+C Exact performance table. The 2nd line is this code, the 3rd line is ctop-
+C less code. In an assembly sqr_basecase, the ctop-full numbers will become a
+C few cycles better since we can mitigate the many I0 instructions.
+C
+C 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
+C - 20 22 24 26 28 30 32 34 36 38 40 42 44 46 48 50 52 54 56 Needs updating
+C - 13 16 17 18 20 21 23 25 26 30 31 31 33 34 36 38 39 42 43
+
+C We should keep in mind that this code takes linear time in a O(n^2) context
+C and that it will only be used under SQR_TOOM2_THRESHOLD, which might become
+C around 60. Keeping overhead down for smallish operands (< 10) is more
+C important than optimal cycle counts.
+
+C TODO
+C * Make sure we don't depend on uninitialised r-registers, f-registers, or
+C * p-registers.
+C * Optimise by doing first two loop iterations in function header.
+
+C INPUT PARAMETERS
+define(`rp_param', `r32') define(`rp', `r14') C size: 2n
+define(`tp_param', `r33') define(`tp', `r15') C size: 2n - 2
+define(`up_param', `r34') define(`up', `r31') C size: n
+define(`n', `r35')
+
+ifdef(`HAVE_ABI_32',`
+ define(`ABI64', `')
+ define(`ABI32', `$1')
+',`
+ define(`ABI64', `$1')
+ define(`ABI32', `')
+')
+
+ASM_START()
+PROLOGUE(mpn_sqr_diag_addlsh1)
+
+ .prologue
+ .save ar.pfs, r2
+ .save ar.lc, r3
+ .body
+
+ {.mii; alloc r2 = ar.pfs, 4,24,0,24 C M
+ mov r3 = ar.lc C I0
+ ABI64(` nop 4711 ')
+ ABI32(` zxt4 n = n ')
+}{.mmi; ABI64(` mov tp = tp_param ') C M I
+ ABI32(` addp4 tp = 0, tp_param') C M I
+ ABI64(` mov up = up_param ') C M I
+ ABI32(` addp4 up = 0, up_param') C M I
+ ABI64(` mov rp = rp_param ') C M I
+ ABI32(` addp4 rp = 0, rp_param') C M I
+ ;;
+}{.mmi; ld8 r36 = [tp], 8 C M
+ add r20 = -2, n C M I
+ mov r9 = ar.ec C I0
+ ;;
+}{.mmi; ld8 r32 = [tp], 8 C M
+ mov r16 = 0 C M I
+ mov ar.ec = 7 C I0
+ ;;
+}{.mmi; nop 4711
+ mov r44 = 0 C M I
+ mov ar.lc = r20 C I0
+ ;;
+}{.mii; mov r33 = 0
+ mov r10 = pr C I0
+ mov pr.rot = 0x30000 C I0
+ ;;
+} br.cexit.spnt.few.clr L(end)
+
+dnl *** MAIN LOOP START ***
+ ALIGN(32)
+L(top):
+ {.mfi; (p18) ldf8 f33 = [up], 8 C M
+ (p20) xma.l f36 = f35, f35, f42 C F
+ (p41) cmpequc p50, p0 = -1, r44 C M I
+}{.mfi; setfsig f40 = r16 C M23
+ (p20) xma.hu f38 = f35, f35, f42 C F
+ (p23) add r50 = r41, r49 C M I
+ ;;
+}{.mmi; (p16) ld8 r36 = [tp], 8 C M
+ (p23) cmpltu p40, p0 = r50, r41 C cyout hi M I
+ (p19) shrp r45 = r38, r35, 63 C non-critical I0
+}{.mmi; (p21) getfsig r39 = f39 C hi M2
+ (p24) st8 [rp] = r51, 8 C hi M23
+ (p41) add r44 = 1, r44 C M I
+ ;;
+}{.mmi; (p16) ld8 r32 = [tp], 8 C M
+ (p50) cmpeqor p40, p0 = -1, r50 C cyout hi M I
+ (p17) shrp r16 = r33, r37, 63 C critical I0
+}{.mmi; (p21) getfsig r42 = f37 C lo M2
+ (p23) st8 [rp] = r44, 8 C lo M23
+ (p50) add r50 = 1, r50 C M I
+ ;;
+} br.ctop.sptk.few.clr L(top) C B
+dnl *** MAIN LOOP END ***
+ ;;
+L(end):
+ {.mmi; nop 4711
+ (p41) add r44 = 1, r44 C M I
+ shr.u r48 = r39, 63 C I0
+ ;;
+}{.mmi; st8 [rp] = r51, 8 C M23
+ (p41) cmpequc p6, p0 = 0, r44 C M I
+ add r50 = r41, r48 C M I
+ ;;
+}{.mmi; st8 [rp] = r44, 8 C M23
+ (p6) add r50 = 1, r50 C M I
+ mov ar.lc = r3 C I0
+ ;;
+}{.mii; st8 [rp] = r50 C M23
+ mov ar.ec = r9 C I0
+ mov pr = r10 C I0
+ ;;
+}{.mib; nop 4711
+ mov ar.pfs = r2 C I0
+ br.ret.sptk.many b0 C B
+}
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/ia64/submul_1.asm b/vendor/gmp-6.3.0/mpn/ia64/submul_1.asm
new file mode 100644
index 0000000..cb2a552
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/ia64/submul_1.asm
@@ -0,0 +1,647 @@
+dnl IA-64 mpn_submul_1 -- Multiply a limb vector with a limb and subtract the
+dnl result from a second limb vector.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2000-2004 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C Itanium: 4.0
+C Itanium 2: 2.25 (alignment dependent, sometimes it seems to need 3 c/l)
+
+C TODO
+C * Optimize feed-in and wind-down code, both for speed and code size.
+C * Handle low limb input and results specially, using a common stf8 in the
+C epilogue.
+C * Delay r8, r10 initialization, put cmp-p6 in 1st bundle and br .Ldone in
+C 2nd bundle. This will allow the bbb bundle to be one cycle earlier and
+C save a cycle.
+
+C INPUT PARAMETERS
+define(`rp', `r32')
+define(`up', `r33')
+define(`n', `r34')
+define(`vl', `r35')
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+ .prologue
+ .save ar.lc, r2
+ .body
+
+ifdef(`HAVE_ABI_32',
+` addp4 rp = 0, rp C M I
+ addp4 up = 0, up C M I
+ zxt4 n = n C I
+ ;;
+')
+{.mmi
+ mov r10 = rp C M I
+ mov r9 = up C M I
+ sub vl = r0, vl C M I negate vl
+}
+{.mmi
+ ldf8 f8 = [rp], 8 C M
+ ldf8 f7 = [up], 8 C M
+ add r19 = -1, n C M I n - 1
+ ;;
+}
+{.mmi
+ cmp.eq p6, p0 = 0, vl C M I
+ mov r8 = 0 C M I zero cylimb
+ mov r2 = ar.lc C I0
+}
+{.mmi
+ setf.sig f6 = vl C M2 M3
+ and r14 = 3, n C M I
+ shr.u r19 = r19, 2 C I0
+ ;;
+}
+{.mmb
+ nop 0
+ cmp.eq p10, p0 = 0, r14 C M I
+ (p6) br.spnt .Ldone C B vl == 0
+}
+{.mmi
+ cmp.eq p11, p0 = 2, r14 C M I
+ cmp.eq p12, p0 = 3, r14 C M I
+ mov ar.lc = r19 C I0
+}
+{.bbb
+ (p10) br.dptk .Lb00 C B
+ (p11) br.dptk .Lb10 C B
+ (p12) br.dptk .Lb11 C B
+ ;;
+}
+
+.Lb01: br.cloop.dptk .grt1
+
+ xma.l f39 = f7, f6, f8
+ xma.hu f43 = f7, f6, f8
+ ;;
+ getf.sig r27 = f39 C lo
+ getf.sig r31 = f43 C hi
+ ld8 r20 = [r9], 8
+ br .Lcj1
+
+.grt1: ldf8 f44 = [rp], 8
+ ldf8 f32 = [up], 8
+ ;;
+ ldf8 f45 = [rp], 8
+ ldf8 f33 = [up], 8
+ ;;
+ ldf8 f46 = [rp], 8
+ xma.l f39 = f7, f6, f8
+ ldf8 f34 = [up], 8
+ xma.hu f43 = f7, f6, f8
+ ;;
+ ldf8 f47 = [rp], 8
+ xma.l f36 = f32, f6, f44
+ ldf8 f35 = [up], 8
+ xma.hu f40 = f32, f6, f44
+ br.cloop.dptk .grt5
+ ;;
+
+ getf.sig r27 = f39 C lo
+ xma.l f37 = f33, f6, f45
+ ld8 r20 = [r9], 8
+ xma.hu f41 = f33, f6, f45
+ ;;
+ getf.sig r31 = f43 C hi
+ getf.sig r24 = f36 C lo
+ xma.l f38 = f34, f6, f46
+ ld8 r21 = [r9], 8
+ xma.hu f42 = f34, f6, f46
+ ;;
+ getf.sig r28 = f40 C hi
+ getf.sig r25 = f37 C lo
+ xma.l f39 = f35, f6, f47
+ ld8 r22 = [r9], 8
+ xma.hu f43 = f35, f6, f47
+ ;;
+ getf.sig r29 = f41 C hi
+ getf.sig r26 = f38 C lo
+ ld8 r23 = [r9], 8
+ br .Lcj5
+
+.grt5: ldf8 f44 = [rp], 8
+ ldf8 f32 = [up], 8
+ ;;
+ getf.sig r27 = f39 C lo
+ xma.l f37 = f33, f6, f45
+ ld8 r20 = [r9], 8
+ xma.hu f41 = f33, f6, f45
+ ;;
+ ldf8 f45 = [rp], 8
+ getf.sig r31 = f43 C hi
+ ldf8 f33 = [up], 8
+ ;;
+ getf.sig r24 = f36 C lo
+ xma.l f38 = f34, f6, f46
+ ld8 r21 = [r9], 8
+ xma.hu f42 = f34, f6, f46
+ ;;
+ ldf8 f46 = [rp], 8
+ getf.sig r28 = f40 C hi
+ ldf8 f34 = [up], 8
+ ;;
+ getf.sig r25 = f37 C lo
+ xma.l f39 = f35, f6, f47
+ ld8 r22 = [r9], 8
+ xma.hu f43 = f35, f6, f47
+ ;;
+ ldf8 f47 = [rp], 8
+ getf.sig r29 = f41 C hi
+ ldf8 f35 = [up], 8
+ ;;
+ getf.sig r26 = f38 C lo
+ xma.l f36 = f32, f6, f44
+ ld8 r23 = [r9], 8
+ xma.hu f40 = f32, f6, f44
+ br.cloop.dptk .Loop
+ br .Lend
+
+
+.Lb10: ldf8 f47 = [rp], 8
+ ldf8 f35 = [up], 8
+ br.cloop.dptk .grt2
+
+ xma.l f38 = f7, f6, f8
+ xma.hu f42 = f7, f6, f8
+ ;;
+ xma.l f39 = f35, f6, f47
+ xma.hu f43 = f35, f6, f47
+ ;;
+ getf.sig r26 = f38 C lo
+ getf.sig r30 = f42 C hi
+ ld8 r23 = [r9], 8
+ ;;
+ getf.sig r27 = f39 C lo
+ getf.sig r31 = f43 C hi
+ ld8 r20 = [r9], 8
+ br .Lcj2
+
+.grt2: ldf8 f44 = [rp], 8
+ ldf8 f32 = [up], 8
+ ;;
+ ldf8 f45 = [rp], 8
+ ldf8 f33 = [up], 8
+ xma.l f38 = f7, f6, f8
+ xma.hu f42 = f7, f6, f8
+ ;;
+ ldf8 f46 = [rp], 8
+ ldf8 f34 = [up], 8
+ xma.l f39 = f35, f6, f47
+ xma.hu f43 = f35, f6, f47
+ ;;
+ ldf8 f47 = [rp], 8
+ ldf8 f35 = [up], 8
+ ;;
+ getf.sig r26 = f38 C lo
+ xma.l f36 = f32, f6, f44
+ ld8 r23 = [r9], 8
+ xma.hu f40 = f32, f6, f44
+ br.cloop.dptk .grt6
+
+ getf.sig r30 = f42 C hi
+ ;;
+ getf.sig r27 = f39 C lo
+ xma.l f37 = f33, f6, f45
+ ld8 r20 = [r9], 8
+ xma.hu f41 = f33, f6, f45
+ ;;
+ getf.sig r31 = f43 C hi
+ getf.sig r24 = f36 C lo
+ xma.l f38 = f34, f6, f46
+ ld8 r21 = [r9], 8
+ xma.hu f42 = f34, f6, f46
+ ;;
+ getf.sig r28 = f40 C hi
+ getf.sig r25 = f37 C lo
+ xma.l f39 = f35, f6, f47
+ ld8 r22 = [r9], 8
+ xma.hu f43 = f35, f6, f47
+ br .Lcj6
+
+.grt6: ldf8 f44 = [rp], 8
+ getf.sig r30 = f42 C hi
+ ldf8 f32 = [up], 8
+ ;;
+ getf.sig r27 = f39 C lo
+ xma.l f37 = f33, f6, f45
+ ld8 r20 = [r9], 8
+ xma.hu f41 = f33, f6, f45
+ ;;
+ ldf8 f45 = [rp], 8
+ getf.sig r31 = f43 C hi
+ ldf8 f33 = [up], 8
+ ;;
+ getf.sig r24 = f36 C lo
+ xma.l f38 = f34, f6, f46
+ ld8 r21 = [r9], 8
+ xma.hu f42 = f34, f6, f46
+ ;;
+ ldf8 f46 = [rp], 8
+ getf.sig r28 = f40 C hi
+ ldf8 f34 = [up], 8
+ ;;
+ getf.sig r25 = f37 C lo
+ xma.l f39 = f35, f6, f47
+ ld8 r22 = [r9], 8
+ xma.hu f43 = f35, f6, f47
+ br .LL10
+
+
+.Lb11: ldf8 f46 = [rp], 8
+ ldf8 f34 = [up], 8
+ ;;
+ ldf8 f47 = [rp], 8
+ ldf8 f35 = [up], 8
+ br.cloop.dptk .grt3
+
+ xma.l f37 = f7, f6, f8
+ xma.hu f41 = f7, f6, f8
+ ;;
+ xma.l f38 = f34, f6, f46
+ xma.hu f42 = f34, f6, f46
+ ;;
+ getf.sig r25 = f37 C lo
+ xma.l f39 = f35, f6, f47
+ xma.hu f43 = f35, f6, f47
+ ;;
+ getf.sig r29 = f41 C hi
+ ld8 r22 = [r9], 8
+ ;;
+ getf.sig r26 = f38 C lo
+ getf.sig r30 = f42 C hi
+ ld8 r23 = [r9], 8
+ ;;
+ getf.sig r27 = f39 C lo
+ getf.sig r31 = f43 C hi
+ ld8 r20 = [r9], 8
+ br .Lcj3
+
+.grt3: ldf8 f44 = [rp], 8
+ xma.l f37 = f7, f6, f8
+ ldf8 f32 = [up], 8
+ xma.hu f41 = f7, f6, f8
+ ;;
+ ldf8 f45 = [rp], 8
+ xma.l f38 = f34, f6, f46
+ ldf8 f33 = [up], 8
+ xma.hu f42 = f34, f6, f46
+ ;;
+ ldf8 f46 = [rp], 8
+ ldf8 f34 = [up], 8
+ ;;
+ getf.sig r25 = f37 C lo
+ xma.l f39 = f35, f6, f47
+ ld8 r22 = [r9], 8
+ xma.hu f43 = f35, f6, f47
+ ;;
+ ldf8 f47 = [rp], 8
+ getf.sig r29 = f41 C hi
+ ldf8 f35 = [up], 8
+ ;;
+ getf.sig r26 = f38 C lo
+ xma.l f36 = f32, f6, f44
+ ld8 r23 = [r9], 8
+ xma.hu f40 = f32, f6, f44
+ br.cloop.dptk .grt7
+ ;;
+
+ getf.sig r30 = f42 C hi
+ getf.sig r27 = f39 C lo
+ xma.l f37 = f33, f6, f45
+ ld8 r20 = [r9], 8
+ xma.hu f41 = f33, f6, f45
+ ;;
+ getf.sig r31 = f43 C hi
+ getf.sig r24 = f36 C lo
+ xma.l f38 = f34, f6, f46
+ ld8 r21 = [r9], 8
+ xma.hu f42 = f34, f6, f46
+ br .Lcj7
+
+.grt7: ldf8 f44 = [rp], 8
+ getf.sig r30 = f42 C hi
+ ldf8 f32 = [up], 8
+ ;;
+ getf.sig r27 = f39 C lo
+ xma.l f37 = f33, f6, f45
+ ld8 r20 = [r9], 8
+ xma.hu f41 = f33, f6, f45
+ ;;
+ ldf8 f45 = [rp], 8
+ getf.sig r31 = f43 C hi
+ ldf8 f33 = [up], 8
+ ;;
+ getf.sig r24 = f36 C lo
+ xma.l f38 = f34, f6, f46
+ ld8 r21 = [r9], 8
+ xma.hu f42 = f34, f6, f46
+ br .LL11
+
+
+.Lb00: ldf8 f45 = [rp], 8
+ ldf8 f33 = [up], 8
+ ;;
+ ldf8 f46 = [rp], 8
+ ldf8 f34 = [up], 8
+ ;;
+ ldf8 f47 = [rp], 8
+ xma.l f36 = f7, f6, f8
+ ldf8 f35 = [up], 8
+ xma.hu f40 = f7, f6, f8
+ br.cloop.dptk .grt4
+
+ xma.l f37 = f33, f6, f45
+ xma.hu f41 = f33, f6, f45
+ ;;
+ getf.sig r24 = f36 C lo
+ xma.l f38 = f34, f6, f46
+ ld8 r21 = [r9], 8
+ xma.hu f42 = f34, f6, f46
+ ;;
+ getf.sig r28 = f40 C hi
+ xma.l f39 = f35, f6, f47
+ getf.sig r25 = f37 C lo
+ ld8 r22 = [r9], 8
+ xma.hu f43 = f35, f6, f47
+ ;;
+ getf.sig r29 = f41 C hi
+ getf.sig r26 = f38 C lo
+ ld8 r23 = [r9], 8
+ ;;
+ getf.sig r30 = f42 C hi
+ getf.sig r27 = f39 C lo
+ ld8 r20 = [r9], 8
+ br .Lcj4
+
+.grt4: ldf8 f44 = [rp], 8
+ xma.l f37 = f33, f6, f45
+ ldf8 f32 = [up], 8
+ xma.hu f41 = f33, f6, f45
+ ;;
+ ldf8 f45 = [rp], 8
+ ldf8 f33 = [up], 8
+ xma.l f38 = f34, f6, f46
+ getf.sig r24 = f36 C lo
+ ld8 r21 = [r9], 8
+ xma.hu f42 = f34, f6, f46
+ ;;
+ ldf8 f46 = [rp], 8
+ getf.sig r28 = f40 C hi
+ ldf8 f34 = [up], 8
+ xma.l f39 = f35, f6, f47
+ getf.sig r25 = f37 C lo
+ ld8 r22 = [r9], 8
+ xma.hu f43 = f35, f6, f47
+ ;;
+ ldf8 f47 = [rp], 8
+ getf.sig r29 = f41 C hi
+ ldf8 f35 = [up], 8
+ ;;
+ getf.sig r26 = f38 C lo
+ xma.l f36 = f32, f6, f44
+ ld8 r23 = [r9], 8
+ xma.hu f40 = f32, f6, f44
+ br.cloop.dptk .grt8
+ ;;
+
+ getf.sig r30 = f42 C hi
+ getf.sig r27 = f39 C lo
+ xma.l f37 = f33, f6, f45
+ ld8 r20 = [r9], 8
+ xma.hu f41 = f33, f6, f45
+ br .Lcj8
+
+.grt8: ldf8 f44 = [rp], 8
+ getf.sig r30 = f42 C hi
+ ldf8 f32 = [up], 8
+ ;;
+ getf.sig r27 = f39 C lo
+ xma.l f37 = f33, f6, f45
+ ld8 r20 = [r9], 8
+ xma.hu f41 = f33, f6, f45
+ br .LL00
+
+ ALIGN(32)
+.Loop:
+{.mmi
+ ldf8 f44 = [rp], 8
+ cmp.ltu p6, p0 = r27, r8 C lo cmp
+ sub r14 = r27, r8 C lo sub
+}
+{.mmi
+ getf.sig r30 = f42 C hi
+ ldf8 f32 = [up], 8
+ sub r8 = r20, r31 C hi sub
+ ;; C 01
+}
+{.mmf
+ getf.sig r27 = f39 C lo
+ st8 [r10] = r14, 8
+ xma.l f37 = f33, f6, f45
+}
+{.mfi
+ ld8 r20 = [r9], 8
+ xma.hu f41 = f33, f6, f45
+ (p6) add r8 = 1, r8
+ ;; C 02
+}
+{.mmi
+.LL00: ldf8 f45 = [rp], 8
+ cmp.ltu p6, p0 = r24, r8
+ sub r14 = r24, r8
+}
+{.mmi
+ getf.sig r31 = f43 C hi
+ ldf8 f33 = [up], 8
+ sub r8 = r21, r28
+ ;; C 03
+}
+{.mmf
+ getf.sig r24 = f36 C lo
+ st8 [r10] = r14, 8
+ xma.l f38 = f34, f6, f46
+}
+{.mfi
+ ld8 r21 = [r9], 8
+ xma.hu f42 = f34, f6, f46
+ (p6) add r8 = 1, r8
+ ;; C 04
+}
+{.mmi
+.LL11: ldf8 f46 = [rp], 8
+ cmp.ltu p6, p0 = r25, r8
+ sub r14 = r25, r8
+}
+{.mmi
+ getf.sig r28 = f40 C hi
+ ldf8 f34 = [up], 8
+ sub r8 = r22, r29
+ ;; C 05
+}
+{.mmf
+ getf.sig r25 = f37 C lo
+ st8 [r10] = r14, 8
+ xma.l f39 = f35, f6, f47
+}
+{.mfi
+ ld8 r22 = [r9], 8
+ xma.hu f43 = f35, f6, f47
+ (p6) add r8 = 1, r8
+ ;; C 06
+}
+{.mmi
+.LL10: ldf8 f47 = [rp], 8
+ cmp.ltu p6, p0 = r26, r8
+ sub r14 = r26, r8
+}
+{.mmi
+ getf.sig r29 = f41 C hi
+ ldf8 f35 = [up], 8
+ sub r8 = r23, r30
+ ;; C 07
+}
+{.mmf
+ getf.sig r26 = f38 C lo
+ st8 [r10] = r14, 8
+ xma.l f36 = f32, f6, f44
+}
+{.mfi
+ ld8 r23 = [r9], 8
+ xma.hu f40 = f32, f6, f44
+ (p6) add r8 = 1, r8
+}
+ br.cloop.dptk .Loop
+ ;;
+
+.Lend:
+ cmp.ltu p6, p0 = r27, r8
+ sub r14 = r27, r8
+ getf.sig r30 = f42
+ sub r8 = r20, r31
+ ;;
+ getf.sig r27 = f39
+ st8 [r10] = r14, 8
+ xma.l f37 = f33, f6, f45
+ ld8 r20 = [r9], 8
+ xma.hu f41 = f33, f6, f45
+ (p6) add r8 = 1, r8
+ ;;
+.Lcj8:
+ cmp.ltu p6, p0 = r24, r8
+ sub r14 = r24, r8
+ getf.sig r31 = f43
+ sub r8 = r21, r28
+ ;;
+ getf.sig r24 = f36
+ st8 [r10] = r14, 8
+ xma.l f38 = f34, f6, f46
+ ld8 r21 = [r9], 8
+ xma.hu f42 = f34, f6, f46
+ (p6) add r8 = 1, r8
+ ;;
+.Lcj7:
+ cmp.ltu p6, p0 = r25, r8
+ sub r14 = r25, r8
+ getf.sig r28 = f40
+ sub r8 = r22, r29
+ ;;
+ getf.sig r25 = f37
+ st8 [r10] = r14, 8
+ xma.l f39 = f35, f6, f47
+ ld8 r22 = [r9], 8
+ xma.hu f43 = f35, f6, f47
+ (p6) add r8 = 1, r8
+ ;;
+.Lcj6:
+ cmp.ltu p6, p0 = r26, r8
+ sub r14 = r26, r8
+ getf.sig r29 = f41
+ sub r8 = r23, r30
+ ;;
+ getf.sig r26 = f38
+ st8 [r10] = r14, 8
+ ld8 r23 = [r9], 8
+ (p6) add r8 = 1, r8
+ ;;
+.Lcj5:
+ cmp.ltu p6, p0 = r27, r8
+ sub r14 = r27, r8
+ getf.sig r30 = f42
+ sub r8 = r20, r31
+ ;;
+ getf.sig r27 = f39
+ st8 [r10] = r14, 8
+ ld8 r20 = [r9], 8
+ (p6) add r8 = 1, r8
+ ;;
+.Lcj4:
+ cmp.ltu p6, p0 = r24, r8
+ sub r14 = r24, r8
+ getf.sig r31 = f43
+ sub r8 = r21, r28
+ ;;
+ st8 [r10] = r14, 8
+ (p6) add r8 = 1, r8
+ ;;
+.Lcj3:
+ cmp.ltu p6, p0 = r25, r8
+ sub r14 = r25, r8
+ sub r8 = r22, r29
+ ;;
+ st8 [r10] = r14, 8
+ (p6) add r8 = 1, r8
+ ;;
+.Lcj2:
+ cmp.ltu p6, p0 = r26, r8
+ sub r14 = r26, r8
+ sub r8 = r23, r30
+ ;;
+ st8 [r10] = r14, 8
+ (p6) add r8 = 1, r8
+ ;;
+.Lcj1:
+ cmp.ltu p6, p0 = r27, r8
+ sub r14 = r27, r8
+ sub r8 = r20, r31
+ ;;
+ st8 [r10] = r14, 8
+ mov ar.lc = r2
+ (p6) add r8 = 1, r8
+ br.ret.sptk.many b0
+.Ldone: mov ar.lc = r2
+ br.ret.sptk.many b0
+EPILOGUE()
+ASM_END()