aboutsummaryrefslogtreecommitdiff
path: root/vendor/gmp-6.3.0/mpn/x86
diff options
context:
space:
mode:
authorThomas Voss <mail@thomasvoss.com> 2024-06-21 23:36:36 +0200
committerThomas Voss <mail@thomasvoss.com> 2024-06-21 23:42:26 +0200
commita89a14ef5da44684a16b204e7a70460cc8c4922a (patch)
treeb23b4c6b155977909ef508fdae2f48d33d802813 /vendor/gmp-6.3.0/mpn/x86
parent1db63fcedab0b288820d66e100b1877b1a5a8851 (diff)
Basic constant folding implementation
Diffstat (limited to 'vendor/gmp-6.3.0/mpn/x86')
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/README525
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/aors_n.asm202
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/aorsmul_1.asm214
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/atom/aorrlsh1_n.asm53
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/atom/aorrlsh2_n.asm53
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/atom/aorrlshC_n.asm156
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/atom/aors_n.asm159
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/atom/aorslshC_n.asm247
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/atom/bdiv_q_1.asm35
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/atom/cnd_add_n.asm113
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/atom/cnd_sub_n.asm124
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/atom/dive_1.asm34
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/atom/gmp-mparam.h214
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/atom/logops_n.asm151
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/atom/lshift.asm218
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/atom/lshiftc.asm159
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/atom/mmx/copyd.asm34
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/atom/mmx/copyi.asm34
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/atom/mmx/hamdist.asm34
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/atom/mod_34lsub1.asm34
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/atom/mode1o.asm34
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/atom/rshift.asm152
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/atom/sse2/aorsmul_1.asm174
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/atom/sse2/bdiv_dbm1c.asm34
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/atom/sse2/divrem_1.asm34
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/atom/sse2/mod_1_1.asm34
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/atom/sse2/mod_1_4.asm34
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/atom/sse2/mul_1.asm124
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/atom/sse2/mul_basecase.asm501
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/atom/sse2/popcount.asm35
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/atom/sse2/sqr_basecase.asm634
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/atom/sublsh1_n.asm34
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/atom/sublsh2_n.asm57
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/bd1/gmp-mparam.h211
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/bd2/gmp-mparam.h214
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/bd4/gmp-mparam.h225
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/bdiv_dbm1c.asm129
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/bdiv_q_1.asm208
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/bt1/gmp-mparam.h218
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/bt2/gmp-mparam.h214
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/cnd_aors_n.asm124
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/copyd.asm91
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/copyi.asm99
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/core2/gmp-mparam.h210
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/coreibwl/gmp-mparam.h216
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/coreihwl/gmp-mparam.h216
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/coreinhm/gmp-mparam.h223
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/coreisbr/gmp-mparam.h215
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/darwin.m4102
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/dive_1.asm190
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/divrem_1.asm233
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/divrem_2.asm199
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/fat/com.c32
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/fat/fat.c530
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/fat/fat_entry.asm243
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/fat/gmp-mparam.h71
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/fat/lshiftc.c32
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/fat/mod_1.c32
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/fat/mod_1_1.c36
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/fat/mod_1_2.c36
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/fat/mod_1_4.c36
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/fat/mode1o.c32
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/fat/mullo_basecase.c32
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/fat/redc_1.c32
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/fat/redc_2.c32
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/gcd_11.asm126
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/geode/gmp-mparam.h141
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/gmp-mparam.h38
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/goldmont/gmp-mparam.h219
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/i486/gmp-mparam.h69
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/k10/gmp-mparam.h217
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/k6/README251
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/k6/aors_n.asm337
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/k6/aorsmul_1.asm391
-rwxr-xr-xvendor/gmp-6.3.0/mpn/x86/k6/cross.pl182
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/k6/divrem_1.asm203
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/k6/gmp-mparam.h166
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/k6/k62mmx/copyd.asm118
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/k6/k62mmx/lshift.asm294
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/k6/k62mmx/rshift.asm293
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/k6/mmx/com.asm103
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/k6/mmx/dive_1.asm282
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/k6/mmx/logops_n.asm226
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/k6/mmx/lshift.asm130
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/k6/mmx/popham.asm236
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/k6/mmx/rshift.asm130
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/k6/mod_34lsub1.asm190
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/k6/mode1o.asm176
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/k6/mul_1.asm292
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/k6/mul_basecase.asm612
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/k6/pre_mod_1.asm146
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/k6/sqr_basecase.asm680
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/k7/README174
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/k7/addlsh1_n.asm196
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/k7/aors_n.asm258
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/k7/aorsmul_1.asm167
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/k7/bdiv_q_1.asm245
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/k7/dive_1.asm208
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/k7/gcd_11.asm107
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/k7/gmp-mparam.h263
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/k7/invert_limb.asm194
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/k7/mmx/com.asm125
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/k7/mmx/copyd.asm144
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/k7/mmx/copyi.asm157
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/k7/mmx/divrem_1.asm832
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/k7/mmx/lshift.asm481
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/k7/mmx/popham.asm213
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/k7/mmx/rshift.asm480
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/k7/mod_1_1.asm221
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/k7/mod_1_4.asm260
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/k7/mod_34lsub1.asm188
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/k7/mode1o.asm181
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/k7/mul_1.asm237
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/k7/mul_basecase.asm602
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/k7/sqr_basecase.asm635
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/k7/sublsh1_n.asm173
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/k8/gmp-mparam.h215
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/lshift.asm106
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/mmx/sec_tabselect.asm163
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/mod_34lsub1.asm183
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/mul_1.asm140
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/mul_basecase.asm223
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/nano/gmp-mparam.h162
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/p6/README125
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/p6/aors_n.asm156
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/p6/aorsmul_1.asm320
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/p6/bdiv_q_1.asm287
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/p6/copyd.asm178
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/p6/dive_1.asm267
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/p6/gcd_11.asm83
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/p6/gmp-mparam.h194
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/p6/lshsub_n.asm169
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/p6/mmx/divrem_1.asm767
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/p6/mmx/gmp-mparam.h218
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/p6/mmx/lshift.asm38
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/p6/mmx/popham.asm39
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/p6/mmx/rshift.asm38
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/p6/mod_34lsub1.asm190
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/p6/mode1o.asm170
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/p6/mul_basecase.asm607
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/p6/p3mmx/popham.asm42
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/p6/sqr_basecase.asm649
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/p6/sse2/addmul_1.asm37
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/p6/sse2/gmp-mparam.h200
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/p6/sse2/mod_1_1.asm34
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/p6/sse2/mod_1_4.asm34
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/p6/sse2/mul_1.asm38
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/p6/sse2/mul_basecase.asm35
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/p6/sse2/popcount.asm35
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/p6/sse2/sqr_basecase.asm35
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/p6/sse2/submul_1.asm35
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium/README181
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium/aors_n.asm203
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium/aorsmul_1.asm144
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium/bdiv_q_1.asm266
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium/com.asm181
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium/copyd.asm146
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium/copyi.asm164
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium/dive_1.asm264
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium/gmp-mparam.h76
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium/hamdist.asm154
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium/logops_n.asm176
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium/lshift.asm243
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium/mmx/gmp-mparam.h163
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium/mmx/hamdist.asm40
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium/mmx/lshift.asm463
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium/mmx/mul_1.asm371
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium/mmx/rshift.asm468
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium/mod_34lsub1.asm192
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium/mode1o.asm279
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium/mul_1.asm177
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium/mul_2.asm150
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium/mul_basecase.asm142
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium/popcount.asm146
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium/rshift.asm243
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium/sqr_basecase.asm528
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium4/README124
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium4/copyd.asm71
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium4/copyi.asm93
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium4/mmx/lshift.asm39
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium4/mmx/popham.asm203
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium4/mmx/rshift.asm39
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/add_n.asm101
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/addlsh1_n.asm108
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/addmul_1.asm189
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/bdiv_dbm1c.asm141
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/bdiv_q_1.asm234
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/cnd_add_n.asm95
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/cnd_sub_n.asm114
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/dive_1.asm216
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/divrem_1.asm645
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/gmp-mparam.h213
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mod_1_1.asm166
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mod_1_4.asm269
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mod_34lsub1.asm175
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mode1o.asm175
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mul_1.asm164
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mul_basecase.asm662
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/popcount.asm281
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/rsh1add_n.asm126
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/sqr_basecase.asm705
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/sub_n.asm119
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/submul_1.asm182
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/rshift.asm108
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/sec_tabselect.asm106
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/silvermont/gmp-mparam.h222
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/skylake/gmp-mparam.h211
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/sqr_basecase.asm359
-rwxr-xr-xvendor/gmp-6.3.0/mpn/x86/t-zdisp.sh71
-rwxr-xr-xvendor/gmp-6.3.0/mpn/x86/t-zdisp2.pl147
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/udiv.asm52
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/umul.asm51
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/x86-defs.m41024
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/zn1/gmp-mparam.h220
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/zn2/gmp-mparam.h226
215 files changed, 42594 insertions, 0 deletions
diff --git a/vendor/gmp-6.3.0/mpn/x86/README b/vendor/gmp-6.3.0/mpn/x86/README
new file mode 100644
index 0000000..8d7ac90
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/README
@@ -0,0 +1,525 @@
+Copyright 1999-2002 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/.
+
+
+
+
+
+ X86 MPN SUBROUTINES
+
+
+This directory contains mpn functions for various 80x86 chips.
+
+
+CODE ORGANIZATION
+
+ x86 i386, generic
+ x86/i486 i486
+ x86/pentium Intel Pentium (P5, P54)
+ x86/pentium/mmx Intel Pentium with MMX (P55)
+ x86/p6 Intel Pentium Pro
+ x86/p6/mmx Intel Pentium II, III
+ x86/p6/p3mmx Intel Pentium III
+ x86/k6 \ AMD K6
+ x86/k6/mmx /
+ x86/k6/k62mmx AMD K6-2
+ x86/k7 \ AMD Athlon
+ x86/k7/mmx /
+ x86/pentium4 \
+ x86/pentium4/mmx | Intel Pentium 4
+ x86/pentium4/sse2 /
+
+
+The top-level x86 directory contains blended style code, meant to be
+reasonable on all x86s.
+
+
+
+STATUS
+
+The code is well-optimized for AMD and Intel chips, but there's nothing
+specific for Cyrix chips, nor for actual 80386 and 80486 chips.
+
+
+
+ASM FILES
+
+The x86 .asm files are BSD style assembler code, first put through m4 for
+macro processing. The generic mpn/asm-defs.m4 is used, together with
+mpn/x86/x86-defs.m4. See comments in those files.
+
+The code is meant for use with GNU "gas" or a system "as". There's no
+support for assemblers that demand Intel style code.
+
+
+
+STACK FRAME
+
+m4 macros are used to define the parameters passed on the stack, and these
+act like comments on what the stack frame looks like too. For example,
+mpn_mul_1() has the following.
+
+ defframe(PARAM_MULTIPLIER, 16)
+ defframe(PARAM_SIZE, 12)
+ defframe(PARAM_SRC, 8)
+ defframe(PARAM_DST, 4)
+
+PARAM_MULTIPLIER becomes `FRAME+16(%esp)', and the others similarly. The
+return address is at offset 0, but there's not normally any need to access
+that.
+
+FRAME is redefined as necessary through the code so it's the number of bytes
+pushed on the stack, and hence the offsets in the parameter macros stay
+correct. At the start of a routine FRAME should be zero.
+
+ deflit(`FRAME',0)
+ ...
+ deflit(`FRAME',4)
+ ...
+ deflit(`FRAME',8)
+ ...
+
+Helper macros FRAME_pushl(), FRAME_popl(), FRAME_addl_esp() and
+FRAME_subl_esp() exist to adjust FRAME for the effect of those instructions,
+and can be used instead of explicit definitions if preferred.
+defframe_pushl() is a combination FRAME_pushl() and defframe().
+
+There's generally some slackness in redefining FRAME. If new values aren't
+going to get used then the redefinitions are omitted to keep from cluttering
+up the code. This happens for instance at the end of a routine, where there
+might be just four pops and then a ret, so FRAME isn't getting used.
+
+Local variables and saved registers can be similarly defined, with negative
+offsets representing stack space below the initial stack pointer. For
+example,
+
+ defframe(SAVE_ESI, -4)
+ defframe(SAVE_EDI, -8)
+ defframe(VAR_COUNTER,-12)
+
+ deflit(STACK_SPACE, 12)
+
+Here STACK_SPACE gets used in a "subl $STACK_SPACE, %esp" to allocate the
+space, and that instruction must be followed by a redefinition of FRAME
+(setting it equal to STACK_SPACE) to reflect the change in %esp.
+
+Definitions for pushed registers are only put in when they're going to be
+used. If registers are just saved and restored with pushes and pops then
+definitions aren't made.
+
+
+
+ASSEMBLER EXPRESSIONS
+
+Only addition and subtraction seem to be universally available, certainly
+that's all the Solaris 8 "as" seems to accept. If expressions are wanted
+then m4 eval() should be used.
+
+In particular note that a "/" anywhere in a line starts a comment in Solaris
+"as", and in some configurations of gas too.
+
+ addl $32/2, %eax <-- wrong
+
+ addl $eval(32/2), %eax <-- right
+
+Binutils gas/config/tc-i386.c has a choice between "/" being a comment
+anywhere in a line, or only at the start. FreeBSD patches 2.9.1 to select
+the latter, and from 2.9.5 it's the default for GNU/Linux too.
+
+
+
+ASSEMBLER COMMENTS
+
+Solaris "as" doesn't support "#" commenting, using /* */ instead. For that
+reason "C" commenting is used (see asm-defs.m4) and the intermediate ".s"
+files have no comments.
+
+Any comments before include(`../config.m4') must use m4 "dnl", since it's
+only after the include that "C" is available. By convention "dnl" is also
+used for comments about m4 macros.
+
+
+
+TEMPORARY LABELS
+
+Temporary numbered labels like "1:" used as "1f" or "1b" are available in
+"gas" and Solaris "as", but not in SCO "as". Normal L() labels should be
+used instead, possibly with a counter to make them unique, see jadcl0() in
+x86-defs.m4 for instance. A separate counter for each macro makes it
+possible to nest them, for instance movl_text_address() can be used within
+an ASSERT().
+
+"1:" etc must be avoided in gcc __asm__ blocks too. "%=" for generating a
+unique number looks like a good alternative, but is that actually a
+documented feature? In any case this problem doesn't currently arise.
+
+
+
+ZERO DISPLACEMENTS
+
+In a couple of places addressing modes like 0(%ebx) with a byte-sized zero
+displacement are wanted, rather than (%ebx) with no displacement. These are
+either for computed jumps or to get desirable code alignment. Explicit
+.byte sequences are used to ensure the assembler doesn't turn 0(%ebx) into
+(%ebx). The Zdisp() macro in x86-defs.m4 is used for this.
+
+Current gas 2.9.5 or recent 2.9.1 leave 0(%ebx) as written, but old gas
+1.92.3 changes it. In general changing would be the sort of "optimization"
+an assembler might perform, hence explicit ".byte"s are used where
+necessary.
+
+
+
+SHLD/SHRD INSTRUCTIONS
+
+The %cl count forms of double shift instructions like "shldl %cl,%eax,%ebx"
+must be written "shldl %eax,%ebx" for some assemblers. gas takes either,
+Solaris "as" doesn't allow %cl, gcc generates %cl for gas and NeXT (which is
+gas), and omits %cl elsewhere.
+
+For GMP an autoconf test GMP_ASM_X86_SHLDL_CL is used to determine whether
+%cl should be used, and the macros shldl, shrdl, shldw and shrdw in
+mpn/x86/x86-defs.m4 pass through or omit %cl as necessary. See the comments
+with those macros for usage.
+
+
+
+IMUL INSTRUCTION
+
+GCC config/i386/i386.md (cvs rev 1.187, 21 Oct 00) under *mulsi3_1 notes
+that the following two forms produce identical object code
+
+ imul $12, %eax
+ imul $12, %eax, %eax
+
+but that the former isn't accepted by some assemblers, in particular the SCO
+OSR5 COFF assembler. GMP follows GCC and uses only the latter form.
+
+(This applies only to immediate operands, the three operand form is only
+valid with an immediate.)
+
+
+
+DIRECTION FLAG
+
+The x86 calling conventions say that the direction flag should be clear at
+function entry and exit. (See iBCS2 and SVR4 ABI books, references below.)
+Although this has been so since the year dot, it's not absolutely clear
+whether it's universally respected. Since it's better to be safe than
+sorry, GMP follows glibc and does a "cld" if it depends on the direction
+flag being clear. This happens only in a few places.
+
+
+
+POSITION INDEPENDENT CODE
+
+ Coding Style
+
+ Defining the symbol PIC in m4 processing selects SVR4 / ELF style
+ position independent code. This is necessary for shared libraries
+ because they can be mapped into different processes at different virtual
+ addresses. Actually, relocations are allowed but text pages with
+ relocations aren't shared, defeating the purpose of a shared library.
+
+ The GOT is used to access global data, and the PLT is used for
+ functions. The use of the PLT adds a fixed cost to every function call,
+ and the GOT adds a cost to any function accessing global variables.
+ These are small but might be noticeable when working with small
+ operands.
+
+ Scope
+
+ It's intended, as a matter of policy, that references within libgmp are
+ resolved within libgmp. Certainly there's no need for an application to
+ replace any internals, and we take the view that there's no value in an
+ application subverting anything documented either.
+
+ Resolving references within libgmp in theory means calls can be made with a
+ plain PC-relative call instruction, which is faster and smaller than going
+ through the PLT, and data references can be similarly PC-relative, saving a
+ GOT entry and fetch from there. Unfortunately the normal linker behaviour
+ doesn't allow us to do this.
+
+ By default an R_386_PC32 PC-relative reference, either for a call or for
+ data, is left in libgmp.so by the linker so that it can be resolved at
+ runtime to a location in the application or another shared library. This
+ means a text segment relocation which we don't want.
+
+ -Bsymbolic
+
+ Under the "-Bsymbolic" option, the linker resolves references to symbols
+ within libgmp.so. This gives us the desired effect for R_386_PC32,
+ ie. it's resolved at link time. It also resolves R_386_PLT32 calls
+ directly to their target without creating a PLT entry (though if this is
+ done to normal compiler-generated code it still leaves a setup of %ebx
+ to _GLOBAL_OFFSET_TABLE_ which may then be unnecessary).
+
+ Unfortunately -Bsymbolic does bad things to global variables defined in
+ a shared library but accessed by non-PIC code from the mainline (or a
+ static library).
+
+ The problem is that the mainline needs a fixed data address to avoid
+ text segment relocations, so space is allocated in its data segment and
+ the value from the variable is copied from the shared library's data
+ segment when the library is loaded. Under -Bsymbolic, however,
+ references in the shared library are then resolved still to the shared
+ library data area. Not surprisingly it bombs badly to have mainline
+ code and library code accessing different locations for what should be
+ one variable.
+
+ Note that this -Bsymbolic effect for the shared library is not just for
+ R_386_PC32 offsets which might have been cooked up in assembler, but is
+ done also for the contents of GOT entries. -Bsymbolic simply applies a
+ general rule that symbols are resolved first from the local module.
+
+ Visibility Attributes
+
+ GCC __attribute__ ((visibility ("protected"))), which is available in
+ recent versions, eg. 3.3, is probably what we'd like to use. It makes
+ gcc generate plain PC-relative calls to indicated functions, and directs
+ the linker to resolve references to the given function within the link
+ module.
+
+ Unfortunately, as of debian binutils 2.13.90.0.16 at least, the
+ resulting libgmp.so comes out with text segment relocations, references
+ are not resolved at link time. If the gcc description is to be believed
+ this is this not how it should work. If a symbol cannot be overridden
+ by another module then surely references within that module can be
+ resolved immediately (ie. at link time).
+
+ Present
+
+ In any case, all this means that we have no optimizations we can
+ usefully make to function or variable usages, neither for assembler nor
+ C code. Perhaps in the future the visibility attribute will work as
+ we'd like.
+
+
+
+
+GLOBAL OFFSET TABLE
+
+The magic _GLOBAL_OFFSET_TABLE_ used by code establishing the address of the
+GOT sometimes requires an extra underscore prefix. SVR4 systems and NetBSD
+don't need a prefix, OpenBSD does need one. Note that NetBSD and OpenBSD
+are both a.out underscore systems, so the prefix for _GLOBAL_OFFSET_TABLE_
+is not simply the same as the prefix for ordinary globals.
+
+In any case in the asm code we write _GLOBAL_OFFSET_TABLE_ and let a macro
+in x86-defs.m4 add an extra underscore if required (according to a configure
+test).
+
+Old gas 1.92.3 which comes with FreeBSD 2.2.8 gets a segmentation fault when
+asked to assemble the following,
+
+ L1:
+ addl $_GLOBAL_OFFSET_TABLE_+[.-L1], %ebx
+
+It seems that using the label in the same instruction it refers to is the
+problem, since a nop in between works. But the simplest workaround is to
+follow gcc and omit the +[.-L1] since it does nothing,
+
+ addl $_GLOBAL_OFFSET_TABLE_, %ebx
+
+Current gas 2.10 generates incorrect object code when %eax is used in such a
+construction (with or without +[.-L1]),
+
+ addl $_GLOBAL_OFFSET_TABLE_, %eax
+
+The R_386_GOTPC gets a displacement of 2 rather than the 1 appropriate for
+the 1 byte opcode of "addl $n,%eax". The best workaround is just to use any
+other register, since then it's a two byte opcode+mod/rm. GCC for example
+always uses %ebx (which is needed for calls through the PLT).
+
+A similar problem occurs in an leal (again with or without a +[.-L1]),
+
+ leal _GLOBAL_OFFSET_TABLE_(%edi), %ebx
+
+This time the R_386_GOTPC gets a displacement of 0 rather than the 2
+appropriate for the opcode and mod/rm, making this form unusable.
+
+
+
+
+SIMPLE LOOPS
+
+The overheads in setting up for an unrolled loop can mean that at small
+sizes a simple loop is faster. Making small sizes go fast is important,
+even if it adds a cycle or two to bigger sizes. To this end various
+routines choose between a simple loop and an unrolled loop according to
+operand size. The path to the simple loop, or to special case code for
+small sizes, is always as fast as possible.
+
+Adding a simple loop requires a conditional jump to choose between the
+simple and unrolled code. The size of a branch misprediction penalty
+affects whether a simple loop is worthwhile.
+
+The convention is for an m4 definition UNROLL_THRESHOLD to set the crossover
+point, with sizes < UNROLL_THRESHOLD using the simple loop, sizes >=
+UNROLL_THRESHOLD using the unrolled loop. If position independent code adds
+a couple of cycles to an unrolled loop setup, the threshold will vary with
+PIC or non-PIC. Something like the following is typical.
+
+ deflit(UNROLL_THRESHOLD, ifdef(`PIC',10,8))
+
+There's no automated way to determine the threshold. Setting it to a small
+value and then to a big value makes it possible to measure the simple and
+unrolled loops each over a range of sizes, from which the crossover point
+can be determined. Alternately, just adjust the threshold up or down until
+there's no more speedups.
+
+
+
+UNROLLED LOOP CODING
+
+The x86 addressing modes allow a byte displacement of -128 to +127, making
+it possible to access 256 bytes, which is 64 limbs, without adjusting
+pointer registers within the loop. Dword sized displacements can be used
+too, but they increase code size, and unrolling to 64 ought to be enough.
+
+When unrolling to the full 64 limbs/loop, the limb at the top of the loop
+will have a displacement of -128, so pointers have to have a corresponding
++128 added before entering the loop. When unrolling to 32 limbs/loop
+displacements 0 to 127 can be used with 0 at the top of the loop and no
+adjustment needed to the pointers.
+
+Where 64 limbs/loop is supported, the +128 adjustment is done only when 64
+limbs/loop is selected. Usually the gain in speed using 64 instead of 32 or
+16 is small, so support for 64 limbs/loop is generally only for comparison.
+
+
+
+COMPUTED JUMPS
+
+When working from least significant limb to most significant limb (most
+routines) the computed jump and pointer calculations in preparation for an
+unrolled loop are as follows.
+
+ S = operand size in limbs
+ N = number of limbs per loop (UNROLL_COUNT)
+ L = log2 of unrolling (UNROLL_LOG2)
+ M = mask for unrolling (UNROLL_MASK)
+ C = code bytes per limb in the loop
+ B = bytes per limb (4 for x86)
+
+ computed jump (-S & M) * C + entrypoint
+ subtract from pointers (-S & M) * B
+ initial loop counter (S-1) >> L
+ displacements 0 to B*(N-1)
+
+The loop counter is decremented at the end of each loop, and the looping
+stops when the decrement takes the counter to -1. The displacements are for
+the addressing accessing each limb, eg. a load with "movl disp(%ebx), %eax".
+
+Usually the multiply by "C" can be handled without an imul, using instead an
+leal, or a shift and subtract.
+
+When working from most significant to least significant limb (eg. mpn_lshift
+and mpn_copyd), the calculations change as follows.
+
+ add to pointers (-S & M) * B
+ displacements 0 to -B*(N-1)
+
+
+
+OLD GAS 1.92.3
+
+This version comes with FreeBSD 2.2.8 and has a couple of gremlins that
+affect GMP code.
+
+Firstly, an expression involving two forward references to labels comes out
+as zero. For example,
+
+ addl $bar-foo, %eax
+ foo:
+ nop
+ bar:
+
+This should lead to "addl $1, %eax", but it comes out as "addl $0, %eax".
+When only one forward reference is involved, it works correctly, as for
+example,
+
+ foo:
+ addl $bar-foo, %eax
+ nop
+ bar:
+
+Secondly, an expression involving two labels can't be used as the
+displacement for an leal. For example,
+
+ foo:
+ nop
+ bar:
+ leal bar-foo(%eax,%ebx,8), %ecx
+
+A slightly cryptic error is given, "Unimplemented segment type 0 in
+parse_operand". When only one label is used it's ok, and the label can be a
+forward reference too, as for example,
+
+ leal foo(%eax,%ebx,8), %ecx
+ nop
+ foo:
+
+These problems only affect PIC computed jump calculations. The workarounds
+are just to do an leal without a displacement and then an addl, and to make
+sure the code is placed so that there's at most one forward reference in the
+addl.
+
+
+
+REFERENCES
+
+"Intel Architecture Software Developer's Manual", volumes 1, 2a, 2b, 3a, 3b,
+2006, order numbers 253665 through 253669. Available on-line,
+
+ ftp://download.intel.com/design/Pentium4/manuals/25366518.pdf
+ ftp://download.intel.com/design/Pentium4/manuals/25366618.pdf
+ ftp://download.intel.com/design/Pentium4/manuals/25366718.pdf
+ ftp://download.intel.com/design/Pentium4/manuals/25366818.pdf
+ ftp://download.intel.com/design/Pentium4/manuals/25366918.pdf
+
+
+"System V Application Binary Interface", Unix System Laboratories Inc, 1992,
+published by Prentice Hall, ISBN 0-13-880410-9. And the "Intel386 Processor
+Supplement", AT&T, 1991, ISBN 0-13-877689-X. These have details of calling
+conventions and ELF shared library PIC coding. Versions of both available
+on-line,
+
+ http://www.sco.com/developer/devspecs
+
+"Intel386 Family Binary Compatibility Specification 2", Intel Corporation,
+published by McGraw-Hill, 1991, ISBN 0-07-031219-2. (Same as the above 386
+ABI supplement.)
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 76
+End:
diff --git a/vendor/gmp-6.3.0/mpn/x86/aors_n.asm b/vendor/gmp-6.3.0/mpn/x86/aors_n.asm
new file mode 100644
index 0000000..5d359f5
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/aors_n.asm
@@ -0,0 +1,202 @@
+dnl x86 mpn_add_n/mpn_sub_n -- mpn addition and subtraction.
+
+dnl Copyright 1992, 1994-1996, 1999-2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C P5 3.375
+C P6 3.125
+C K6 3.5
+C K7 2.25
+C P4 8.75
+
+
+ifdef(`OPERATION_add_n',`
+ define(M4_inst, adcl)
+ define(M4_function_n, mpn_add_n)
+ define(M4_function_nc, mpn_add_nc)
+
+',`ifdef(`OPERATION_sub_n',`
+ define(M4_inst, sbbl)
+ define(M4_function_n, mpn_sub_n)
+ define(M4_function_nc, mpn_sub_nc)
+
+',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
+')')')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+
+C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size);
+C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size, mp_limb_t carry);
+
+defframe(PARAM_CARRY,20)
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC2, 12)
+defframe(PARAM_SRC1, 8)
+defframe(PARAM_DST, 4)
+
+ TEXT
+ ALIGN(8)
+
+PROLOGUE(M4_function_nc)
+deflit(`FRAME',0)
+
+ pushl %edi FRAME_pushl()
+ pushl %esi FRAME_pushl()
+
+ movl PARAM_DST,%edi
+ movl PARAM_SRC1,%esi
+ movl PARAM_SRC2,%edx
+ movl PARAM_SIZE,%ecx
+
+ movl %ecx,%eax
+ shrl $3,%ecx C compute count for unrolled loop
+ negl %eax
+ andl $7,%eax C get index where to start loop
+ jz L(oopgo) C necessary special case for 0
+ incl %ecx C adjust loop count
+ shll $2,%eax C adjustment for pointers...
+ subl %eax,%edi C ... since they are offset ...
+ subl %eax,%esi C ... by a constant when we ...
+ subl %eax,%edx C ... enter the loop
+ shrl $2,%eax C restore previous value
+
+ifdef(`PIC',`
+ C Calculate start address in loop for PIC. Due to limitations in
+ C old gas, LF(M4_function_n,oop)-L(0a)-3 cannot be put into the leal
+ call L(0a)
+L(0a): leal (%eax,%eax,8),%eax
+ addl (%esp),%eax
+ addl $L(oop)-L(0a)-3,%eax
+ addl $4,%esp
+',`
+ C Calculate start address in loop for non-PIC.
+ leal L(oop)-3(%eax,%eax,8),%eax
+')
+
+ C These lines initialize carry from the 5th parameter. Should be
+ C possible to simplify.
+ pushl %ebp FRAME_pushl()
+ movl PARAM_CARRY,%ebp
+ shrl %ebp C shift bit 0 into carry
+ popl %ebp FRAME_popl()
+
+ jmp *%eax C jump into loop
+
+EPILOGUE()
+
+
+ ALIGN(16)
+PROLOGUE(M4_function_n)
+deflit(`FRAME',0)
+
+ pushl %edi FRAME_pushl()
+ pushl %esi FRAME_pushl()
+
+ movl PARAM_DST,%edi
+ movl PARAM_SRC1,%esi
+ movl PARAM_SRC2,%edx
+ movl PARAM_SIZE,%ecx
+
+ movl %ecx,%eax
+ shrl $3,%ecx C compute count for unrolled loop
+ negl %eax
+ andl $7,%eax C get index where to start loop
+ jz L(oop) C necessary special case for 0
+ incl %ecx C adjust loop count
+ shll $2,%eax C adjustment for pointers...
+ subl %eax,%edi C ... since they are offset ...
+ subl %eax,%esi C ... by a constant when we ...
+ subl %eax,%edx C ... enter the loop
+ shrl $2,%eax C restore previous value
+
+ifdef(`PIC',`
+ C Calculate start address in loop for PIC. Due to limitations in
+ C some assemblers, L(oop)-L(0b)-3 cannot be put into the leal
+ call L(0b)
+L(0b): leal (%eax,%eax,8),%eax
+ addl (%esp),%eax
+ addl $L(oop)-L(0b)-3,%eax
+ addl $4,%esp
+',`
+ C Calculate start address in loop for non-PIC.
+ leal L(oop)-3(%eax,%eax,8),%eax
+')
+ jmp *%eax C jump into loop
+
+L(oopgo):
+ pushl %ebp FRAME_pushl()
+ movl PARAM_CARRY,%ebp
+ shrl %ebp C shift bit 0 into carry
+ popl %ebp FRAME_popl()
+
+ ALIGN(16)
+L(oop): movl (%esi),%eax
+ M4_inst (%edx),%eax
+ movl %eax,(%edi)
+ movl 4(%esi),%eax
+ M4_inst 4(%edx),%eax
+ movl %eax,4(%edi)
+ movl 8(%esi),%eax
+ M4_inst 8(%edx),%eax
+ movl %eax,8(%edi)
+ movl 12(%esi),%eax
+ M4_inst 12(%edx),%eax
+ movl %eax,12(%edi)
+ movl 16(%esi),%eax
+ M4_inst 16(%edx),%eax
+ movl %eax,16(%edi)
+ movl 20(%esi),%eax
+ M4_inst 20(%edx),%eax
+ movl %eax,20(%edi)
+ movl 24(%esi),%eax
+ M4_inst 24(%edx),%eax
+ movl %eax,24(%edi)
+ movl 28(%esi),%eax
+ M4_inst 28(%edx),%eax
+ movl %eax,28(%edi)
+ leal 32(%edi),%edi
+ leal 32(%esi),%esi
+ leal 32(%edx),%edx
+ decl %ecx
+ jnz L(oop)
+
+ sbbl %eax,%eax
+ negl %eax
+
+ popl %esi
+ popl %edi
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/aorsmul_1.asm b/vendor/gmp-6.3.0/mpn/x86/aorsmul_1.asm
new file mode 100644
index 0000000..8c13997
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/aorsmul_1.asm
@@ -0,0 +1,214 @@
+dnl x86 __gmpn_addmul_1 (for 386 and 486) -- Multiply a limb vector with a
+dnl limb and add the result to a second limb vector.
+
+dnl Copyright 1992, 1994, 1997, 1999-2002, 2005 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C P5
+C P6 model 0-8,10-12
+C P6 model 9 (Banias)
+C P6 model 13 (Dothan)
+C P4 model 0 (Willamette)
+C P4 model 1 (?)
+C P4 model 2 (Northwood)
+C P4 model 3 (Prescott)
+C P4 model 4 (Nocona)
+C Intel Atom
+C AMD K6
+C AMD K7
+C AMD K8 3.875
+C AMD K10
+
+
+ifdef(`OPERATION_addmul_1',`
+ define(ADDSUB, addl)
+ define(M4_function_1, mpn_addmul_1)
+ define(M4_function_1c, mpn_addmul_1c)
+
+',`ifdef(`OPERATION_submul_1',`
+ define(ADDSUB, subl)
+ define(M4_function_1, mpn_submul_1)
+
+',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
+')')')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1 mpn_addmul_1c)
+
+
+C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t mult);
+
+define(PARAM_CARRY, `FRAME+20(%esp)')
+define(PARAM_MULTIPLIER, `FRAME+16(%esp)')
+define(PARAM_SIZE, `FRAME+12(%esp)')
+define(PARAM_SRC, `FRAME+8(%esp)')
+define(PARAM_DST, `FRAME+4(%esp)')
+
+ TEXT
+ ALIGN(32)
+PROLOGUE(M4_function_1)
+deflit(`FRAME',0)
+
+ pushl %edi
+ pushl %esi
+ pushl %ebx
+ pushl %ebp
+deflit(`FRAME',16)
+
+ movl PARAM_SRC, %esi
+ movl PARAM_SIZE, %ecx
+ movl PARAM_DST, %edi
+
+ movl (%esi), %eax
+ mull PARAM_MULTIPLIER
+
+ testb $1, %cl
+ jnz L(bx1)
+
+L(bx0): movl %eax, %ebx
+ movl %edx, %ebp
+ shrl $2, %ecx
+ jnc L(lo0)
+
+L(b10): leal -8(%esi), %esi
+ leal -8(%edi), %edi
+ incl %ecx
+ jmp L(lo2)
+
+L(bx1): movl %eax, %ebp
+ movl %edx, %ebx
+ shrl $2, %ecx
+ jc L(b11)
+
+L(b01): leal 4(%edi), %edi
+ jz L(end)
+ leal 4(%esi), %esi
+ jmp L(top)
+
+L(b11): leal -4(%esi), %esi
+ leal -4(%edi), %edi
+ incl %ecx
+ jmp L(lo3)
+
+ ALIGN(16)
+L(top): movl (%esi), %eax
+ mull PARAM_MULTIPLIER
+ ADDSUB %ebp, -4(%edi)
+ adcl %eax, %ebx
+ movl $0, %ebp
+ adcl %edx, %ebp
+L(lo0): movl 4(%esi), %eax
+ mull PARAM_MULTIPLIER
+ ADDSUB %ebx, (%edi)
+ adcl %eax, %ebp
+ movl $0, %ebx
+ adcl %edx, %ebx
+L(lo3): movl 8(%esi), %eax
+ mull PARAM_MULTIPLIER
+ ADDSUB %ebp, 4(%edi)
+ adcl %eax, %ebx
+ movl $0, %ebp
+ adcl %edx, %ebp
+L(lo2): movl 12(%esi), %eax
+ mull PARAM_MULTIPLIER
+ ADDSUB %ebx, 8(%edi)
+ adcl %eax, %ebp
+ movl $0, %ebx
+ adcl %edx, %ebx
+
+ leal 16(%esi), %esi
+ leal 16(%edi), %edi
+ decl %ecx
+ jnz L(top)
+
+L(end): xor %eax, %eax
+ ADDSUB %ebp, -4(%edi)
+ adcl %ebx, %eax
+ popl %ebp
+ popl %ebx
+ popl %esi
+ popl %edi
+ ret
+
+EPILOGUE()
+ifdef(`OPERATION_addmul_1',`
+ ALIGN(32)
+PROLOGUE(M4_function_1c)
+deflit(`FRAME',0)
+
+ pushl %edi
+ pushl %esi
+ pushl %ebx
+ pushl %ebp
+deflit(`FRAME',16)
+
+ movl PARAM_SRC, %esi
+ movl PARAM_SIZE, %ecx
+ movl PARAM_DST, %edi
+
+ movl (%esi), %eax
+ mull PARAM_MULTIPLIER
+
+ testb $1, %cl
+ jnz L(cx1)
+
+ movl PARAM_CARRY, %ebx
+ xorl %ebp, %ebp
+
+L(cx0): addl %eax, %ebx
+ adcl %edx, %ebp
+ shrl $2, %ecx
+ jnc L(lo0)
+
+L(c10): leal -8(%esi), %esi
+ leal -8(%edi), %edi
+ incl %ecx
+ jmp L(lo2)
+
+L(cx1): movl PARAM_CARRY, %ebp
+ xorl %ebx, %ebx
+
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ shrl $2, %ecx
+ jc L(c11)
+
+L(c01): leal 4(%edi), %edi
+ jz L(end)
+ leal 4(%esi), %esi
+ jmp L(top)
+
+L(c11): leal -4(%esi), %esi
+ leal -4(%edi), %edi
+ incl %ecx
+ jmp L(lo3)
+EPILOGUE()
+')
diff --git a/vendor/gmp-6.3.0/mpn/x86/atom/aorrlsh1_n.asm b/vendor/gmp-6.3.0/mpn/x86/atom/aorrlsh1_n.asm
new file mode 100644
index 0000000..cd1a650
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/atom/aorrlsh1_n.asm
@@ -0,0 +1,53 @@
+dnl Intel Atom mpn_rsblsh1_n -- rp[] = (vp[] << 1) - up[]
+
+dnl Contributed to the GNU project by Marco Bodrato.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+define(LSH, 1)
+define(RSH, 31)
+
+ifdef(`OPERATION_addlsh1_n', `
+ define(M4_inst, adc)
+ define(M4_opp, sub)
+ define(M4_function, mpn_addlsh1_n)
+ define(M4_function_c, mpn_addlsh1_nc)
+',`ifdef(`OPERATION_rsblsh1_n', `
+ define(M4_inst, sbb)
+ define(M4_opp, add)
+ define(M4_function, mpn_rsblsh1_n)
+ define(M4_function_c, mpn_rsblsh1_nc)
+',`m4_error(`Need OPERATION_addlsh1_n or OPERATION_rsblsh1_n
+')')')
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_addlsh1_nc mpn_rsblsh1_n mpn_rsblsh1_nc)
+
+include_mpn(`x86/atom/aorrlshC_n.asm')
diff --git a/vendor/gmp-6.3.0/mpn/x86/atom/aorrlsh2_n.asm b/vendor/gmp-6.3.0/mpn/x86/atom/aorrlsh2_n.asm
new file mode 100644
index 0000000..10f4419
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/atom/aorrlsh2_n.asm
@@ -0,0 +1,53 @@
+dnl Intel Atom mpn_addlsh2_n/mpn_rsblsh2_n -- rp[] = (vp[] << 2) +- up[]
+
+dnl Contributed to the GNU project by Marco Bodrato.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+define(LSH, 2)
+define(RSH, 30)
+
+ifdef(`OPERATION_addlsh2_n', `
+ define(M4_inst, adcl)
+ define(M4_opp, subl)
+ define(M4_function, mpn_addlsh2_n)
+ define(M4_function_c, mpn_addlsh2_nc)
+',`ifdef(`OPERATION_rsblsh2_n', `
+ define(M4_inst, sbbl)
+ define(M4_opp, addl)
+ define(M4_function, mpn_rsblsh2_n)
+ define(M4_function_c, mpn_rsblsh2_nc)
+',`m4_error(`Need OPERATION_addlsh2_n or OPERATION_rsblsh2_n
+')')')
+
+MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_addlsh2_nc mpn_rsblsh2_n mpn_rsblsh2_nc)
+
+include_mpn(`x86/atom/aorrlshC_n.asm')
diff --git a/vendor/gmp-6.3.0/mpn/x86/atom/aorrlshC_n.asm b/vendor/gmp-6.3.0/mpn/x86/atom/aorrlshC_n.asm
new file mode 100644
index 0000000..71cfe49
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/atom/aorrlshC_n.asm
@@ -0,0 +1,156 @@
+dnl Intel Atom mpn_addlshC_n/mpn_rsblshC_n -- rp[] = (vp[] << C) +- up[]
+
+dnl Contributed to the GNU project by Marco Bodrato.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C mp_limb_t mpn_addlshC_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size);
+C mp_limb_t mpn_addlshC_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size, mp_limb_t carry);
+C mp_limb_t mpn_rsblshC_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size);
+C mp_limb_t mpn_rsblshC_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size, mp_signed_limb_t carry);
+
+C cycles/limb
+C P5
+C P6 model 0-8,10-12
+C P6 model 9 (Banias)
+C P6 model 13 (Dothan)
+C P4 model 0 (Willamette)
+C P4 model 1 (?)
+C P4 model 2 (Northwood)
+C P4 model 3 (Prescott)
+C P4 model 4 (Nocona)
+C Intel Atom 6
+C AMD K6
+C AMD K7
+C AMD K8
+C AMD K10
+
+defframe(PARAM_CORB, 20)
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_DBLD, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+dnl re-use parameter space
+define(VAR_COUNT,`PARAM_SIZE')
+define(SAVE_EBP,`PARAM_DBLD')
+define(SAVE_VP,`PARAM_SRC')
+define(SAVE_UP,`PARAM_DST')
+
+define(M, eval(m4_lshift(1,LSH)))
+define(`rp', `%edi')
+define(`up', `%esi')
+define(`vp', `%ebx')
+
+ASM_START()
+ TEXT
+ ALIGN(8)
+
+PROLOGUE(M4_function_c)
+deflit(`FRAME',0)
+ movl PARAM_CORB, %eax
+ movl %eax, %edx
+ shr $LSH, %edx
+ andl $1, %edx
+ M4_opp %edx, %eax
+ jmp L(start_nc)
+EPILOGUE()
+
+PROLOGUE(M4_function)
+deflit(`FRAME',0)
+
+ xor %eax, %eax
+ xor %edx, %edx
+L(start_nc):
+ push rp FRAME_pushl()
+
+ mov PARAM_SIZE, %ecx C size
+ mov PARAM_DST, rp
+ mov up, SAVE_UP
+ incl %ecx C size + 1
+ mov PARAM_SRC, up
+ mov vp, SAVE_VP
+ shr %ecx C (size+1)\2
+ mov PARAM_DBLD, vp
+ mov %ebp, SAVE_EBP
+ mov %ecx, VAR_COUNT
+ jnc L(entry) C size odd
+
+ shr %edx C size even
+ mov (vp), %ecx
+ lea 4(vp), vp
+ lea (%eax,%ecx,M), %edx
+ mov %ecx, %eax
+ lea -4(up), up
+ lea -4(rp), rp
+ jmp L(enteven)
+
+ ALIGN(16)
+L(oop):
+ lea (%eax,%ecx,M), %ebp
+ shr $RSH, %ecx
+ mov 4(vp), %eax
+ shr %edx
+ lea 8(vp), vp
+ M4_inst (up), %ebp
+ lea (%ecx,%eax,M), %edx
+ mov %ebp, (rp)
+L(enteven):
+ M4_inst 4(up), %edx
+ lea 8(up), up
+ mov %edx, 4(rp)
+ adc %edx, %edx
+ shr $RSH, %eax
+ lea 8(rp), rp
+L(entry):
+ mov (vp), %ecx
+ decl VAR_COUNT
+ jnz L(oop)
+
+ lea (%eax,%ecx,M), %ebp
+ shr $RSH, %ecx
+ shr %edx
+ mov SAVE_VP, vp
+ M4_inst (up), %ebp
+ mov %ecx, %eax
+ mov SAVE_UP, up
+ M4_inst $0, %eax
+ mov %ebp, (rp)
+ mov SAVE_EBP, %ebp
+ pop rp FRAME_popl()
+ ret
+EPILOGUE()
+
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/x86/atom/aors_n.asm b/vendor/gmp-6.3.0/mpn/x86/atom/aors_n.asm
new file mode 100644
index 0000000..45ec287
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/atom/aors_n.asm
@@ -0,0 +1,159 @@
+dnl Intel Atom mpn_add_n/mpn_sub_n -- rp[] = up[] +- vp[].
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl Contributed to the GNU project by Marco Bodrato.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C P5
+C P6 model 0-8,10-12
+C P6 model 9 (Banias)
+C P6 model 13 (Dothan)
+C P4 model 0 (Willamette)
+C P4 model 1 (?)
+C P4 model 2 (Northwood)
+C P4 model 3 (Prescott)
+C P4 model 4 (Nocona)
+C Intel Atom 3
+C AMD K6
+C AMD K7
+C AMD K8
+C AMD K10
+
+ifdef(`OPERATION_add_n', `
+ define(M4_inst, adcl)
+ define(M4_function_n, mpn_add_n)
+ define(M4_function_nc, mpn_add_nc)
+ define(M4_description, add)
+',`ifdef(`OPERATION_sub_n', `
+ define(M4_inst, sbbl)
+ define(M4_function_n, mpn_sub_n)
+ define(M4_function_nc, mpn_sub_nc)
+ define(M4_description, subtract)
+',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
+')')')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size);
+C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size, mp_limb_t carry);
+C
+C Calculate src1,size M4_description src2,size, and store the result in
+C dst,size. The return value is the carry bit from the top of the result (1
+C or 0).
+C
+C The _nc version accepts 1 or 0 for an initial carry into the low limb of
+C the calculation. Note values other than 1 or 0 here will lead to garbage
+C results.
+
+defframe(PARAM_CARRY,20)
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC2, 12)
+defframe(PARAM_SRC1, 8)
+defframe(PARAM_DST, 4)
+
+dnl re-use parameter space
+define(SAVE_RP,`PARAM_SIZE')
+define(SAVE_VP,`PARAM_SRC1')
+define(SAVE_UP,`PARAM_DST')
+
+define(`rp', `%edi')
+define(`up', `%esi')
+define(`vp', `%ebx')
+define(`cy', `%ecx')
+define(`r1', `%ecx')
+define(`r2', `%edx')
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+deflit(`FRAME',0)
+
+PROLOGUE(M4_function_n)
+ xor cy, cy C carry
+L(start):
+ mov PARAM_SIZE, %eax C size
+ mov rp, SAVE_RP
+ mov PARAM_DST, rp
+ mov up, SAVE_UP
+ mov PARAM_SRC1, up
+ shr %eax C size >> 1
+ mov vp, SAVE_VP
+ mov PARAM_SRC2, vp
+ jz L(one) C size == 1
+ jc L(three) C size % 2 == 1
+
+ shr cy
+ mov (up), r2
+ lea 4(up), up
+ lea 4(vp), vp
+ lea -4(rp), rp
+ jmp L(entry)
+L(one):
+ shr cy
+ mov (up), r1
+ jmp L(end)
+L(three):
+ shr cy
+ mov (up), r1
+
+ ALIGN(16)
+L(oop):
+ M4_inst (vp), r1
+ lea 8(up), up
+ mov -4(up), r2
+ lea 8(vp), vp
+ mov r1, (rp)
+L(entry):
+ M4_inst -4(vp), r2
+ lea 8(rp), rp
+ dec %eax
+ mov (up), r1
+ mov r2, -4(rp)
+ jnz L(oop)
+
+L(end): C %eax is zero here
+ mov SAVE_UP, up
+ M4_inst (vp), r1
+ mov SAVE_VP, vp
+ mov r1, (rp)
+ adc %eax, %eax
+ mov SAVE_RP, rp
+ ret
+EPILOGUE()
+
+PROLOGUE(M4_function_nc)
+ mov PARAM_CARRY, cy C carry
+ jmp L(start)
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/x86/atom/aorslshC_n.asm b/vendor/gmp-6.3.0/mpn/x86/atom/aorslshC_n.asm
new file mode 100644
index 0000000..75ace65
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/atom/aorslshC_n.asm
@@ -0,0 +1,247 @@
+dnl Intel Atom mpn_addlshC_n/mpn_sublshC_n -- rp[] = up[] +- (vp[] << C)
+
+dnl Contributed to the GNU project by Marco Bodrato.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C mp_limb_t mpn_addlshC_n_ip1 (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C mp_limb_t mpn_addlshC_nc_ip1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t carry);
+C mp_limb_t mpn_sublshC_n_ip1 (mp_ptr dst, mp_srcptr src, mp_size_t size,);
+C mp_limb_t mpn_sublshC_nc_ip1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_signed_limb_t borrow);
+
+defframe(PARAM_CORB, 16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+C mp_limb_t mpn_addlshC_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size,);
+C mp_limb_t mpn_addlshC_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size, mp_limb_t carry);
+C mp_limb_t mpn_sublshC_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size,);
+C mp_limb_t mpn_sublshC_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size, mp_limb_t borrow);
+
+C if src1 == dst, _ip1 is used
+
+C cycles/limb
+C dst!=src1,src2 dst==src1
+C P5
+C P6 model 0-8,10-12
+C P6 model 9 (Banias)
+C P6 model 13 (Dothan)
+C P4 model 0 (Willamette)
+C P4 model 1 (?)
+C P4 model 2 (Northwood)
+C P4 model 3 (Prescott)
+C P4 model 4 (Nocona)
+C Intel Atom 7 6
+C AMD K6
+C AMD K7
+C AMD K8
+C AMD K10
+
+defframe(GPARAM_CORB, 20)
+defframe(GPARAM_SIZE, 16)
+defframe(GPARAM_SRC2, 12)
+
+dnl re-use parameter space
+define(SAVE_EBP,`PARAM_SIZE')
+define(SAVE_EBX,`PARAM_SRC')
+define(SAVE_UP,`PARAM_DST')
+
+define(M, eval(m4_lshift(1,LSH)))
+define(`rp', `%edi')
+define(`up', `%esi')
+
+ASM_START()
+ TEXT
+ ALIGN(8)
+
+PROLOGUE(M4_ip_function_c)
+deflit(`FRAME',0)
+ movl PARAM_CORB, %ecx
+ movl %ecx, %edx
+ shr $LSH, %edx
+ andl $1, %edx
+ M4_opp %edx, %ecx
+ jmp L(start_nc)
+EPILOGUE()
+
+PROLOGUE(M4_ip_function)
+deflit(`FRAME',0)
+
+ xor %ecx, %ecx
+ xor %edx, %edx
+L(start_nc):
+ push rp FRAME_pushl()
+ mov PARAM_DST, rp
+ mov up, SAVE_UP
+ mov PARAM_SRC, up
+ mov %ebx, SAVE_EBX
+ mov PARAM_SIZE, %ebx C size
+L(inplace):
+ incl %ebx C size + 1
+ shr %ebx C (size+1)\2
+ mov %ebp, SAVE_EBP
+ jnc L(entry) C size odd
+
+ add %edx, %edx C size even
+ mov %ecx, %ebp
+ mov (up), %ecx
+ lea -4(rp), rp
+ lea (%ebp,%ecx,M), %eax
+ lea 4(up), up
+ jmp L(enteven)
+
+ ALIGN(16)
+L(oop):
+ lea (%ecx,%eax,M), %ebp
+ shr $RSH, %eax
+ mov 4(up), %ecx
+ add %edx, %edx
+ lea 8(up), up
+ M4_inst %ebp, (rp)
+ lea (%eax,%ecx,M), %eax
+
+L(enteven):
+ M4_inst %eax, 4(rp)
+ lea 8(rp), rp
+
+ sbb %edx, %edx
+ shr $RSH, %ecx
+
+L(entry):
+ mov (up), %eax
+ decl %ebx
+ jnz L(oop)
+
+ lea (%ecx,%eax,M), %ebp
+ shr $RSH, %eax
+ shr %edx
+ M4_inst %ebp, (rp)
+ mov SAVE_UP, up
+ adc $0, %eax
+ mov SAVE_EBP, %ebp
+ mov SAVE_EBX, %ebx
+ pop rp FRAME_popl()
+ ret
+EPILOGUE()
+
+PROLOGUE(M4_function_c)
+deflit(`FRAME',0)
+ movl GPARAM_CORB, %ecx
+ movl %ecx, %edx
+ shr $LSH, %edx
+ andl $1, %edx
+ M4_opp %edx, %ecx
+ jmp L(generic_nc)
+EPILOGUE()
+
+PROLOGUE(M4_function)
+deflit(`FRAME',0)
+
+ xor %ecx, %ecx
+ xor %edx, %edx
+L(generic_nc):
+ push rp FRAME_pushl()
+ mov PARAM_DST, rp
+ mov up, SAVE_UP
+ mov PARAM_SRC, up
+ cmp rp, up
+ mov %ebx, SAVE_EBX
+ jne L(general)
+ mov GPARAM_SIZE, %ebx C size
+ mov GPARAM_SRC2, up
+ jmp L(inplace)
+
+L(general):
+ mov GPARAM_SIZE, %eax C size
+ mov %ebx, SAVE_EBX
+ incl %eax C size + 1
+ mov up, %ebx C vp
+ mov GPARAM_SRC2, up C up
+ shr %eax C (size+1)\2
+ mov %ebp, SAVE_EBP
+ mov %eax, GPARAM_SIZE
+ jnc L(entry2) C size odd
+
+ add %edx, %edx C size even
+ mov %ecx, %ebp
+ mov (up), %ecx
+ lea -4(rp), rp
+ lea -4(%ebx), %ebx
+ lea (%ebp,%ecx,M), %eax
+ lea 4(up), up
+ jmp L(enteven2)
+
+ ALIGN(16)
+L(oop2):
+ lea (%ecx,%eax,M), %ebp
+ shr $RSH, %eax
+ mov 4(up), %ecx
+ add %edx, %edx
+ lea 8(up), up
+ mov (%ebx), %edx
+ M4_inst %ebp, %edx
+ lea (%eax,%ecx,M), %eax
+ mov %edx, (rp)
+L(enteven2):
+ mov 4(%ebx), %edx
+ lea 8(%ebx), %ebx
+ M4_inst %eax, %edx
+ mov %edx, 4(rp)
+ sbb %edx, %edx
+ shr $RSH, %ecx
+ lea 8(rp), rp
+L(entry2):
+ mov (up), %eax
+ decl GPARAM_SIZE
+ jnz L(oop2)
+
+ lea (%ecx,%eax,M), %ebp
+ shr $RSH, %eax
+ shr %edx
+ mov (%ebx), %edx
+ M4_inst %ebp, %edx
+ mov %edx, (rp)
+ mov SAVE_UP, up
+ adc $0, %eax
+ mov SAVE_EBP, %ebp
+ mov SAVE_EBX, %ebx
+ pop rp FRAME_popl()
+ ret
+EPILOGUE()
+
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/x86/atom/bdiv_q_1.asm b/vendor/gmp-6.3.0/mpn/x86/atom/bdiv_q_1.asm
new file mode 100644
index 0000000..31e908e
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/atom/bdiv_q_1.asm
@@ -0,0 +1,35 @@
+dnl Intel Atom mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- schoolbook Hensel
+dnl division by 1-limb divisor, returning quotient only.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1)
+include_mpn(`x86/pentium/bdiv_q_1.asm')
diff --git a/vendor/gmp-6.3.0/mpn/x86/atom/cnd_add_n.asm b/vendor/gmp-6.3.0/mpn/x86/atom/cnd_add_n.asm
new file mode 100644
index 0000000..50bf2ad
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/atom/cnd_add_n.asm
@@ -0,0 +1,113 @@
+dnl X86 mpn_cnd_add_n optimised for Intel Atom.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C P5 ?
+C P6 model 0-8,10-12 ?
+C P6 model 9 (Banias) ?
+C P6 model 13 (Dothan) ?
+C P4 model 0-1 (Willamette) ?
+C P4 model 2 (Northwood) ?
+C P4 model 3-4 (Prescott) ?
+C Intel atom 4.67
+C AMD K6 ?
+C AMD K7 ?
+C AMD K8 ?
+
+
+define(`rp', `%edi')
+define(`up', `%esi')
+define(`vp', `%ebp')
+define(`n', `%ecx')
+define(`cnd', `20(%esp)')
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_cnd_add_n)
+ push %edi
+ push %esi
+ push %ebx
+ push %ebp
+
+ mov cnd, %eax C make cnd into a mask (1)
+ mov 24(%esp), rp
+ neg %eax C make cnd into a mask (1)
+ mov 28(%esp), up
+ sbb %eax, %eax C make cnd into a mask (1)
+ mov 32(%esp), vp
+ mov %eax, cnd C make cnd into a mask (1)
+ mov 36(%esp), n
+
+ xor %edx, %edx
+
+ shr $1, n
+ jnc L(top)
+
+ mov 0(vp), %eax
+ and cnd, %eax
+ lea 4(vp), vp
+ add 0(up), %eax
+ lea 4(rp), rp
+ lea 4(up), up
+ sbb %edx, %edx
+ mov %eax, -4(rp)
+ inc n
+ dec n
+ je L(end)
+
+L(top): sbb %edx, %edx
+ mov 0(vp), %eax
+ and cnd, %eax
+ lea 8(vp), vp
+ lea 8(rp), rp
+ mov -4(vp), %ebx
+ and cnd, %ebx
+ add %edx, %edx
+ adc 0(up), %eax
+ lea 8(up), up
+ mov %eax, -8(rp)
+ adc -4(up), %ebx
+ dec n
+ mov %ebx, -4(rp)
+ jne L(top)
+
+L(end): mov $0, %eax
+ adc %eax, %eax
+
+ pop %ebp
+ pop %ebx
+ pop %esi
+ pop %edi
+ ret
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/x86/atom/cnd_sub_n.asm b/vendor/gmp-6.3.0/mpn/x86/atom/cnd_sub_n.asm
new file mode 100644
index 0000000..221bedc
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/atom/cnd_sub_n.asm
@@ -0,0 +1,124 @@
+dnl X86 mpn_cnd_sub_n optimised for Intel Atom.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C P5 ?
+C P6 model 0-8,10-12 ?
+C P6 model 9 (Banias) ?
+C P6 model 13 (Dothan) ?
+C P4 model 0-1 (Willamette) ?
+C P4 model 2 (Northwood) ?
+C P4 model 3-4 (Prescott) ?
+C Intel atom 5.67
+C AMD K6 ?
+C AMD K7 ?
+C AMD K8 ?
+
+
+define(`rp', `%edi')
+define(`up', `%esi')
+define(`vp', `%ebp')
+define(`n', `%ecx')
+define(`cnd', `20(%esp)')
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_cnd_sub_n)
+ push %edi
+ push %esi
+ push %ebx
+ push %ebp
+
+ mov cnd, %eax C make cnd into a mask (1)
+ mov 24(%esp), rp
+ neg %eax C make cnd into a mask (1)
+ mov 28(%esp), up
+ sbb %eax, %eax C make cnd into a mask (1)
+ mov 32(%esp), vp
+ mov %eax, cnd C make cnd into a mask (1)
+ mov 36(%esp), n
+
+ xor %edx, %edx
+
+ inc n
+ shr n
+ jnc L(ent)
+
+ mov 0(vp), %eax
+ and cnd, %eax
+ lea 4(vp), vp
+ mov 0(up), %edx
+ sub %eax, %edx
+ lea 4(rp), rp
+ lea 4(up), up
+ mov %edx, -4(rp)
+ sbb %edx, %edx C save cy
+
+L(ent): mov 0(vp), %ebx
+ and cnd, %ebx
+ add %edx, %edx C restore cy
+ mov 0(up), %edx
+ dec n
+ je L(end)
+
+L(top): sbb %ebx, %edx
+ mov 4(vp), %eax
+ mov %edx, 0(rp)
+ sbb %edx, %edx C save cy
+ mov 8(vp), %ebx
+ lea 8(up), up
+ and cnd, %ebx
+ and cnd, %eax
+ add %edx, %edx C restore cy
+ mov -4(up), %edx
+ lea 8(rp), rp
+ sbb %eax, %edx
+ mov %edx, -4(rp)
+ dec n
+ mov 0(up), %edx
+ lea 8(vp), vp
+ jne L(top)
+
+L(end): sbb %ebx, %edx
+ mov %edx, 0(rp)
+
+ mov $0, %eax
+ adc %eax, %eax
+
+ pop %ebp
+ pop %ebx
+ pop %esi
+ pop %edi
+ ret
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/x86/atom/dive_1.asm b/vendor/gmp-6.3.0/mpn/x86/atom/dive_1.asm
new file mode 100644
index 0000000..71036a1
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/atom/dive_1.asm
@@ -0,0 +1,34 @@
+dnl Intel Atom mpn_divexact_1 -- mpn by limb exact division.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_divexact_1)
+include_mpn(`x86/pentium/dive_1.asm')
diff --git a/vendor/gmp-6.3.0/mpn/x86/atom/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/x86/atom/gmp-mparam.h
new file mode 100644
index 0000000..e025bb7
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/atom/gmp-mparam.h
@@ -0,0 +1,214 @@
+/* Intel Atom/32 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 1600 MHz Diamondville (Atom 330) */
+/* FFT tuning limit = 67,000,000 */
+/* Generated by tuneup.c, 2019-10-18, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD 3
+#define MOD_1_UNNORM_THRESHOLD 5
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 11
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 5
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 10
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 17
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1N_PI1_METHOD 1 /* 72.60% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 35
+
+#define DIV_1_VS_MUL_1_PERCENT 236
+
+#define MUL_TOOM22_THRESHOLD 22
+#define MUL_TOOM33_THRESHOLD 81
+#define MUL_TOOM44_THRESHOLD 178
+#define MUL_TOOM6H_THRESHOLD 270
+#define MUL_TOOM8H_THRESHOLD 399
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 126
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 115
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 129
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 115
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 32
+#define SQR_TOOM3_THRESHOLD 117
+#define SQR_TOOM4_THRESHOLD 178
+#define SQR_TOOM6_THRESHOLD 366
+#define SQR_TOOM8_THRESHOLD 527
+
+#define MULMID_TOOM42_THRESHOLD 50
+
+#define MULMOD_BNM1_THRESHOLD 13
+#define SQRMOD_BNM1_THRESHOLD 17
+
+#define MUL_FFT_MODF_THRESHOLD 404 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 404, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
+ { 12, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \
+ { 21, 7}, { 11, 6}, { 25, 7}, { 13, 6}, \
+ { 27, 7}, { 15, 6}, { 31, 7}, { 21, 8}, \
+ { 11, 7}, { 27, 8}, { 15, 7}, { 35, 8}, \
+ { 19, 7}, { 39, 8}, { 23, 7}, { 47, 8}, \
+ { 27, 9}, { 15, 8}, { 39, 9}, { 23, 8}, \
+ { 51,10}, { 15, 9}, { 31, 8}, { 67, 9}, \
+ { 39, 8}, { 79, 9}, { 47, 8}, { 95,10}, \
+ { 31, 9}, { 79,10}, { 47, 9}, { 95,11}, \
+ { 31,10}, { 63, 9}, { 135,10}, { 79, 9}, \
+ { 159,10}, { 95, 9}, { 191,10}, { 111,11}, \
+ { 63,10}, { 127, 9}, { 255, 8}, { 511,10}, \
+ { 143, 9}, { 287, 8}, { 575, 9}, { 303,10}, \
+ { 159,11}, { 95,10}, { 191, 9}, { 383,12}, \
+ { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \
+ { 271, 9}, { 543,10}, { 287, 9}, { 575,10}, \
+ { 303,11}, { 159,10}, { 351, 9}, { 703,10}, \
+ { 367, 9}, { 735,11}, { 191,10}, { 383, 9}, \
+ { 767,10}, { 415,11}, { 223,10}, { 447,12}, \
+ { 127,11}, { 255,10}, { 543,11}, { 287,10}, \
+ { 607,11}, { 319,10}, { 671,11}, { 351,10}, \
+ { 735,12}, { 191,11}, { 383,10}, { 767,11}, \
+ { 415,10}, { 831,11}, { 447,13}, { 127,12}, \
+ { 255,11}, { 543,10}, { 1087,11}, { 607,12}, \
+ { 319,11}, { 735,12}, { 383,11}, { 831,12}, \
+ { 447,11}, { 959,13}, { 255,12}, { 511,11}, \
+ { 1087,12}, { 575,11}, { 1215,12}, { 639,11}, \
+ { 1279,12}, { 703,11}, { 1407,13}, { 383,12}, \
+ { 831,11}, { 1663,12}, { 959,14}, { 255,13}, \
+ { 511,12}, { 1215,13}, { 639,12}, { 1471,13}, \
+ { 767,12}, { 1599,13}, { 895,12}, { 1791,14}, \
+ { 511,13}, { 1023,12}, { 2111,13}, { 1151,12}, \
+ { 2431,13}, { 1407,14}, { 767,13}, { 1663,12}, \
+ { 3455,13}, { 1791,15}, { 511,14}, { 1023,13}, \
+ { 2431,14}, { 1279,13}, { 2943,12}, { 5887,14}, \
+ { 1535,13}, { 3455,14}, { 1791,13}, { 3839,15}, \
+ { 1023,14}, { 2047,13}, { 4223,14}, { 2303,13}, \
+ { 4991,12}, { 9983,14}, { 2815,13}, { 5887,15}, \
+ { 1535,14}, { 3839,16} }
+#define MUL_FFT_TABLE3_SIZE 158
+#define MUL_FFT_THRESHOLD 4544
+
+#define SQR_FFT_MODF_THRESHOLD 368 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 368, 5}, { 23, 6}, { 12, 5}, { 25, 6}, \
+ { 13, 5}, { 27, 6}, { 25, 7}, { 13, 6}, \
+ { 28, 7}, { 15, 6}, { 31, 7}, { 17, 6}, \
+ { 35, 7}, { 21, 8}, { 11, 7}, { 27, 8}, \
+ { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \
+ { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \
+ { 39, 9}, { 23, 8}, { 51,10}, { 15, 9}, \
+ { 31, 8}, { 63, 9}, { 39, 8}, { 79, 9}, \
+ { 47,10}, { 31, 9}, { 79,10}, { 47, 9}, \
+ { 95,11}, { 31,10}, { 63, 9}, { 127, 8}, \
+ { 255, 9}, { 135,10}, { 79, 9}, { 159, 8}, \
+ { 319,10}, { 95, 9}, { 191,11}, { 63,10}, \
+ { 127, 9}, { 255, 8}, { 511, 9}, { 271,10}, \
+ { 143, 9}, { 287, 8}, { 575, 9}, { 303,10}, \
+ { 159, 9}, { 319,11}, { 95,10}, { 191, 9}, \
+ { 383,12}, { 63,11}, { 127,10}, { 255, 9}, \
+ { 511,10}, { 271, 9}, { 543,10}, { 287, 9}, \
+ { 575,10}, { 303, 9}, { 607,11}, { 159,10}, \
+ { 319, 9}, { 639,10}, { 335, 9}, { 671,10}, \
+ { 351, 9}, { 703,11}, { 191,10}, { 383, 9}, \
+ { 767,10}, { 415,11}, { 223,10}, { 447,12}, \
+ { 127,11}, { 255,10}, { 543,11}, { 287,10}, \
+ { 607,11}, { 319,10}, { 671,11}, { 351,10}, \
+ { 703,12}, { 191,11}, { 383,10}, { 767,11}, \
+ { 415,10}, { 831,11}, { 447,13}, { 127,12}, \
+ { 255,11}, { 543,10}, { 1087,11}, { 607,12}, \
+ { 319,11}, { 671,10}, { 1343,11}, { 735,12}, \
+ { 383,11}, { 831,12}, { 447,11}, { 959,13}, \
+ { 255,12}, { 511,11}, { 1087,12}, { 575,11}, \
+ { 1215,12}, { 639,11}, { 1343,12}, { 703,13}, \
+ { 383,12}, { 959,14}, { 255,13}, { 511,12}, \
+ { 1215,13}, { 639,12}, { 1471,13}, { 767,12}, \
+ { 1599,13}, { 895,14}, { 511,13}, { 1023,12}, \
+ { 2111,13}, { 1151,12}, { 2431,13}, { 1407,14}, \
+ { 767,13}, { 1663,12}, { 3455,15}, { 511,14}, \
+ { 1023,13}, { 2175,12}, { 4351,13}, { 2431,14}, \
+ { 1279,13}, { 2943,12}, { 5887,14}, { 1535,13}, \
+ { 3455,14}, { 1791,13}, { 3839,15}, { 1023,14}, \
+ { 2047,13}, { 4351,14}, { 2303,13}, { 4991,12}, \
+ { 9983,14}, { 2815,13}, { 5887,15}, { 1535,14}, \
+ { 3839,16} }
+#define SQR_FFT_TABLE3_SIZE 161
+#define SQR_FFT_THRESHOLD 3712
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 56
+#define MULLO_MUL_N_THRESHOLD 8907
+#define SQRLO_BASECASE_THRESHOLD 6
+#define SQRLO_DC_THRESHOLD 111
+#define SQRLO_SQR_THRESHOLD 6654
+
+#define DC_DIV_QR_THRESHOLD 67
+#define DC_DIVAPPR_Q_THRESHOLD 252
+#define DC_BDIV_QR_THRESHOLD 63
+#define DC_BDIV_Q_THRESHOLD 172
+
+#define INV_MULMOD_BNM1_THRESHOLD 42
+#define INV_NEWTON_THRESHOLD 250
+#define INV_APPR_THRESHOLD 250
+
+#define BINV_NEWTON_THRESHOLD 276
+#define REDC_1_TO_REDC_N_THRESHOLD 68
+
+#define MU_DIV_QR_THRESHOLD 1334
+#define MU_DIVAPPR_Q_THRESHOLD 1442
+#define MUPI_DIV_QR_THRESHOLD 116
+#define MU_BDIV_QR_THRESHOLD 1142
+#define MU_BDIV_Q_THRESHOLD 1341
+
+#define POWM_SEC_TABLE 1,16,98,376,1259
+
+#define GET_STR_DC_THRESHOLD 12
+#define GET_STR_PRECOMPUTE_THRESHOLD 23
+#define SET_STR_DC_THRESHOLD 298
+#define SET_STR_PRECOMPUTE_THRESHOLD 1037
+
+#define FAC_DSC_THRESHOLD 171
+#define FAC_ODD_THRESHOLD 34
+
+#define MATRIX22_STRASSEN_THRESHOLD 17
+#define HGCD2_DIV1_METHOD 3 /* 3.71% faster than 1 */
+#define HGCD_THRESHOLD 128
+#define HGCD_APPR_THRESHOLD 186
+#define HGCD_REDUCE_THRESHOLD 2479
+#define GCD_DC_THRESHOLD 465
+#define GCDEXT_DC_THRESHOLD 339
+#define JACOBI_BASE_METHOD 3 /* 2.58% faster than 2 */
+
+/* Tuneup completed successfully, took 214190 seconds */
diff --git a/vendor/gmp-6.3.0/mpn/x86/atom/logops_n.asm b/vendor/gmp-6.3.0/mpn/x86/atom/logops_n.asm
new file mode 100644
index 0000000..3cb6d73
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/atom/logops_n.asm
@@ -0,0 +1,151 @@
+dnl Intel Atom mpn_and_n,...,mpn_xnor_n -- bitwise logical operations.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl Contributed to the GNU project by Marco Bodrato.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C op nop opn
+C P5
+C P6 model 0-8,10-12
+C P6 model 9 (Banias)
+C P6 model 13 (Dothan)
+C P4 model 0 (Willamette)
+C P4 model 1 (?)
+C P4 model 2 (Northwood)
+C P4 model 3 (Prescott)
+C P4 model 4 (Nocona)
+C Intel Atom 3 3.5 3.5
+C AMD K6
+C AMD K7
+C AMD K8
+C AMD K10
+
+define(M4_choose_op,
+`ifdef(`OPERATION_$1',`
+define(`M4_function', `mpn_$1')
+define(`M4_want_pre', `$4')
+define(`M4_inst', `$3')
+define(`M4_want_post',`$2')
+')')
+define(M4pre, `ifelse(M4_want_pre, yes,`$1')')
+define(M4post,`ifelse(M4_want_post,yes,`$1')')
+
+M4_choose_op( and_n, , andl, )
+M4_choose_op( andn_n, , andl, yes)
+M4_choose_op( nand_n, yes, andl, )
+M4_choose_op( ior_n, , orl, )
+M4_choose_op( iorn_n, , orl, yes)
+M4_choose_op( nior_n, yes, orl, )
+M4_choose_op( xor_n, , xorl, )
+M4_choose_op( xnor_n, yes, xorl, )
+
+ifdef(`M4_function',,
+`m4_error(`Unrecognised or undefined OPERATION symbol
+')')
+
+MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
+
+C void M4_function (mp_ptr dst, mp_srcptr src2, mp_srcptr src1, mp_size_t size);
+C
+
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC1, 12)
+defframe(PARAM_SRC2, 8)
+defframe(PARAM_DST, 4)
+
+dnl re-use parameter space
+define(SAVE_RP,`PARAM_SIZE')
+define(SAVE_VP,`PARAM_SRC1')
+define(SAVE_UP,`PARAM_DST')
+
+define(`rp', `%edi')
+define(`up', `%esi')
+define(`vp', `%ebx')
+define(`cnt', `%eax')
+define(`r1', `%ecx')
+define(`r2', `%edx')
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+deflit(`FRAME',0)
+
+PROLOGUE(M4_function)
+ mov PARAM_SIZE, cnt C size
+ mov rp, SAVE_RP
+ mov PARAM_DST, rp
+ mov up, SAVE_UP
+ mov PARAM_SRC1, up
+ shr cnt C size >> 1
+ mov vp, SAVE_VP
+ mov PARAM_SRC2, vp
+ mov (up), r1
+ jz L(end) C size == 1
+ jnc L(even) C size % 2 == 0
+
+ ALIGN(16)
+L(oop):
+M4pre(` notl_or_xorl_GMP_NUMB_MASK(r1)')
+ M4_inst (vp), r1
+ lea 8(up), up
+ mov -4(up), r2
+M4post(` notl_or_xorl_GMP_NUMB_MASK(r1)')
+ lea 8(vp), vp
+ mov r1, (rp)
+L(entry):
+M4pre(` notl_or_xorl_GMP_NUMB_MASK(r2)')
+ M4_inst -4(vp), r2
+ lea 8(rp), rp
+M4post(` notl_or_xorl_GMP_NUMB_MASK(r2)')
+ dec cnt
+ mov (up), r1
+ mov r2, -4(rp)
+ jnz L(oop)
+
+L(end):
+M4pre(` notl_or_xorl_GMP_NUMB_MASK(r1)')
+ mov SAVE_UP, up
+ M4_inst (vp), r1
+M4post(`notl_or_xorl_GMP_NUMB_MASK(r1)')
+ mov SAVE_VP, vp
+ mov r1, (rp)
+ mov SAVE_RP, rp
+ ret
+
+L(even):
+ mov r1, r2
+ lea 4(up), up
+ lea 4(vp), vp
+ lea -4(rp), rp
+ jmp L(entry)
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/x86/atom/lshift.asm b/vendor/gmp-6.3.0/mpn/x86/atom/lshift.asm
new file mode 100644
index 0000000..f2c70dd
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/atom/lshift.asm
@@ -0,0 +1,218 @@
+dnl Intel Atom mpn_lshift -- mpn left shift.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C unsigned cnt);
+
+C cycles/limb
+C cnt!=1 cnt==1
+C P5
+C P6 model 0-8,10-12
+C P6 model 9 (Banias)
+C P6 model 13 (Dothan)
+C P4 model 0 (Willamette)
+C P4 model 1 (?)
+C P4 model 2 (Northwood)
+C P4 model 3 (Prescott)
+C P4 model 4 (Nocona)
+C Intel Atom 5 2.5
+C AMD K6
+C AMD K7
+C AMD K8
+C AMD K10
+
+defframe(PARAM_CNT, 16)
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+dnl re-use parameter space
+define(SAVE_UP,`PARAM_CNT')
+define(VAR_COUNT,`PARAM_SIZE')
+define(SAVE_EBX,`PARAM_SRC')
+define(SAVE_EBP,`PARAM_DST')
+
+define(`rp', `%edi')
+define(`up', `%esi')
+define(`cnt', `%ecx')
+
+ASM_START()
+ TEXT
+ ALIGN(8)
+deflit(`FRAME',0)
+PROLOGUE(mpn_lshift)
+ mov PARAM_CNT, cnt
+ mov PARAM_SIZE, %edx
+ mov up, SAVE_UP
+ mov PARAM_SRC, up
+ push rp FRAME_pushl()
+ mov PARAM_DST, rp
+
+C We can use faster code for shift-by-1 under certain conditions.
+ cmp $1,cnt
+ jne L(normal)
+ cmpl rp, up
+ jnc L(special) C jump if s_ptr + 1 >= res_ptr
+ leal (up,%edx,4),%eax
+ cmpl %eax,rp
+ jnc L(special) C jump if res_ptr >= s_ptr + size
+
+L(normal):
+ lea -4(up,%edx,4), up
+ mov %ebx, SAVE_EBX
+ lea -4(rp,%edx,4), rp
+
+ shr %edx
+ mov (up), %eax
+ mov %edx, VAR_COUNT
+ jnc L(evn)
+
+ mov %eax, %ebx
+ shl %cl, %ebx
+ neg cnt
+ shr %cl, %eax
+ test %edx, %edx
+ jnz L(gt1)
+ mov %ebx, (rp)
+ jmp L(quit)
+
+L(gt1): mov %ebp, SAVE_EBP
+ push %eax
+ mov -4(up), %eax
+ mov %eax, %ebp
+ shr %cl, %eax
+ jmp L(lo1)
+
+L(evn): mov %ebp, SAVE_EBP
+ neg cnt
+ mov %eax, %ebp
+ mov -4(up), %edx
+ shr %cl, %eax
+ mov %edx, %ebx
+ shr %cl, %edx
+ neg cnt
+ decl VAR_COUNT
+ lea 4(rp), rp
+ lea -4(up), up
+ jz L(end)
+ push %eax FRAME_pushl()
+
+ ALIGN(8)
+L(top): shl %cl, %ebp
+ or %ebp, %edx
+ shl %cl, %ebx
+ neg cnt
+ mov -4(up), %eax
+ mov %eax, %ebp
+ mov %edx, -4(rp)
+ shr %cl, %eax
+ lea -8(rp), rp
+L(lo1): mov -8(up), %edx
+ or %ebx, %eax
+ mov %edx, %ebx
+ shr %cl, %edx
+ lea -8(up), up
+ neg cnt
+ mov %eax, (rp)
+ decl VAR_COUNT
+ jg L(top)
+
+ pop %eax FRAME_popl()
+L(end):
+ shl %cl, %ebp
+ shl %cl, %ebx
+ or %ebp, %edx
+ mov SAVE_EBP, %ebp
+ mov %edx, -4(rp)
+ mov %ebx, -8(rp)
+
+L(quit):
+ mov SAVE_UP, up
+ mov SAVE_EBX, %ebx
+ pop rp FRAME_popl()
+ ret
+
+L(special):
+deflit(`FRAME',4)
+ lea 3(%edx), %eax C size + 3
+ dec %edx C size - 1
+ mov (up), %ecx
+ shr $2, %eax C (size + 3) / 4
+ and $3, %edx C (size - 1) % 4
+ jz L(goloop) C jmp if size == 1 (mod 4)
+ shr %edx
+ jnc L(odd) C jum if size == 3 (mod 4)
+
+ add %ecx, %ecx
+ lea 4(up), up
+ mov %ecx, (rp)
+ mov (up), %ecx
+ lea 4(rp), rp
+
+ dec %edx
+ jnz L(goloop) C jump if size == 0 (mod 4)
+L(odd): lea -8(up), up
+ lea -8(rp), rp
+ jmp L(sentry) C reached if size == 2 or 3 (mod 4)
+
+L(sloop):
+ adc %ecx, %ecx
+ mov 4(up), %edx
+ mov %ecx, (rp)
+ adc %edx, %edx
+ mov 8(up), %ecx
+ mov %edx, 4(rp)
+L(sentry):
+ adc %ecx, %ecx
+ mov 12(up), %edx
+ mov %ecx, 8(rp)
+ adc %edx, %edx
+ lea 16(up), up
+ mov %edx, 12(rp)
+ lea 16(rp), rp
+ mov (up), %ecx
+L(goloop):
+ decl %eax
+ jnz L(sloop)
+
+L(squit):
+ adc %ecx, %ecx
+ mov %ecx, (rp)
+ adc %eax, %eax
+
+ mov SAVE_UP, up
+ pop rp FRAME_popl()
+ ret
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/x86/atom/lshiftc.asm b/vendor/gmp-6.3.0/mpn/x86/atom/lshiftc.asm
new file mode 100644
index 0000000..5be53ed
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/atom/lshiftc.asm
@@ -0,0 +1,159 @@
+dnl Intel Atom mpn_lshiftc -- mpn left shift with complement.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C mp_limb_t mpn_lshiftc (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C unsigned cnt);
+
+C cycles/limb
+C P5
+C P6 model 0-8,10-12
+C P6 model 9 (Banias)
+C P6 model 13 (Dothan)
+C P4 model 0 (Willamette)
+C P4 model 1 (?)
+C P4 model 2 (Northwood)
+C P4 model 3 (Prescott)
+C P4 model 4 (Nocona)
+C Intel Atom 5.5
+C AMD K6
+C AMD K7
+C AMD K8
+C AMD K10
+
+defframe(PARAM_CNT, 16)
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+dnl re-use parameter space
+define(SAVE_UP,`PARAM_CNT')
+define(VAR_COUNT,`PARAM_SIZE')
+define(SAVE_EBX,`PARAM_SRC')
+define(SAVE_EBP,`PARAM_DST')
+
+define(`rp', `%edi')
+define(`up', `%esi')
+define(`cnt', `%ecx')
+
+ASM_START()
+ TEXT
+
+PROLOGUE(mpn_lshiftc)
+deflit(`FRAME',0)
+ mov PARAM_CNT, cnt
+ mov PARAM_SIZE, %edx
+ mov up, SAVE_UP
+ mov PARAM_SRC, up
+ push rp FRAME_pushl()
+ mov PARAM_DST, rp
+
+ lea -4(up,%edx,4), up
+ mov %ebx, SAVE_EBX
+ lea -4(rp,%edx,4), rp
+
+ shr %edx
+ mov (up), %eax
+ mov %edx, VAR_COUNT
+ jnc L(evn)
+
+ mov %eax, %ebx
+ shl %cl, %ebx
+ neg cnt
+ shr %cl, %eax
+ test %edx, %edx
+ jnz L(gt1)
+ not %ebx
+ mov %ebx, (rp)
+ jmp L(quit)
+
+L(gt1): mov %ebp, SAVE_EBP
+ push %eax
+ mov -4(up), %eax
+ mov %eax, %ebp
+ shr %cl, %eax
+ jmp L(lo1)
+
+L(evn): mov %ebp, SAVE_EBP
+ neg cnt
+ mov %eax, %ebp
+ mov -4(up), %edx
+ shr %cl, %eax
+ mov %edx, %ebx
+ shr %cl, %edx
+ neg cnt
+ decl VAR_COUNT
+ lea 4(rp), rp
+ lea -4(up), up
+ jz L(end)
+ push %eax FRAME_pushl()
+
+L(top): shl %cl, %ebp
+ or %ebp, %edx
+ shl %cl, %ebx
+ neg cnt
+ not %edx
+ mov -4(up), %eax
+ mov %eax, %ebp
+ mov %edx, -4(rp)
+ shr %cl, %eax
+ lea -8(rp), rp
+L(lo1): mov -8(up), %edx
+ or %ebx, %eax
+ mov %edx, %ebx
+ shr %cl, %edx
+ not %eax
+ lea -8(up), up
+ neg cnt
+ mov %eax, (rp)
+ decl VAR_COUNT
+ jg L(top)
+
+ pop %eax FRAME_popl()
+L(end):
+ shl %cl, %ebp
+ shl %cl, %ebx
+ or %ebp, %edx
+ mov SAVE_EBP, %ebp
+ not %edx
+ not %ebx
+ mov %edx, -4(rp)
+ mov %ebx, -8(rp)
+
+L(quit):
+ mov SAVE_UP, up
+ mov SAVE_EBX, %ebx
+ pop rp FRAME_popl()
+ ret
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/x86/atom/mmx/copyd.asm b/vendor/gmp-6.3.0/mpn/x86/atom/mmx/copyd.asm
new file mode 100644
index 0000000..b80fb03
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/atom/mmx/copyd.asm
@@ -0,0 +1,34 @@
+dnl Intel Atom mpn_copyd -- copy limb vector, decrementing.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_copyd)
+include_mpn(`x86/k7/mmx/copyd.asm')
diff --git a/vendor/gmp-6.3.0/mpn/x86/atom/mmx/copyi.asm b/vendor/gmp-6.3.0/mpn/x86/atom/mmx/copyi.asm
new file mode 100644
index 0000000..49b6b8d
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/atom/mmx/copyi.asm
@@ -0,0 +1,34 @@
+dnl Intel Atom mpn_copyi -- copy limb vector, incrementing.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_copyi)
+include_mpn(`x86/k7/mmx/copyi.asm')
diff --git a/vendor/gmp-6.3.0/mpn/x86/atom/mmx/hamdist.asm b/vendor/gmp-6.3.0/mpn/x86/atom/mmx/hamdist.asm
new file mode 100644
index 0000000..3fe8253
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/atom/mmx/hamdist.asm
@@ -0,0 +1,34 @@
+dnl Intel Atom mpn_hamdist -- hamming distance.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_hamdist)
+include_mpn(`x86/k7/mmx/popham.asm')
diff --git a/vendor/gmp-6.3.0/mpn/x86/atom/mod_34lsub1.asm b/vendor/gmp-6.3.0/mpn/x86/atom/mod_34lsub1.asm
new file mode 100644
index 0000000..6d57ba3
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/atom/mod_34lsub1.asm
@@ -0,0 +1,34 @@
+dnl Intel Atom mpn_mod_34lsub1 -- remainder modulo 2^24-1.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_mod_34lsub1)
+include_mpn(`x86/p6/mod_34lsub1.asm')
diff --git a/vendor/gmp-6.3.0/mpn/x86/atom/mode1o.asm b/vendor/gmp-6.3.0/mpn/x86/atom/mode1o.asm
new file mode 100644
index 0000000..c9ee6bd
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/atom/mode1o.asm
@@ -0,0 +1,34 @@
+dnl Intel Atom mpn_modexact_1_odd -- exact division style remainder.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_modexact_1_odd mpn_modexact_1c_odd)
+include_mpn(`x86/pentium/mode1o.asm')
diff --git a/vendor/gmp-6.3.0/mpn/x86/atom/rshift.asm b/vendor/gmp-6.3.0/mpn/x86/atom/rshift.asm
new file mode 100644
index 0000000..1cb5dbe
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/atom/rshift.asm
@@ -0,0 +1,152 @@
+dnl Intel Atom mpn_rshift -- mpn right shift.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl Converted from AMD64 by Marco Bodrato.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C unsigned cnt);
+
+C cycles/limb
+C P5
+C P6 model 0-8,10-12
+C P6 model 9 (Banias)
+C P6 model 13 (Dothan)
+C P4 model 0 (Willamette)
+C P4 model 1 (?)
+C P4 model 2 (Northwood)
+C P4 model 3 (Prescott)
+C P4 model 4 (Nocona)
+C Intel Atom 5
+C AMD K6
+C AMD K7
+C AMD K8
+C AMD K10
+
+defframe(PARAM_CNT, 16)
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+dnl re-use parameter space
+define(SAVE_UP,`PARAM_CNT')
+define(VAR_COUNT,`PARAM_SIZE')
+define(SAVE_EBX,`PARAM_SRC')
+define(SAVE_EBP,`PARAM_DST')
+
+define(`rp', `%edi')
+define(`up', `%esi')
+define(`cnt', `%ecx')
+
+ASM_START()
+ TEXT
+ ALIGN(8)
+deflit(`FRAME',0)
+PROLOGUE(mpn_rshift)
+ mov PARAM_CNT, cnt
+ mov PARAM_SIZE, %edx
+ mov up, SAVE_UP
+ mov PARAM_SRC, up
+ push rp FRAME_pushl()
+ mov PARAM_DST, rp
+ mov %ebx, SAVE_EBX
+
+ shr %edx
+ mov (up), %eax
+ mov %edx, VAR_COUNT
+ jnc L(evn)
+
+ mov %eax, %ebx
+ shr %cl, %ebx
+ neg cnt
+ shl %cl, %eax
+ test %edx, %edx
+ jnz L(gt1)
+ mov %ebx, (rp)
+ jmp L(quit)
+
+L(gt1): mov %ebp, SAVE_EBP
+ push %eax
+ mov 4(up), %eax
+ mov %eax, %ebp
+ shl %cl, %eax
+ jmp L(lo1)
+
+L(evn): mov %ebp, SAVE_EBP
+ neg cnt
+ mov %eax, %ebp
+ mov 4(up), %edx
+ shl %cl, %eax
+ mov %edx, %ebx
+ shl %cl, %edx
+ neg cnt
+ decl VAR_COUNT
+ lea -4(rp), rp
+ lea 4(up), up
+ jz L(end)
+ push %eax FRAME_pushl()
+
+ ALIGN(8)
+L(top): shr %cl, %ebp
+ or %ebp, %edx
+ shr %cl, %ebx
+ neg cnt
+ mov 4(up), %eax
+ mov %eax, %ebp
+ mov %edx, 4(rp)
+ shl %cl, %eax
+ lea 8(rp), rp
+L(lo1): mov 8(up), %edx
+ or %ebx, %eax
+ mov %edx, %ebx
+ shl %cl, %edx
+ lea 8(up), up
+ neg cnt
+ mov %eax, (rp)
+ decl VAR_COUNT
+ jg L(top)
+
+ pop %eax FRAME_popl()
+L(end):
+ shr %cl, %ebp
+ shr %cl, %ebx
+ or %ebp, %edx
+ mov SAVE_EBP, %ebp
+ mov %edx, 4(rp)
+ mov %ebx, 8(rp)
+
+L(quit):
+ mov SAVE_UP, up
+ mov SAVE_EBX, %ebx
+ pop rp FRAME_popl()
+ ret
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/x86/atom/sse2/aorsmul_1.asm b/vendor/gmp-6.3.0/mpn/x86/atom/sse2/aorsmul_1.asm
new file mode 100644
index 0000000..969a14a
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/atom/sse2/aorsmul_1.asm
@@ -0,0 +1,174 @@
+dnl x86-32 mpn_addmul_1 and mpn_submul_1 optimised for Intel Atom.
+
+dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C cycles/limb
+C P5 -
+C P6 model 0-8,10-12 -
+C P6 model 9 (Banias)
+C P6 model 13 (Dothan)
+C P4 model 0 (Willamette)
+C P4 model 1 (?)
+C P4 model 2 (Northwood)
+C P4 model 3 (Prescott)
+C P4 model 4 (Nocona)
+C Intel Atom 8
+C AMD K6
+C AMD K7 -
+C AMD K8
+C AMD K10
+
+define(`rp', `%edi')
+define(`up', `%esi')
+define(`n', `%ecx')
+
+ifdef(`OPERATION_addmul_1',`
+ define(ADDSUB, add)
+ define(func_1, mpn_addmul_1)
+ define(func_1c, mpn_addmul_1c)')
+ifdef(`OPERATION_submul_1',`
+ define(ADDSUB, sub)
+ define(func_1, mpn_submul_1)
+ define(func_1c, mpn_submul_1c)')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
+
+ TEXT
+ ALIGN(16)
+PROLOGUE(func_1)
+ xor %edx, %edx
+L(ent): push %edi
+ push %esi
+ push %ebx
+ mov 16(%esp), rp
+ mov 20(%esp), up
+ mov 24(%esp), n
+ movd 28(%esp), %mm7
+ test $1, n
+ jz L(fi0or2)
+ movd (up), %mm0
+ pmuludq %mm7, %mm0
+ shr $2, n
+ jnc L(fi1)
+
+L(fi3): lea -8(up), up
+ lea -8(rp), rp
+ movd 12(up), %mm1
+ movd %mm0, %ebx
+ pmuludq %mm7, %mm1
+ add $1, n C increment and clear carry
+ jmp L(lo3)
+
+L(fi1): movd %mm0, %ebx
+ jz L(wd1)
+ movd 4(up), %mm1
+ pmuludq %mm7, %mm1
+ jmp L(lo1)
+
+L(fi0or2):
+ movd (up), %mm1
+ pmuludq %mm7, %mm1
+ shr $2, n
+ movd 4(up), %mm0
+ jc L(fi2)
+ lea -4(up), up
+ lea -4(rp), rp
+ movd %mm1, %eax
+ pmuludq %mm7, %mm0
+ jmp L(lo0)
+
+L(fi2): lea 4(up), up
+ add $1, n C increment and clear carry
+ movd %mm1, %eax
+ lea -12(rp), rp
+ jmp L(lo2)
+
+C ALIGN(16) C alignment seems irrelevant
+L(top): movd 4(up), %mm1
+ adc $0, %edx
+ ADDSUB %eax, 12(rp)
+ movd %mm0, %ebx
+ pmuludq %mm7, %mm1
+ lea 16(rp), rp
+L(lo1): psrlq $32, %mm0
+ adc %edx, %ebx
+ movd %mm0, %edx
+ movd %mm1, %eax
+ movd 8(up), %mm0
+ pmuludq %mm7, %mm0
+ adc $0, %edx
+ ADDSUB %ebx, (rp)
+L(lo0): psrlq $32, %mm1
+ adc %edx, %eax
+ movd %mm1, %edx
+ movd %mm0, %ebx
+ movd 12(up), %mm1
+ pmuludq %mm7, %mm1
+ adc $0, %edx
+ ADDSUB %eax, 4(rp)
+L(lo3): psrlq $32, %mm0
+ adc %edx, %ebx
+ movd %mm0, %edx
+ movd %mm1, %eax
+ lea 16(up), up
+ movd (up), %mm0
+ adc $0, %edx
+ ADDSUB %ebx, 8(rp)
+L(lo2): psrlq $32, %mm1
+ adc %edx, %eax
+ movd %mm1, %edx
+ pmuludq %mm7, %mm0
+ dec n
+ jnz L(top)
+
+L(end): adc n, %edx C n is zero here
+ ADDSUB %eax, 12(rp)
+ movd %mm0, %ebx
+ lea 16(rp), rp
+L(wd1): psrlq $32, %mm0
+ adc %edx, %ebx
+ movd %mm0, %eax
+ adc n, %eax
+ ADDSUB %ebx, (rp)
+ emms
+ adc n, %eax
+ pop %ebx
+ pop %esi
+ pop %edi
+ ret
+EPILOGUE()
+PROLOGUE(func_1c)
+ mov 20(%esp), %edx C carry
+ jmp L(ent)
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/atom/sse2/bdiv_dbm1c.asm b/vendor/gmp-6.3.0/mpn/x86/atom/sse2/bdiv_dbm1c.asm
new file mode 100644
index 0000000..782e914
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/atom/sse2/bdiv_dbm1c.asm
@@ -0,0 +1,34 @@
+dnl Intel Atom mpn_bdiv_dbm1.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_bdiv_dbm1c)
+include_mpn(`x86/pentium4/sse2/bdiv_dbm1c.asm')
diff --git a/vendor/gmp-6.3.0/mpn/x86/atom/sse2/divrem_1.asm b/vendor/gmp-6.3.0/mpn/x86/atom/sse2/divrem_1.asm
new file mode 100644
index 0000000..f84709a
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/atom/sse2/divrem_1.asm
@@ -0,0 +1,34 @@
+dnl Intel Atom mpn_divrem_1 -- mpn by limb division.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_preinv_divrem_1 mpn_divrem_1c mpn_divrem_1)
+include_mpn(`x86/pentium4/sse2/divrem_1.asm')
diff --git a/vendor/gmp-6.3.0/mpn/x86/atom/sse2/mod_1_1.asm b/vendor/gmp-6.3.0/mpn/x86/atom/sse2/mod_1_1.asm
new file mode 100644
index 0000000..ae6581d
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/atom/sse2/mod_1_1.asm
@@ -0,0 +1,34 @@
+dnl Intel Atom/SSE2 mpn_mod_1_1.
+
+dnl Copyright 2009, 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_mod_1_1p)
+include_mpn(`x86/pentium4/sse2/mod_1_1.asm')
diff --git a/vendor/gmp-6.3.0/mpn/x86/atom/sse2/mod_1_4.asm b/vendor/gmp-6.3.0/mpn/x86/atom/sse2/mod_1_4.asm
new file mode 100644
index 0000000..31faa3f
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/atom/sse2/mod_1_4.asm
@@ -0,0 +1,34 @@
+dnl Intel Atom/SSE2 mpn_mod_1_4.
+
+dnl Copyright 2009, 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_mod_1s_4p)
+include_mpn(`x86/pentium4/sse2/mod_1_4.asm')
diff --git a/vendor/gmp-6.3.0/mpn/x86/atom/sse2/mul_1.asm b/vendor/gmp-6.3.0/mpn/x86/atom/sse2/mul_1.asm
new file mode 100644
index 0000000..aa3bb97
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/atom/sse2/mul_1.asm
@@ -0,0 +1,124 @@
+dnl Intel Atom mpn_mul_1.
+
+dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C cycles/limb
+C P5 -
+C P6 model 0-8,10-12 -
+C P6 model 9 (Banias)
+C P6 model 13 (Dothan)
+C P4 model 0 (Willamette)
+C P4 model 1 (?)
+C P4 model 2 (Northwood)
+C P4 model 3 (Prescott)
+C P4 model 4 (Nocona)
+C Intel Atom 7.5
+C AMD K6 -
+C AMD K7 -
+C AMD K8
+C AMD K10
+
+defframe(PARAM_CARRY,20)
+defframe(PARAM_MUL, 16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+define(`rp', `%edx')
+define(`up', `%esi')
+define(`n', `%ecx')
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+deflit(`FRAME',0)
+
+PROLOGUE(mpn_mul_1c)
+ movd PARAM_CARRY, %mm6 C carry
+ jmp L(ent)
+EPILOGUE()
+
+ ALIGN(8) C for compact code
+PROLOGUE(mpn_mul_1)
+ pxor %mm6, %mm6
+L(ent): push %esi FRAME_pushl()
+ mov PARAM_SRC, up
+ mov PARAM_SIZE, %eax C size
+ movd PARAM_MUL, %mm7
+ movd (up), %mm0
+ mov %eax, n
+ and $3, %eax
+ pmuludq %mm7, %mm0
+ mov PARAM_DST, rp
+ jz L(lo0)
+ cmp $2, %eax
+ lea -16(up,%eax,4),up
+ lea -16(rp,%eax,4),rp
+ jc L(lo1)
+ jz L(lo2)
+ jmp L(lo3)
+
+ ALIGN(16)
+L(top): movd (up), %mm0
+ pmuludq %mm7, %mm0
+ psrlq $32, %mm6
+ lea 16(rp), rp
+L(lo0): paddq %mm0, %mm6
+ movd 4(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, (rp)
+ psrlq $32, %mm6
+L(lo3): paddq %mm0, %mm6
+ movd 8(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, 4(rp)
+ psrlq $32, %mm6
+L(lo2): paddq %mm0, %mm6
+ movd 12(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, 8(rp)
+ psrlq $32, %mm6
+L(lo1): paddq %mm0, %mm6
+ sub $4, n
+ movd %mm6, 12(rp)
+ lea 16(up), up
+ ja L(top)
+
+ psrlq $32, %mm6
+ movd %mm6, %eax
+ emms
+ pop %esi FRAME_popl()
+ ret
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/x86/atom/sse2/mul_basecase.asm b/vendor/gmp-6.3.0/mpn/x86/atom/sse2/mul_basecase.asm
new file mode 100644
index 0000000..97d3aeb
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/atom/sse2/mul_basecase.asm
@@ -0,0 +1,501 @@
+dnl x86 mpn_mul_basecase -- Multiply two limb vectors and store the result in
+dnl a third limb vector.
+
+dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO
+C * Check if 'jmp N(%esp)' is well-predicted enough to allow us to combine the
+C 4 large loops into one; we could use it for the outer loop branch.
+C * Optimise code outside of inner loops.
+C * Write combined addmul_1 feed-in a wind-down code, and use when iterating
+C outer each loop. ("Overlapping software pipelining")
+C * Postpone push of ebx until we know vn > 1. Perhaps use caller-saves regs
+C for inlined mul_1, allowing us to postpone all pushes.
+C * Perhaps write special code for vn <= un < M, for some small M.
+
+C void mpn_mul_basecase (mp_ptr wp,
+C mp_srcptr xp, mp_size_t xn,
+C mp_srcptr yp, mp_size_t yn);
+C
+
+define(`rp', `%edi')
+define(`up', `%esi')
+define(`un', `%ecx')
+define(`vp', `%ebp')
+define(`vn', `36(%esp)')
+
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mul_basecase)
+ push %edi
+ push %esi
+ push %ebx
+ push %ebp
+ mov 20(%esp), rp
+ mov 24(%esp), up
+ mov 28(%esp), un
+ mov 32(%esp), vp
+
+ movd (up), %mm0
+ movd (vp), %mm7
+ pmuludq %mm7, %mm0
+ pxor %mm6, %mm6
+
+ mov un, %eax
+ and $3, %eax
+ jz L(of0)
+ cmp $2, %eax
+ jc L(of1)
+ jz L(of2)
+
+C ================================================================
+ jmp L(m3)
+ ALIGN(16)
+L(lm3): movd -4(up), %mm0
+ pmuludq %mm7, %mm0
+ psrlq $32, %mm6
+ lea 16(rp), rp
+ paddq %mm0, %mm6
+ movd (up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, -4(rp)
+ psrlq $32, %mm6
+L(m3): paddq %mm0, %mm6
+ movd 4(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, (rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ movd 8(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, 4(rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ sub $4, un
+ movd %mm6, 8(rp)
+ lea 16(up), up
+ ja L(lm3)
+
+ psrlq $32, %mm6
+ movd %mm6, 12(rp)
+
+ decl vn
+ jz L(done)
+ lea -8(rp), rp
+
+L(ol3): mov 28(%esp), un
+ neg un
+ lea 4(vp), vp
+ movd (vp), %mm7 C read next V limb
+ mov 24(%esp), up
+ lea 16(rp,un,4), rp
+
+ movd (up), %mm0
+ pmuludq %mm7, %mm0
+ sar $2, un
+ movd 4(up), %mm1
+ movd %mm0, %ebx
+ pmuludq %mm7, %mm1
+ lea -8(up), up
+ xor %edx, %edx C zero edx and CF
+ jmp L(a3)
+
+L(la3): movd 4(up), %mm1
+ adc $0, %edx
+ add %eax, 12(rp)
+ movd %mm0, %ebx
+ pmuludq %mm7, %mm1
+ lea 16(rp), rp
+ psrlq $32, %mm0
+ adc %edx, %ebx
+ movd %mm0, %edx
+ movd %mm1, %eax
+ movd 8(up), %mm0
+ pmuludq %mm7, %mm0
+ adc $0, %edx
+ add %ebx, (rp)
+ psrlq $32, %mm1
+ adc %edx, %eax
+ movd %mm1, %edx
+ movd %mm0, %ebx
+ movd 12(up), %mm1
+ pmuludq %mm7, %mm1
+ adc $0, %edx
+ add %eax, 4(rp)
+L(a3): psrlq $32, %mm0
+ adc %edx, %ebx
+ movd %mm0, %edx
+ movd %mm1, %eax
+ lea 16(up), up
+ movd (up), %mm0
+ adc $0, %edx
+ add %ebx, 8(rp)
+ psrlq $32, %mm1
+ adc %edx, %eax
+ movd %mm1, %edx
+ pmuludq %mm7, %mm0
+ inc un
+ jnz L(la3)
+
+ adc un, %edx C un is zero here
+ add %eax, 12(rp)
+ movd %mm0, %ebx
+ psrlq $32, %mm0
+ adc %edx, %ebx
+ movd %mm0, %eax
+ adc un, %eax
+ add %ebx, 16(rp)
+ adc un, %eax
+ mov %eax, 20(rp)
+
+ decl vn
+ jnz L(ol3)
+ jmp L(done)
+
+C ================================================================
+ ALIGN(16)
+L(lm0): movd (up), %mm0
+ pmuludq %mm7, %mm0
+ psrlq $32, %mm6
+ lea 16(rp), rp
+L(of0): paddq %mm0, %mm6
+ movd 4(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, (rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ movd 8(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, 4(rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ movd 12(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, 8(rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ sub $4, un
+ movd %mm6, 12(rp)
+ lea 16(up), up
+ ja L(lm0)
+
+ psrlq $32, %mm6
+ movd %mm6, 16(rp)
+
+ decl vn
+ jz L(done)
+ lea -4(rp), rp
+
+L(ol0): mov 28(%esp), un
+ neg un
+ lea 4(vp), vp
+ movd (vp), %mm7 C read next V limb
+ mov 24(%esp), up
+ lea 20(rp,un,4), rp
+
+ movd (up), %mm1
+ pmuludq %mm7, %mm1
+ sar $2, un
+ movd 4(up), %mm0
+ lea -4(up), up
+ movd %mm1, %eax
+ pmuludq %mm7, %mm0
+ xor %edx, %edx C zero edx and CF
+ jmp L(a0)
+
+L(la0): movd 4(up), %mm1
+ adc $0, %edx
+ add %eax, 12(rp)
+ movd %mm0, %ebx
+ pmuludq %mm7, %mm1
+ lea 16(rp), rp
+ psrlq $32, %mm0
+ adc %edx, %ebx
+ movd %mm0, %edx
+ movd %mm1, %eax
+ movd 8(up), %mm0
+ pmuludq %mm7, %mm0
+ adc $0, %edx
+ add %ebx, (rp)
+L(a0): psrlq $32, %mm1
+ adc %edx, %eax
+ movd %mm1, %edx
+ movd %mm0, %ebx
+ movd 12(up), %mm1
+ pmuludq %mm7, %mm1
+ adc $0, %edx
+ add %eax, 4(rp)
+ psrlq $32, %mm0
+ adc %edx, %ebx
+ movd %mm0, %edx
+ movd %mm1, %eax
+ lea 16(up), up
+ movd (up), %mm0
+ adc $0, %edx
+ add %ebx, 8(rp)
+ psrlq $32, %mm1
+ adc %edx, %eax
+ movd %mm1, %edx
+ pmuludq %mm7, %mm0
+ inc un
+ jnz L(la0)
+
+ adc un, %edx C un is zero here
+ add %eax, 12(rp)
+ movd %mm0, %ebx
+ psrlq $32, %mm0
+ adc %edx, %ebx
+ movd %mm0, %eax
+ adc un, %eax
+ add %ebx, 16(rp)
+ adc un, %eax
+ mov %eax, 20(rp)
+
+ decl vn
+ jnz L(ol0)
+ jmp L(done)
+
+C ================================================================
+ ALIGN(16)
+L(lm1): movd -12(up), %mm0
+ pmuludq %mm7, %mm0
+ psrlq $32, %mm6
+ lea 16(rp), rp
+ paddq %mm0, %mm6
+ movd -8(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, -12(rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ movd -4(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, -8(rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ movd (up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, -4(rp)
+ psrlq $32, %mm6
+L(of1): paddq %mm0, %mm6
+ sub $4, un
+ movd %mm6, (rp)
+ lea 16(up), up
+ ja L(lm1)
+
+ psrlq $32, %mm6
+ movd %mm6, 4(rp)
+
+ decl vn
+ jz L(done)
+ lea -16(rp), rp
+
+L(ol1): mov 28(%esp), un
+ neg un
+ lea 4(vp), vp
+ movd (vp), %mm7 C read next V limb
+ mov 24(%esp), up
+ lea 24(rp,un,4), rp
+
+ movd (up), %mm0
+ pmuludq %mm7, %mm0
+ sar $2, un
+ movd %mm0, %ebx
+ movd 4(up), %mm1
+ pmuludq %mm7, %mm1
+ xor %edx, %edx C zero edx and CF
+ inc un
+ jmp L(a1)
+
+L(la1): movd 4(up), %mm1
+ adc $0, %edx
+ add %eax, 12(rp)
+ movd %mm0, %ebx
+ pmuludq %mm7, %mm1
+ lea 16(rp), rp
+L(a1): psrlq $32, %mm0
+ adc %edx, %ebx
+ movd %mm0, %edx
+ movd %mm1, %eax
+ movd 8(up), %mm0
+ pmuludq %mm7, %mm0
+ adc $0, %edx
+ add %ebx, (rp)
+ psrlq $32, %mm1
+ adc %edx, %eax
+ movd %mm1, %edx
+ movd %mm0, %ebx
+ movd 12(up), %mm1
+ pmuludq %mm7, %mm1
+ adc $0, %edx
+ add %eax, 4(rp)
+ psrlq $32, %mm0
+ adc %edx, %ebx
+ movd %mm0, %edx
+ movd %mm1, %eax
+ lea 16(up), up
+ movd (up), %mm0
+ adc $0, %edx
+ add %ebx, 8(rp)
+ psrlq $32, %mm1
+ adc %edx, %eax
+ movd %mm1, %edx
+ pmuludq %mm7, %mm0
+ inc un
+ jnz L(la1)
+
+ adc un, %edx C un is zero here
+ add %eax, 12(rp)
+ movd %mm0, %ebx
+ psrlq $32, %mm0
+ adc %edx, %ebx
+ movd %mm0, %eax
+ adc un, %eax
+ add %ebx, 16(rp)
+ adc un, %eax
+ mov %eax, 20(rp)
+
+ decl vn
+ jnz L(ol1)
+ jmp L(done)
+
+C ================================================================
+ ALIGN(16)
+L(lm2): movd -8(up), %mm0
+ pmuludq %mm7, %mm0
+ psrlq $32, %mm6
+ lea 16(rp), rp
+ paddq %mm0, %mm6
+ movd -4(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, -8(rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ movd (up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, -4(rp)
+ psrlq $32, %mm6
+L(of2): paddq %mm0, %mm6
+ movd 4(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, (rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ sub $4, un
+ movd %mm6, 4(rp)
+ lea 16(up), up
+ ja L(lm2)
+
+ psrlq $32, %mm6
+ movd %mm6, 8(rp)
+
+ decl vn
+ jz L(done)
+ lea -12(rp), rp
+
+L(ol2): mov 28(%esp), un
+ neg un
+ lea 4(vp), vp
+ movd (vp), %mm7 C read next V limb
+ mov 24(%esp), up
+ lea 12(rp,un,4), rp
+
+ movd (up), %mm1
+ pmuludq %mm7, %mm1
+ sar $2, un
+ movd 4(up), %mm0
+ lea 4(up), up
+ movd %mm1, %eax
+ xor %edx, %edx C zero edx and CF
+ jmp L(lo2)
+
+L(la2): movd 4(up), %mm1
+ adc $0, %edx
+ add %eax, 12(rp)
+ movd %mm0, %ebx
+ pmuludq %mm7, %mm1
+ lea 16(rp), rp
+ psrlq $32, %mm0
+ adc %edx, %ebx
+ movd %mm0, %edx
+ movd %mm1, %eax
+ movd 8(up), %mm0
+ pmuludq %mm7, %mm0
+ adc $0, %edx
+ add %ebx, (rp)
+ psrlq $32, %mm1
+ adc %edx, %eax
+ movd %mm1, %edx
+ movd %mm0, %ebx
+ movd 12(up), %mm1
+ pmuludq %mm7, %mm1
+ adc $0, %edx
+ add %eax, 4(rp)
+ psrlq $32, %mm0
+ adc %edx, %ebx
+ movd %mm0, %edx
+ movd %mm1, %eax
+ lea 16(up), up
+ movd (up), %mm0
+ adc $0, %edx
+ add %ebx, 8(rp)
+L(lo2): psrlq $32, %mm1
+ adc %edx, %eax
+ movd %mm1, %edx
+ pmuludq %mm7, %mm0
+ inc un
+ jnz L(la2)
+
+ adc un, %edx C un is zero here
+ add %eax, 12(rp)
+ movd %mm0, %ebx
+ psrlq $32, %mm0
+ adc %edx, %ebx
+ movd %mm0, %eax
+ adc un, %eax
+ add %ebx, 16(rp)
+ adc un, %eax
+ mov %eax, 20(rp)
+
+ decl vn
+ jnz L(ol2)
+C jmp L(done)
+
+C ================================================================
+L(done):
+ emms
+ pop %ebp
+ pop %ebx
+ pop %esi
+ pop %edi
+ ret
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/atom/sse2/popcount.asm b/vendor/gmp-6.3.0/mpn/x86/atom/sse2/popcount.asm
new file mode 100644
index 0000000..7847aec
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/atom/sse2/popcount.asm
@@ -0,0 +1,35 @@
+dnl Intel Atom mpn_popcount -- population count.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+MULFUNC_PROLOGUE(mpn_popcount)
+include_mpn(`x86/pentium4/sse2/popcount.asm')
diff --git a/vendor/gmp-6.3.0/mpn/x86/atom/sse2/sqr_basecase.asm b/vendor/gmp-6.3.0/mpn/x86/atom/sse2/sqr_basecase.asm
new file mode 100644
index 0000000..af19ed8
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/atom/sse2/sqr_basecase.asm
@@ -0,0 +1,634 @@
+dnl x86 mpn_sqr_basecase -- square an mpn number, optimised for atom.
+
+dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO
+C * Check if 'jmp N(%esp)' is well-predicted enough to allow us to combine the
+C 4 large loops into one; we could use it for the outer loop branch.
+C * Optimise code outside of inner loops.
+C * Write combined addmul_1 feed-in a wind-down code, and use when iterating
+C outer each loop. ("Overlapping software pipelining")
+C * Perhaps use caller-saves regs for inlined mul_1, allowing us to postpone
+C all pushes.
+C * Perhaps write special code for n < M, for some small M.
+C * Replace inlined addmul_1 with smaller code from aorsmul_1.asm, or perhaps
+C with even less pipelined code.
+C * We run the outer loop until we have a 2-limb by 1-limb addmul_1 left.
+C Consider breaking out earlier, saving high the cost of short loops.
+
+C void mpn_sqr_basecase (mp_ptr wp,
+C mp_srcptr xp, mp_size_t xn);
+
+define(`rp', `%edi')
+define(`up', `%esi')
+define(`n', `%ecx')
+
+define(`un', `%ebp')
+
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_sqr_basecase)
+ push %edi
+ push %esi
+ mov 12(%esp), rp
+ mov 16(%esp), up
+ mov 20(%esp), n
+
+ lea 4(rp), rp C write triangular product starting at rp[1]
+ dec n
+ movd (up), %mm7
+
+ jz L(one)
+ lea 4(up), up
+ push %ebx
+ push %ebp
+ mov n, %eax
+
+ movd (up), %mm0
+ neg n
+ pmuludq %mm7, %mm0
+ pxor %mm6, %mm6
+ mov n, un
+
+ and $3, %eax
+ jz L(of0)
+ cmp $2, %eax
+ jc L(of1)
+ jz L(of2)
+
+C ================================================================
+ jmp L(m3)
+ ALIGN(16)
+L(lm3): movd -4(up), %mm0
+ pmuludq %mm7, %mm0
+ psrlq $32, %mm6
+ lea 16(rp), rp
+ paddq %mm0, %mm6
+ movd (up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, -4(rp)
+ psrlq $32, %mm6
+L(m3): paddq %mm0, %mm6
+ movd 4(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, (rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ movd 8(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, 4(rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ add $4, un
+ movd %mm6, 8(rp)
+ lea 16(up), up
+ js L(lm3)
+
+ psrlq $32, %mm6
+ movd %mm6, 12(rp)
+
+ inc n
+C jz L(done)
+ lea -12(up), up
+ lea 4(rp), rp
+ jmp L(ol2)
+
+C ================================================================
+ ALIGN(16)
+L(lm0): movd (up), %mm0
+ pmuludq %mm7, %mm0
+ psrlq $32, %mm6
+ lea 16(rp), rp
+L(of0): paddq %mm0, %mm6
+ movd 4(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, (rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ movd 8(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, 4(rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ movd 12(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, 8(rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ add $4, un
+ movd %mm6, 12(rp)
+ lea 16(up), up
+ js L(lm0)
+
+ psrlq $32, %mm6
+ movd %mm6, 16(rp)
+
+ inc n
+C jz L(done)
+ lea -8(up), up
+ lea 8(rp), rp
+ jmp L(ol3)
+
+C ================================================================
+ ALIGN(16)
+L(lm1): movd -12(up), %mm0
+ pmuludq %mm7, %mm0
+ psrlq $32, %mm6
+ lea 16(rp), rp
+ paddq %mm0, %mm6
+ movd -8(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, -12(rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ movd -4(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, -8(rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ movd (up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, -4(rp)
+ psrlq $32, %mm6
+L(of1): paddq %mm0, %mm6
+ add $4, un
+ movd %mm6, (rp)
+ lea 16(up), up
+ js L(lm1)
+
+ psrlq $32, %mm6
+ movd %mm6, 4(rp)
+
+ inc n
+ jz L(done) C goes away when we add special n=2 code
+ lea -20(up), up
+ lea -4(rp), rp
+ jmp L(ol0)
+
+C ================================================================
+ ALIGN(16)
+L(lm2): movd -8(up), %mm0
+ pmuludq %mm7, %mm0
+ psrlq $32, %mm6
+ lea 16(rp), rp
+ paddq %mm0, %mm6
+ movd -4(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, -8(rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ movd (up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, -4(rp)
+ psrlq $32, %mm6
+L(of2): paddq %mm0, %mm6
+ movd 4(up), %mm0
+ pmuludq %mm7, %mm0
+ movd %mm6, (rp)
+ psrlq $32, %mm6
+ paddq %mm0, %mm6
+ add $4, un
+ movd %mm6, 4(rp)
+ lea 16(up), up
+ js L(lm2)
+
+ psrlq $32, %mm6
+ movd %mm6, 8(rp)
+
+ inc n
+C jz L(done)
+ lea -16(up), up
+C lea (rp), rp
+C jmp L(ol1)
+
+C ================================================================
+
+L(ol1): lea 4(up,n,4), up
+ movd (up), %mm7 C read next U invariant limb
+ lea 8(rp,n,4), rp
+ mov n, un
+
+ movd 4(up), %mm1
+ pmuludq %mm7, %mm1
+ sar $2, un
+ movd %mm1, %ebx
+ inc un
+ jz L(re1)
+
+ movd 8(up), %mm0
+ pmuludq %mm7, %mm0
+ xor %edx, %edx C zero edx and CF
+ jmp L(a1)
+
+L(la1): adc $0, %edx
+ add %ebx, 12(rp)
+ movd %mm0, %eax
+ pmuludq %mm7, %mm1
+ lea 16(rp), rp
+ psrlq $32, %mm0
+ adc %edx, %eax
+ movd %mm0, %edx
+ movd %mm1, %ebx
+ movd 8(up), %mm0
+ pmuludq %mm7, %mm0
+ adc $0, %edx
+ add %eax, (rp)
+L(a1): psrlq $32, %mm1
+ adc %edx, %ebx
+ movd %mm1, %edx
+ movd %mm0, %eax
+ movd 12(up), %mm1
+ pmuludq %mm7, %mm1
+ adc $0, %edx
+ add %ebx, 4(rp)
+ psrlq $32, %mm0
+ adc %edx, %eax
+ movd %mm0, %edx
+ movd %mm1, %ebx
+ lea 16(up), up
+ movd (up), %mm0
+ adc $0, %edx
+ add %eax, 8(rp)
+ psrlq $32, %mm1
+ adc %edx, %ebx
+ movd %mm1, %edx
+ pmuludq %mm7, %mm0
+ inc un
+ movd 4(up), %mm1
+ jnz L(la1)
+
+ adc un, %edx C un is zero here
+ add %ebx, 12(rp)
+ movd %mm0, %eax
+ pmuludq %mm7, %mm1
+ lea 16(rp), rp
+ psrlq $32, %mm0
+ adc %edx, %eax
+ movd %mm0, %edx
+ movd %mm1, %ebx
+ adc un, %edx
+ add %eax, (rp)
+ psrlq $32, %mm1
+ adc %edx, %ebx
+ movd %mm1, %eax
+ adc un, %eax
+ add %ebx, 4(rp)
+ adc un, %eax
+ mov %eax, 8(rp)
+
+ inc n
+
+C ================================================================
+
+L(ol0): lea (up,n,4), up
+ movd 4(up), %mm7 C read next U invariant limb
+ lea 4(rp,n,4), rp
+ mov n, un
+
+ movd 8(up), %mm0
+ pmuludq %mm7, %mm0
+ sar $2, un
+ movd 12(up), %mm1
+ movd %mm0, %eax
+ pmuludq %mm7, %mm1
+ xor %edx, %edx C zero edx and CF
+ jmp L(a0)
+
+L(la0): adc $0, %edx
+ add %ebx, 12(rp)
+ movd %mm0, %eax
+ pmuludq %mm7, %mm1
+ lea 16(rp), rp
+ psrlq $32, %mm0
+ adc %edx, %eax
+ movd %mm0, %edx
+ movd %mm1, %ebx
+ movd 8(up), %mm0
+ pmuludq %mm7, %mm0
+ adc $0, %edx
+ add %eax, (rp)
+ psrlq $32, %mm1
+ adc %edx, %ebx
+ movd %mm1, %edx
+ movd %mm0, %eax
+ movd 12(up), %mm1
+ pmuludq %mm7, %mm1
+ adc $0, %edx
+ add %ebx, 4(rp)
+L(a0): psrlq $32, %mm0
+ adc %edx, %eax
+ movd %mm0, %edx
+ movd %mm1, %ebx
+ lea 16(up), up
+ movd (up), %mm0
+ adc $0, %edx
+ add %eax, 8(rp)
+ psrlq $32, %mm1
+ adc %edx, %ebx
+ movd %mm1, %edx
+ pmuludq %mm7, %mm0
+ inc un
+ movd 4(up), %mm1
+ jnz L(la0)
+
+ adc un, %edx C un is zero here
+ add %ebx, 12(rp)
+ movd %mm0, %eax
+ pmuludq %mm7, %mm1
+ lea 16(rp), rp
+ psrlq $32, %mm0
+ adc %edx, %eax
+ movd %mm0, %edx
+ movd %mm1, %ebx
+ adc un, %edx
+ add %eax, (rp)
+ psrlq $32, %mm1
+ adc %edx, %ebx
+ movd %mm1, %eax
+ adc un, %eax
+ add %ebx, 4(rp)
+ adc un, %eax
+ mov %eax, 8(rp)
+
+ inc n
+
+C ================================================================
+
+L(ol3): lea 12(up,n,4), up
+ movd -8(up), %mm7 C read next U invariant limb
+ lea (rp,n,4), rp C put rp back
+ mov n, un
+
+ movd -4(up), %mm1
+ pmuludq %mm7, %mm1
+ sar $2, un
+ movd %mm1, %ebx
+ movd (up), %mm0
+ xor %edx, %edx C zero edx and CF
+ jmp L(a3)
+
+L(la3): adc $0, %edx
+ add %ebx, 12(rp)
+ movd %mm0, %eax
+ pmuludq %mm7, %mm1
+ lea 16(rp), rp
+ psrlq $32, %mm0
+ adc %edx, %eax
+ movd %mm0, %edx
+ movd %mm1, %ebx
+ movd 8(up), %mm0
+ pmuludq %mm7, %mm0
+ adc $0, %edx
+ add %eax, (rp)
+ psrlq $32, %mm1
+ adc %edx, %ebx
+ movd %mm1, %edx
+ movd %mm0, %eax
+ movd 12(up), %mm1
+ pmuludq %mm7, %mm1
+ adc $0, %edx
+ add %ebx, 4(rp)
+ psrlq $32, %mm0
+ adc %edx, %eax
+ movd %mm0, %edx
+ movd %mm1, %ebx
+ lea 16(up), up
+ movd (up), %mm0
+ adc $0, %edx
+ add %eax, 8(rp)
+L(a3): psrlq $32, %mm1
+ adc %edx, %ebx
+ movd %mm1, %edx
+ pmuludq %mm7, %mm0
+ inc un
+ movd 4(up), %mm1
+ jnz L(la3)
+
+ adc un, %edx C un is zero here
+ add %ebx, 12(rp)
+ movd %mm0, %eax
+ pmuludq %mm7, %mm1
+ lea 16(rp), rp
+ psrlq $32, %mm0
+ adc %edx, %eax
+ movd %mm0, %edx
+ movd %mm1, %ebx
+ adc un, %edx
+ add %eax, (rp)
+ psrlq $32, %mm1
+ adc %edx, %ebx
+ movd %mm1, %eax
+ adc un, %eax
+ add %ebx, 4(rp)
+ adc un, %eax
+ mov %eax, 8(rp)
+
+ inc n
+
+C ================================================================
+
+L(ol2): lea 8(up,n,4), up
+ movd -4(up), %mm7 C read next U invariant limb
+ lea 12(rp,n,4), rp
+ mov n, un
+
+ movd (up), %mm0
+ pmuludq %mm7, %mm0
+ xor %edx, %edx
+ sar $2, un
+ movd 4(up), %mm1
+ test un, un C clear carry
+ movd %mm0, %eax
+ pmuludq %mm7, %mm1
+ inc un
+ jnz L(a2)
+ jmp L(re2)
+
+L(la2): adc $0, %edx
+ add %ebx, 12(rp)
+ movd %mm0, %eax
+ pmuludq %mm7, %mm1
+ lea 16(rp), rp
+L(a2): psrlq $32, %mm0
+ adc %edx, %eax
+ movd %mm0, %edx
+ movd %mm1, %ebx
+ movd 8(up), %mm0
+ pmuludq %mm7, %mm0
+ adc $0, %edx
+ add %eax, (rp)
+ psrlq $32, %mm1
+ adc %edx, %ebx
+ movd %mm1, %edx
+ movd %mm0, %eax
+ movd 12(up), %mm1
+ pmuludq %mm7, %mm1
+ adc $0, %edx
+ add %ebx, 4(rp)
+ psrlq $32, %mm0
+ adc %edx, %eax
+ movd %mm0, %edx
+ movd %mm1, %ebx
+ lea 16(up), up
+ movd (up), %mm0
+ adc $0, %edx
+ add %eax, 8(rp)
+ psrlq $32, %mm1
+ adc %edx, %ebx
+ movd %mm1, %edx
+ pmuludq %mm7, %mm0
+ inc un
+ movd 4(up), %mm1
+ jnz L(la2)
+
+ adc un, %edx C un is zero here
+ add %ebx, 12(rp)
+ movd %mm0, %eax
+ pmuludq %mm7, %mm1
+ lea 16(rp), rp
+ psrlq $32, %mm0
+ adc %edx, %eax
+ movd %mm0, %edx
+ movd %mm1, %ebx
+ adc un, %edx
+ add %eax, (rp)
+ psrlq $32, %mm1
+ adc %edx, %ebx
+ movd %mm1, %eax
+ adc un, %eax
+ add %ebx, 4(rp)
+ adc un, %eax
+ mov %eax, 8(rp)
+
+ inc n
+ jmp L(ol1)
+
+C ================================================================
+L(re2): psrlq $32, %mm0
+ movd (up), %mm7 C read next U invariant limb
+ adc %edx, %eax
+ movd %mm0, %edx
+ movd %mm1, %ebx
+ adc un, %edx
+ add %eax, (rp)
+ lea 4(rp), rp
+ psrlq $32, %mm1
+ adc %edx, %ebx
+ movd %mm1, %eax
+ movd 4(up), %mm1
+ adc un, %eax
+ add %ebx, (rp)
+ pmuludq %mm7, %mm1
+ adc un, %eax
+ mov %eax, 4(rp)
+ movd %mm1, %ebx
+
+L(re1): psrlq $32, %mm1
+ add %ebx, 4(rp)
+ movd %mm1, %eax
+ adc un, %eax
+ xor n, n C make n zeroness assumption below true
+ mov %eax, 8(rp)
+
+L(done): C n is zero here
+ mov 24(%esp), up
+ mov 28(%esp), %eax
+
+ movd (up), %mm0
+ inc %eax
+ pmuludq %mm0, %mm0
+ lea 4(up), up
+ mov 20(%esp), rp
+ shr %eax
+ movd %mm0, (rp)
+ psrlq $32, %mm0
+ lea -12(rp), rp
+ mov %eax, 28(%esp)
+ jnc L(odd)
+
+ movd %mm0, %ebp
+ movd (up), %mm0
+ lea 8(rp), rp
+ pmuludq %mm0, %mm0
+ lea -4(up), up
+ add 8(rp), %ebp
+ movd %mm0, %edx
+ adc 12(rp), %edx
+ rcr n
+ jmp L(ent)
+
+C ALIGN(16) C alignment seems irrelevant
+L(top): movd (up), %mm1
+ adc n, n
+ movd %mm0, %eax
+ pmuludq %mm1, %mm1
+ movd 4(up), %mm0
+ adc (rp), %eax
+ movd %mm1, %ebx
+ pmuludq %mm0, %mm0
+ psrlq $32, %mm1
+ adc 4(rp), %ebx
+ movd %mm1, %ebp
+ movd %mm0, %edx
+ adc 8(rp), %ebp
+ adc 12(rp), %edx
+ rcr n C FIXME: isn't this awfully slow on atom???
+ adc %eax, (rp)
+ adc %ebx, 4(rp)
+L(ent): lea 8(up), up
+ adc %ebp, 8(rp)
+ psrlq $32, %mm0
+ adc %edx, 12(rp)
+L(odd): decl 28(%esp)
+ lea 16(rp), rp
+ jnz L(top)
+
+L(end): adc n, n
+ movd %mm0, %eax
+ adc n, %eax
+ mov %eax, (rp)
+
+L(rtn): emms
+ pop %ebp
+ pop %ebx
+ pop %esi
+ pop %edi
+ ret
+
+L(one): pmuludq %mm7, %mm7
+ movq %mm7, -4(rp)
+ emms
+ pop %esi
+ pop %edi
+ ret
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/atom/sublsh1_n.asm b/vendor/gmp-6.3.0/mpn/x86/atom/sublsh1_n.asm
new file mode 100644
index 0000000..d3e7e5b
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/atom/sublsh1_n.asm
@@ -0,0 +1,34 @@
+dnl Intel Atom mpn_sublsh1_n -- rp[] = up[] - (vp[] << 1)
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_sublsh1_n_ip1)
+include_mpn(`x86/k7/sublsh1_n.asm')
diff --git a/vendor/gmp-6.3.0/mpn/x86/atom/sublsh2_n.asm b/vendor/gmp-6.3.0/mpn/x86/atom/sublsh2_n.asm
new file mode 100644
index 0000000..79405cf
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/atom/sublsh2_n.asm
@@ -0,0 +1,57 @@
+dnl Intel Atom mpn_addlsh2_n/mpn_sublsh2_n -- rp[] = up[] +- (vp[] << 2).
+
+dnl Contributed to the GNU project by Marco Bodrato.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+define(LSH, 2)
+define(RSH, 30)
+
+ifdef(`OPERATION_addlsh2_n', `
+ define(M4_inst, adcl)
+ define(M4_opp, subl)
+ define(M4_function, mpn_addlsh2_n)
+ define(M4_function_c, mpn_addlsh2_nc)
+ define(M4_ip_function_c, mpn_addlsh2_nc_ip1)
+ define(M4_ip_function, mpn_addlsh2_n_ip1)
+',`ifdef(`OPERATION_sublsh2_n', `
+ define(M4_inst, sbbl)
+ define(M4_opp, addl)
+ define(M4_function, mpn_sublsh2_n)
+ define(M4_function_c, mpn_sublsh2_nc)
+ define(M4_ip_function_c, mpn_sublsh2_nc_ip1)
+ define(M4_ip_function, mpn_sublsh2_n_ip1)
+',`m4_error(`Need OPERATION_addlsh2_n or OPERATION_sublsh2_n
+')')')
+
+MULFUNC_PROLOGUE(mpn_sublsh2_n mpn_sublsh2_nc mpn_sublsh2_n_ip1 mpn_sublsh2_nc_ip1)
+
+include_mpn(`x86/atom/aorslshC_n.asm')
diff --git a/vendor/gmp-6.3.0/mpn/x86/bd1/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/x86/bd1/gmp-mparam.h
new file mode 100644
index 0000000..254cfea
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/bd1/gmp-mparam.h
@@ -0,0 +1,211 @@
+/* AMD bd1 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 3600-3800 MHz Bulldozer Zambezi */
+/* FFT tuning limit = 67,000,000 */
+/* Generated by tuneup.c, 2019-10-27, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 7
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 4
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 15
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 13
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1N_PI1_METHOD 1 /* 59.59% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD 5
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 27
+
+#define DIV_1_VS_MUL_1_PERCENT 245
+
+#define MUL_TOOM22_THRESHOLD 32
+#define MUL_TOOM33_THRESHOLD 89
+#define MUL_TOOM44_THRESHOLD 154
+#define MUL_TOOM6H_THRESHOLD 230
+#define MUL_TOOM8H_THRESHOLD 351
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 89
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 110
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 101
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 111
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 130
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 46
+#define SQR_TOOM3_THRESHOLD 87
+#define SQR_TOOM4_THRESHOLD 216
+#define SQR_TOOM6_THRESHOLD 294
+#define SQR_TOOM8_THRESHOLD 442
+
+#define MULMID_TOOM42_THRESHOLD 50
+
+#define MULMOD_BNM1_THRESHOLD 22
+#define SQRMOD_BNM1_THRESHOLD 26
+
+#define MUL_FFT_MODF_THRESHOLD 636 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 636, 5}, { 28, 6}, { 15, 5}, { 31, 6}, \
+ { 28, 7}, { 15, 6}, { 33, 7}, { 17, 6}, \
+ { 35, 7}, { 19, 6}, { 39, 7}, { 23, 6}, \
+ { 47, 7}, { 29, 8}, { 15, 7}, { 35, 8}, \
+ { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \
+ { 27, 7}, { 55, 8}, { 31, 7}, { 63, 8}, \
+ { 43, 9}, { 23, 8}, { 55, 9}, { 31, 8}, \
+ { 67, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \
+ { 95, 9}, { 55,10}, { 31, 9}, { 79,10}, \
+ { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \
+ { 135,10}, { 79, 9}, { 159,10}, { 95, 9}, \
+ { 191,11}, { 63, 7}, { 1023, 8}, { 543,11}, \
+ { 95,10}, { 191,12}, { 63,11}, { 127,10}, \
+ { 255, 9}, { 511,10}, { 271,11}, { 159,10}, \
+ { 319, 9}, { 639,10}, { 335,11}, { 191,10}, \
+ { 399,11}, { 223,12}, { 127,11}, { 255,10}, \
+ { 543,11}, { 287,10}, { 607,11}, { 319,10}, \
+ { 639,12}, { 191,11}, { 383,10}, { 799,11}, \
+ { 415,13}, { 127,12}, { 255,11}, { 543,10}, \
+ { 1087,11}, { 607,12}, { 319,11}, { 671,10}, \
+ { 1343,11}, { 735,12}, { 383,11}, { 799,10}, \
+ { 1599,11}, { 863,12}, { 447,11}, { 895,13}, \
+ { 255,12}, { 511,11}, { 1087,12}, { 575,11}, \
+ { 1215,12}, { 639,11}, { 1343,12}, { 703,11}, \
+ { 1471,13}, { 383,12}, { 767,11}, { 1599,12}, \
+ { 831,11}, { 1727,12}, { 895,14}, { 255,13}, \
+ { 511,12}, { 1087,11}, { 2239,10}, { 4479,12}, \
+ { 1215,13}, { 639,12}, { 1471,11}, { 2943,13}, \
+ { 767,12}, { 1727,11}, { 3455,13}, { 895,12}, \
+ { 1919,14}, { 511,13}, { 1023,12}, { 2239,11}, \
+ { 4479,13}, { 1151,12}, { 2495,11}, { 4991,13}, \
+ { 1279,12}, { 2623,13}, { 1407,12}, { 2943,14}, \
+ { 767,13}, { 1663,12}, { 3455,13}, { 1919,15}, \
+ { 511,14}, { 1023,13}, { 2175,12}, { 4479,13}, \
+ { 2431,12}, { 4991,14}, { 1279,13}, { 2943,12}, \
+ { 5887,14}, { 1535,13}, { 3455,14}, { 1791,13}, \
+ { 3967,12}, { 7935,15}, { 1023,14}, { 2047,13}, \
+ { 4479,14}, { 2303,13}, { 4991,12}, { 9983,14}, \
+ { 2815,13}, { 5887,15}, { 1535,14}, { 3327,13}, \
+ { 6911,14}, { 3839,13}, { 7935,16} }
+#define MUL_FFT_TABLE3_SIZE 159
+#define MUL_FFT_THRESHOLD 6784
+
+#define SQR_FFT_MODF_THRESHOLD 565 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 565, 5}, { 29, 6}, { 15, 5}, { 32, 6}, \
+ { 17, 5}, { 35, 6}, { 29, 7}, { 15, 6}, \
+ { 33, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \
+ { 39, 7}, { 23, 6}, { 47, 7}, { 35, 8}, \
+ { 19, 7}, { 41, 8}, { 23, 7}, { 49, 8}, \
+ { 27, 7}, { 55, 8}, { 43, 9}, { 23, 8}, \
+ { 55, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \
+ { 79, 9}, { 47, 8}, { 95, 9}, { 55,10}, \
+ { 31, 9}, { 79,10}, { 47, 9}, { 95,11}, \
+ { 31,10}, { 63, 9}, { 135,10}, { 79, 9}, \
+ { 159,10}, { 95,11}, { 63,10}, { 159,11}, \
+ { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \
+ { 543,11}, { 159,10}, { 319, 9}, { 639,10}, \
+ { 335, 9}, { 671,11}, { 191,10}, { 415,11}, \
+ { 223,12}, { 127,11}, { 255,10}, { 543,11}, \
+ { 287,10}, { 607,11}, { 319,10}, { 671,12}, \
+ { 191,11}, { 383,10}, { 799,11}, { 415,10}, \
+ { 831,13}, { 127,12}, { 255,11}, { 543,10}, \
+ { 1087,11}, { 607,12}, { 319,11}, { 671,10}, \
+ { 1343,11}, { 735,12}, { 383,11}, { 863,12}, \
+ { 447,11}, { 959,13}, { 255,12}, { 511,11}, \
+ { 1087,12}, { 575,11}, { 1215,12}, { 639,11}, \
+ { 1343,12}, { 703,13}, { 383,12}, { 767,11}, \
+ { 1535,12}, { 831,11}, { 1727,12}, { 895,11}, \
+ { 1791,12}, { 959,14}, { 255,13}, { 511,12}, \
+ { 1087,11}, { 2239,10}, { 4479,12}, { 1215,13}, \
+ { 639,12}, { 1471,11}, { 2943,13}, { 767,12}, \
+ { 1727,13}, { 895,12}, { 1919,14}, { 511,13}, \
+ { 1023,12}, { 2239,11}, { 4479,13}, { 1151,12}, \
+ { 2495,11}, { 4991,13}, { 1279,12}, { 2623,13}, \
+ { 1407,12}, { 2943,14}, { 767,13}, { 1663,12}, \
+ { 3455,13}, { 1919,15}, { 511,14}, { 1023,13}, \
+ { 2175,12}, { 4479,13}, { 2431,12}, { 4991,14}, \
+ { 1279,13}, { 2943,12}, { 5887,14}, { 1535,13}, \
+ { 3455,14}, { 1791,13}, { 3967,15}, { 1023,14}, \
+ { 2047,13}, { 4479,14}, { 2303,13}, { 4991,12}, \
+ { 9983,14}, { 2815,13}, { 5887,15}, { 1535,14}, \
+ { 3327,13}, { 6783,14}, { 3839,13}, { 7679,16} }
+#define SQR_FFT_TABLE3_SIZE 152
+#define SQR_FFT_THRESHOLD 5760
+
+#define MULLO_BASECASE_THRESHOLD 3
+#define MULLO_DC_THRESHOLD 31
+#define MULLO_MUL_N_THRESHOLD 13463
+#define SQRLO_BASECASE_THRESHOLD 0 /* always */
+#define SQRLO_DC_THRESHOLD 33
+#define SQRLO_SQR_THRESHOLD 11278
+
+#define DC_DIV_QR_THRESHOLD 52
+#define DC_DIVAPPR_Q_THRESHOLD 198
+#define DC_BDIV_QR_THRESHOLD 48
+#define DC_BDIV_Q_THRESHOLD 126
+
+#define INV_MULMOD_BNM1_THRESHOLD 82
+#define INV_NEWTON_THRESHOLD 212
+#define INV_APPR_THRESHOLD 202
+
+#define BINV_NEWTON_THRESHOLD 238
+#define REDC_1_TO_REDC_N_THRESHOLD 55
+
+#define MU_DIV_QR_THRESHOLD 1652
+#define MU_DIVAPPR_Q_THRESHOLD 1528
+#define MUPI_DIV_QR_THRESHOLD 110
+#define MU_BDIV_QR_THRESHOLD 1442
+#define MU_BDIV_Q_THRESHOLD 1528
+
+#define POWM_SEC_TABLE 1,20,96,386,1221,2698
+
+#define GET_STR_DC_THRESHOLD 11
+#define GET_STR_PRECOMPUTE_THRESHOLD 21
+#define SET_STR_DC_THRESHOLD 100
+#define SET_STR_PRECOMPUTE_THRESHOLD 762
+
+#define FAC_DSC_THRESHOLD 118
+#define FAC_ODD_THRESHOLD 34
+
+#define MATRIX22_STRASSEN_THRESHOLD 16
+#define HGCD2_DIV1_METHOD 4 /* 1.22% faster than 3 */
+#define HGCD_THRESHOLD 67
+#define HGCD_APPR_THRESHOLD 150
+#define HGCD_REDUCE_THRESHOLD 3389
+#define GCD_DC_THRESHOLD 483
+#define GCDEXT_DC_THRESHOLD 345
+#define JACOBI_BASE_METHOD 4 /* 5.07% faster than 1 */
+
+/* Tuneup completed successfully, took 65358 seconds */
diff --git a/vendor/gmp-6.3.0/mpn/x86/bd2/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/x86/bd2/gmp-mparam.h
new file mode 100644
index 0000000..6893da7
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/bd2/gmp-mparam.h
@@ -0,0 +1,214 @@
+/* AMD bd2 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 4000-4200 MHz Piledriver Vishera */
+/* FFT tuning limit = 67,000,000 */
+/* Generated by tuneup.c, 2019-10-23, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD 3
+#define MOD_1_UNNORM_THRESHOLD 4
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 6
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 4
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 18
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 12
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1N_PI1_METHOD 1 /* 40.87% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD 5
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 24
+
+#define DIV_1_VS_MUL_1_PERCENT 254
+
+#define MUL_TOOM22_THRESHOLD 32
+#define MUL_TOOM33_THRESHOLD 73
+#define MUL_TOOM44_THRESHOLD 151
+#define MUL_TOOM6H_THRESHOLD 222
+#define MUL_TOOM8H_THRESHOLD 351
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 85
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 110
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 100
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 110
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 130
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 44
+#define SQR_TOOM3_THRESHOLD 93
+#define SQR_TOOM4_THRESHOLD 212
+#define SQR_TOOM6_THRESHOLD 318
+#define SQR_TOOM8_THRESHOLD 466
+
+#define MULMID_TOOM42_THRESHOLD 66
+
+#define MULMOD_BNM1_THRESHOLD 20
+#define SQRMOD_BNM1_THRESHOLD 23
+
+#define MUL_FFT_MODF_THRESHOLD 595 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 595, 5}, { 27, 6}, { 29, 7}, { 15, 6}, \
+ { 33, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \
+ { 39, 7}, { 23, 6}, { 47, 7}, { 29, 8}, \
+ { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \
+ { 23, 7}, { 49, 8}, { 31, 7}, { 63, 8}, \
+ { 39, 9}, { 23, 8}, { 55, 9}, { 31, 8}, \
+ { 67, 9}, { 39, 8}, { 83, 9}, { 47, 8}, \
+ { 95, 9}, { 55,10}, { 31, 9}, { 79,10}, \
+ { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \
+ { 135,10}, { 79, 9}, { 159,10}, { 95, 9}, \
+ { 191,11}, { 63,10}, { 143, 7}, { 1215, 9}, \
+ { 319, 8}, { 639, 9}, { 335, 8}, { 671, 9}, \
+ { 351,10}, { 191,12}, { 63,11}, { 127,10}, \
+ { 271,11}, { 159,10}, { 319, 9}, { 639,10}, \
+ { 335,11}, { 191,10}, { 399,11}, { 223,12}, \
+ { 127,11}, { 255,10}, { 543,11}, { 287,10}, \
+ { 607,11}, { 319,10}, { 671,12}, { 191,11}, \
+ { 383,10}, { 799,11}, { 415,13}, { 127,12}, \
+ { 255,11}, { 543,10}, { 1087,11}, { 607,12}, \
+ { 319,11}, { 671,10}, { 1343,11}, { 735,10}, \
+ { 1471,12}, { 383,11}, { 799,10}, { 1599,11}, \
+ { 863,12}, { 447,11}, { 895,13}, { 255,12}, \
+ { 511,11}, { 1087,12}, { 575,11}, { 1215,12}, \
+ { 639,11}, { 1343,12}, { 703,11}, { 1471,13}, \
+ { 383,12}, { 767,11}, { 1599,12}, { 831,11}, \
+ { 1727,12}, { 895,14}, { 255,13}, { 511,12}, \
+ { 1087,11}, { 2239,12}, { 1215,13}, { 639,12}, \
+ { 1471,11}, { 2943,13}, { 767,12}, { 1727,13}, \
+ { 895,12}, { 1919,14}, { 511,13}, { 1023,12}, \
+ { 2239,13}, { 1151,12}, { 2431,13}, { 1279,12}, \
+ { 2623,13}, { 1407,12}, { 2943,14}, { 767,13}, \
+ { 1535,12}, { 3135,13}, { 1663,12}, { 3455,13}, \
+ { 1919,15}, { 511,14}, { 1023,13}, { 2175,12}, \
+ { 4479,13}, { 2431,14}, { 1279,13}, { 2943,12}, \
+ { 5887,14}, { 1535,13}, { 3455,14}, { 1791,13}, \
+ { 3967,12}, { 7935,11}, { 15871,15}, { 1023,14}, \
+ { 2047,13}, { 4479,14}, { 2303,13}, { 4991,12}, \
+ { 9983,14}, { 2815,13}, { 5887,15}, { 1535,14}, \
+ { 3839,13}, { 7935,12}, { 15871,16} }
+#define MUL_FFT_TABLE3_SIZE 155
+#define MUL_FFT_THRESHOLD 6784
+
+#define SQR_FFT_MODF_THRESHOLD 555 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 555, 5}, { 28, 6}, { 15, 5}, { 31, 6}, \
+ { 16, 5}, { 33, 6}, { 29, 7}, { 15, 6}, \
+ { 33, 7}, { 17, 6}, { 36, 7}, { 19, 6}, \
+ { 39, 7}, { 23, 6}, { 47, 7}, { 29, 8}, \
+ { 15, 7}, { 35, 8}, { 19, 7}, { 43, 8}, \
+ { 23, 7}, { 49, 8}, { 31, 7}, { 63, 8}, \
+ { 43, 9}, { 23, 8}, { 55, 9}, { 31, 8}, \
+ { 67, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \
+ { 95, 9}, { 55,10}, { 31, 9}, { 79,10}, \
+ { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \
+ { 135,10}, { 79, 9}, { 159,10}, { 95,11}, \
+ { 63,10}, { 143, 9}, { 287,10}, { 159,11}, \
+ { 95,10}, { 191, 6}, { 3071, 5}, { 6399, 6}, \
+ { 3455, 7}, { 1791, 8}, { 959,10}, { 255, 9}, \
+ { 511,10}, { 271,11}, { 159,10}, { 319, 9}, \
+ { 639,10}, { 335, 9}, { 671,10}, { 351,11}, \
+ { 191,10}, { 399, 9}, { 799,10}, { 415,11}, \
+ { 223,12}, { 127,11}, { 255,10}, { 543,11}, \
+ { 287,10}, { 607,11}, { 319,10}, { 671,11}, \
+ { 351,12}, { 191,11}, { 383,10}, { 799,11}, \
+ { 415,13}, { 127,12}, { 255,11}, { 543,10}, \
+ { 1087,11}, { 607,12}, { 319,11}, { 671,10}, \
+ { 1343,11}, { 735,12}, { 383,11}, { 799,10}, \
+ { 1599,11}, { 863,12}, { 447,11}, { 927,13}, \
+ { 255,12}, { 511,11}, { 1055,10}, { 2111,11}, \
+ { 1087,12}, { 575,11}, { 1215,12}, { 639,11}, \
+ { 1343,12}, { 703,13}, { 383,12}, { 767,11}, \
+ { 1599,12}, { 831,11}, { 1727,10}, { 3455,12}, \
+ { 959,14}, { 255,13}, { 511,12}, { 1023,11}, \
+ { 2111,12}, { 1087,11}, { 2239,10}, { 4479,12}, \
+ { 1215,13}, { 639,12}, { 1471,11}, { 2943,13}, \
+ { 767,12}, { 1727,11}, { 3455,13}, { 895,12}, \
+ { 1855,14}, { 511,13}, { 1023,12}, { 2239,13}, \
+ { 1151,12}, { 2495,13}, { 1279,12}, { 2623,13}, \
+ { 1407,12}, { 2943,14}, { 767,13}, { 1663,12}, \
+ { 3455,13}, { 1791,15}, { 511,14}, { 1023,13}, \
+ { 2175,12}, { 4479,13}, { 2431,14}, { 1279,13}, \
+ { 2943,12}, { 5887,14}, { 1535,13}, { 3455,14}, \
+ { 1791,13}, { 3967,12}, { 7935,15}, { 1023,14}, \
+ { 2047,13}, { 4479,14}, { 2303,13}, { 4991,12}, \
+ { 9983,14}, { 2815,13}, { 5887,15}, { 1535,14}, \
+ { 3839,13}, { 7935,16} }
+#define SQR_FFT_TABLE3_SIZE 166
+#define SQR_FFT_THRESHOLD 5760
+
+#define MULLO_BASECASE_THRESHOLD 3
+#define MULLO_DC_THRESHOLD 34
+#define MULLO_MUL_N_THRESHOLD 13463
+#define SQRLO_BASECASE_THRESHOLD 8
+#define SQRLO_DC_THRESHOLD 43
+#define SQRLO_SQR_THRESHOLD 11278
+
+#define DC_DIV_QR_THRESHOLD 75
+#define DC_DIVAPPR_Q_THRESHOLD 200
+#define DC_BDIV_QR_THRESHOLD 71
+#define DC_BDIV_Q_THRESHOLD 119
+
+#define INV_MULMOD_BNM1_THRESHOLD 74
+#define INV_NEWTON_THRESHOLD 266
+#define INV_APPR_THRESHOLD 214
+
+#define BINV_NEWTON_THRESHOLD 278
+#define REDC_1_TO_REDC_N_THRESHOLD 71
+
+#define MU_DIV_QR_THRESHOLD 1652
+#define MU_DIVAPPR_Q_THRESHOLD 1589
+#define MUPI_DIV_QR_THRESHOLD 122
+#define MU_BDIV_QR_THRESHOLD 1442
+#define MU_BDIV_Q_THRESHOLD 1597
+
+#define POWM_SEC_TABLE 1,22,96,289,1259
+
+#define GET_STR_DC_THRESHOLD 11
+#define GET_STR_PRECOMPUTE_THRESHOLD 20
+#define SET_STR_DC_THRESHOLD 173
+#define SET_STR_PRECOMPUTE_THRESHOLD 454
+
+#define FAC_DSC_THRESHOLD 90
+#define FAC_ODD_THRESHOLD 34
+
+#define MATRIX22_STRASSEN_THRESHOLD 19
+#define HGCD2_DIV1_METHOD 1 /* 5.80% faster than 3 */
+#define HGCD_THRESHOLD 74
+#define HGCD_APPR_THRESHOLD 50
+#define HGCD_REDUCE_THRESHOLD 3389
+#define GCD_DC_THRESHOLD 456
+#define GCDEXT_DC_THRESHOLD 345
+#define JACOBI_BASE_METHOD 4 /* 17.07% faster than 1 */
+
+/* Tuneup completed successfully, took 53914 seconds */
diff --git a/vendor/gmp-6.3.0/mpn/x86/bd4/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/x86/bd4/gmp-mparam.h
new file mode 100644
index 0000000..6c20d0f
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/bd4/gmp-mparam.h
@@ -0,0 +1,225 @@
+/* AMD bd4 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 3800-4200 MHz Excavator/Bristol Ridge */
+/* FFT tuning limit = 67,000,000 */
+/* Generated by tuneup.c, 2019-10-23, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 8
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 6
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 27
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 50
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 13
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1N_PI1_METHOD 1 /* 28.45% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD 4
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD 13
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 28
+
+#define DIV_1_VS_MUL_1_PERCENT 314
+
+#define MUL_TOOM22_THRESHOLD 32
+#define MUL_TOOM33_THRESHOLD 73
+#define MUL_TOOM44_THRESHOLD 166
+#define MUL_TOOM6H_THRESHOLD 270
+#define MUL_TOOM8H_THRESHOLD 357
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 69
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 114
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 103
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 121
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 154
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 42
+#define SQR_TOOM3_THRESHOLD 89
+#define SQR_TOOM4_THRESHOLD 208
+#define SQR_TOOM6_THRESHOLD 306
+#define SQR_TOOM8_THRESHOLD 454
+
+#define MULMID_TOOM42_THRESHOLD 68
+
+#define MULMOD_BNM1_THRESHOLD 19
+#define SQRMOD_BNM1_THRESHOLD 18
+
+#define MUL_FFT_MODF_THRESHOLD 570 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 570, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \
+ { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \
+ { 32, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \
+ { 39, 7}, { 27, 8}, { 15, 7}, { 35, 8}, \
+ { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \
+ { 27, 9}, { 15, 8}, { 31, 7}, { 63, 8}, \
+ { 39, 9}, { 23, 8}, { 51,10}, { 15, 9}, \
+ { 31, 8}, { 67, 9}, { 39, 8}, { 79, 9}, \
+ { 47, 8}, { 95,10}, { 31, 9}, { 79,10}, \
+ { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \
+ { 135,10}, { 79, 9}, { 159,10}, { 95, 9}, \
+ { 191,11}, { 63,10}, { 143, 6}, { 2303, 5}, \
+ { 4735, 4}, { 9471, 5}, { 4863, 7}, { 1279, 9}, \
+ { 335, 8}, { 671, 9}, { 351, 8}, { 703,10}, \
+ { 191,12}, { 63,11}, { 127,10}, { 255, 9}, \
+ { 511,10}, { 271, 9}, { 543,11}, { 159,10}, \
+ { 319, 9}, { 639,10}, { 335, 9}, { 671, 8}, \
+ { 1343,10}, { 351, 9}, { 703,10}, { 367, 9}, \
+ { 735,11}, { 191,10}, { 383, 9}, { 767,10}, \
+ { 399, 9}, { 799, 8}, { 1599,10}, { 415,11}, \
+ { 223,12}, { 127,11}, { 255,10}, { 543, 9}, \
+ { 1087,11}, { 287,10}, { 607, 9}, { 1215,11}, \
+ { 319,10}, { 671, 9}, { 1343,11}, { 351,12}, \
+ { 191,11}, { 383,10}, { 799,11}, { 415,10}, \
+ { 863,13}, { 127,12}, { 255,11}, { 543,10}, \
+ { 1087,11}, { 607,10}, { 1215, 9}, { 2431,12}, \
+ { 319,11}, { 671,10}, { 1343,11}, { 735,10}, \
+ { 1471, 9}, { 2943,12}, { 383,11}, { 799,10}, \
+ { 1599,11}, { 863,10}, { 1727,12}, { 447,11}, \
+ { 959,10}, { 1919,13}, { 255,12}, { 511,11}, \
+ { 1087,12}, { 575,11}, { 1215,10}, { 2431,12}, \
+ { 639,11}, { 1343,12}, { 703,11}, { 1471,10}, \
+ { 2943,13}, { 383,12}, { 767,11}, { 1599,12}, \
+ { 831,11}, { 1727,10}, { 3455,12}, { 959,11}, \
+ { 1919,10}, { 3839,13}, { 511,12}, { 1087,11}, \
+ { 2239,12}, { 1215,11}, { 2431,13}, { 639,12}, \
+ { 1471,11}, { 2943,10}, { 5887,13}, { 767,12}, \
+ { 1727,11}, { 3455,13}, { 895,12}, { 1919,11}, \
+ { 3839,14}, { 511,13}, { 1023,12}, { 2239,13}, \
+ { 1151,12}, { 2431,13}, { 1279,12}, { 2559,13}, \
+ { 1407,12}, { 2943,11}, { 5887,14}, { 767,13}, \
+ { 1663,12}, { 3455,13}, { 1919,12}, { 3839,15}, \
+ { 511,14}, { 1023,13}, { 2175,12}, { 4479,13}, \
+ { 2431,14}, { 1279,13}, { 2943,12}, { 5887,14}, \
+ { 1535,13}, { 3455,14}, { 1791,13}, { 3967,12}, \
+ { 7935,15}, { 1023,14}, { 2047,13}, { 4479,14}, \
+ { 2303,13}, { 4991,12}, { 9983,14}, { 2815,13}, \
+ { 5887,15}, { 1535,14}, { 3839,13}, { 7935,16} }
+#define MUL_FFT_TABLE3_SIZE 192
+#define MUL_FFT_THRESHOLD 5760
+
+#define SQR_FFT_MODF_THRESHOLD 476 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 476, 5}, { 28, 6}, { 15, 5}, { 31, 6}, \
+ { 16, 5}, { 33, 6}, { 29, 7}, { 15, 6}, \
+ { 33, 7}, { 17, 6}, { 36, 7}, { 19, 6}, \
+ { 39, 7}, { 29, 8}, { 15, 7}, { 35, 8}, \
+ { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \
+ { 27, 9}, { 15, 8}, { 39, 9}, { 23, 8}, \
+ { 51, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \
+ { 79, 9}, { 47, 8}, { 95,10}, { 31, 9}, \
+ { 79,10}, { 47, 9}, { 95,11}, { 31,10}, \
+ { 63, 9}, { 135,10}, { 95, 9}, { 191,10}, \
+ { 111,11}, { 63,10}, { 127, 9}, { 255,10}, \
+ { 143, 9}, { 287, 8}, { 575,10}, { 159,11}, \
+ { 95,10}, { 191,12}, { 63,10}, { 255, 9}, \
+ { 511,10}, { 271, 9}, { 543,10}, { 287, 9}, \
+ { 575,11}, { 159,10}, { 319, 9}, { 639,10}, \
+ { 335, 9}, { 671,10}, { 351, 9}, { 735,11}, \
+ { 191,10}, { 383, 9}, { 767,10}, { 399, 9}, \
+ { 799,10}, { 415, 9}, { 863,12}, { 127,11}, \
+ { 255,10}, { 511, 9}, { 1023,10}, { 543,11}, \
+ { 287,10}, { 607, 9}, { 1215,11}, { 319,10}, \
+ { 671, 9}, { 1343,11}, { 351,10}, { 735,12}, \
+ { 191,11}, { 383,10}, { 799,11}, { 415,10}, \
+ { 863,13}, { 127,12}, { 255,11}, { 511,10}, \
+ { 1055,11}, { 543,10}, { 1087,11}, { 607,10}, \
+ { 1215,12}, { 319,11}, { 671,10}, { 1343,11}, \
+ { 735,10}, { 1471,12}, { 383,11}, { 799,10}, \
+ { 1599,11}, { 863,10}, { 1727,12}, { 447,11}, \
+ { 959,13}, { 255,12}, { 511,11}, { 1087,12}, \
+ { 575,11}, { 1215,12}, { 639,11}, { 1343,12}, \
+ { 703,11}, { 1471,13}, { 383,12}, { 767,11}, \
+ { 1599,12}, { 831,11}, { 1727,12}, { 959,11}, \
+ { 1919,14}, { 255,13}, { 511,12}, { 1023,11}, \
+ { 2047,12}, { 1087,11}, { 2239,12}, { 1215,11}, \
+ { 2431,13}, { 639,12}, { 1471,11}, { 2943,13}, \
+ { 767,12}, { 1727,13}, { 895,12}, { 1983,14}, \
+ { 511,13}, { 1023,12}, { 2239,13}, { 1151,12}, \
+ { 2431,13}, { 1279,12}, { 2559,13}, { 1407,12}, \
+ { 2943,14}, { 767,13}, { 1663,12}, { 3455,13}, \
+ { 1919,12}, { 3839,15}, { 511,14}, { 1023,13}, \
+ { 2175,12}, { 4479,13}, { 2431,14}, { 1279,13}, \
+ { 2943,12}, { 5887,14}, { 1535,13}, { 3455,14}, \
+ { 1791,13}, { 3967,15}, { 1023,14}, { 2047,13}, \
+ { 4479,14}, { 2303,13}, { 4991,12}, { 9983,14}, \
+ { 2815,13}, { 5887,15}, { 1535,14}, { 3839,16} }
+#define SQR_FFT_TABLE3_SIZE 176
+#define SQR_FFT_THRESHOLD 4736
+
+#define MULLO_BASECASE_THRESHOLD 3
+#define MULLO_DC_THRESHOLD 54
+#define MULLO_MUL_N_THRESHOLD 10950
+#define SQRLO_BASECASE_THRESHOLD 10
+#define SQRLO_DC_THRESHOLD 77
+#define SQRLO_SQR_THRESHOLD 9449
+
+#define DC_DIV_QR_THRESHOLD 84
+#define DC_DIVAPPR_Q_THRESHOLD 252
+#define DC_BDIV_QR_THRESHOLD 79
+#define DC_BDIV_Q_THRESHOLD 80
+
+#define INV_MULMOD_BNM1_THRESHOLD 71
+#define INV_NEWTON_THRESHOLD 254
+#define INV_APPR_THRESHOLD 266
+
+#define BINV_NEWTON_THRESHOLD 294
+#define REDC_1_TO_REDC_N_THRESHOLD 79
+
+#define MU_DIV_QR_THRESHOLD 1652
+#define MU_DIVAPPR_Q_THRESHOLD 1528
+#define MUPI_DIV_QR_THRESHOLD 122
+#define MU_BDIV_QR_THRESHOLD 1387
+#define MU_BDIV_Q_THRESHOLD 1528
+
+#define POWM_SEC_TABLE 1,16,96,480,960
+
+#define GET_STR_DC_THRESHOLD 12
+#define GET_STR_PRECOMPUTE_THRESHOLD 19
+#define SET_STR_DC_THRESHOLD 264
+#define SET_STR_PRECOMPUTE_THRESHOLD 542
+
+#define FAC_DSC_THRESHOLD 91
+#define FAC_ODD_THRESHOLD 29
+
+#define MATRIX22_STRASSEN_THRESHOLD 19
+#define HGCD2_DIV1_METHOD 1 /* 9.73% faster than 3 */
+#define HGCD_THRESHOLD 55
+#define HGCD_APPR_THRESHOLD 50
+#define HGCD_REDUCE_THRESHOLD 3389
+#define GCD_DC_THRESHOLD 562
+#define GCDEXT_DC_THRESHOLD 416
+#define JACOBI_BASE_METHOD 4 /* 16.50% faster than 1 */
+
+/* Tuneup completed successfully, took 49179 seconds */
diff --git a/vendor/gmp-6.3.0/mpn/x86/bdiv_dbm1c.asm b/vendor/gmp-6.3.0/mpn/x86/bdiv_dbm1c.asm
new file mode 100644
index 0000000..0288c47
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/bdiv_dbm1c.asm
@@ -0,0 +1,129 @@
+dnl x86 mpn_bdiv_dbm1.
+
+dnl Copyright 2008, 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C P5
+C P6 model 0-8,10-12)
+C P6 model 9 (Banias)
+C P6 model 13 (Dothan) 5.1
+C P4 model 0 (Willamette)
+C P4 model 1 (?)
+C P4 model 2 (Northwood) 13.67
+C P4 model 3 (Prescott)
+C P4 model 4 (Nocona)
+C Intel Atom
+C AMD K6
+C AMD K7 3.5
+C AMD K8
+C AMD K10
+
+
+C TODO
+C * Optimize for more x86 processors
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_bdiv_dbm1c)
+ mov 16(%esp), %ecx C d
+ push %esi
+ mov 12(%esp), %esi C ap
+ push %edi
+ mov 12(%esp), %edi C qp
+ push %ebp
+ mov 24(%esp), %ebp C n
+ push %ebx
+
+ mov (%esi), %eax
+ mul %ecx
+ mov 36(%esp), %ebx
+ sub %eax, %ebx
+ mov %ebx, (%edi)
+ sbb %edx, %ebx
+
+ mov %ebp, %eax
+ and $3, %eax
+ jz L(b0)
+ cmp $2, %eax
+ jc L(b1)
+ jz L(b2)
+
+L(b3): lea -8(%esi), %esi
+ lea 8(%edi), %edi
+ add $-3, %ebp
+ jmp L(3)
+
+L(b0): mov 4(%esi), %eax
+ lea -4(%esi), %esi
+ lea 12(%edi), %edi
+ add $-4, %ebp
+ jmp L(0)
+
+L(b2): mov 4(%esi), %eax
+ lea 4(%esi), %esi
+ lea 4(%edi), %edi
+ add $-2, %ebp
+ jmp L(2)
+
+ ALIGN(8)
+L(top): mov 4(%esi), %eax
+ mul %ecx
+ lea 16(%edi), %edi
+ sub %eax, %ebx
+ mov 8(%esi), %eax
+ mov %ebx, -12(%edi)
+ sbb %edx, %ebx
+L(0): mul %ecx
+ sub %eax, %ebx
+ mov %ebx, -8(%edi)
+ sbb %edx, %ebx
+L(3): mov 12(%esi), %eax
+ mul %ecx
+ sub %eax, %ebx
+ mov %ebx, -4(%edi)
+ mov 16(%esi), %eax
+ lea 16(%esi), %esi
+ sbb %edx, %ebx
+L(2): mul %ecx
+ sub %eax, %ebx
+ mov %ebx, 0(%edi)
+ sbb %edx, %ebx
+L(b1): add $-4, %ebp
+ jns L(top)
+
+ mov %ebx, %eax
+ pop %ebx
+ pop %ebp
+ pop %edi
+ pop %esi
+ ret
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/bdiv_q_1.asm b/vendor/gmp-6.3.0/mpn/x86/bdiv_q_1.asm
new file mode 100644
index 0000000..132de06
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/bdiv_q_1.asm
@@ -0,0 +1,208 @@
+dnl x86 mpn_bdiv_q_1 -- mpn by limb exact division.
+
+dnl Rearranged from mpn/x86/dive_1.asm by Marco Bodrato.
+
+dnl Copyright 2001, 2002, 2007, 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C P54 30.0
+C P55 29.0
+C P6 13.0 odd divisor, 12.0 even (strangely)
+C K6 14.0
+C K7 12.0
+C P4 42.0
+
+MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1)
+
+defframe(PARAM_SHIFT, 24)
+defframe(PARAM_INVERSE,20)
+defframe(PARAM_DIVISOR,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+dnl re-use parameter space
+define(VAR_INVERSE,`PARAM_SRC')
+
+ TEXT
+
+C mp_limb_t
+C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C mp_limb_t inverse, int shift)
+
+ ALIGN(16)
+PROLOGUE(mpn_pi1_bdiv_q_1)
+deflit(`FRAME',0)
+
+ movl PARAM_SHIFT, %ecx
+ pushl %ebp FRAME_pushl()
+
+ movl PARAM_INVERSE, %eax
+ movl PARAM_SIZE, %ebp
+ pushl %ebx FRAME_pushl()
+L(common):
+ pushl %edi FRAME_pushl()
+ pushl %esi FRAME_pushl()
+
+ movl PARAM_SRC, %esi
+ movl PARAM_DST, %edi
+
+ leal (%esi,%ebp,4), %esi C src end
+ leal (%edi,%ebp,4), %edi C dst end
+ negl %ebp C -size
+
+ movl %eax, VAR_INVERSE
+ movl (%esi,%ebp,4), %eax C src[0]
+
+ xorl %ebx, %ebx
+ xorl %edx, %edx
+
+ incl %ebp
+ jz L(one)
+
+ movl (%esi,%ebp,4), %edx C src[1]
+
+ shrdl( %cl, %edx, %eax)
+
+ movl VAR_INVERSE, %edx
+ jmp L(entry)
+
+
+ ALIGN(8)
+ nop C k6 code alignment
+ nop
+L(top):
+ C eax q
+ C ebx carry bit, 0 or -1
+ C ecx shift
+ C edx carry limb
+ C esi src end
+ C edi dst end
+ C ebp counter, limbs, negative
+
+ movl -4(%esi,%ebp,4), %eax
+ subl %ebx, %edx C accumulate carry bit
+
+ movl (%esi,%ebp,4), %ebx
+
+ shrdl( %cl, %ebx, %eax)
+
+ subl %edx, %eax C apply carry limb
+ movl VAR_INVERSE, %edx
+
+ sbbl %ebx, %ebx
+
+L(entry):
+ imull %edx, %eax
+
+ movl %eax, -4(%edi,%ebp,4)
+ movl PARAM_DIVISOR, %edx
+
+ mull %edx
+
+ incl %ebp
+ jnz L(top)
+
+
+ movl -4(%esi), %eax C src high limb
+L(one):
+ shrl %cl, %eax
+ popl %esi FRAME_popl()
+
+ addl %ebx, %eax C apply carry bit
+
+ subl %edx, %eax C apply carry limb
+
+ imull VAR_INVERSE, %eax
+
+ movl %eax, -4(%edi)
+
+ popl %edi
+ popl %ebx
+ popl %ebp
+
+ ret
+
+EPILOGUE()
+
+C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor);
+C
+
+ ALIGN(16)
+PROLOGUE(mpn_bdiv_q_1)
+deflit(`FRAME',0)
+
+ movl PARAM_DIVISOR, %eax
+ pushl %ebp FRAME_pushl()
+
+ movl $-1, %ecx C shift count
+ movl PARAM_SIZE, %ebp
+
+ pushl %ebx FRAME_pushl()
+
+L(strip_twos):
+ incl %ecx
+
+ shrl %eax
+ jnc L(strip_twos)
+
+ leal 1(%eax,%eax), %ebx C d without twos
+ andl $127, %eax C d/2, 7 bits
+
+ifdef(`PIC',`
+ LEA( binvert_limb_table, %edx)
+ movzbl (%eax,%edx), %eax C inv 8 bits
+',`
+ movzbl binvert_limb_table(%eax), %eax C inv 8 bits
+')
+
+ leal (%eax,%eax), %edx C 2*inv
+ movl %ebx, PARAM_DIVISOR C d without twos
+ imull %eax, %eax C inv*inv
+ imull %ebx, %eax C inv*inv*d
+ subl %eax, %edx C inv = 2*inv - inv*inv*d
+
+ leal (%edx,%edx), %eax C 2*inv
+ imull %edx, %edx C inv*inv
+ imull %ebx, %edx C inv*inv*d
+ subl %edx, %eax C inv = 2*inv - inv*inv*d
+
+ ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS
+ pushl %eax FRAME_pushl()
+ imull PARAM_DIVISOR, %eax
+ cmpl $1, %eax
+ popl %eax FRAME_popl()')
+
+ jmp L(common)
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/x86/bt1/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/x86/bt1/gmp-mparam.h
new file mode 100644
index 0000000..302dbc6
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/bt1/gmp-mparam.h
@@ -0,0 +1,218 @@
+/* x86/bobcat gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* NOTE: In a fat binary build SQR_TOOM2_THRESHOLD here cannot be greater than
+ the value in mpn/x86/k7/gmp-mparam.h. The latter is used as a hard limit in
+ k7/sqr_basecase.asm. */
+
+/* 1600 MHz AMD Bobcat Zacate E-350 */
+/* FFT tuning limit = 67,000,000 */
+/* Generated by tuneup.c, 2019-10-17, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 10
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 6
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 16
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 21
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1N_PI1_METHOD 1 /* 57.16% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD 3
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 36
+
+#define DIV_1_VS_MUL_1_PERCENT 199
+
+#define MUL_TOOM22_THRESHOLD 28
+#define MUL_TOOM33_THRESHOLD 93
+#define MUL_TOOM44_THRESHOLD 166
+#define MUL_TOOM6H_THRESHOLD 270
+#define MUL_TOOM8H_THRESHOLD 478
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 102
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 177
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 169
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 113
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 143
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 50
+#define SQR_TOOM3_THRESHOLD 89
+#define SQR_TOOM4_THRESHOLD 248
+#define SQR_TOOM6_THRESHOLD 342
+#define SQR_TOOM8_THRESHOLD 470
+
+#define MULMID_TOOM42_THRESHOLD 72
+
+#define MULMOD_BNM1_THRESHOLD 20
+#define SQRMOD_BNM1_THRESHOLD 21
+
+#define MUL_FFT_MODF_THRESHOLD 630 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 630, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \
+ { 15, 5}, { 31, 6}, { 27, 7}, { 15, 6}, \
+ { 33, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \
+ { 39, 7}, { 23, 6}, { 47, 7}, { 29, 8}, \
+ { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \
+ { 23, 7}, { 49, 8}, { 27, 7}, { 55, 9}, \
+ { 15, 8}, { 31, 7}, { 63, 8}, { 43, 9}, \
+ { 23, 8}, { 55, 9}, { 31, 8}, { 67, 9}, \
+ { 39, 8}, { 79, 9}, { 47, 8}, { 95, 9}, \
+ { 55,10}, { 31, 9}, { 79,10}, { 47, 6}, \
+ { 767, 7}, { 399, 6}, { 799, 7}, { 415, 8}, \
+ { 235, 7}, { 479, 9}, { 135,10}, { 79, 9}, \
+ { 159,10}, { 95, 9}, { 191,11}, { 63,10}, \
+ { 159,11}, { 95,10}, { 191,12}, { 63,11}, \
+ { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \
+ { 543,11}, { 159,10}, { 319, 9}, { 639,10}, \
+ { 335, 9}, { 671,11}, { 191,10}, { 383, 9}, \
+ { 767,10}, { 399, 9}, { 799,11}, { 223,12}, \
+ { 127,11}, { 255,10}, { 543, 9}, { 1087,11}, \
+ { 287,10}, { 607, 9}, { 1215,11}, { 319,10}, \
+ { 671,11}, { 351,12}, { 191,11}, { 383,10}, \
+ { 799,11}, { 415,13}, { 127,12}, { 255,11}, \
+ { 543,10}, { 1087,11}, { 607,10}, { 1215,12}, \
+ { 319,11}, { 671,10}, { 1343,11}, { 735,10}, \
+ { 1471,12}, { 383,11}, { 799,10}, { 1599,11}, \
+ { 863,12}, { 447,11}, { 991,13}, { 255,12}, \
+ { 511,11}, { 1087,12}, { 575,11}, { 1215,12}, \
+ { 639,11}, { 1343,12}, { 703,11}, { 1471,13}, \
+ { 383,12}, { 767,11}, { 1599,12}, { 831,11}, \
+ { 1727,12}, { 959,14}, { 255,13}, { 511,12}, \
+ { 1215,13}, { 639,12}, { 1471,13}, { 767,12}, \
+ { 1727,13}, { 895,12}, { 1919,14}, { 511,13}, \
+ { 1023,12}, { 2111,13}, { 1151,12}, { 2431,13}, \
+ { 1407,14}, { 767,13}, { 1663,12}, { 3455,13}, \
+ { 1919,15}, { 511,14}, { 1023,13}, { 2175,12}, \
+ { 4479,13}, { 2431,14}, { 1279,13}, { 2943,12}, \
+ { 5887,14}, { 1535,13}, { 3455,14}, { 1791,13}, \
+ { 3967,15}, { 1023,14}, { 2047,13}, { 4479,14}, \
+ { 2303,13}, { 4991,12}, { 9983,14}, { 2815,13}, \
+ { 5887,15}, { 1535,14}, { 3839,16} }
+#define MUL_FFT_TABLE3_SIZE 159
+#define MUL_FFT_THRESHOLD 7424
+
+#define SQR_FFT_MODF_THRESHOLD 500 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 500, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \
+ { 28, 7}, { 15, 6}, { 32, 7}, { 17, 6}, \
+ { 35, 7}, { 19, 6}, { 39, 7}, { 23, 6}, \
+ { 47, 7}, { 27, 8}, { 15, 7}, { 35, 8}, \
+ { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \
+ { 27, 9}, { 15, 8}, { 31, 7}, { 63, 8}, \
+ { 39, 9}, { 23, 8}, { 51, 9}, { 31, 8}, \
+ { 67, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \
+ { 95, 9}, { 55,10}, { 31, 9}, { 79,10}, \
+ { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \
+ { 127, 6}, { 1087, 7}, { 575, 8}, { 303, 9}, \
+ { 159,10}, { 95,11}, { 63,10}, { 127, 9}, \
+ { 255,10}, { 143, 9}, { 287,10}, { 159,11}, \
+ { 95,10}, { 191,12}, { 63,11}, { 127,10}, \
+ { 255, 9}, { 511,10}, { 271, 9}, { 543,10}, \
+ { 287, 9}, { 575,11}, { 159,10}, { 319, 9}, \
+ { 639,10}, { 335, 9}, { 671,10}, { 351,11}, \
+ { 191,10}, { 383, 9}, { 767,10}, { 399, 9}, \
+ { 799,10}, { 415, 9}, { 831,12}, { 127,11}, \
+ { 255,10}, { 543,11}, { 287,10}, { 607,11}, \
+ { 319,10}, { 671,11}, { 351,10}, { 703,12}, \
+ { 191,11}, { 383,10}, { 799,11}, { 415,10}, \
+ { 831,13}, { 127,12}, { 255,11}, { 543,10}, \
+ { 1087,11}, { 607,12}, { 319,11}, { 671,10}, \
+ { 1343,11}, { 735,10}, { 1471,12}, { 383,11}, \
+ { 799,10}, { 1599,11}, { 863,12}, { 447,11}, \
+ { 959,13}, { 255,12}, { 511,11}, { 1087,12}, \
+ { 575,11}, { 1215,12}, { 639,11}, { 1343,12}, \
+ { 703,11}, { 1471,13}, { 383,12}, { 767,11}, \
+ { 1599,12}, { 831,11}, { 1727,12}, { 959,14}, \
+ { 255,13}, { 511,12}, { 1215,13}, { 639,12}, \
+ { 1471,13}, { 767,12}, { 1727,13}, { 895,12}, \
+ { 1919,14}, { 511,13}, { 1023,12}, { 2111,13}, \
+ { 1151,12}, { 2431,13}, { 1407,14}, { 767,13}, \
+ { 1663,12}, { 3455,13}, { 1919,15}, { 511,14}, \
+ { 1023,13}, { 2175,12}, { 4479,13}, { 2431,14}, \
+ { 1279,13}, { 2943,12}, { 5887,14}, { 1535,13}, \
+ { 3455,14}, { 1791,13}, { 3839,15}, { 1023,14}, \
+ { 2047,13}, { 4479,14}, { 2303,13}, { 4991,12}, \
+ { 9983,14}, { 2815,13}, { 5887,15}, { 1535,14}, \
+ { 3839,16} }
+#define SQR_FFT_TABLE3_SIZE 161
+#define SQR_FFT_THRESHOLD 5760
+
+#define MULLO_BASECASE_THRESHOLD 9
+#define MULLO_DC_THRESHOLD 48
+#define MULLO_MUL_N_THRESHOLD 14281
+#define SQRLO_BASECASE_THRESHOLD 7
+#define SQRLO_DC_THRESHOLD 146
+#define SQRLO_SQR_THRESHOLD 11278
+
+#define DC_DIV_QR_THRESHOLD 77
+#define DC_DIVAPPR_Q_THRESHOLD 240
+#define DC_BDIV_QR_THRESHOLD 83
+#define DC_BDIV_Q_THRESHOLD 182
+
+#define INV_MULMOD_BNM1_THRESHOLD 74
+#define INV_NEWTON_THRESHOLD 252
+#define INV_APPR_THRESHOLD 252
+
+#define BINV_NEWTON_THRESHOLD 252
+#define REDC_1_TO_REDC_N_THRESHOLD 79
+
+#define MU_DIV_QR_THRESHOLD 1787
+#define MU_DIVAPPR_Q_THRESHOLD 1718
+#define MUPI_DIV_QR_THRESHOLD 122
+#define MU_BDIV_QR_THRESHOLD 1470
+#define MU_BDIV_Q_THRESHOLD 1713
+
+#define POWM_SEC_TABLE 1,16,96,563,1317,1867
+
+#define GET_STR_DC_THRESHOLD 19
+#define GET_STR_PRECOMPUTE_THRESHOLD 32
+#define SET_STR_DC_THRESHOLD 254
+#define SET_STR_PRECOMPUTE_THRESHOLD 907
+
+#define FAC_DSC_THRESHOLD 224
+#define FAC_ODD_THRESHOLD 55
+
+#define MATRIX22_STRASSEN_THRESHOLD 23
+#define HGCD2_DIV1_METHOD 3 /* 3.59% faster than 5 */
+#define HGCD_THRESHOLD 85
+#define HGCD_APPR_THRESHOLD 152
+#define HGCD_REDUCE_THRESHOLD 3389
+#define GCD_DC_THRESHOLD 531
+#define GCDEXT_DC_THRESHOLD 386
+#define JACOBI_BASE_METHOD 3 /* 0.92% faster than 1 */
+
+/* Tuneup completed successfully, took 159946 seconds */
diff --git a/vendor/gmp-6.3.0/mpn/x86/bt2/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/x86/bt2/gmp-mparam.h
new file mode 100644
index 0000000..f936cb7
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/bt2/gmp-mparam.h
@@ -0,0 +1,214 @@
+/* x86/bobcat gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* NOTE: In a fat binary build SQR_TOOM2_THRESHOLD here cannot be greater than
+ the value in mpn/x86/k7/gmp-mparam.h. The latter is used as a hard limit in
+ k7/sqr_basecase.asm. */
+
+/* 2050 MHz AMD Jaguar/Kabini */
+/* FFT tuning limit = 67,000,000 */
+/* Generated by tuneup.c, 2019-10-24, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD 4
+#define MOD_1_UNNORM_THRESHOLD 6
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 4
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 4
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 18
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 9
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1N_PI1_METHOD 1 /* 47.53% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 27
+
+#define DIV_1_VS_MUL_1_PERCENT 243
+
+#define MUL_TOOM22_THRESHOLD 32
+#define MUL_TOOM33_THRESHOLD 90
+#define MUL_TOOM44_THRESHOLD 154
+#define MUL_TOOM6H_THRESHOLD 286
+#define MUL_TOOM8H_THRESHOLD 478
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 152
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 103
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 113
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 154
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 38
+#define SQR_TOOM3_THRESHOLD 126
+#define SQR_TOOM4_THRESHOLD 220
+#define SQR_TOOM6_THRESHOLD 318
+#define SQR_TOOM8_THRESHOLD 502
+
+#define MULMID_TOOM42_THRESHOLD 68
+
+#define MULMOD_BNM1_THRESHOLD 19
+#define SQRMOD_BNM1_THRESHOLD 25
+
+#define MUL_FFT_MODF_THRESHOLD 570 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 570, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \
+ { 15, 5}, { 31, 6}, { 28, 7}, { 15, 6}, \
+ { 33, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \
+ { 39, 7}, { 23, 6}, { 47, 7}, { 27, 8}, \
+ { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \
+ { 23, 7}, { 49, 8}, { 31, 7}, { 63, 8}, \
+ { 39, 9}, { 23, 8}, { 55, 9}, { 31, 8}, \
+ { 67, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \
+ { 95, 9}, { 55,10}, { 31, 9}, { 79,10}, \
+ { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \
+ { 135,10}, { 79, 9}, { 159,10}, { 95,11}, \
+ { 63,10}, { 159,11}, { 95,10}, { 191,12}, \
+ { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \
+ { 271, 9}, { 543,10}, { 287,11}, { 159,10}, \
+ { 319, 9}, { 639,10}, { 335, 9}, { 671,11}, \
+ { 191,10}, { 383, 9}, { 767,10}, { 399, 9}, \
+ { 799,10}, { 415,11}, { 223,12}, { 127,11}, \
+ { 255,10}, { 543,11}, { 287,10}, { 607, 9}, \
+ { 1215,11}, { 319,10}, { 671,11}, { 351,12}, \
+ { 191,11}, { 383,10}, { 799,11}, { 415,13}, \
+ { 127,12}, { 255,11}, { 543,10}, { 1087,11}, \
+ { 607,10}, { 1215,12}, { 319,11}, { 671,10}, \
+ { 1343,11}, { 735,10}, { 1471,12}, { 383,11}, \
+ { 799,10}, { 1599,11}, { 863,12}, { 447,11}, \
+ { 991,13}, { 255,12}, { 511,11}, { 1087,12}, \
+ { 575,11}, { 1215,12}, { 639,11}, { 1343,12}, \
+ { 703,11}, { 1471,13}, { 383,12}, { 767,11}, \
+ { 1599,12}, { 831,11}, { 1727,12}, { 959,14}, \
+ { 255,13}, { 511,12}, { 1215,13}, { 639,12}, \
+ { 1471,13}, { 767,12}, { 1727,13}, { 895,12}, \
+ { 1919,14}, { 511,13}, { 1023,12}, { 2111,13}, \
+ { 1151,12}, { 2431,13}, { 1407,14}, { 767,13}, \
+ { 1663,12}, { 3455,13}, { 1919,15}, { 511,14}, \
+ { 1023,13}, { 2175,12}, { 4479,13}, { 2431,14}, \
+ { 1279,13}, { 2943,12}, { 5887,14}, { 1535,13}, \
+ { 3455,14}, { 1791,13}, { 3967,15}, { 1023,14}, \
+ { 2047,13}, { 4479,14}, { 2303,13}, { 4991,12}, \
+ { 9983,14}, { 2815,13}, { 5887,15}, { 1535,14}, \
+ { 3839,16} }
+#define MUL_FFT_TABLE3_SIZE 153
+#define MUL_FFT_THRESHOLD 5760
+
+#define SQR_FFT_MODF_THRESHOLD 530 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 530, 5}, { 27, 6}, { 15, 5}, { 31, 6}, \
+ { 28, 7}, { 15, 6}, { 33, 7}, { 17, 6}, \
+ { 35, 7}, { 19, 6}, { 39, 7}, { 23, 6}, \
+ { 47, 7}, { 29, 8}, { 15, 7}, { 35, 8}, \
+ { 19, 7}, { 41, 8}, { 23, 7}, { 49, 8}, \
+ { 31, 7}, { 63, 8}, { 39, 9}, { 23, 8}, \
+ { 55, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \
+ { 79, 9}, { 47, 8}, { 95, 9}, { 55,10}, \
+ { 31, 9}, { 79,10}, { 47, 9}, { 95,11}, \
+ { 31,10}, { 63, 9}, { 135,10}, { 95,11}, \
+ { 63,10}, { 143, 9}, { 287,10}, { 159,11}, \
+ { 95,12}, { 63,11}, { 127,10}, { 255, 9}, \
+ { 511,10}, { 271, 9}, { 543,10}, { 287,11}, \
+ { 159,10}, { 319, 9}, { 639,10}, { 335, 9}, \
+ { 671,10}, { 351,11}, { 191,10}, { 383, 9}, \
+ { 767,10}, { 399, 9}, { 799,12}, { 127,11}, \
+ { 255,10}, { 543,11}, { 287,10}, { 607, 9}, \
+ { 1215,11}, { 319,10}, { 671,11}, { 351,12}, \
+ { 191,11}, { 383,10}, { 799,11}, { 415,10}, \
+ { 831,13}, { 127,12}, { 255,11}, { 543,10}, \
+ { 1087,11}, { 607,10}, { 1215,12}, { 319,11}, \
+ { 671,10}, { 1343,11}, { 735,10}, { 1471,12}, \
+ { 383,11}, { 799,10}, { 1599,11}, { 863,12}, \
+ { 447,11}, { 991,13}, { 255,12}, { 511,11}, \
+ { 1087,12}, { 575,11}, { 1215,12}, { 639,11}, \
+ { 1343,12}, { 703,11}, { 1471,13}, { 383,12}, \
+ { 767,11}, { 1599,12}, { 831,11}, { 1727,12}, \
+ { 959,11}, { 1919,14}, { 255,13}, { 511,12}, \
+ { 1215,13}, { 639,12}, { 1471,13}, { 767,12}, \
+ { 1727,13}, { 895,12}, { 1919,14}, { 511,13}, \
+ { 1023,12}, { 2111,13}, { 1151,12}, { 2495,13}, \
+ { 1407,14}, { 767,13}, { 1663,12}, { 3455,13}, \
+ { 1919,15}, { 511,14}, { 1023,13}, { 2175,12}, \
+ { 4479,13}, { 2431,14}, { 1279,13}, { 2943,12}, \
+ { 5887,14}, { 1535,13}, { 3455,14}, { 1791,13}, \
+ { 3967,15}, { 1023,14}, { 2047,13}, { 4479,14}, \
+ { 2303,13}, { 4991,12}, { 9983,14}, { 2815,13}, \
+ { 5887,15}, { 1535,14}, { 3839,16} }
+#define SQR_FFT_TABLE3_SIZE 151
+#define SQR_FFT_THRESHOLD 4736
+
+#define MULLO_BASECASE_THRESHOLD 8
+#define MULLO_DC_THRESHOLD 44
+#define MULLO_MUL_N_THRESHOLD 11278
+#define SQRLO_BASECASE_THRESHOLD 13
+#define SQRLO_DC_THRESHOLD 62
+#define SQRLO_SQR_THRESHOLD 8907
+
+#define DC_DIV_QR_THRESHOLD 79
+#define DC_DIVAPPR_Q_THRESHOLD 228
+#define DC_BDIV_QR_THRESHOLD 75
+#define DC_BDIV_Q_THRESHOLD 136
+
+#define INV_MULMOD_BNM1_THRESHOLD 90
+#define INV_NEWTON_THRESHOLD 260
+#define INV_APPR_THRESHOLD 236
+
+#define BINV_NEWTON_THRESHOLD 294
+#define REDC_1_TO_REDC_N_THRESHOLD 80
+
+#define MU_DIV_QR_THRESHOLD 1787
+#define MU_DIVAPPR_Q_THRESHOLD 1718
+#define MUPI_DIV_QR_THRESHOLD 118
+#define MU_BDIV_QR_THRESHOLD 1442
+#define MU_BDIV_Q_THRESHOLD 1652
+
+#define POWM_SEC_TABLE 1,16,96,615,865,1442
+
+#define GET_STR_DC_THRESHOLD 16
+#define GET_STR_PRECOMPUTE_THRESHOLD 27
+#define SET_STR_DC_THRESHOLD 252
+#define SET_STR_PRECOMPUTE_THRESHOLD 638
+
+#define FAC_DSC_THRESHOLD 141
+#define FAC_ODD_THRESHOLD 39
+
+#define MATRIX22_STRASSEN_THRESHOLD 19
+#define HGCD2_DIV1_METHOD 1 /* 13.65% faster than 3 */
+#define HGCD_THRESHOLD 81
+#define HGCD_APPR_THRESHOLD 66
+#define HGCD_REDUCE_THRESHOLD 3389
+#define GCD_DC_THRESHOLD 531
+#define GCDEXT_DC_THRESHOLD 345
+#define JACOBI_BASE_METHOD 1 /* 0.84% faster than 4 */
+
+/* Tuneup completed successfully, took 103818 seconds */
diff --git a/vendor/gmp-6.3.0/mpn/x86/cnd_aors_n.asm b/vendor/gmp-6.3.0/mpn/x86/cnd_aors_n.asm
new file mode 100644
index 0000000..74f4917
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/cnd_aors_n.asm
@@ -0,0 +1,124 @@
+dnl X86 mpn_cnd_add_n, mpn_cnd_sub_n
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C P5 ?
+C P6 model 0-8,10-12 ?
+C P6 model 9 (Banias) ?
+C P6 model 13 (Dothan) 5.4
+C P4 model 0-1 (Willamette) ?
+C P4 model 2 (Northwood) 14.5
+C P4 model 3-4 (Prescott) 21
+C Intel atom 11
+C AMD K6 ?
+C AMD K7 3.4
+C AMD K8 ?
+
+
+define(`rp', `%edi')
+define(`up', `%esi')
+define(`vp', `%ebp')
+define(`n', `%ecx')
+define(`cnd', `20(%esp)')
+define(`cy', `%edx')
+
+ifdef(`OPERATION_cnd_add_n', `
+ define(ADDSUB, add)
+ define(ADCSBB, adc)
+ define(func, mpn_cnd_add_n)')
+ifdef(`OPERATION_cnd_sub_n', `
+ define(ADDSUB, sub)
+ define(ADCSBB, sbb)
+ define(func, mpn_cnd_sub_n)')
+
+MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(func)
+ add $-16, %esp
+ mov %ebp, (%esp)
+ mov %ebx, 4(%esp)
+ mov %esi, 8(%esp)
+ mov %edi, 12(%esp)
+
+ C make cnd into a full mask
+ mov cnd, %eax
+ neg %eax
+ sbb %eax, %eax
+ mov %eax, cnd
+
+ C load parameters into registers
+ mov 24(%esp), rp
+ mov 28(%esp), up
+ mov 32(%esp), vp
+ mov 36(%esp), n
+
+ mov (vp), %eax
+ mov (up), %ebx
+
+ C put operand pointers just beyond their last limb
+ lea (vp,n,4), vp
+ lea (up,n,4), up
+ lea -4(rp,n,4), rp
+ neg n
+
+ and cnd, %eax
+ ADDSUB %eax, %ebx
+ sbb cy, cy
+ inc n
+ je L(end)
+
+ ALIGN(16)
+L(top): mov (vp,n,4), %eax
+ and cnd, %eax
+ mov %ebx, (rp,n,4)
+ mov (up,n,4), %ebx
+ add cy, cy
+ ADCSBB %eax, %ebx
+ sbb cy, cy
+ inc n
+ jne L(top)
+
+L(end): mov %ebx, (rp)
+ xor %eax, %eax
+ sub cy, %eax
+
+ mov (%esp), %ebp
+ mov 4(%esp), %ebx
+ mov 8(%esp), %esi
+ mov 12(%esp), %edi
+ add $16, %esp
+ ret
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/x86/copyd.asm b/vendor/gmp-6.3.0/mpn/x86/copyd.asm
new file mode 100644
index 0000000..51fa195
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/copyd.asm
@@ -0,0 +1,91 @@
+dnl x86 mpn_copyd -- copy limb vector, decrementing.
+
+dnl Copyright 1999-2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb startup (approx)
+C P5 1.0 40
+C P6 2.4 70
+C K6 1.0 55
+C K7 1.3 75
+C P4 2.6 175
+C
+C (Startup time includes some function call overheads.)
+
+
+C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C Copy src,size to dst,size, working from high to low addresses.
+C
+C The code here is very generic and can be expected to be reasonable on all
+C the x86 family.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+ TEXT
+ ALIGN(32)
+
+PROLOGUE(mpn_copyd)
+ C eax saved esi
+ C ebx
+ C ecx counter
+ C edx saved edi
+ C esi src
+ C edi dst
+ C ebp
+
+ movl PARAM_SIZE, %ecx
+ movl %esi, %eax
+
+ movl PARAM_SRC, %esi
+ movl %edi, %edx
+
+ movl PARAM_DST, %edi
+ leal -4(%esi,%ecx,4), %esi
+
+ leal -4(%edi,%ecx,4), %edi
+
+ std
+
+ rep
+ movsl
+
+ cld
+
+ movl %eax, %esi
+ movl %edx, %edi
+
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/copyi.asm b/vendor/gmp-6.3.0/mpn/x86/copyi.asm
new file mode 100644
index 0000000..f6b0354
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/copyi.asm
@@ -0,0 +1,99 @@
+dnl x86 mpn_copyi -- copy limb vector, incrementing.
+
+dnl Copyright 1999-2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb startup (approx)
+C P5 1.0 35
+C P6 0.75 45
+C K6 1.0 30
+C K7 1.3 65
+C P4 1.0 120
+C
+C (Startup time includes some function call overheads.)
+
+
+C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C Copy src,size to dst,size, working from low to high addresses.
+C
+C The code here is very generic and can be expected to be reasonable on all
+C the x86 family.
+C
+C P6 - An MMX based copy was tried, but was found to be slower than a rep
+C movs in all cases. The fastest MMX found was 0.8 cycles/limb (when
+C fully aligned). A rep movs seems to have a startup time of about 15
+C cycles, but doing something special for small sizes could lead to a
+C branch misprediction that would destroy any saving. For now a plain
+C rep movs seems ok.
+C
+C K62 - We used to have a big chunk of code doing an MMX copy at 0.56 c/l if
+C aligned or a 1.0 rep movs if not. But that seemed excessive since
+C it only got an advantage half the time, and even then only showed it
+C above 50 limbs or so.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+ TEXT
+ ALIGN(32)
+
+ C eax saved esi
+ C ebx
+ C ecx counter
+ C edx saved edi
+ C esi src
+ C edi dst
+ C ebp
+
+PROLOGUE(mpn_copyi)
+
+ movl PARAM_SIZE, %ecx
+ movl %esi, %eax
+
+ movl PARAM_SRC, %esi
+ movl %edi, %edx
+
+ movl PARAM_DST, %edi
+
+ cld C better safe than sorry, see mpn/x86/README
+
+ rep
+ movsl
+
+ movl %eax, %esi
+ movl %edx, %edi
+
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/core2/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/x86/core2/gmp-mparam.h
new file mode 100644
index 0000000..8a44ad1
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/core2/gmp-mparam.h
@@ -0,0 +1,210 @@
+/* x86/core2 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 3000 MHz Penryn */
+/* FFT tuning limit = 67,000,000 */
+/* Generated by tuneup.c, 2019-10-20, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define MOD_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 9
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 8
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 9
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 3
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1N_PI1_METHOD 2 /* 22.20% faster than 1 */
+#define DIV_QR_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD 9
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 18
+
+#define DIV_1_VS_MUL_1_PERCENT 277
+
+#define MUL_TOOM22_THRESHOLD 24
+#define MUL_TOOM33_THRESHOLD 93
+#define MUL_TOOM44_THRESHOLD 136
+#define MUL_TOOM6H_THRESHOLD 300
+#define MUL_TOOM8H_THRESHOLD 478
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 91
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 153
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 93
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 94
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 130
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 34
+#define SQR_TOOM3_THRESHOLD 117
+#define SQR_TOOM4_THRESHOLD 184
+#define SQR_TOOM6_THRESHOLD 262
+#define SQR_TOOM8_THRESHOLD 597
+
+#define MULMID_TOOM42_THRESHOLD 70
+
+#define MULMOD_BNM1_THRESHOLD 17
+#define SQRMOD_BNM1_THRESHOLD 25
+
+#define MUL_FFT_MODF_THRESHOLD 505 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 505, 5}, { 28, 6}, { 15, 5}, { 31, 6}, \
+ { 29, 7}, { 15, 6}, { 32, 7}, { 17, 6}, \
+ { 35, 7}, { 19, 6}, { 39, 7}, { 29, 8}, \
+ { 15, 7}, { 35, 8}, { 19, 7}, { 43, 8}, \
+ { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \
+ { 31, 7}, { 63, 8}, { 43, 9}, { 23, 8}, \
+ { 55, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \
+ { 79, 9}, { 55,10}, { 31, 9}, { 79,10}, \
+ { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \
+ { 135,10}, { 79, 9}, { 159,10}, { 95,11}, \
+ { 63, 9}, { 255,10}, { 159,11}, { 95,10}, \
+ { 191,12}, { 63,11}, { 127,10}, { 271, 9}, \
+ { 543,10}, { 287,11}, { 159,10}, { 335, 9}, \
+ { 671,10}, { 351,11}, { 191,10}, { 399, 9}, \
+ { 799,11}, { 223,12}, { 127,11}, { 255,10}, \
+ { 543,11}, { 287,10}, { 607,11}, { 319,10}, \
+ { 671,11}, { 351,12}, { 191,11}, { 383,10}, \
+ { 799,11}, { 415,13}, { 127,12}, { 255,11}, \
+ { 543,10}, { 1087,11}, { 607,12}, { 319,11}, \
+ { 671,10}, { 1343,11}, { 735,10}, { 1471,12}, \
+ { 383,11}, { 799,10}, { 1599,11}, { 863,12}, \
+ { 447,11}, { 959,13}, { 255,12}, { 511,11}, \
+ { 1087,12}, { 575,11}, { 1215,12}, { 639,11}, \
+ { 1343,12}, { 703,11}, { 1471,13}, { 383,12}, \
+ { 767,11}, { 1599,12}, { 831,11}, { 1727,12}, \
+ { 959,14}, { 255,13}, { 511,12}, { 1087,11}, \
+ { 2239,12}, { 1215,13}, { 639,12}, { 1471,11}, \
+ { 2943,13}, { 767,12}, { 1727,13}, { 895,12}, \
+ { 1919,14}, { 511,13}, { 1023,12}, { 2239,13}, \
+ { 1151,12}, { 2431,13}, { 1407,12}, { 2943,14}, \
+ { 767,13}, { 1663,12}, { 3455,13}, { 1919,15}, \
+ { 511,14}, { 1023,13}, { 2175,12}, { 4479,13}, \
+ { 2431,14}, { 1279,13}, { 2943,12}, { 5887,14}, \
+ { 1535,13}, { 3455,14}, { 1791,13}, { 3967,12}, \
+ { 7935,15}, { 1023,14}, { 2047,13}, { 4479,14}, \
+ { 2303,13}, { 4991,12}, { 9983,14}, { 2815,13}, \
+ { 5887,15}, { 1535,14}, { 3839,16} }
+#define MUL_FFT_TABLE3_SIZE 147
+#define MUL_FFT_THRESHOLD 6784
+
+#define SQR_FFT_MODF_THRESHOLD 464 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 464, 5}, { 28, 6}, { 15, 5}, { 31, 6}, \
+ { 29, 7}, { 15, 6}, { 32, 7}, { 17, 6}, \
+ { 35, 7}, { 19, 6}, { 39, 7}, { 29, 8}, \
+ { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \
+ { 23, 7}, { 49, 8}, { 27, 9}, { 15, 8}, \
+ { 39, 9}, { 23, 8}, { 51,10}, { 15, 9}, \
+ { 31, 8}, { 67, 9}, { 39, 8}, { 79, 9}, \
+ { 47, 8}, { 95, 9}, { 55,10}, { 31, 9}, \
+ { 79,10}, { 47, 9}, { 95,11}, { 31,10}, \
+ { 63, 9}, { 127,10}, { 79, 9}, { 159,10}, \
+ { 95,11}, { 63,10}, { 127, 9}, { 255,10}, \
+ { 143, 9}, { 287, 5}, { 4863, 6}, { 2495, 7}, \
+ { 1343, 8}, { 703, 9}, { 367,12}, { 63,11}, \
+ { 127,10}, { 303,11}, { 159,10}, { 319, 9}, \
+ { 639,10}, { 335, 9}, { 671,10}, { 351, 9}, \
+ { 703,10}, { 367,11}, { 191,10}, { 383, 9}, \
+ { 767,10}, { 399, 9}, { 799,10}, { 415, 9}, \
+ { 831,12}, { 127,11}, { 255,10}, { 543,11}, \
+ { 287,10}, { 607,11}, { 319,10}, { 671,11}, \
+ { 351,10}, { 703,12}, { 191,11}, { 383,10}, \
+ { 799,11}, { 415,10}, { 863,13}, { 127,12}, \
+ { 255,11}, { 543,10}, { 1087,11}, { 607,12}, \
+ { 319,11}, { 671,10}, { 1343,11}, { 735,10}, \
+ { 1471,12}, { 383,11}, { 799,10}, { 1599,11}, \
+ { 863,12}, { 447,11}, { 959,13}, { 255,12}, \
+ { 511,11}, { 1087,12}, { 575,11}, { 1215,12}, \
+ { 639,11}, { 1343,12}, { 703,11}, { 1407,13}, \
+ { 383,12}, { 767,11}, { 1599,12}, { 831,11}, \
+ { 1727,12}, { 959,14}, { 255,13}, { 511,12}, \
+ { 1215,13}, { 639,12}, { 1471,11}, { 2943,13}, \
+ { 767,12}, { 1727,13}, { 895,12}, { 1919,14}, \
+ { 511,13}, { 1023,12}, { 2111,13}, { 1151,12}, \
+ { 2431,13}, { 1407,12}, { 2943,14}, { 767,13}, \
+ { 1663,12}, { 3455,13}, { 1919,15}, { 511,14}, \
+ { 1023,13}, { 2175,12}, { 4479,13}, { 2431,14}, \
+ { 1279,13}, { 2943,12}, { 5887,14}, { 1535,13}, \
+ { 3455,14}, { 1791,13}, { 3967,15}, { 1023,14}, \
+ { 2047,13}, { 4479,14}, { 2303,13}, { 4991,12}, \
+ { 9983,14}, { 2815,13}, { 5887,15}, { 1535,14}, \
+ { 3839,16} }
+#define SQR_FFT_TABLE3_SIZE 157
+#define SQR_FFT_THRESHOLD 5312
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 36
+#define MULLO_MUL_N_THRESHOLD 13463
+#define SQRLO_BASECASE_THRESHOLD 0 /* always */
+#define SQRLO_DC_THRESHOLD 140
+#define SQRLO_SQR_THRESHOLD 10393
+
+#define DC_DIV_QR_THRESHOLD 32
+#define DC_DIVAPPR_Q_THRESHOLD 116
+#define DC_BDIV_QR_THRESHOLD 76
+#define DC_BDIV_Q_THRESHOLD 180
+
+#define INV_MULMOD_BNM1_THRESHOLD 46
+#define INV_NEWTON_THRESHOLD 138
+#define INV_APPR_THRESHOLD 123
+
+#define BINV_NEWTON_THRESHOLD 306
+#define REDC_1_TO_REDC_N_THRESHOLD 82
+
+#define MU_DIV_QR_THRESHOLD 1499
+#define MU_DIVAPPR_Q_THRESHOLD 1442
+#define MUPI_DIV_QR_THRESHOLD 63
+#define MU_BDIV_QR_THRESHOLD 1442
+#define MU_BDIV_Q_THRESHOLD 1589
+
+#define POWM_SEC_TABLE 1,22,66,428,1035
+
+#define GET_STR_DC_THRESHOLD 13
+#define GET_STR_PRECOMPUTE_THRESHOLD 18
+#define SET_STR_DC_THRESHOLD 732
+#define SET_STR_PRECOMPUTE_THRESHOLD 1118
+
+#define FAC_DSC_THRESHOLD 115
+#define FAC_ODD_THRESHOLD 50
+
+#define MATRIX22_STRASSEN_THRESHOLD 25
+#define HGCD2_DIV1_METHOD 1 /* 5.78% faster than 3 */
+#define HGCD_THRESHOLD 121
+#define HGCD_APPR_THRESHOLD 151
+#define HGCD_REDUCE_THRESHOLD 3259
+#define GCD_DC_THRESHOLD 368
+#define GCDEXT_DC_THRESHOLD 306
+#define JACOBI_BASE_METHOD 4 /* 14.19% faster than 1 */
+
+/* Tuneup completed successfully, took 67142 seconds */
diff --git a/vendor/gmp-6.3.0/mpn/x86/coreibwl/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/x86/coreibwl/gmp-mparam.h
new file mode 100644
index 0000000..7b58cad
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/coreibwl/gmp-mparam.h
@@ -0,0 +1,216 @@
+/* x86/coreibwl gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 3400-3800 MHz Intel Xeon E3-1285Lv4 Broadwell */
+/* FFT tuning limit = 67,000,000 */
+/* Generated by tuneup.c, 2019-10-20, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD 15
+#define MOD_1_UNNORM_THRESHOLD 16
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 10
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 8
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 10
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 11
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1N_PI1_METHOD 1 /* 21.34% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD 14
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD 29
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 19
+
+#define DIV_1_VS_MUL_1_PERCENT 295
+
+#define MUL_TOOM22_THRESHOLD 26
+#define MUL_TOOM33_THRESHOLD 97
+#define MUL_TOOM44_THRESHOLD 220
+#define MUL_TOOM6H_THRESHOLD 306
+#define MUL_TOOM8H_THRESHOLD 454
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 93
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 153
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 154
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 169
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 136
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 44
+#define SQR_TOOM3_THRESHOLD 134
+#define SQR_TOOM4_THRESHOLD 242
+#define SQR_TOOM6_THRESHOLD 342
+#define SQR_TOOM8_THRESHOLD 502
+
+#define MULMID_TOOM42_THRESHOLD 98
+
+#define MULMOD_BNM1_THRESHOLD 20
+#define SQRMOD_BNM1_THRESHOLD 23
+
+#define MUL_FFT_MODF_THRESHOLD 540 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 540, 5}, { 29, 6}, { 15, 5}, { 31, 6}, \
+ { 16, 5}, { 33, 6}, { 17, 5}, { 36, 6}, \
+ { 25, 7}, { 13, 6}, { 29, 7}, { 15, 6}, \
+ { 33, 7}, { 17, 6}, { 36, 7}, { 19, 6}, \
+ { 39, 7}, { 21, 6}, { 43, 7}, { 23, 6}, \
+ { 47, 7}, { 29, 8}, { 15, 7}, { 35, 8}, \
+ { 19, 7}, { 43, 8}, { 23, 7}, { 49, 8}, \
+ { 27, 7}, { 55, 9}, { 15, 8}, { 31, 7}, \
+ { 63, 8}, { 43, 9}, { 23, 8}, { 55,10}, \
+ { 15, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \
+ { 83, 9}, { 47, 8}, { 95, 9}, { 55,10}, \
+ { 31, 9}, { 79,10}, { 47, 9}, { 95,11}, \
+ { 31,10}, { 63, 9}, { 135,10}, { 79, 9}, \
+ { 159,10}, { 95, 9}, { 191,10}, { 111,11}, \
+ { 63,10}, { 143, 9}, { 287,10}, { 159,11}, \
+ { 95, 7}, { 1599, 8}, { 831, 9}, { 431, 8}, \
+ { 863, 9}, { 447,10}, { 239, 9}, { 479,10}, \
+ { 255, 9}, { 511,10}, { 287,11}, { 159,10}, \
+ { 319, 9}, { 639,10}, { 335, 9}, { 671,11}, \
+ { 191,10}, { 383, 9}, { 767,10}, { 399,11}, \
+ { 223,12}, { 127,11}, { 255,10}, { 511, 9}, \
+ { 1023,11}, { 287,10}, { 607,11}, { 319,10}, \
+ { 671,11}, { 351,12}, { 191,11}, { 383,10}, \
+ { 799,11}, { 415,13}, { 127,12}, { 255,11}, \
+ { 543,10}, { 1119,11}, { 607,12}, { 319,11}, \
+ { 671,10}, { 1343,11}, { 735,12}, { 383,11}, \
+ { 799,10}, { 1599,11}, { 863,12}, { 447,11}, \
+ { 959,13}, { 255,12}, { 511,11}, { 1119,12}, \
+ { 575,11}, { 1215,12}, { 639,11}, { 1343,12}, \
+ { 703,11}, { 1407,13}, { 383,12}, { 767,11}, \
+ { 1599,12}, { 831,11}, { 1727,12}, { 959,14}, \
+ { 255,13}, { 511,12}, { 1215,13}, { 639,12}, \
+ { 1471,13}, { 767,12}, { 1727,13}, { 895,12}, \
+ { 1919,14}, { 511,13}, { 1023,12}, { 2239,13}, \
+ { 1151,12}, { 2431,13}, { 1279,12}, { 2623,13}, \
+ { 1407,12}, { 2815,14}, { 767,13}, { 1535,12}, \
+ { 3135,13}, { 1663,12}, { 3455,13}, { 1919,15}, \
+ { 511,14}, { 1023,13}, { 2175,12}, { 4479,13}, \
+ { 2431,14}, { 1279,13}, { 2943,12}, { 5887,14}, \
+ { 1535,13}, { 3455,14}, { 1791,13}, { 3839,15}, \
+ { 1023,14}, { 2047,13}, { 4479,14}, { 2303,13}, \
+ { 4991,12}, { 9983,14}, { 2559,13}, { 5247,14}, \
+ { 2815,13}, { 5887,15}, { 1535,14}, { 3839,16} }
+#define MUL_FFT_TABLE3_SIZE 172
+#define MUL_FFT_THRESHOLD 7424
+
+#define SQR_FFT_MODF_THRESHOLD 472 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 472, 5}, { 29, 6}, { 15, 5}, { 33, 6}, \
+ { 37, 7}, { 19, 6}, { 40, 7}, { 29, 8}, \
+ { 15, 7}, { 35, 8}, { 19, 7}, { 43, 8}, \
+ { 23, 7}, { 49, 8}, { 27, 9}, { 15, 8}, \
+ { 31, 7}, { 63, 8}, { 43, 9}, { 23, 8}, \
+ { 55,10}, { 15, 9}, { 31, 8}, { 67, 9}, \
+ { 39, 8}, { 83, 9}, { 47, 8}, { 95, 9}, \
+ { 55,10}, { 31, 9}, { 79,10}, { 47, 9}, \
+ { 95,11}, { 31,10}, { 63, 9}, { 135,10}, \
+ { 79, 9}, { 159,10}, { 95,11}, { 63,10}, \
+ { 127, 9}, { 255,10}, { 143, 9}, { 287,10}, \
+ { 159,11}, { 95,12}, { 63,11}, { 127,10}, \
+ { 271, 9}, { 543, 6}, { 4479, 7}, { 2431, 8}, \
+ { 1247, 7}, { 2495, 8}, { 1279,10}, { 351,11}, \
+ { 191,10}, { 399, 9}, { 799,10}, { 415,12}, \
+ { 127,11}, { 255,10}, { 543,11}, { 287,10}, \
+ { 607,11}, { 319,10}, { 639,11}, { 351,12}, \
+ { 191,11}, { 383,10}, { 799,11}, { 415,10}, \
+ { 831,13}, { 127,12}, { 255,11}, { 511,10}, \
+ { 1023,11}, { 543,10}, { 1087,11}, { 607,12}, \
+ { 319,11}, { 671,10}, { 1343,11}, { 735,12}, \
+ { 383,11}, { 799,10}, { 1599,11}, { 863,12}, \
+ { 447,11}, { 927,13}, { 255,12}, { 511,11}, \
+ { 1087,12}, { 575,11}, { 1215,12}, { 639,11}, \
+ { 1343,12}, { 703,11}, { 1471,13}, { 383,12}, \
+ { 767,11}, { 1599,12}, { 831,11}, { 1663,12}, \
+ { 895,11}, { 1855,14}, { 255,13}, { 511,12}, \
+ { 1023,11}, { 2047,12}, { 1087,11}, { 2239,12}, \
+ { 1215,13}, { 639,12}, { 1471,13}, { 767,12}, \
+ { 1663,13}, { 895,12}, { 1983,14}, { 511,13}, \
+ { 1023,12}, { 2239,13}, { 1151,12}, { 2495,13}, \
+ { 1279,12}, { 2623,13}, { 1407,14}, { 767,13}, \
+ { 1535,12}, { 3135,13}, { 1663,12}, { 3455,13}, \
+ { 1919,15}, { 511,14}, { 1023,13}, { 2175,12}, \
+ { 4479,13}, { 2431,14}, { 1279,13}, { 2943,12}, \
+ { 5887,14}, { 1535,13}, { 3455,14}, { 1791,13}, \
+ { 3839,15}, { 1023,14}, { 2047,13}, { 4479,14}, \
+ { 2303,13}, { 4991,12}, { 9983,14}, { 2815,13}, \
+ { 5887,15}, { 1535,14}, { 3327,13}, { 6783,14}, \
+ { 3839,16} }
+#define SQR_FFT_TABLE3_SIZE 157
+#define SQR_FFT_THRESHOLD 5568
+
+#define MULLO_BASECASE_THRESHOLD 16
+#define MULLO_DC_THRESHOLD 37
+#define MULLO_MUL_N_THRESHOLD 14281
+#define SQRLO_BASECASE_THRESHOLD 0 /* always */
+#define SQRLO_DC_THRESHOLD 137
+#define SQRLO_SQR_THRESHOLD 10821
+
+#define DC_DIV_QR_THRESHOLD 54
+#define DC_DIVAPPR_Q_THRESHOLD 146
+#define DC_BDIV_QR_THRESHOLD 98
+#define DC_BDIV_Q_THRESHOLD 218
+
+#define INV_MULMOD_BNM1_THRESHOLD 50
+#define INV_NEWTON_THRESHOLD 173
+#define INV_APPR_THRESHOLD 165
+
+#define BINV_NEWTON_THRESHOLD 278
+#define REDC_1_TO_REDC_N_THRESHOLD 79
+
+#define MU_DIV_QR_THRESHOLD 1787
+#define MU_DIVAPPR_Q_THRESHOLD 1787
+#define MUPI_DIV_QR_THRESHOLD 78
+#define MU_BDIV_QR_THRESHOLD 1589
+#define MU_BDIV_Q_THRESHOLD 1830
+
+#define POWM_SEC_TABLE 1,16,126,416,932
+
+#define GET_STR_DC_THRESHOLD 11
+#define GET_STR_PRECOMPUTE_THRESHOLD 17
+#define SET_STR_DC_THRESHOLD 306
+#define SET_STR_PRECOMPUTE_THRESHOLD 894
+
+#define FAC_DSC_THRESHOLD 141
+#define FAC_ODD_THRESHOLD 34
+
+#define MATRIX22_STRASSEN_THRESHOLD 20
+#define HGCD2_DIV1_METHOD 3 /* 5.97% faster than 1 */
+#define HGCD_THRESHOLD 73
+#define HGCD_APPR_THRESHOLD 123
+#define HGCD_REDUCE_THRESHOLD 3664
+#define GCD_DC_THRESHOLD 562
+#define GCDEXT_DC_THRESHOLD 465
+#define JACOBI_BASE_METHOD 1 /* 31.16% faster than 3 */
+
+/* Tuneup completed successfully, took 35114 seconds */
diff --git a/vendor/gmp-6.3.0/mpn/x86/coreihwl/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/x86/coreihwl/gmp-mparam.h
new file mode 100644
index 0000000..4c1b388
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/coreihwl/gmp-mparam.h
@@ -0,0 +1,216 @@
+/* x86/coreihwl gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 3600-4000 MHz Intel Xeon E3-1271v3 Haswell */
+/* FFT tuning limit = 67,000,000 */
+/* Generated by tuneup.c, 2019-10-21, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD 17
+#define MOD_1_UNNORM_THRESHOLD 17
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 10
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 9
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 10
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 5
+#define USE_PREINV_DIVREM_1 1 /* native */
+/* From sky.gmplib.org, 2023-07-20 */
+#define DIV_QR_1N_PI1_METHOD 3 /* 5.86% faster than 1 */
+#define DIV_QR_1_NORM_THRESHOLD 13
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 21
+
+#define DIV_1_VS_MUL_1_PERCENT 296
+
+#define MUL_TOOM22_THRESHOLD 28
+#define MUL_TOOM33_THRESHOLD 108
+#define MUL_TOOM44_THRESHOLD 232
+#define MUL_TOOM6H_THRESHOLD 306
+#define MUL_TOOM8H_THRESHOLD 478
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 109
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 183
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 113
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 113
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 136
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 44
+#define SQR_TOOM3_THRESHOLD 141
+#define SQR_TOOM4_THRESHOLD 384
+#define SQR_TOOM6_THRESHOLD 517
+#define SQR_TOOM8_THRESHOLD 698
+
+#define MULMID_TOOM42_THRESHOLD 98
+
+#define MULMOD_BNM1_THRESHOLD 20
+#define SQRMOD_BNM1_THRESHOLD 23
+
+#define MUL_FFT_MODF_THRESHOLD 565 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 565, 5}, { 28, 6}, { 15, 5}, { 31, 6}, \
+ { 16, 5}, { 33, 6}, { 29, 7}, { 15, 6}, \
+ { 33, 7}, { 17, 6}, { 36, 7}, { 19, 6}, \
+ { 39, 7}, { 21, 6}, { 43, 7}, { 23, 6}, \
+ { 47, 7}, { 29, 8}, { 15, 7}, { 35, 8}, \
+ { 19, 7}, { 43, 8}, { 23, 7}, { 47, 8}, \
+ { 27, 7}, { 55, 9}, { 15, 8}, { 31, 7}, \
+ { 63, 8}, { 43, 9}, { 23, 8}, { 55, 9}, \
+ { 31, 8}, { 71, 9}, { 39, 8}, { 83, 9}, \
+ { 47, 8}, { 95, 9}, { 55,10}, { 31, 9}, \
+ { 79,10}, { 47, 9}, { 95,11}, { 31,10}, \
+ { 63, 9}, { 135,10}, { 79, 9}, { 159,10}, \
+ { 95, 9}, { 191,10}, { 111,11}, { 63,10}, \
+ { 143, 9}, { 287,10}, { 159,11}, { 95,10}, \
+ { 191, 6}, { 3199, 7}, { 1727, 9}, { 447,10}, \
+ { 239, 9}, { 479,10}, { 287,11}, { 159,10}, \
+ { 319, 9}, { 639,10}, { 335, 9}, { 671,11}, \
+ { 191,10}, { 399, 9}, { 799,10}, { 415,11}, \
+ { 223,12}, { 127,11}, { 255,10}, { 511, 9}, \
+ { 1023,10}, { 527,11}, { 287,10}, { 607,11}, \
+ { 319,10}, { 671,12}, { 191,11}, { 383,10}, \
+ { 799,11}, { 415,13}, { 127,12}, { 255,11}, \
+ { 511,10}, { 1023,11}, { 543,10}, { 1087,11}, \
+ { 607,12}, { 319,11}, { 671,10}, { 1343,11}, \
+ { 735,12}, { 383,11}, { 799,10}, { 1599,11}, \
+ { 863,12}, { 447,11}, { 991,13}, { 255,12}, \
+ { 511,11}, { 1087,12}, { 575,11}, { 1215,12}, \
+ { 639,11}, { 1343,12}, { 703,11}, { 1407,13}, \
+ { 383,12}, { 767,11}, { 1599,12}, { 831,11}, \
+ { 1727,12}, { 959,11}, { 1919,14}, { 255,13}, \
+ { 511,12}, { 1087,11}, { 2239,12}, { 1215,13}, \
+ { 639,12}, { 1471,13}, { 767,12}, { 1727,13}, \
+ { 895,12}, { 1919,14}, { 511,13}, { 1023,12}, \
+ { 2239,13}, { 1151,12}, { 2431,13}, { 1279,12}, \
+ { 2623,13}, { 1407,14}, { 767,13}, { 1663,12}, \
+ { 3455,13}, { 1919,15}, { 511,14}, { 1023,13}, \
+ { 2175,12}, { 4479,13}, { 2431,14}, { 1279,13}, \
+ { 2943,12}, { 5887,14}, { 1535,13}, { 3455,14}, \
+ { 1791,13}, { 3967,15}, { 1023,14}, { 2047,13}, \
+ { 4479,14}, { 2303,13}, { 4991,14}, { 2559,13}, \
+ { 5375,14}, { 2815,13}, { 5887,15}, { 1535,14}, \
+ { 3839,16} }
+#define MUL_FFT_TABLE3_SIZE 165
+#define MUL_FFT_THRESHOLD 7808
+
+#define SQR_FFT_MODF_THRESHOLD 560 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 560, 5}, { 29, 6}, { 15, 5}, { 31, 6}, \
+ { 16, 5}, { 33, 6}, { 17, 5}, { 36, 6}, \
+ { 29, 7}, { 15, 6}, { 33, 7}, { 17, 6}, \
+ { 36, 7}, { 19, 6}, { 40, 7}, { 21, 6}, \
+ { 43, 7}, { 23, 6}, { 47, 7}, { 29, 8}, \
+ { 15, 7}, { 35, 8}, { 19, 7}, { 43, 8}, \
+ { 23, 7}, { 49, 8}, { 27, 7}, { 55, 9}, \
+ { 15, 8}, { 31, 7}, { 63, 8}, { 43, 9}, \
+ { 23, 8}, { 55,10}, { 15, 9}, { 31, 8}, \
+ { 67, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \
+ { 95, 9}, { 55,10}, { 31, 9}, { 79,10}, \
+ { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \
+ { 135,10}, { 79, 9}, { 159,10}, { 95,11}, \
+ { 63,10}, { 143, 9}, { 287,10}, { 159,11}, \
+ { 95,12}, { 63,11}, { 127, 9}, { 511, 5}, \
+ { 8959, 7}, { 2431, 8}, { 1247, 7}, { 2495, 8}, \
+ { 1279, 9}, { 671,10}, { 367,11}, { 191,10}, \
+ { 399, 9}, { 799,10}, { 415,12}, { 127,11}, \
+ { 255,10}, { 527,11}, { 287,10}, { 607,11}, \
+ { 319,10}, { 671,11}, { 351,10}, { 703,12}, \
+ { 191,11}, { 383,10}, { 799,11}, { 415,10}, \
+ { 831,13}, { 127,11}, { 543,10}, { 1119,11}, \
+ { 607,12}, { 319,11}, { 671,10}, { 1343,11}, \
+ { 735,12}, { 383,11}, { 863,12}, { 447,11}, \
+ { 991,12}, { 511,11}, { 1119,12}, { 575,11}, \
+ { 1215,12}, { 639,11}, { 1343,12}, { 703,13}, \
+ { 383,12}, { 767,11}, { 1599,12}, { 831,11}, \
+ { 1727,12}, { 959,11}, { 1983,13}, { 511,12}, \
+ { 1087,11}, { 2239,12}, { 1215,13}, { 639,12}, \
+ { 1471,13}, { 767,12}, { 1727,13}, { 895,12}, \
+ { 1983,14}, { 511,13}, { 1023,12}, { 2239,13}, \
+ { 1151,12}, { 2495,13}, { 1279,12}, { 2623,13}, \
+ { 1407,14}, { 767,13}, { 1663,12}, { 3455,13}, \
+ { 1919,15}, { 511,14}, { 1023,13}, { 2175,12}, \
+ { 4479,13}, { 2431,14}, { 1279,13}, { 2943,12}, \
+ { 5887,14}, { 1535,13}, { 3455,14}, { 1791,13}, \
+ { 3967,15}, { 1023,14}, { 2047,13}, { 4479,14}, \
+ { 2303,13}, { 4991,12}, { 9983,14}, { 2559,13}, \
+ { 5119,14}, { 2815,13}, { 5887,15}, { 1535,14}, \
+ { 3327,13}, { 6911,14}, { 3839,16} }
+#define SQR_FFT_TABLE3_SIZE 159
+#define SQR_FFT_THRESHOLD 5568
+
+#define MULLO_BASECASE_THRESHOLD 17
+#define MULLO_DC_THRESHOLD 40
+#define MULLO_MUL_N_THRESHOLD 14281
+#define SQRLO_BASECASE_THRESHOLD 0 /* always */
+#define SQRLO_DC_THRESHOLD 141
+#define SQRLO_SQR_THRESHOLD 10821
+
+#define DC_DIV_QR_THRESHOLD 30
+#define DC_DIVAPPR_Q_THRESHOLD 190
+#define DC_BDIV_QR_THRESHOLD 67
+#define DC_BDIV_Q_THRESHOLD 254
+
+#define INV_MULMOD_BNM1_THRESHOLD 54
+#define INV_NEWTON_THRESHOLD 157
+#define INV_APPR_THRESHOLD 163
+
+#define BINV_NEWTON_THRESHOLD 236
+#define REDC_1_TO_REDC_N_THRESHOLD 79
+
+#define MU_DIV_QR_THRESHOLD 1895
+#define MU_DIVAPPR_Q_THRESHOLD 1718
+#define MUPI_DIV_QR_THRESHOLD 54
+#define MU_BDIV_QR_THRESHOLD 1589
+#define MU_BDIV_Q_THRESHOLD 1898
+
+#define POWM_SEC_TABLE 1,16,95,480,1442
+
+#define GET_STR_DC_THRESHOLD 10
+#define GET_STR_PRECOMPUTE_THRESHOLD 16
+#define SET_STR_DC_THRESHOLD 372
+#define SET_STR_PRECOMPUTE_THRESHOLD 1037
+
+#define FAC_DSC_THRESHOLD 141
+#define FAC_ODD_THRESHOLD 34
+
+#define MATRIX22_STRASSEN_THRESHOLD 21
+#define HGCD2_DIV1_METHOD 3 /* 6.26% faster than 1 */
+#define HGCD_THRESHOLD 70
+#define HGCD_APPR_THRESHOLD 129
+#define HGCD_REDUCE_THRESHOLD 3664
+#define GCD_DC_THRESHOLD 573
+#define GCDEXT_DC_THRESHOLD 483
+#define JACOBI_BASE_METHOD 1 /* 27.01% faster than 3 */
+
+/* Tuneup completed successfully, took 35232 seconds */
diff --git a/vendor/gmp-6.3.0/mpn/x86/coreinhm/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/x86/coreinhm/gmp-mparam.h
new file mode 100644
index 0000000..4428b4b
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/coreinhm/gmp-mparam.h
@@ -0,0 +1,223 @@
+/* x86/coreinhm gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 2933-3200 MHz Intel Xeon X3470 Nehalem */
+/* FFT tuning limit = 67,000,000 */
+/* Generated by tuneup.c, 2019-10-23, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD 36
+#define MOD_1_UNNORM_THRESHOLD 40
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 8
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 7
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 12
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 3
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1N_PI1_METHOD 1 /* 42.59% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD 9
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 17
+
+#define DIV_1_VS_MUL_1_PERCENT 288
+
+#define MUL_TOOM22_THRESHOLD 24
+#define MUL_TOOM33_THRESHOLD 93
+#define MUL_TOOM44_THRESHOLD 214
+#define MUL_TOOM6H_THRESHOLD 306
+#define MUL_TOOM8H_THRESHOLD 430
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 93
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 134
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 145
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 94
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 118
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 38
+#define SQR_TOOM3_THRESHOLD 133
+#define SQR_TOOM4_THRESHOLD 212
+#define SQR_TOOM6_THRESHOLD 318
+#define SQR_TOOM8_THRESHOLD 620
+
+#define MULMID_TOOM42_THRESHOLD 68
+
+#define MULMOD_BNM1_THRESHOLD 17
+#define SQRMOD_BNM1_THRESHOLD 23
+
+#define MUL_FFT_MODF_THRESHOLD 595 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 595, 5}, { 28, 6}, { 15, 5}, { 31, 6}, \
+ { 17, 5}, { 35, 6}, { 28, 7}, { 15, 6}, \
+ { 33, 7}, { 17, 6}, { 36, 7}, { 19, 6}, \
+ { 39, 7}, { 23, 6}, { 47, 7}, { 27, 8}, \
+ { 15, 7}, { 35, 8}, { 19, 7}, { 43, 8}, \
+ { 23, 7}, { 49, 8}, { 27, 9}, { 15, 8}, \
+ { 31, 7}, { 63, 8}, { 43, 9}, { 23, 8}, \
+ { 51, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \
+ { 79, 9}, { 47, 8}, { 99, 9}, { 55,10}, \
+ { 31, 9}, { 63, 8}, { 127, 9}, { 79,10}, \
+ { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \
+ { 135,10}, { 79, 9}, { 159,10}, { 95, 9}, \
+ { 191,11}, { 63, 9}, { 255,10}, { 159,11}, \
+ { 95,10}, { 191,12}, { 63,11}, { 127,10}, \
+ { 255, 9}, { 511,10}, { 271, 9}, { 543,11}, \
+ { 159,10}, { 335,11}, { 191,10}, { 383, 9}, \
+ { 767,10}, { 399,12}, { 127,11}, { 255,10}, \
+ { 511, 9}, { 1023,10}, { 543, 9}, { 1087,11}, \
+ { 287,10}, { 607,11}, { 319,10}, { 671,12}, \
+ { 191,11}, { 383,10}, { 767,13}, { 127,12}, \
+ { 255,11}, { 511,10}, { 1023,11}, { 543,10}, \
+ { 1119,11}, { 607,12}, { 319,11}, { 671,10}, \
+ { 1343,11}, { 735,10}, { 1471,12}, { 383,11}, \
+ { 799,10}, { 1599,11}, { 863,10}, { 1727,12}, \
+ { 447,11}, { 959,13}, { 255,12}, { 511,11}, \
+ { 1119,12}, { 575,11}, { 1215,10}, { 2431,12}, \
+ { 639,11}, { 1343,12}, { 703,11}, { 1471,10}, \
+ { 2943,13}, { 383,12}, { 767,11}, { 1599,12}, \
+ { 831,11}, { 1727,10}, { 3455,12}, { 959,14}, \
+ { 255,13}, { 511,12}, { 1087,11}, { 2239,10}, \
+ { 4479,12}, { 1215,11}, { 2431,13}, { 639,12}, \
+ { 1471,11}, { 2943,13}, { 767,12}, { 1727,11}, \
+ { 3455,13}, { 895,12}, { 1983,14}, { 511,13}, \
+ { 1023,12}, { 2239,11}, { 4479,13}, { 1151,12}, \
+ { 2431,13}, { 1279,12}, { 2559,13}, { 1407,12}, \
+ { 2943,11}, { 5887,14}, { 767,13}, { 1663,12}, \
+ { 3455,13}, { 1919,12}, { 3839,15}, { 511,14}, \
+ { 1023,13}, { 2175,12}, { 4479,13}, { 2431,14}, \
+ { 1279,13}, { 2943,12}, { 5887,14}, { 1535,13}, \
+ { 3455,14}, { 1791,13}, { 3967,15}, { 1023,14}, \
+ { 2047,13}, { 4479,14}, { 2303,13}, { 4991,12}, \
+ { 9983,14}, { 2815,13}, { 6015,15}, { 1535,14}, \
+ { 3839,13}, { 7679,16} }
+#define MUL_FFT_TABLE3_SIZE 170
+#define MUL_FFT_THRESHOLD 6784
+
+#define SQR_FFT_MODF_THRESHOLD 525 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 525, 5}, { 29, 6}, { 15, 5}, { 33, 6}, \
+ { 17, 5}, { 35, 6}, { 29, 7}, { 15, 6}, \
+ { 33, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \
+ { 39, 7}, { 23, 6}, { 47, 7}, { 29, 8}, \
+ { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \
+ { 23, 7}, { 49, 8}, { 27, 7}, { 55, 9}, \
+ { 15, 8}, { 31, 7}, { 63, 8}, { 39, 9}, \
+ { 23, 8}, { 55, 9}, { 31, 8}, { 67, 9}, \
+ { 39, 8}, { 79, 9}, { 47, 8}, { 95, 9}, \
+ { 55,10}, { 31, 9}, { 79,10}, { 47, 9}, \
+ { 95,11}, { 31,10}, { 63, 9}, { 135,10}, \
+ { 79, 9}, { 159,10}, { 95,11}, { 63,10}, \
+ { 143, 9}, { 287,10}, { 159, 6}, { 2687, 7}, \
+ { 1407, 9}, { 367, 8}, { 735, 9}, { 383,10}, \
+ { 207, 9}, { 415,11}, { 127,10}, { 271, 9}, \
+ { 543,10}, { 287,11}, { 159,10}, { 319, 9}, \
+ { 639,10}, { 335, 9}, { 671,10}, { 351,11}, \
+ { 191,10}, { 383, 9}, { 767,10}, { 399, 9}, \
+ { 799,10}, { 415,12}, { 127,11}, { 255,10}, \
+ { 511, 9}, { 1023,10}, { 543,11}, { 287,10}, \
+ { 607,11}, { 319,10}, { 671,11}, { 351,12}, \
+ { 191,11}, { 383,10}, { 799,11}, { 415,13}, \
+ { 127,12}, { 255,11}, { 511,10}, { 1023,11}, \
+ { 543,10}, { 1087,11}, { 607,10}, { 1215,12}, \
+ { 319,11}, { 671,10}, { 1343,11}, { 735,10}, \
+ { 1471,12}, { 383,11}, { 799,10}, { 1599,11}, \
+ { 863,10}, { 1727,12}, { 447,11}, { 991,10}, \
+ { 1983,13}, { 255,12}, { 511,11}, { 1119,12}, \
+ { 575,11}, { 1215,10}, { 2431,12}, { 639,11}, \
+ { 1343,12}, { 703,11}, { 1471,13}, { 383,12}, \
+ { 767,11}, { 1599,12}, { 831,11}, { 1727,10}, \
+ { 3455,12}, { 895,11}, { 1791,12}, { 959,11}, \
+ { 1983,14}, { 255,13}, { 511,12}, { 1023,11}, \
+ { 2047,12}, { 1087,11}, { 2239,12}, { 1215,11}, \
+ { 2431,13}, { 639,12}, { 1471,11}, { 2943,13}, \
+ { 767,12}, { 1727,11}, { 3455,13}, { 895,12}, \
+ { 1983,11}, { 3967,14}, { 511,13}, { 1023,12}, \
+ { 2239,13}, { 1151,12}, { 2495,13}, { 1279,12}, \
+ { 2623,13}, { 1407,12}, { 2943,14}, { 767,13}, \
+ { 1663,12}, { 3455,13}, { 1919,12}, { 3967,15}, \
+ { 511,14}, { 1023,13}, { 2175,12}, { 4479,13}, \
+ { 2431,12}, { 4863,14}, { 1279,13}, { 2943,12}, \
+ { 5887,14}, { 1535,13}, { 3455,14}, { 1791,13}, \
+ { 3967,12}, { 7935,15}, { 1023,14}, { 2047,13}, \
+ { 4479,14}, { 2303,13}, { 4991,12}, { 9983,14}, \
+ { 2815,13}, { 5887,15}, { 1535,14}, { 3327,13}, \
+ { 6655,14}, { 3839,13}, { 7935,16} }
+#define SQR_FFT_TABLE3_SIZE 187
+#define SQR_FFT_THRESHOLD 5312
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 43
+#define MULLO_MUL_N_THRESHOLD 13463
+#define SQRLO_BASECASE_THRESHOLD 9
+#define SQRLO_DC_THRESHOLD 42
+#define SQRLO_SQR_THRESHOLD 10323
+
+#define DC_DIV_QR_THRESHOLD 43
+#define DC_DIVAPPR_Q_THRESHOLD 132
+#define DC_BDIV_QR_THRESHOLD 83
+#define DC_BDIV_Q_THRESHOLD 130
+
+#define INV_MULMOD_BNM1_THRESHOLD 46
+#define INV_NEWTON_THRESHOLD 189
+#define INV_APPR_THRESHOLD 167
+
+#define BINV_NEWTON_THRESHOLD 372
+#define REDC_1_TO_REDC_N_THRESHOLD 83
+
+#define MU_DIV_QR_THRESHOLD 1589
+#define MU_DIVAPPR_Q_THRESHOLD 1589
+#define MUPI_DIV_QR_THRESHOLD 97
+#define MU_BDIV_QR_THRESHOLD 1589
+#define MU_BDIV_Q_THRESHOLD 1718
+
+#define POWM_SEC_TABLE 1,28,96,473,803
+
+#define GET_STR_DC_THRESHOLD 12
+#define GET_STR_PRECOMPUTE_THRESHOLD 16
+#define SET_STR_DC_THRESHOLD 145
+#define SET_STR_PRECOMPUTE_THRESHOLD 419
+
+#define FAC_DSC_THRESHOLD 114
+#define FAC_ODD_THRESHOLD 57
+
+#define MATRIX22_STRASSEN_THRESHOLD 20
+#define HGCD2_DIV1_METHOD 1 /* 1.03% faster than 3 */
+#define HGCD_THRESHOLD 117
+#define HGCD_APPR_THRESHOLD 137
+#define HGCD_REDUCE_THRESHOLD 3524
+#define GCD_DC_THRESHOLD 389
+#define GCDEXT_DC_THRESHOLD 318
+#define JACOBI_BASE_METHOD 4 /* 6.10% faster than 1 */
+
+/* Tuneup completed successfully, took 67994 seconds */
diff --git a/vendor/gmp-6.3.0/mpn/x86/coreisbr/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/x86/coreisbr/gmp-mparam.h
new file mode 100644
index 0000000..23d708a
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/coreisbr/gmp-mparam.h
@@ -0,0 +1,215 @@
+/* x86/coreisbr gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 3400-3800 MHz Intel Xeon E3-1270 Sandy Bridge */
+/* FFT tuning limit = 67,000,000 */
+/* Generated by tuneup.c, 2019-10-24, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD 28
+#define MOD_1_UNNORM_THRESHOLD 26
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 9
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 7
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 11
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 4
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1N_PI1_METHOD 2 /* 88.29% faster than 1 */
+#define DIV_QR_1_NORM_THRESHOLD 21
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD 14
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 20
+
+#define DIV_1_VS_MUL_1_PERCENT 297
+
+#define MUL_TOOM22_THRESHOLD 32
+#define MUL_TOOM33_THRESHOLD 105
+#define MUL_TOOM44_THRESHOLD 190
+#define MUL_TOOM6H_THRESHOLD 294
+#define MUL_TOOM8H_THRESHOLD 478
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 109
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 144
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 116
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 129
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 160
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 48
+#define SQR_TOOM3_THRESHOLD 163
+#define SQR_TOOM4_THRESHOLD 250
+#define SQR_TOOM6_THRESHOLD 354
+#define SQR_TOOM8_THRESHOLD 502
+
+#define MULMID_TOOM42_THRESHOLD 98
+
+#define MULMOD_BNM1_THRESHOLD 19
+#define SQRMOD_BNM1_THRESHOLD 23
+
+#define MUL_FFT_MODF_THRESHOLD 666 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 666, 5}, { 28, 6}, { 15, 5}, { 31, 6}, \
+ { 28, 7}, { 15, 6}, { 33, 7}, { 17, 6}, \
+ { 36, 7}, { 19, 6}, { 39, 7}, { 23, 6}, \
+ { 47, 7}, { 29, 8}, { 15, 7}, { 35, 8}, \
+ { 19, 7}, { 41, 8}, { 23, 7}, { 49, 8}, \
+ { 27, 7}, { 55, 8}, { 31, 7}, { 63, 8}, \
+ { 43, 9}, { 23, 8}, { 55, 9}, { 31, 8}, \
+ { 71, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \
+ { 99, 9}, { 55,10}, { 31, 9}, { 79,10}, \
+ { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \
+ { 135,10}, { 79, 9}, { 159,10}, { 95, 9}, \
+ { 191,11}, { 63,10}, { 159, 7}, { 1343, 8}, \
+ { 703, 9}, { 367, 8}, { 735, 9}, { 383,10}, \
+ { 207,11}, { 127,10}, { 255, 9}, { 511,10}, \
+ { 271, 9}, { 543,10}, { 287,11}, { 159,10}, \
+ { 319, 9}, { 639,10}, { 335,11}, { 191,10}, \
+ { 383, 9}, { 767,11}, { 223,12}, { 127,11}, \
+ { 255,10}, { 543,11}, { 287,10}, { 607, 9}, \
+ { 1215,11}, { 319,10}, { 671,12}, { 191,11}, \
+ { 383,10}, { 799,13}, { 127,12}, { 255,11}, \
+ { 511,10}, { 1023,11}, { 543,10}, { 1087,11}, \
+ { 607,10}, { 1215,12}, { 319,11}, { 671,10}, \
+ { 1343,11}, { 735,10}, { 1471,12}, { 383,11}, \
+ { 799,10}, { 1599,11}, { 863,12}, { 447,11}, \
+ { 959,13}, { 255,12}, { 511,11}, { 1087,12}, \
+ { 575,11}, { 1215,12}, { 639,11}, { 1343,12}, \
+ { 703,11}, { 1471,13}, { 383,12}, { 767,11}, \
+ { 1599,12}, { 831,11}, { 1727,12}, { 959,14}, \
+ { 255,13}, { 511,12}, { 1087,11}, { 2239,12}, \
+ { 1215,13}, { 639,12}, { 1471,11}, { 2943,13}, \
+ { 767,12}, { 1727,11}, { 3455,13}, { 895,12}, \
+ { 1983,14}, { 511,13}, { 1023,12}, { 2239,13}, \
+ { 1151,12}, { 2495,13}, { 1279,12}, { 2623,13}, \
+ { 1407,12}, { 2943,14}, { 767,13}, { 1535,12}, \
+ { 3071,13}, { 1663,12}, { 3455,13}, { 1919,15}, \
+ { 511,14}, { 1023,13}, { 2175,12}, { 4479,13}, \
+ { 2431,14}, { 1279,13}, { 2943,12}, { 5887,14}, \
+ { 1535,13}, { 3455,14}, { 1791,13}, { 3967,15}, \
+ { 1023,14}, { 2047,13}, { 4479,14}, { 2303,13}, \
+ { 4991,12}, { 9983,14}, { 2815,13}, { 5887,15}, \
+ { 1535,14}, { 3839,13}, { 7679,16} }
+#define MUL_FFT_TABLE3_SIZE 163
+#define MUL_FFT_THRESHOLD 7552
+
+#define SQR_FFT_MODF_THRESHOLD 570 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 570, 5}, { 28, 6}, { 15, 5}, { 32, 6}, \
+ { 17, 5}, { 35, 6}, { 29, 7}, { 15, 6}, \
+ { 33, 7}, { 17, 6}, { 36, 7}, { 19, 6}, \
+ { 40, 7}, { 23, 6}, { 47, 7}, { 29, 8}, \
+ { 15, 7}, { 35, 8}, { 19, 7}, { 43, 8}, \
+ { 23, 7}, { 49, 8}, { 27, 7}, { 55, 8}, \
+ { 31, 7}, { 63, 8}, { 43, 9}, { 23, 8}, \
+ { 55, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \
+ { 79, 9}, { 47, 8}, { 95, 9}, { 55,10}, \
+ { 31, 9}, { 79,10}, { 47, 9}, { 95,11}, \
+ { 31,10}, { 63, 9}, { 135,10}, { 79, 9}, \
+ { 159,10}, { 95,11}, { 63,10}, { 159,11}, \
+ { 95,10}, { 191,12}, { 63, 8}, { 1023, 9}, \
+ { 543,11}, { 159,10}, { 319, 9}, { 639,10}, \
+ { 335,11}, { 191,10}, { 383, 9}, { 767,10}, \
+ { 399, 9}, { 799,12}, { 127,11}, { 255,10}, \
+ { 511, 9}, { 1023,10}, { 543,11}, { 287,10}, \
+ { 607,11}, { 319,10}, { 671,11}, { 351,12}, \
+ { 191,11}, { 383,10}, { 799,13}, { 127,12}, \
+ { 255,11}, { 511,10}, { 1023,11}, { 543,10}, \
+ { 1087,11}, { 607,12}, { 319,11}, { 671,10}, \
+ { 1343,11}, { 735,10}, { 1471,12}, { 383,11}, \
+ { 799,10}, { 1599,11}, { 863,12}, { 447,11}, \
+ { 991,13}, { 255,12}, { 511,11}, { 1087,12}, \
+ { 575,11}, { 1215,12}, { 639,11}, { 1343,12}, \
+ { 703,11}, { 1471,13}, { 383,12}, { 767,11}, \
+ { 1599,12}, { 831,11}, { 1727,12}, { 959,11}, \
+ { 1919,14}, { 255,13}, { 511,12}, { 1023,11}, \
+ { 2047,12}, { 1087,11}, { 2239,12}, { 1215,11}, \
+ { 2431,13}, { 639,12}, { 1471,13}, { 767,12}, \
+ { 1727,13}, { 895,12}, { 1983,14}, { 511,13}, \
+ { 1023,12}, { 2239,13}, { 1151,12}, { 2495,13}, \
+ { 1279,12}, { 2623,13}, { 1407,12}, { 2943,14}, \
+ { 767,13}, { 1663,12}, { 3455,13}, { 1919,12}, \
+ { 3967,15}, { 511,14}, { 1023,13}, { 2175,12}, \
+ { 4479,13}, { 2431,12}, { 4863,14}, { 1279,13}, \
+ { 2943,12}, { 5887,14}, { 1535,13}, { 3455,14}, \
+ { 1791,13}, { 3967,15}, { 1023,14}, { 2047,13}, \
+ { 4479,14}, { 2303,13}, { 4991,12}, { 9983,14}, \
+ { 2559,13}, { 5119,14}, { 2815,13}, { 5887,15}, \
+ { 1535,14}, { 3839,13}, { 7679,16} }
+#define SQR_FFT_TABLE3_SIZE 163
+#define SQR_FFT_THRESHOLD 5760
+
+#define MULLO_BASECASE_THRESHOLD 16
+#define MULLO_DC_THRESHOLD 46
+#define MULLO_MUL_N_THRESHOLD 14281
+#define SQRLO_BASECASE_THRESHOLD 0 /* always */
+#define SQRLO_DC_THRESHOLD 159
+#define SQRLO_SQR_THRESHOLD 11317
+
+#define DC_DIV_QR_THRESHOLD 47
+#define DC_DIVAPPR_Q_THRESHOLD 191
+#define DC_BDIV_QR_THRESHOLD 107
+#define DC_BDIV_Q_THRESHOLD 232
+
+#define INV_MULMOD_BNM1_THRESHOLD 62
+#define INV_NEWTON_THRESHOLD 181
+#define INV_APPR_THRESHOLD 182
+
+#define BINV_NEWTON_THRESHOLD 378
+#define REDC_1_TO_REDC_N_THRESHOLD 91
+
+#define MU_DIV_QR_THRESHOLD 1858
+#define MU_DIVAPPR_Q_THRESHOLD 1858
+#define MUPI_DIV_QR_THRESHOLD 77
+#define MU_BDIV_QR_THRESHOLD 1830
+#define MU_BDIV_Q_THRESHOLD 2166
+
+#define POWM_SEC_TABLE 1,16,126,428,1442
+
+#define GET_STR_DC_THRESHOLD 10
+#define GET_STR_PRECOMPUTE_THRESHOLD 16
+#define SET_STR_DC_THRESHOLD 418
+#define SET_STR_PRECOMPUTE_THRESHOLD 1104
+
+#define FAC_DSC_THRESHOLD 149
+#define FAC_ODD_THRESHOLD 34
+
+#define MATRIX22_STRASSEN_THRESHOLD 21
+#define HGCD2_DIV1_METHOD 1 /* 5.54% faster than 4 */
+#define HGCD_THRESHOLD 66
+#define HGCD_APPR_THRESHOLD 135
+#define HGCD_REDUCE_THRESHOLD 4284
+#define GCD_DC_THRESHOLD 642
+#define GCDEXT_DC_THRESHOLD 465
+#define JACOBI_BASE_METHOD 3 /* 14.76% faster than 4 */
+
+/* Tuneup completed successfully, took 44241 seconds */
diff --git a/vendor/gmp-6.3.0/mpn/x86/darwin.m4 b/vendor/gmp-6.3.0/mpn/x86/darwin.m4
new file mode 100644
index 0000000..c449216
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/darwin.m4
@@ -0,0 +1,102 @@
+divert(-1)
+dnl Copyright 2007, 2011, 2012, 2014 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+define(`DARWIN')
+
+
+dnl Usage LEA(symbol,reg)
+dnl Usage LEAL(symbol_local_to_file,reg)
+dnl
+dnl We maintain lists of stuff to append in load_eip and darwin_bd. The
+dnl `index' stuff is needed to suppress repeated definitions. To avoid
+dnl getting fooled by "var" and "var1", we add 'bol ' (the end of
+dnl 'indirect_symbol') at the beginning and and a newline at the end. This
+dnl might be a bit fragile.
+
+define(`LEA',
+m4_assert_numargs(2)
+`ifdef(`PIC',`
+ifelse(index(defn(`load_eip'), `$2'),-1,
+`m4append(`load_eip',
+` TEXT
+ ALIGN(16)
+L(movl_eip_`'substr($2,1)):
+ movl (%esp), $2
+ ret_internal
+')')
+ifelse(index(defn(`darwin_bd'), `bol $1
+'),-1,
+`m4append(`darwin_bd',
+` .section __IMPORT,__pointers,non_lazy_symbol_pointers
+L($1`'$non_lazy_ptr):
+ .indirect_symbol $1
+ .long 0
+')')
+ call L(movl_eip_`'substr($2,1))
+ movl L($1`'$non_lazy_ptr)-.($2), $2
+',`
+ movl `$'$1, $2
+')')
+
+define(`LEAL',
+m4_assert_numargs(2)
+`ifdef(`PIC',`
+ifelse(index(defn(`load_eip'), `$2'),-1,
+`m4append(`load_eip',
+` TEXT
+ ALIGN(16)
+L(movl_eip_`'substr($2,1)):
+ movl (%esp), $2
+ ret_internal
+')')
+ call L(movl_eip_`'substr($2,1))
+ leal $1-.($2), $2
+',`
+ movl `$'$1, $2
+')')
+
+
+dnl ASM_END
+
+define(`ASM_END',`load_eip`'darwin_bd')
+
+define(`load_eip', `') dnl updated in LEA
+define(`darwin_bd', `') dnl updated in LEA
+
+
+dnl Usage: CALL(funcname)
+dnl
+
+define(`CALL',
+m4_assert_numargs(1)
+`call GSYM_PREFIX`'$1')
+
+undefine(`PIC_WITH_EBX')
+
+divert`'dnl
diff --git a/vendor/gmp-6.3.0/mpn/x86/dive_1.asm b/vendor/gmp-6.3.0/mpn/x86/dive_1.asm
new file mode 100644
index 0000000..5bb0f45
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/dive_1.asm
@@ -0,0 +1,190 @@
+dnl x86 mpn_divexact_1 -- mpn by limb exact division.
+
+dnl Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C P54 30.0
+C P55 29.0
+C P6 13.0 odd divisor, 12.0 even (strangely)
+C K6 14.0
+C K7 12.0
+C P4 42.0
+
+
+C mp_limb_t mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor);
+C
+
+defframe(PARAM_DIVISOR,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+dnl re-use parameter space
+define(VAR_INVERSE,`PARAM_SRC')
+
+ TEXT
+
+ ALIGN(16)
+PROLOGUE(mpn_divexact_1)
+deflit(`FRAME',0)
+
+ movl PARAM_DIVISOR, %eax
+ pushl %ebp FRAME_pushl()
+
+ movl PARAM_SIZE, %ebp
+ pushl %edi FRAME_pushl()
+
+ pushl %ebx FRAME_pushl()
+ movl $-1, %ecx C shift count
+
+ pushl %esi FRAME_pushl()
+
+L(strip_twos):
+ incl %ecx
+
+ shrl %eax
+ jnc L(strip_twos)
+
+ leal 1(%eax,%eax), %ebx C d without twos
+ andl $127, %eax C d/2, 7 bits
+
+ifdef(`PIC',`
+ LEA( binvert_limb_table, %edx)
+ movzbl (%eax,%edx), %eax C inv 8 bits
+',`
+ movzbl binvert_limb_table(%eax), %eax C inv 8 bits
+')
+
+ leal (%eax,%eax), %edx C 2*inv
+ movl %ebx, PARAM_DIVISOR C d without twos
+
+ imull %eax, %eax C inv*inv
+
+ movl PARAM_SRC, %esi
+ movl PARAM_DST, %edi
+
+ imull %ebx, %eax C inv*inv*d
+
+ subl %eax, %edx C inv = 2*inv - inv*inv*d
+ leal (%edx,%edx), %eax C 2*inv
+
+ imull %edx, %edx C inv*inv
+
+ leal (%esi,%ebp,4), %esi C src end
+ leal (%edi,%ebp,4), %edi C dst end
+ negl %ebp C -size
+
+ imull %ebx, %edx C inv*inv*d
+
+ subl %edx, %eax C inv = 2*inv - inv*inv*d
+
+ ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS
+ pushl %eax FRAME_pushl()
+ imull PARAM_DIVISOR, %eax
+ cmpl $1, %eax
+ popl %eax FRAME_popl()')
+
+ movl %eax, VAR_INVERSE
+ movl (%esi,%ebp,4), %eax C src[0]
+
+ xorl %ebx, %ebx
+ xorl %edx, %edx
+
+ incl %ebp
+ jz L(one)
+
+ movl (%esi,%ebp,4), %edx C src[1]
+
+ shrdl( %cl, %edx, %eax)
+
+ movl VAR_INVERSE, %edx
+ jmp L(entry)
+
+
+ ALIGN(8)
+ nop C k6 code alignment
+ nop
+L(top):
+ C eax q
+ C ebx carry bit, 0 or -1
+ C ecx shift
+ C edx carry limb
+ C esi src end
+ C edi dst end
+ C ebp counter, limbs, negative
+
+ movl -4(%esi,%ebp,4), %eax
+ subl %ebx, %edx C accumulate carry bit
+
+ movl (%esi,%ebp,4), %ebx
+
+ shrdl( %cl, %ebx, %eax)
+
+ subl %edx, %eax C apply carry limb
+ movl VAR_INVERSE, %edx
+
+ sbbl %ebx, %ebx
+
+L(entry):
+ imull %edx, %eax
+
+ movl %eax, -4(%edi,%ebp,4)
+ movl PARAM_DIVISOR, %edx
+
+ mull %edx
+
+ incl %ebp
+ jnz L(top)
+
+
+ movl -4(%esi), %eax C src high limb
+L(one):
+ shrl %cl, %eax
+ popl %esi FRAME_popl()
+
+ addl %ebx, %eax C apply carry bit
+ popl %ebx FRAME_popl()
+
+ subl %edx, %eax C apply carry limb
+
+ imull VAR_INVERSE, %eax
+
+ movl %eax, -4(%edi)
+
+ popl %edi
+ popl %ebp
+
+ ret
+
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/x86/divrem_1.asm b/vendor/gmp-6.3.0/mpn/x86/divrem_1.asm
new file mode 100644
index 0000000..255d493
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/divrem_1.asm
@@ -0,0 +1,233 @@
+dnl x86 mpn_divrem_1 -- mpn by limb division extending to fractional quotient.
+
+dnl Copyright 1999-2003, 2007 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C 486 approx 43 maybe
+C P5 44
+C P6 39
+C P6MMX 39
+C K6 22
+C K7 42
+C P4 58
+
+
+C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
+C mp_srcptr src, mp_size_t size, mp_limb_t divisor);
+C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
+C mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C mp_limb_t carry);
+C
+C Divide src,size by divisor and store the quotient in dst+xsize,size.
+C Extend the division to fractional quotient limbs in dst,xsize. Return the
+C remainder. Either or both xsize and size can be 0.
+C
+C mpn_divrem_1c takes a carry parameter which is an initial high limb,
+C effectively one extra limb at the top of src,size. Must have
+C carry<divisor.
+C
+C
+C Essentially the code is the same as the division based part of
+C mpn/generic/divrem_1.c, but has the advantage that we get the desired divl
+C instruction even when gcc is not being used (when longlong.h only has the
+C rather slow generic C udiv_qrnnd().
+C
+C A test is done to see if the high limb is less than the divisor, and if so
+C one less div is done. A div is between 20 and 40 cycles on the various
+C x86s, so assuming high<divisor about half the time, then this test saves
+C half that amount. The branch misprediction penalty on each chip is less
+C than half a div.
+C
+C
+C Notes for P5:
+C
+C It might be thought that moving the load down to pair with the store would
+C save 1 cycle, but that doesn't seem to happen in practice, and in any case
+C would be a mere 2.2% saving, so it's hardly worth bothering about.
+C
+C A mul-by-inverse might be a possibility for P5, as done in
+C mpn/x86/pentium/mod_1.asm. The number of auxiliary instructions required
+C is a hinderance, but there could be a 10-15% speedup available.
+C
+C
+C Notes for K6:
+C
+C K6 has its own version of this code, using loop and paying attention to
+C cache line boundary crossings. The target 20 c/l can be had with the
+C decl+jnz of the present code by pairing up the load and store in the
+C loops. But it's considered easier not to introduce complexity just for
+C that, but instead let k6 have its own code.
+C
+
+defframe(PARAM_CARRY, 24)
+defframe(PARAM_DIVISOR,20)
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC, 12)
+defframe(PARAM_XSIZE, 8)
+defframe(PARAM_DST, 4)
+
+ TEXT
+ ALIGN(16)
+
+PROLOGUE(mpn_divrem_1c)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ pushl %edi FRAME_pushl()
+
+ movl PARAM_SRC, %edi
+ pushl %esi FRAME_pushl()
+
+ movl PARAM_DIVISOR, %esi
+ pushl %ebx FRAME_pushl()
+
+ movl PARAM_DST, %ebx
+ pushl %ebp FRAME_pushl()
+
+ movl PARAM_XSIZE, %ebp
+ orl %ecx, %ecx
+
+ movl PARAM_CARRY, %edx
+ jz L(fraction)
+
+ leal -4(%ebx,%ebp,4), %ebx C dst one limb below integer part
+ jmp L(integer_top)
+
+EPILOGUE()
+
+
+PROLOGUE(mpn_divrem_1)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ pushl %edi FRAME_pushl()
+
+ movl PARAM_SRC, %edi
+ pushl %esi FRAME_pushl()
+
+ movl PARAM_DIVISOR, %esi
+ orl %ecx,%ecx
+
+ jz L(size_zero)
+ pushl %ebx FRAME_pushl()
+
+ movl -4(%edi,%ecx,4), %eax C src high limb
+ xorl %edx, %edx
+
+ movl PARAM_DST, %ebx
+ pushl %ebp FRAME_pushl()
+
+ movl PARAM_XSIZE, %ebp
+ cmpl %esi, %eax
+
+ leal -4(%ebx,%ebp,4), %ebx C dst one limb below integer part
+ jae L(integer_entry)
+
+
+ C high<divisor, so high of dst is zero, and avoid one div
+
+ movl %edx, (%ebx,%ecx,4)
+ decl %ecx
+
+ movl %eax, %edx
+ jz L(fraction)
+
+
+L(integer_top):
+ C eax scratch (quotient)
+ C ebx dst+4*xsize-4
+ C ecx counter
+ C edx scratch (remainder)
+ C esi divisor
+ C edi src
+ C ebp xsize
+
+ movl -4(%edi,%ecx,4), %eax
+L(integer_entry):
+
+ divl %esi
+
+ movl %eax, (%ebx,%ecx,4)
+ decl %ecx
+ jnz L(integer_top)
+
+
+L(fraction):
+ orl %ebp, %ecx
+ jz L(done)
+
+ movl PARAM_DST, %ebx
+
+
+L(fraction_top):
+ C eax scratch (quotient)
+ C ebx dst
+ C ecx counter
+ C edx scratch (remainder)
+ C esi divisor
+ C edi
+ C ebp
+
+ xorl %eax, %eax
+
+ divl %esi
+
+ movl %eax, -4(%ebx,%ecx,4)
+ decl %ecx
+ jnz L(fraction_top)
+
+
+L(done):
+ popl %ebp
+ movl %edx, %eax
+ popl %ebx
+ popl %esi
+ popl %edi
+ ret
+
+
+L(size_zero):
+deflit(`FRAME',8)
+ movl PARAM_XSIZE, %ecx
+ xorl %eax, %eax
+
+ movl PARAM_DST, %edi
+
+ cld C better safe than sorry, see mpn/x86/README
+
+ rep
+ stosl
+
+ popl %esi
+ popl %edi
+ ret
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/divrem_2.asm b/vendor/gmp-6.3.0/mpn/x86/divrem_2.asm
new file mode 100644
index 0000000..4c38ad0
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/divrem_2.asm
@@ -0,0 +1,199 @@
+dnl x86 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number.
+
+dnl Copyright 2007, 2008 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C norm frac
+C 486
+C P5
+C P6-13 29.2
+C P6-15 *26
+C K6
+C K7 22
+C K8 *19
+C P4-f1
+C P4-f2 *65
+C P4-f3
+C P4-f4 *72
+
+C A star means numbers not updated for the latest version of the code.
+
+
+C TODO
+C * Perhaps keep ecx or esi in stack slot, freeing up a reg for q0.
+C * The loop has not been carefully tuned. We should at the very least do
+C some local insn swapping.
+C * The code outside the main loop is what gcc generated. Clean up!
+C * Clean up stack slot usage.
+
+C INPUT PARAMETERS
+C qp
+C fn
+C up_param
+C un_param
+C dp
+
+
+C eax ebx ecx edx esi edi ebp
+C cnt qp
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_divrem_2)
+ push %ebp
+ push %edi
+ push %esi
+ push %ebx
+ sub $36, %esp
+ mov 68(%esp), %ecx C un
+ mov 72(%esp), %esi C dp
+ movl $0, 32(%esp)
+ lea 0(,%ecx,4), %edi
+ add 64(%esp), %edi C up
+ mov (%esi), %ebx
+ mov 4(%esi), %eax
+ mov %ebx, 20(%esp)
+ sub $12, %edi
+ mov %eax, 24(%esp)
+ mov %edi, 12(%esp)
+ mov 8(%edi), %ebx
+ mov 4(%edi), %ebp
+ cmp %eax, %ebx
+ jb L(8)
+ seta %dl
+ cmp 20(%esp), %ebp
+ setae %al
+ orb %dl, %al C "orb" form to placate Sun tools
+ jne L(35)
+L(8):
+ mov 60(%esp), %esi C fn
+ lea -3(%esi,%ecx), %edi
+ test %edi, %edi
+ js L(9)
+ mov 24(%esp), %edx
+ mov $-1, %esi
+ mov %esi, %eax
+ mov %esi, %ecx
+ not %edx
+ divl 24(%esp)
+ mov %eax, %esi
+ imul 24(%esp), %eax
+ mov %eax, (%esp)
+ mov %esi, %eax
+ mull 20(%esp)
+ mov (%esp), %eax
+ add 20(%esp), %eax
+ adc $0, %ecx
+ add %eax, %edx
+ adc $0, %ecx
+ mov %ecx, %eax
+ js L(32)
+L(36): dec %esi
+ sub 24(%esp), %edx
+ sbb $0, %eax
+ jns L(36)
+L(32):
+ mov %esi, 16(%esp) C di
+ mov %edi, %ecx C un
+ mov 12(%esp), %esi C up
+ mov 24(%esp), %eax
+ neg %eax
+ mov %eax, 4(%esp) C -d1
+ ALIGN(16)
+ nop
+
+C eax ebx ecx edx esi edi ebp 0 4 8 12 16 20 24 28 32 56 60
+C n2 un up n1 q0 -d1 di d0 d1 msl qp fn
+
+L(loop):
+ mov 16(%esp), %eax C di
+ mul %ebx
+ add %ebp, %eax
+ mov %eax, (%esp) C q0
+ adc %ebx, %edx
+ mov %edx, %edi C q
+ imul 4(%esp), %edx
+ mov 20(%esp), %eax
+ lea (%edx, %ebp), %ebx C n1 -= ...
+ mul %edi
+ xor %ebp, %ebp
+ cmp 60(%esp), %ecx
+ jl L(19)
+ mov (%esi), %ebp
+ sub $4, %esi
+L(19): sub 20(%esp), %ebp
+ sbb 24(%esp), %ebx
+ sub %eax, %ebp
+ sbb %edx, %ebx
+ mov 20(%esp), %eax C d1
+ inc %edi
+ xor %edx, %edx
+ cmp (%esp), %ebx
+ adc $-1, %edx C mask
+ add %edx, %edi C q--
+ and %edx, %eax C d0 or 0
+ and 24(%esp), %edx C d1 or 0
+ add %eax, %ebp
+ adc %edx, %ebx
+ cmp 24(%esp), %ebx
+ jae L(fix)
+L(bck): mov 56(%esp), %edx
+ mov %edi, (%edx, %ecx, 4)
+ dec %ecx
+ jns L(loop)
+
+L(9): mov 64(%esp), %esi C up
+ mov %ebp, (%esi)
+ mov %ebx, 4(%esi)
+ mov 32(%esp), %eax
+ add $36, %esp
+ pop %ebx
+ pop %esi
+ pop %edi
+ pop %ebp
+ ret
+
+L(fix): seta %dl
+ cmp 20(%esp), %ebp
+ setae %al
+ orb %dl, %al C "orb" form to placate Sun tools
+ je L(bck)
+ inc %edi
+ sub 20(%esp), %ebp
+ sbb 24(%esp), %ebx
+ jmp L(bck)
+
+L(35): sub 20(%esp), %ebp
+ sbb 24(%esp), %ebx
+ movl $1, 32(%esp)
+ jmp L(8)
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/fat/com.c b/vendor/gmp-6.3.0/mpn/x86/fat/com.c
new file mode 100644
index 0000000..d359d4c
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/fat/com.c
@@ -0,0 +1,32 @@
+/* Fat binary fallback mpn_com.
+
+Copyright 2003, 2009, 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+
+#include "mpn/generic/com.c"
diff --git a/vendor/gmp-6.3.0/mpn/x86/fat/fat.c b/vendor/gmp-6.3.0/mpn/x86/fat/fat.c
new file mode 100644
index 0000000..4c8cf3c
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/fat/fat.c
@@ -0,0 +1,530 @@
+/* x86 fat binary initializers.
+
+ THE FUNCTIONS AND VARIABLES IN THIS FILE ARE FOR INTERNAL USE ONLY.
+ THEY'RE ALMOST CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR
+ COMPLETELY IN FUTURE GNU MP RELEASES.
+
+Copyright 2003, 2004, 2011-2013, 2015, 2017, 2018 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#include <stdio.h> /* for printf */
+#include <stdlib.h> /* for getenv */
+#include <string.h>
+
+#include "gmp-impl.h"
+
+/* Change this to "#define TRACE(x) x" for some traces. */
+#define TRACE(x)
+
+
+/* fat_entry.asm */
+long __gmpn_cpuid (char [12], int);
+int __gmpn_cpuid_available (void);
+
+
+#if WANT_FAKE_CPUID
+/* The "name"s in the table are values for the GMP_CPU_TYPE environment
+ variable. Anything can be used, but for now it's the canonical cpu types
+ as per config.guess/config.sub. */
+
+#define __gmpn_cpuid fake_cpuid
+#define __gmpn_cpuid_available fake_cpuid_available
+
+#define MAKE_FMS(family, model) \
+ ((((family) & 0xf) << 8) + (((family) & 0xff0) << 20) \
+ + (((model) & 0xf) << 4) + (((model) & 0xf0) << 12))
+
+static struct {
+ const char *name;
+ const char *vendor;
+ unsigned fms;
+} fake_cpuid_table[] = {
+ { "i386", "" },
+ { "i486", "GenuineIntel", MAKE_FMS (4, 0) },
+ { "pentium", "GenuineIntel", MAKE_FMS (5, 0) },
+ { "pentiummmx", "GenuineIntel", MAKE_FMS (5, 4) },
+ { "pentiumpro", "GenuineIntel", MAKE_FMS (6, 0) },
+ { "pentium2", "GenuineIntel", MAKE_FMS (6, 2) },
+ { "pentium3", "GenuineIntel", MAKE_FMS (6, 7) },
+ { "pentium4", "GenuineIntel", MAKE_FMS (15, 2) },
+ { "prescott", "GenuineIntel", MAKE_FMS (15, 3) },
+ { "nocona", "GenuineIntel", MAKE_FMS (15, 4) },
+ { "core2", "GenuineIntel", MAKE_FMS (6, 0xf) },
+ { "nehalem", "GenuineIntel", MAKE_FMS (6, 0x1a) },
+ { "nhm", "GenuineIntel", MAKE_FMS (6, 0x1a) },
+ { "atom", "GenuineIntel", MAKE_FMS (6, 0x1c) },
+ { "westmere", "GenuineIntel", MAKE_FMS (6, 0x25) },
+ { "wsm", "GenuineIntel", MAKE_FMS (6, 0x25) },
+ { "sandybridge","GenuineIntel", MAKE_FMS (6, 0x2a) },
+ { "sbr", "GenuineIntel", MAKE_FMS (6, 0x2a) },
+ { "silvermont", "GenuineIntel", MAKE_FMS (6, 0x37) },
+ { "slm", "GenuineIntel", MAKE_FMS (6, 0x37) },
+ { "haswell", "GenuineIntel", MAKE_FMS (6, 0x3c) },
+ { "hwl", "GenuineIntel", MAKE_FMS (6, 0x3c) },
+ { "broadwell", "GenuineIntel", MAKE_FMS (6, 0x3d) },
+ { "bwl", "GenuineIntel", MAKE_FMS (6, 0x3d) },
+ { "skylake", "GenuineIntel", MAKE_FMS (6, 0x5e) },
+ { "sky", "GenuineIntel", MAKE_FMS (6, 0x5e) },
+
+ { "k5", "AuthenticAMD", MAKE_FMS (5, 0) },
+ { "k6", "AuthenticAMD", MAKE_FMS (5, 3) },
+ { "k62", "AuthenticAMD", MAKE_FMS (5, 8) },
+ { "k63", "AuthenticAMD", MAKE_FMS (5, 9) },
+ { "athlon", "AuthenticAMD", MAKE_FMS (6, 0) },
+ { "k8", "AuthenticAMD", MAKE_FMS (15, 0) },
+ { "k10", "AuthenticAMD", MAKE_FMS (16, 0) },
+ { "bobcat", "AuthenticAMD", MAKE_FMS (20, 1) },
+ { "bulldozer", "AuthenticAMD", MAKE_FMS (21, 1) },
+ { "piledriver", "AuthenticAMD", MAKE_FMS (21, 2) },
+ { "steamroller","AuthenticAMD", MAKE_FMS (21, 0x30) },
+ { "excavator", "AuthenticAMD", MAKE_FMS (21, 0x60) },
+ { "jaguar", "AuthenticAMD", MAKE_FMS (22, 1) },
+ { "zen", "AuthenticAMD", MAKE_FMS (23, 1) },
+
+ { "viac3", "CentaurHauls", MAKE_FMS (6, 0) },
+ { "viac32", "CentaurHauls", MAKE_FMS (6, 9) },
+ { "nano", "CentaurHauls", MAKE_FMS (6, 15) },
+};
+
+static int
+fake_cpuid_lookup (void)
+{
+ char *s;
+ int i;
+
+ s = getenv ("GMP_CPU_TYPE");
+ if (s == NULL)
+ {
+ printf ("Need GMP_CPU_TYPE environment variable for fake cpuid\n");
+ abort ();
+ }
+
+ for (i = 0; i < numberof (fake_cpuid_table); i++)
+ if (strcmp (s, fake_cpuid_table[i].name) == 0)
+ return i;
+
+ printf ("GMP_CPU_TYPE=%s unknown\n", s);
+ abort ();
+}
+
+static int
+fake_cpuid_available (void)
+{
+ return fake_cpuid_table[fake_cpuid_lookup()].vendor[0] != '\0';
+}
+
+static long
+fake_cpuid (char dst[12], int id)
+{
+ int i = fake_cpuid_lookup();
+
+ switch (id) {
+ case 0:
+ memcpy (dst, fake_cpuid_table[i].vendor, 12);
+ return 0;
+ case 1:
+ return fake_cpuid_table[i].fms;
+ default:
+ printf ("fake_cpuid(): oops, unknown id %d\n", id);
+ abort ();
+ }
+}
+#endif
+
+
+typedef DECL_preinv_divrem_1 ((*preinv_divrem_1_t));
+typedef DECL_preinv_mod_1 ((*preinv_mod_1_t));
+
+struct cpuvec_t __gmpn_cpuvec = {
+ __MPN(add_n_init),
+ 0,
+ 0,
+ __MPN(addmul_1_init),
+ 0,
+ __MPN(bdiv_dbm1c_init),
+ __MPN(cnd_add_n_init),
+ __MPN(cnd_sub_n_init),
+ __MPN(com_init),
+ __MPN(copyd_init),
+ __MPN(copyi_init),
+ __MPN(divexact_1_init),
+ __MPN(divrem_1_init),
+ __MPN(gcd_11_init),
+ __MPN(lshift_init),
+ __MPN(lshiftc_init),
+ __MPN(mod_1_init),
+ __MPN(mod_1_1p_init),
+ __MPN(mod_1_1p_cps_init),
+ __MPN(mod_1s_2p_init),
+ __MPN(mod_1s_2p_cps_init),
+ __MPN(mod_1s_4p_init),
+ __MPN(mod_1s_4p_cps_init),
+ __MPN(mod_34lsub1_init),
+ __MPN(modexact_1c_odd_init),
+ __MPN(mul_1_init),
+ __MPN(mul_basecase_init),
+ __MPN(mullo_basecase_init),
+ __MPN(preinv_divrem_1_init),
+ __MPN(preinv_mod_1_init),
+ __MPN(redc_1_init),
+ __MPN(redc_2_init),
+ __MPN(rshift_init),
+ __MPN(sqr_basecase_init),
+ __MPN(sub_n_init),
+ 0,
+ __MPN(submul_1_init),
+ 0
+};
+
+int __gmpn_cpuvec_initialized = 0;
+
+/* The following setups start with generic x86, then overwrite with
+ specifics for a chip, and higher versions of that chip.
+
+ The arrangement of the setups here will normally be the same as the $path
+ selections in configure.in for the respective chips.
+
+ This code is reentrant and thread safe. We always calculate the same
+ decided_cpuvec, so if two copies of the code are running it doesn't
+ matter which completes first, both write the same to __gmpn_cpuvec.
+
+ We need to go via decided_cpuvec because if one thread has completed
+ __gmpn_cpuvec then it may be making use of the threshold values in that
+ vector. If another thread is still running __gmpn_cpuvec_init then we
+ don't want it to write different values to those fields since some of the
+ asm routines only operate correctly up to their own defined threshold,
+ not an arbitrary value. */
+
+void
+__gmpn_cpuvec_init (void)
+{
+ struct cpuvec_t decided_cpuvec;
+
+ TRACE (printf ("__gmpn_cpuvec_init:\n"));
+
+ memset (&decided_cpuvec, '\0', sizeof (decided_cpuvec));
+
+ CPUVEC_SETUP_x86;
+ CPUVEC_SETUP_fat;
+
+ if (! __gmpn_cpuid_available ())
+ {
+ TRACE (printf (" 80386, or early 80486 without cpuid\n"));
+ }
+ else
+ {
+ char vendor_string[13];
+ char dummy_string[12];
+ long fms;
+ int family, model;
+
+ __gmpn_cpuid (vendor_string, 0);
+ vendor_string[12] = 0;
+
+ fms = __gmpn_cpuid (dummy_string, 1);
+ family = ((fms >> 8) & 0xf) + ((fms >> 20) & 0xff);
+ model = ((fms >> 4) & 0xf) + ((fms >> 12) & 0xf0);
+
+ if (strcmp (vendor_string, "GenuineIntel") == 0)
+ {
+ switch (family)
+ {
+ case 4:
+ TRACE (printf (" 80486 with cpuid\n"));
+ break;
+
+ case 5:
+ TRACE (printf (" pentium\n"));
+ CPUVEC_SETUP_pentium;
+ if (model == 4 || model == 8)
+ {
+ TRACE (printf (" pentiummmx\n"));
+ CPUVEC_SETUP_pentium_mmx;
+ }
+ break;
+
+ case 6:
+ TRACE (printf (" p6\n"));
+ CPUVEC_SETUP_p6;
+ switch (model)
+ {
+ case 0x00:
+ case 0x01:
+ TRACE (printf (" pentiumpro\n"));
+ break;
+
+ case 0x02:
+ case 0x03:
+ case 0x04:
+ case 0x05:
+ case 0x06:
+ TRACE (printf (" pentium2\n"));
+ CPUVEC_SETUP_p6_mmx;
+ break;
+
+ case 0x07:
+ case 0x08:
+ case 0x0a:
+ case 0x0b:
+ case 0x0c:
+ TRACE (printf (" pentium3\n"));
+ CPUVEC_SETUP_p6_mmx;
+ CPUVEC_SETUP_p6_p3mmx;
+ break;
+
+ case 0x09: /* Banias */
+ case 0x0d: /* Dothan */
+ case 0x0e: /* Yonah */
+ TRACE (printf (" Banias/Dothan/Yonah\n"));
+ CPUVEC_SETUP_p6_mmx;
+ CPUVEC_SETUP_p6_p3mmx;
+ CPUVEC_SETUP_p6_sse2;
+ break;
+
+ case 0x0f: /* Conroe Merom Kentsfield Allendale */
+ case 0x10:
+ case 0x11:
+ case 0x12:
+ case 0x13:
+ case 0x14:
+ case 0x15:
+ case 0x16:
+ case 0x17: /* PNR Wolfdale Yorkfield */
+ case 0x18:
+ case 0x19:
+ case 0x1d: /* PNR Dunnington */
+ TRACE (printf (" Conroe\n"));
+ CPUVEC_SETUP_p6_mmx;
+ CPUVEC_SETUP_p6_p3mmx;
+ CPUVEC_SETUP_p6_sse2;
+ CPUVEC_SETUP_core2;
+ break;
+
+ case 0x1c: /* Atom Silverthorne */
+ case 0x26: /* Atom Lincroft */
+ case 0x27: /* Atom Saltwell */
+ case 0x36: /* Atom Cedarview/Saltwell */
+ TRACE (printf (" atom\n"));
+ CPUVEC_SETUP_atom;
+ CPUVEC_SETUP_atom_mmx;
+ CPUVEC_SETUP_atom_sse2;
+ break;
+
+ case 0x37: /* Silvermont */
+ case 0x4a: /* Silvermont */
+ case 0x4c: /* Airmont */
+ case 0x4d: /* Silvermont/Avoton */
+ case 0x5a: /* Silvermont */
+ TRACE (printf (" silvermont\n"));
+ CPUVEC_SETUP_atom;
+ CPUVEC_SETUP_atom_mmx;
+ CPUVEC_SETUP_atom_sse2;
+ CPUVEC_SETUP_silvermont;
+ break;
+
+ case 0x5c: /* Goldmont */
+ case 0x5f: /* Goldmont */
+ case 0x7a: /* Goldmont Plus */
+ TRACE (printf (" goldmont\n"));
+ CPUVEC_SETUP_atom;
+ CPUVEC_SETUP_atom_mmx;
+ CPUVEC_SETUP_atom_sse2;
+ CPUVEC_SETUP_goldmont;
+ break;
+
+ case 0x1a: /* NHM Gainestown */
+ case 0x1b:
+ case 0x1e: /* NHM Lynnfield/Jasper */
+ case 0x1f:
+ case 0x20:
+ case 0x21:
+ case 0x22:
+ case 0x23:
+ case 0x24:
+ case 0x25: /* WSM Clarkdale/Arrandale */
+ case 0x28:
+ case 0x29:
+ case 0x2b:
+ case 0x2c: /* WSM Gulftown */
+ case 0x2e: /* NHM Beckton */
+ case 0x2f: /* WSM Eagleton */
+ TRACE (printf (" nehalem/westmere\n"));
+ CPUVEC_SETUP_p6_mmx;
+ CPUVEC_SETUP_p6_p3mmx;
+ CPUVEC_SETUP_p6_sse2;
+ CPUVEC_SETUP_core2;
+ CPUVEC_SETUP_coreinhm;
+ break;
+
+ case 0x2a: /* SBR */
+ case 0x2d: /* SBR-EP */
+ case 0x3a: /* IBR */
+ case 0x3e: /* IBR Ivytown */
+ case 0x3c: /* Haswell client */
+ case 0x3f: /* Haswell server */
+ case 0x45: /* Haswell ULT */
+ case 0x46: /* Crystal Well */
+ case 0x3d: /* Broadwell */
+ case 0x47: /* Broadwell */
+ case 0x4f: /* Broadwell server */
+ case 0x56: /* Broadwell microserver */
+ case 0x4e: /* Skylake client */
+ case 0x55: /* Skylake server */
+ case 0x5e: /* Skylake */
+ case 0x8e: /* Kabylake */
+ case 0x9e: /* Kabylake */
+ TRACE (printf (" sandybridge\n"));
+ CPUVEC_SETUP_p6_mmx;
+ CPUVEC_SETUP_p6_p3mmx;
+ CPUVEC_SETUP_p6_sse2;
+ CPUVEC_SETUP_core2;
+ CPUVEC_SETUP_coreinhm;
+ CPUVEC_SETUP_coreisbr;
+ break;
+ }
+ break;
+
+ case 15:
+ TRACE (printf (" pentium4\n"));
+ CPUVEC_SETUP_pentium4;
+ CPUVEC_SETUP_pentium4_mmx;
+ CPUVEC_SETUP_pentium4_sse2;
+ break;
+ }
+ }
+ else if (strcmp (vendor_string, "AuthenticAMD") == 0)
+ {
+ switch (family)
+ {
+ case 5:
+ if (model <= 3)
+ {
+ TRACE (printf (" k5\n"));
+ }
+ else
+ {
+ TRACE (printf (" k6\n"));
+ CPUVEC_SETUP_k6;
+ CPUVEC_SETUP_k6_mmx;
+ if (model >= 8)
+ {
+ TRACE (printf (" k62\n"));
+ CPUVEC_SETUP_k6_k62mmx;
+ }
+ if (model >= 9)
+ {
+ TRACE (printf (" k63\n"));
+ }
+ }
+ break;
+ case 6:
+ TRACE (printf (" athlon\n"));
+ CPUVEC_SETUP_k7;
+ CPUVEC_SETUP_k7_mmx;
+ break;
+
+ case 0x0f: /* k8 */
+ case 0x11: /* "fam 11h", mix of k8 and k10 */
+ case 0x13: /* unknown, conservatively assume k8 */
+ TRACE (printf (" k8\n"));
+ CPUVEC_SETUP_k7;
+ CPUVEC_SETUP_k7_mmx;
+ CPUVEC_SETUP_k8;
+ break;
+
+ case 0x10: /* k10 */
+ case 0x12: /* k10 (llano) */
+ TRACE (printf (" k10\n"));
+ CPUVEC_SETUP_k7;
+ CPUVEC_SETUP_k7_mmx;
+ break;
+
+ case 0x14: /* bobcat */
+ case 0x16: /* jaguar */
+ TRACE (printf (" bobcat\n"));
+ CPUVEC_SETUP_k7;
+ CPUVEC_SETUP_k7_mmx;
+ CPUVEC_SETUP_bt1;
+ break;
+
+ case 0x15: /* bulldozer */
+ TRACE (printf (" bulldozer\n"));
+ CPUVEC_SETUP_k7;
+ CPUVEC_SETUP_k7_mmx;
+ CPUVEC_SETUP_bd1;
+ break;
+
+ case 0x17: /* zen */
+ case 0x19: /* zen3 */
+ TRACE (printf (" zen\n"));
+ CPUVEC_SETUP_k7;
+ CPUVEC_SETUP_k7_mmx;
+ break;
+ }
+ }
+ else if (strcmp (vendor_string, "CentaurHauls") == 0)
+ {
+ switch (family)
+ {
+ case 6:
+ TRACE (printf (" viac3\n"));
+ if (model >= 9)
+ {
+ TRACE (printf (" viac32\n"));
+ }
+ if (model >= 15)
+ {
+ TRACE (printf (" nano\n"));
+ CPUVEC_SETUP_nano;
+ }
+ break;
+ }
+ }
+ else if (strcmp (vendor_string, "CyrixInstead") == 0)
+ {
+ /* Should recognize Cyrix' processors too. */
+ TRACE (printf (" cyrix something\n"));
+ }
+ }
+
+ /* There's no x86 generic mpn_preinv_divrem_1 or mpn_preinv_mod_1.
+ Instead default to the plain versions from whichever CPU we detected.
+ The function arguments are compatible, no need for any glue code. */
+ if (decided_cpuvec.preinv_divrem_1 == NULL)
+ decided_cpuvec.preinv_divrem_1 =(preinv_divrem_1_t)decided_cpuvec.divrem_1;
+ if (decided_cpuvec.preinv_mod_1 == NULL)
+ decided_cpuvec.preinv_mod_1 =(preinv_mod_1_t) decided_cpuvec.mod_1;
+
+ ASSERT_CPUVEC (decided_cpuvec);
+ CPUVEC_INSTALL (decided_cpuvec);
+
+ /* Set this once the threshold fields are ready.
+ Use volatile to prevent it getting moved. */
+ *((volatile int *) &__gmpn_cpuvec_initialized) = 1;
+}
diff --git a/vendor/gmp-6.3.0/mpn/x86/fat/fat_entry.asm b/vendor/gmp-6.3.0/mpn/x86/fat/fat_entry.asm
new file mode 100644
index 0000000..25655cf
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/fat/fat_entry.asm
@@ -0,0 +1,243 @@
+dnl x86 fat binary entrypoints.
+
+dnl Copyright 2003, 2012, 2014 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+dnl Forcibly disable profiling.
+dnl
+dnl The entrypoints and inits are small enough not to worry about, the real
+dnl routines arrived at will have any profiling. Also, the way the code
+dnl here ends with a jump means we won't work properly with the
+dnl "instrument" profiling scheme anyway.
+
+define(`WANT_PROFILING',no)
+
+
+ TEXT
+
+
+dnl Usage: FAT_ENTRY(name, offset)
+dnl
+dnl Emit a fat binary entrypoint function of the given name. This is the
+dnl normal entry for applications, eg. __gmpn_add_n.
+dnl
+dnl The code simply jumps through the function pointer in __gmpn_cpuvec at
+dnl the given "offset" (in bytes).
+dnl
+dnl For non-PIC, the jumps are 5 bytes each, aligning them to 8 should be
+dnl fine for all x86s.
+dnl
+dnl For PIC, the jumps are 20 bytes each, and are best aligned to 16 to
+dnl ensure at least the first two instructions don't cross a cache line
+dnl boundary.
+dnl
+dnl Note the extra `' ahead of PROLOGUE obscures it from the HAVE_NATIVE
+dnl grepping in configure, stopping that code trying to eval something with
+dnl $1 in it.
+
+define(FAT_ENTRY,
+m4_assert_numargs(2)
+` ALIGN(ifdef(`PIC',16,8))
+`'PROLOGUE($1)dnl
+ifdef(`PIC',`dnl
+ifdef(`DARWIN',`
+ call L(movl_eip_edx)
+ movl L(___gmpn_cpuvec)$non_lazy_ptr-.(%edx), %edx
+ jmp *m4_empty_if_zero($2)(%edx)
+',`dnl
+ call L(movl_eip_edx)
+L(entry_here$2):
+ addl $_GLOBAL_OFFSET_TABLE_+[.-L(entry_here$2)], %edx
+ movl GSYM_PREFIX`'__gmpn_cpuvec@GOT(%edx), %edx
+ jmp *m4_empty_if_zero($2)(%edx)
+')
+',`dnl non-PIC
+ jmp *GSYM_PREFIX`'__gmpn_cpuvec+$2
+')
+EPILOGUE()
+')
+
+
+dnl FAT_ENTRY for each CPUVEC_FUNCS_LIST
+dnl
+
+define(`CPUVEC_offset',0)
+foreach(i,
+`FAT_ENTRY(MPN(i),CPUVEC_offset)
+define(`CPUVEC_offset',eval(CPUVEC_offset + 4))',
+CPUVEC_FUNCS_LIST)
+
+ifdef(`PIC',`
+ ALIGN(8)
+L(movl_eip_edx):
+ movl (%esp), %edx
+ ret_internal
+ifdef(`DARWIN',`
+ .section __IMPORT,__pointers,non_lazy_symbol_pointers
+L(___gmpn_cpuvec)$non_lazy_ptr:
+ .indirect_symbol ___gmpn_cpuvec
+ .long 0
+ TEXT
+')
+')
+
+
+dnl Usage: FAT_INIT(name, offset)
+dnl
+dnl Emit a fat binary initializer function of the given name. These
+dnl functions are the initial values for the pointers in __gmpn_cpuvec.
+dnl
+dnl The code simply calls __gmpn_cpuvec_init, and then jumps back through
+dnl the __gmpn_cpuvec pointer, at the given "offset" (in bytes).
+dnl __gmpn_cpuvec_init will have stored the address of the selected
+dnl implementation there.
+dnl
+dnl Only one of these routines will be executed, and only once, since after
+dnl that all the __gmpn_cpuvec pointers go to real routines. So there's no
+dnl need for anything special here, just something small and simple. To
+dnl keep code size down, "fat_init" is a shared bit of code, arrived at
+dnl with the offset in %al. %al is used since the movb instruction is 2
+dnl bytes where %eax would be 4.
+dnl
+dnl Note having `PROLOGUE in FAT_INIT obscures that PROLOGUE from the
+dnl HAVE_NATIVE grepping in configure, preventing that code trying to eval
+dnl something with $1 in it.
+
+define(FAT_INIT,
+m4_assert_numargs(2)
+`PROLOGUE($1)dnl
+ movb $`'$2, %al
+ jmp L(fat_init)
+EPILOGUE()
+')
+
+L(fat_init):
+ C al __gmpn_cpuvec byte offset
+
+ movzbl %al, %eax
+ pushl %eax
+
+ifdef(`PIC',`dnl
+ifdef(`DARWIN',`
+ sub $8, %esp
+ CALL( __gmpn_cpuvec_init)
+ add $8, %esp
+ call L(movl_eip_edx)
+ movl L(___gmpn_cpuvec)$non_lazy_ptr-.(%edx), %edx
+',`dnl
+ pushl %ebx
+ call L(movl_eip_ebx)
+L(init_here):
+ addl $_GLOBAL_OFFSET_TABLE_+[.-L(init_here)], %ebx
+ CALL( __gmpn_cpuvec_init)
+ movl GSYM_PREFIX`'__gmpn_cpuvec@GOT(%ebx), %edx
+ popl %ebx
+')
+ popl %eax
+ jmp *(%edx,%eax)
+
+L(movl_eip_ebx):
+ movl (%esp), %ebx
+ ret_internal
+',`dnl non-PIC
+ sub $8, %esp C needed on Darwin, harmless elsewhere
+ CALL( __gmpn_cpuvec_init)
+ add $8, %esp C needed on Darwin, harmless elsewhere
+ popl %eax
+ jmp *GSYM_PREFIX`'__gmpn_cpuvec(%eax)
+')
+
+dnl FAT_INIT for each CPUVEC_FUNCS_LIST
+dnl
+
+define(`CPUVEC_offset',0)
+foreach(i,
+`FAT_INIT(MPN(i`'_init),CPUVEC_offset)
+define(`CPUVEC_offset',eval(CPUVEC_offset + 4))',
+CPUVEC_FUNCS_LIST)
+
+
+
+C long __gmpn_cpuid (char dst[12], int id);
+C
+C This is called only once, so just something simple and compact is fine.
+
+defframe(PARAM_ID, 8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+PROLOGUE(__gmpn_cpuid)
+ pushl %esi FRAME_pushl()
+ pushl %ebx FRAME_pushl()
+ movl PARAM_ID, %eax
+ cpuid
+ movl PARAM_DST, %esi
+ movl %ebx, (%esi)
+ movl %edx, 4(%esi)
+ movl %ecx, 8(%esi)
+ popl %ebx
+ popl %esi
+ ret
+EPILOGUE()
+
+
+C int __gmpn_cpuid_available (void);
+C
+C Return non-zero if the cpuid instruction is available, which means late
+C model 80486 and higher. 80386 and early 80486 don't have cpuid.
+C
+C The test follows Intel AP-485 application note, namely that if bit 21 is
+C modifiable then cpuid is supported. This test is reentrant and thread
+C safe, since of course any interrupt or context switch will preserve the
+C flags while we're tinkering with them.
+C
+C This is called only once, so just something simple and compact is fine.
+
+PROLOGUE(__gmpn_cpuid_available)
+ pushf
+ popl %ecx C old flags
+
+ movl %ecx, %edx
+ xorl $0x200000, %edx
+ pushl %edx
+ popf
+ pushf
+ popl %edx C tweaked flags
+
+ movl $1, %eax
+ cmpl %ecx, %edx
+ jne L(available)
+ xorl %eax, %eax C not changed, so cpuid not available
+
+L(available):
+ ret
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/x86/fat/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/x86/fat/gmp-mparam.h
new file mode 100644
index 0000000..3641a6b
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/fat/gmp-mparam.h
@@ -0,0 +1,71 @@
+/* Fat binary x86 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000-2003, 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+
+/* mpn_divexact_1 is faster than mpn_divrem_1 at all sizes. The only time
+ this might not be true currently is for actual 80386 and 80486 chips,
+ where mpn/x86/dive_1.asm might be slower than mpn/x86/divrem_1.asm, but
+ that's not worth worrying about. */
+#define DIVEXACT_1_THRESHOLD 0
+
+/* Only some of the x86s have an mpn_preinv_divrem_1, but we set
+ USE_PREINV_DIVREM_1 so that all callers use it, and then let the
+ __gmpn_cpuvec pointer go to plain mpn_divrem_1 if there's not an actual
+ preinv. */
+#define USE_PREINV_DIVREM_1 1
+
+#define BMOD_1_TO_MOD_1_THRESHOLD 20
+
+/* mpn_sqr_basecase is faster than mpn_mul_basecase at all sizes, no need
+ for mpn_sqr to call the latter. */
+#define SQR_BASECASE_THRESHOLD 0
+
+/* Sensible fallbacks for these, when not taken from a cpu-specific
+ gmp-mparam.h. */
+#define MUL_TOOM22_THRESHOLD 20
+#define MUL_TOOM33_THRESHOLD 130
+#define SQR_TOOM2_THRESHOLD 30
+#define SQR_TOOM3_THRESHOLD 200
+
+/* These are values more or less in the middle of what the typical x86 chips
+ come out as. For a fat binary it's necessary to have values for these,
+ since the defaults for MUL_FFT_TABLE and SQR_FFT_TABLE otherwise come out
+ as non-constant array initializers. FIXME: Perhaps these should be done
+ in the cpuvec structure like other thresholds. */
+#define MUL_FFT_TABLE { 464, 928, 1920, 3584, 10240, 40960, 0 }
+#define MUL_FFT_MODF_THRESHOLD 400
+#define MUL_FFT_THRESHOLD 2000
+
+#define SQR_FFT_TABLE { 528, 1184, 1920, 4608, 14336, 40960, 0 }
+#define SQR_FFT_MODF_THRESHOLD 500
+#define SQR_FFT_THRESHOLD 3000
diff --git a/vendor/gmp-6.3.0/mpn/x86/fat/lshiftc.c b/vendor/gmp-6.3.0/mpn/x86/fat/lshiftc.c
new file mode 100644
index 0000000..9ecf489
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/fat/lshiftc.c
@@ -0,0 +1,32 @@
+/* Fat binary fallback mpn_lshiftc.
+
+Copyright 2003, 2009, 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+
+#include "mpn/generic/lshiftc.c"
diff --git a/vendor/gmp-6.3.0/mpn/x86/fat/mod_1.c b/vendor/gmp-6.3.0/mpn/x86/fat/mod_1.c
new file mode 100644
index 0000000..4f149cc
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/fat/mod_1.c
@@ -0,0 +1,32 @@
+/* Fat binary fallback mpn_mod_1.
+
+Copyright 2003, 2009 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+
+#include "mpn/generic/mod_1.c"
diff --git a/vendor/gmp-6.3.0/mpn/x86/fat/mod_1_1.c b/vendor/gmp-6.3.0/mpn/x86/fat/mod_1_1.c
new file mode 100644
index 0000000..92eaa7a
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/fat/mod_1_1.c
@@ -0,0 +1,36 @@
+/* Fat binary fallback mpn_mod_1_1p.
+
+Copyright 2003, 2009, 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+/*
+PROLOGUE(mpn_mod_1_1p_cps)
+*/
+
+#define OPERATION_mod_1_1_cps 1
+#include "mpn/generic/mod_1_1.c"
diff --git a/vendor/gmp-6.3.0/mpn/x86/fat/mod_1_2.c b/vendor/gmp-6.3.0/mpn/x86/fat/mod_1_2.c
new file mode 100644
index 0000000..9095a61
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/fat/mod_1_2.c
@@ -0,0 +1,36 @@
+/* Fat binary fallback mpn_mod_1s_2p.
+
+Copyright 2003, 2009, 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+/*
+PROLOGUE(mpn_mod_1s_2p_cps)
+*/
+
+#define OPERATION_mod_1_2_cps 1
+#include "mpn/generic/mod_1_2.c"
diff --git a/vendor/gmp-6.3.0/mpn/x86/fat/mod_1_4.c b/vendor/gmp-6.3.0/mpn/x86/fat/mod_1_4.c
new file mode 100644
index 0000000..51c0def
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/fat/mod_1_4.c
@@ -0,0 +1,36 @@
+/* Fat binary fallback mpn_mod_1s_4p.
+
+Copyright 2003, 2009, 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+/*
+PROLOGUE(mpn_mod_1s_4p_cps)
+*/
+
+#define OPERATION_mod_1_4_cps 1
+#include "mpn/generic/mod_1_4.c"
diff --git a/vendor/gmp-6.3.0/mpn/x86/fat/mode1o.c b/vendor/gmp-6.3.0/mpn/x86/fat/mode1o.c
new file mode 100644
index 0000000..870ddb8
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/fat/mode1o.c
@@ -0,0 +1,32 @@
+/* Fat binary fallback mpn_modexact_1c_odd.
+
+Copyright 2003 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+
+#include "mpn/generic/mode1o.c"
diff --git a/vendor/gmp-6.3.0/mpn/x86/fat/mullo_basecase.c b/vendor/gmp-6.3.0/mpn/x86/fat/mullo_basecase.c
new file mode 100644
index 0000000..7f86be6
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/fat/mullo_basecase.c
@@ -0,0 +1,32 @@
+/* Fat binary fallback mpn_mullo_basecase.
+
+Copyright 2012 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+
+#include "mpn/generic/mullo_basecase.c"
diff --git a/vendor/gmp-6.3.0/mpn/x86/fat/redc_1.c b/vendor/gmp-6.3.0/mpn/x86/fat/redc_1.c
new file mode 100644
index 0000000..0025403
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/fat/redc_1.c
@@ -0,0 +1,32 @@
+/* Fat binary fallback mpn_redc_1.
+
+Copyright 2012 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+
+#include "mpn/generic/redc_1.c"
diff --git a/vendor/gmp-6.3.0/mpn/x86/fat/redc_2.c b/vendor/gmp-6.3.0/mpn/x86/fat/redc_2.c
new file mode 100644
index 0000000..1932d58
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/fat/redc_2.c
@@ -0,0 +1,32 @@
+/* Fat binary fallback mpn_redc_2.
+
+Copyright 2012 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+
+#include "mpn/generic/redc_2.c"
diff --git a/vendor/gmp-6.3.0/mpn/x86/gcd_11.asm b/vendor/gmp-6.3.0/mpn/x86/gcd_11.asm
new file mode 100644
index 0000000..af69135
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/gcd_11.asm
@@ -0,0 +1,126 @@
+dnl x86 mpn_gcd_11 optimised for processors with slow BSF.
+
+dnl Based on C version.
+
+dnl Copyright 2019 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+dnl Rudimentary code for x86-32, i.e. for CPUs without cmov. Also, the bsf
+dnl instruction is assumed to be so slow it is useless. Instead a teble is
+dnl used.
+dnl
+dnl The loop benefits from OoO, in-order CPUs might want a different loop.
+dnl The ebx and ecx registers could be combined if the assigment of ecx were
+dnl postponed until ebx died, but that would at least hurt in-order CPUs.
+
+C cycles/bit (approx)
+C AMD K7 ?
+C AMD K8,K9 ?
+C AMD K10 ?
+C AMD bd1 ?
+C AMD bd2 ?
+C AMD bd3 ?
+C AMD bd4 ?
+C AMD bt1 ?
+C AMD bt2 ?
+C AMD zn1 ?
+C AMD zn2 ?
+C Intel P4-2 ?
+C Intel P4-3/4 ?
+C Intel P6/13 ?
+C Intel CNR ?
+C Intel NHM ?
+C Intel SBR ?
+C Intel IBR ?
+C Intel HWL ?
+C Intel BWL ?
+C Intel SKL ?
+C Intel atom ?
+C Intel SLM ?
+C Intel GLM ?
+C Intel GLM+ ?
+C VIA nano ?
+C Numbers measured with: speed -CD -s8-32 -t24 mpn_gcd_1
+
+deflit(MAXSHIFT, 6)
+deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1))
+
+DEF_OBJECT(ctz_table,64)
+ .byte MAXSHIFT
+forloop(i,1,MASK,
+` .byte m4_count_trailing_zeros(i)
+')
+END_OBJECT(ctz_table)
+
+define(`u0', `%eax')
+define(`v0', `%edx')
+
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_gcd_11)
+ push %edi
+ push %esi
+ push %ebx
+
+ mov 16(%esp), u0
+ mov 20(%esp), v0
+ LEAL( ctz_table, %esi)
+ sub v0, u0 C u = u - v 0
+ jz L(end)
+
+ ALIGN(16)
+L(top): sbb %ebx, %ebx C mask 1
+ mov u0, %edi C 1
+ mov u0, %ecx C 1
+ and %ebx, %edi C 2
+ xor %ebx, u0 C 2
+ add %edi, v0 C v = min(u.v) 3
+ sub %ebx, u0 C u = |u - v| 3
+L(mid): and $MASK, %ecx C 2
+ movzbl (%esi,%ecx), %ecx C 3
+ jz L(shift_alot)
+ shr %cl, u0 C 4
+ sub v0, u0 C u = u - v 0,5
+ jnz L(top)
+
+L(end): mov v0, %eax
+ pop %ebx
+ pop %esi
+ pop %edi
+ ret
+
+L(shift_alot):
+ shr $MAXSHIFT, u0
+ mov u0, %ecx
+ jmp L(mid)
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/x86/geode/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/x86/geode/gmp-mparam.h
new file mode 100644
index 0000000..cc9c9f1
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/geode/gmp-mparam.h
@@ -0,0 +1,141 @@
+/* Generic x86 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000-2002, 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* Generated by tuneup.c, 2011-01-30, gcc 3.4 */
+
+#define MOD_1_NORM_THRESHOLD 6
+#define MOD_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 17
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 9
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 14
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */
+#define USE_PREINV_DIVREM_1 0
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 42
+
+#define MUL_TOOM22_THRESHOLD 18
+#define MUL_TOOM33_THRESHOLD 66
+#define MUL_TOOM44_THRESHOLD 105
+#define MUL_TOOM6H_THRESHOLD 141
+#define MUL_TOOM8H_THRESHOLD 212
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 62
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 69
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 65
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 67
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 33
+#define SQR_TOOM3_THRESHOLD 60
+#define SQR_TOOM4_THRESHOLD 136
+#define SQR_TOOM6_THRESHOLD 196
+#define SQR_TOOM8_THRESHOLD 292
+
+#define MULMOD_BNM1_THRESHOLD 14
+#define SQRMOD_BNM1_THRESHOLD 16
+
+#define MUL_FFT_MODF_THRESHOLD 468 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 468, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
+ { 11, 5}, { 23, 6}, { 21, 7}, { 11, 6}, \
+ { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \
+ { 31, 7}, { 21, 8}, { 11, 7}, { 27, 8}, \
+ { 15, 7}, { 33, 8}, { 19, 7}, { 39, 8}, \
+ { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \
+ { 39, 9}, { 23, 8}, { 47,10}, { 15, 9}, \
+ { 31, 8}, { 67, 9}, { 39, 8}, { 79, 9}, \
+ { 47, 8}, { 95, 9}, { 55,10}, { 31, 9}, \
+ { 63, 8}, { 127, 9}, { 79,10}, { 47, 9}, \
+ { 95,11}, { 31,10}, { 63, 9}, { 135,10}, \
+ { 79, 9}, { 159,10}, { 95, 9}, { 191,11}, \
+ { 63,10}, { 127, 9}, { 255,10}, { 143, 9}, \
+ { 287,10}, { 159,11}, { 95,10}, { 191, 9}, \
+ { 383,12}, { 4096,13}, { 8192,14}, { 16384,15}, \
+ { 32768,16} }
+#define MUL_FFT_TABLE3_SIZE 61
+#define MUL_FFT_THRESHOLD 5504
+
+#define SQR_FFT_MODF_THRESHOLD 396 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 396, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
+ { 21, 7}, { 11, 6}, { 24, 7}, { 13, 6}, \
+ { 27, 7}, { 15, 6}, { 31, 7}, { 21, 8}, \
+ { 11, 7}, { 27, 8}, { 15, 7}, { 33, 8}, \
+ { 19, 7}, { 39, 8}, { 23, 7}, { 47, 8}, \
+ { 27, 9}, { 15, 8}, { 39, 9}, { 23, 8}, \
+ { 51,10}, { 15, 9}, { 31, 8}, { 67, 9}, \
+ { 39, 8}, { 79, 9}, { 47, 8}, { 95, 9}, \
+ { 55,10}, { 31, 9}, { 79,10}, { 47, 9}, \
+ { 95,11}, { 31,10}, { 63, 9}, { 127, 8}, \
+ { 255, 9}, { 135,10}, { 79, 9}, { 159, 8}, \
+ { 319,10}, { 95, 9}, { 191,11}, { 63,10}, \
+ { 127, 9}, { 255, 8}, { 511,10}, { 143, 9}, \
+ { 287, 8}, { 575,10}, { 159,11}, { 95,10}, \
+ { 191,12}, { 4096,13}, { 8192,14}, { 16384,15}, \
+ { 32768,16} }
+#define SQR_FFT_TABLE3_SIZE 61
+#define SQR_FFT_THRESHOLD 3712
+
+#define MULLO_BASECASE_THRESHOLD 3
+#define MULLO_DC_THRESHOLD 37
+#define MULLO_MUL_N_THRESHOLD 10950
+
+#define DC_DIV_QR_THRESHOLD 59
+#define DC_DIVAPPR_Q_THRESHOLD 189
+#define DC_BDIV_QR_THRESHOLD 55
+#define DC_BDIV_Q_THRESHOLD 136
+
+#define INV_MULMOD_BNM1_THRESHOLD 50
+#define INV_NEWTON_THRESHOLD 183
+#define INV_APPR_THRESHOLD 181
+
+#define BINV_NEWTON_THRESHOLD 204
+#define REDC_1_TO_REDC_N_THRESHOLD 54
+
+#define MU_DIV_QR_THRESHOLD 1142
+#define MU_DIVAPPR_Q_THRESHOLD 1142
+#define MUPI_DIV_QR_THRESHOLD 81
+#define MU_BDIV_QR_THRESHOLD 889
+#define MU_BDIV_Q_THRESHOLD 998
+
+#define MATRIX22_STRASSEN_THRESHOLD 13
+#define HGCD_THRESHOLD 133
+#define GCD_DC_THRESHOLD 451
+#define GCDEXT_DC_THRESHOLD 318
+#define JACOBI_BASE_METHOD 1
+
+#define GET_STR_DC_THRESHOLD 15
+#define GET_STR_PRECOMPUTE_THRESHOLD 30
+#define SET_STR_DC_THRESHOLD 547
+#define SET_STR_PRECOMPUTE_THRESHOLD 1049
diff --git a/vendor/gmp-6.3.0/mpn/x86/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/x86/gmp-mparam.h
new file mode 100644
index 0000000..2cb1984
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/gmp-mparam.h
@@ -0,0 +1,38 @@
+/* Generic x86 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000-2002 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+
+/* Generic x86 mpn_divexact_1 is faster than generic x86 mpn_divrem_1 on all
+ of p5, p6, k6 and k7, so use it always. It's probably slower on 386 and
+ 486, but that's too bad. */
+#define DIVEXACT_1_THRESHOLD 0
diff --git a/vendor/gmp-6.3.0/mpn/x86/goldmont/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/x86/goldmont/gmp-mparam.h
new file mode 100644
index 0000000..3d37fa3
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/goldmont/gmp-mparam.h
@@ -0,0 +1,219 @@
+/* Intel Goldmont/32 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 2200 MHz Intel Atom C3758 Goldmont/Denverton */
+/* FFT tuning limit = 67,000,000 */
+/* Generated by tuneup.c, 2019-10-22, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD 7
+#define MOD_1_UNNORM_THRESHOLD 12
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 9
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 7
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 10
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 12
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1N_PI1_METHOD 1 /* 32.79% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD 32
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 23
+
+#define DIV_1_VS_MUL_1_PERCENT 228
+
+#define MUL_TOOM22_THRESHOLD 18
+#define MUL_TOOM33_THRESHOLD 81
+#define MUL_TOOM44_THRESHOLD 193
+#define MUL_TOOM6H_THRESHOLD 286
+#define MUL_TOOM8H_THRESHOLD 399
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 81
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 138
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 125
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 137
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 185
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 32
+#define SQR_TOOM3_THRESHOLD 113
+#define SQR_TOOM4_THRESHOLD 280
+#define SQR_TOOM6_THRESHOLD 399
+#define SQR_TOOM8_THRESHOLD 547
+
+#define MULMID_TOOM42_THRESHOLD 60
+
+#define MULMOD_BNM1_THRESHOLD 13
+#define SQRMOD_BNM1_THRESHOLD 15
+
+#define MUL_FFT_MODF_THRESHOLD 368 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 368, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
+ { 21, 7}, { 11, 6}, { 25, 7}, { 13, 6}, \
+ { 27, 7}, { 15, 6}, { 31, 7}, { 21, 8}, \
+ { 11, 7}, { 27, 8}, { 15, 7}, { 33, 8}, \
+ { 19, 7}, { 39, 8}, { 23, 7}, { 47, 8}, \
+ { 27, 9}, { 15, 8}, { 39, 9}, { 23, 8}, \
+ { 47,10}, { 15, 9}, { 31, 8}, { 63, 9}, \
+ { 39, 8}, { 79, 9}, { 47,10}, { 31, 9}, \
+ { 79,10}, { 47, 9}, { 95,11}, { 31,10}, \
+ { 63, 9}, { 127, 8}, { 255, 9}, { 135,10}, \
+ { 79, 9}, { 159,10}, { 95, 9}, { 191,11}, \
+ { 63,10}, { 127, 9}, { 255, 8}, { 511,10}, \
+ { 143, 9}, { 287, 8}, { 575, 9}, { 303,10}, \
+ { 159,11}, { 95,10}, { 191,12}, { 63,11}, \
+ { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \
+ { 543,10}, { 287, 9}, { 575,10}, { 303, 9}, \
+ { 607,11}, { 159,10}, { 319, 9}, { 639,10}, \
+ { 351, 9}, { 703,11}, { 191,10}, { 383, 9}, \
+ { 767,10}, { 415, 9}, { 831,11}, { 223,10}, \
+ { 447,12}, { 127,11}, { 255,10}, { 543, 9}, \
+ { 1087,11}, { 287,10}, { 607, 9}, { 1215,11}, \
+ { 319,10}, { 671,11}, { 351,10}, { 703,12}, \
+ { 191,11}, { 383,10}, { 767,11}, { 415,10}, \
+ { 831,11}, { 447,13}, { 127,12}, { 255,11}, \
+ { 543,10}, { 1087,11}, { 607,10}, { 1215,12}, \
+ { 319,11}, { 671,10}, { 1343,11}, { 703,10}, \
+ { 1407,11}, { 735,12}, { 383,11}, { 831,12}, \
+ { 447,11}, { 959,13}, { 255,12}, { 511,11}, \
+ { 1087,12}, { 575,11}, { 1215,10}, { 2431,12}, \
+ { 639,11}, { 1343,12}, { 703,11}, { 1407,13}, \
+ { 383,12}, { 831,11}, { 1663,12}, { 959,11}, \
+ { 1919,14}, { 255,13}, { 511,12}, { 1215,11}, \
+ { 2431,13}, { 639,12}, { 1471,11}, { 2943,13}, \
+ { 767,12}, { 1727,13}, { 895,12}, { 1919,11}, \
+ { 3839,14}, { 511,13}, { 1023,12}, { 2111,13}, \
+ { 1151,12}, { 2431,13}, { 1407,12}, { 2943,14}, \
+ { 767,13}, { 1663,12}, { 3455,13}, { 1919,12}, \
+ { 3839,15}, { 511,14}, { 1023,13}, { 2431,14}, \
+ { 1279,13}, { 2943,12}, { 5887,14}, { 1535,13}, \
+ { 3455,14}, { 1791,13}, { 3839,12}, { 7679,15}, \
+ { 1023,14}, { 2303,13}, { 4991,12}, { 9983,14}, \
+ { 2559,13}, { 5119,14}, { 2815,13}, { 5887,15}, \
+ { 1535,14}, { 3839,13}, { 7679,16} }
+#define MUL_FFT_TABLE3_SIZE 171
+#define MUL_FFT_THRESHOLD 3712
+
+#define SQR_FFT_MODF_THRESHOLD 340 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 340, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
+ { 12, 5}, { 25, 6}, { 21, 7}, { 11, 6}, \
+ { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \
+ { 31, 7}, { 21, 8}, { 11, 7}, { 27, 8}, \
+ { 15, 7}, { 33, 8}, { 19, 7}, { 39, 8}, \
+ { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \
+ { 39, 9}, { 23, 8}, { 47,10}, { 15, 9}, \
+ { 31, 8}, { 67, 9}, { 39, 8}, { 79, 9}, \
+ { 47,10}, { 31, 9}, { 79,10}, { 47,11}, \
+ { 31,10}, { 63, 9}, { 127, 8}, { 255,10}, \
+ { 79, 9}, { 159, 8}, { 319,10}, { 95, 9}, \
+ { 191,11}, { 63,10}, { 127, 9}, { 255, 8}, \
+ { 511, 9}, { 271,10}, { 143, 9}, { 287, 8}, \
+ { 575, 9}, { 303, 8}, { 607, 9}, { 319,11}, \
+ { 95,10}, { 191,12}, { 63,11}, { 127,10}, \
+ { 255, 9}, { 511,10}, { 271, 9}, { 543,10}, \
+ { 287, 9}, { 575,10}, { 303, 9}, { 607,11}, \
+ { 159,10}, { 319, 9}, { 639,10}, { 335, 9}, \
+ { 671,10}, { 351, 9}, { 703,11}, { 191,10}, \
+ { 383, 9}, { 767,10}, { 415, 9}, { 831,11}, \
+ { 223,10}, { 479,12}, { 127,11}, { 255,10}, \
+ { 543, 9}, { 1087,11}, { 287,10}, { 607, 9}, \
+ { 1215,11}, { 319,10}, { 671,11}, { 351,10}, \
+ { 703,12}, { 191,11}, { 383,10}, { 767,11}, \
+ { 415,10}, { 831,11}, { 479,13}, { 127,12}, \
+ { 255,11}, { 543,10}, { 1087,11}, { 607,10}, \
+ { 1215,12}, { 319,11}, { 671,10}, { 1343,11}, \
+ { 735,12}, { 383,11}, { 831,12}, { 447,11}, \
+ { 959,13}, { 255,12}, { 511,11}, { 1087,12}, \
+ { 575,11}, { 1215,12}, { 639,11}, { 1343,12}, \
+ { 703,11}, { 1471,13}, { 383,12}, { 831,11}, \
+ { 1663,12}, { 959,11}, { 1919,14}, { 255,13}, \
+ { 511,12}, { 1215,13}, { 639,12}, { 1471,11}, \
+ { 2943,13}, { 767,12}, { 1727,13}, { 895,12}, \
+ { 1919,14}, { 511,13}, { 1023,12}, { 2111,13}, \
+ { 1151,12}, { 2431,13}, { 1407,12}, { 2943,14}, \
+ { 767,13}, { 1663,12}, { 3455,13}, { 1919,15}, \
+ { 511,14}, { 1023,13}, { 2431,14}, { 1279,13}, \
+ { 2943,12}, { 5887,14}, { 1535,13}, { 3455,14}, \
+ { 1791,13}, { 3839,12}, { 7679,15}, { 1023,14}, \
+ { 2047,13}, { 4095,14}, { 2303,13}, { 4991,12}, \
+ { 9983,14}, { 2815,13}, { 5887,15}, { 1535,14}, \
+ { 3839,13}, { 7679,16} }
+#define SQR_FFT_TABLE3_SIZE 170
+#define SQR_FFT_THRESHOLD 3520
+
+#define MULLO_BASECASE_THRESHOLD 5
+#define MULLO_DC_THRESHOLD 50
+#define MULLO_MUL_N_THRESHOLD 6633
+#define SQRLO_BASECASE_THRESHOLD 0 /* always */
+#define SQRLO_DC_THRESHOLD 95
+#define SQRLO_SQR_THRESHOLD 6633
+
+#define DC_DIV_QR_THRESHOLD 68
+#define DC_DIVAPPR_Q_THRESHOLD 204
+#define DC_BDIV_QR_THRESHOLD 64
+#define DC_BDIV_Q_THRESHOLD 108
+
+#define INV_MULMOD_BNM1_THRESHOLD 34
+#define INV_NEWTON_THRESHOLD 276
+#define INV_APPR_THRESHOLD 226
+
+#define BINV_NEWTON_THRESHOLD 298
+#define REDC_1_TO_REDC_N_THRESHOLD 65
+
+#define MU_DIV_QR_THRESHOLD 1528
+#define MU_DIVAPPR_Q_THRESHOLD 1589
+#define MUPI_DIV_QR_THRESHOLD 140
+#define MU_BDIV_QR_THRESHOLD 1334
+#define MU_BDIV_Q_THRESHOLD 1499
+
+#define POWM_SEC_TABLE 3,16,96,428,1317
+
+#define GET_STR_DC_THRESHOLD 13
+#define GET_STR_PRECOMPUTE_THRESHOLD 18
+#define SET_STR_DC_THRESHOLD 704
+#define SET_STR_PRECOMPUTE_THRESHOLD 1358
+
+#define FAC_DSC_THRESHOLD 95
+#define FAC_ODD_THRESHOLD 29
+
+#define MATRIX22_STRASSEN_THRESHOLD 15
+#define HGCD2_DIV1_METHOD 1 /* 5.53% faster than 3 */
+#define HGCD_THRESHOLD 172
+#define HGCD_APPR_THRESHOLD 204
+#define HGCD_REDUCE_THRESHOLD 2479
+#define GCD_DC_THRESHOLD 610
+#define GCDEXT_DC_THRESHOLD 443
+#define JACOBI_BASE_METHOD 4 /* 6.53% faster than 3 */
+
+/* Tuneup completed successfully, took 101563 seconds */
diff --git a/vendor/gmp-6.3.0/mpn/x86/i486/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/x86/i486/gmp-mparam.h
new file mode 100644
index 0000000..aa7dbad
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/i486/gmp-mparam.h
@@ -0,0 +1,69 @@
+/* 80486 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2001-2003 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+
+/* 100MHz DX4 */
+
+/* Generated by tuneup.c, 2003-02-13, gcc 2.95 */
+
+#define MUL_TOOM22_THRESHOLD 18
+#define MUL_TOOM33_THRESHOLD 228
+
+#define SQR_BASECASE_THRESHOLD 13
+#define SQR_TOOM2_THRESHOLD 49
+#define SQR_TOOM3_THRESHOLD 238
+
+#define DIV_SB_PREINV_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_DC_THRESHOLD 72
+#define POWM_THRESHOLD 38
+
+#define GCD_ACCEL_THRESHOLD 3
+#define JACOBI_BASE_METHOD 2
+
+#define USE_PREINV_DIVREM_1 0
+#define USE_PREINV_MOD_1 0
+#define DIVREM_2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define MODEXACT_1_ODD_THRESHOLD 17
+
+#define GET_STR_DC_THRESHOLD 32
+#define GET_STR_PRECOMPUTE_THRESHOLD 82
+#define SET_STR_THRESHOLD 3524
+
+#define MUL_FFT_TABLE { 464, 928, 1920, 4608, 10240, 40960, 0 }
+#define MUL_FFT_MODF_THRESHOLD 392
+#define MUL_FFT_THRESHOLD 2816
+
+#define SQR_FFT_TABLE { 432, 928, 1920, 4608, 14336, 40960, 0 }
+#define SQR_FFT_MODF_THRESHOLD 392
+#define SQR_FFT_THRESHOLD 2816
diff --git a/vendor/gmp-6.3.0/mpn/x86/k10/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/x86/k10/gmp-mparam.h
new file mode 100644
index 0000000..eceaaae
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/k10/gmp-mparam.h
@@ -0,0 +1,217 @@
+/* x86/k10 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000-2011, 2014-2015 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 3200-3600 MHz K10 Thuban */
+/* FFT tuning limit = 67,000,000 */
+/* Generated by tuneup.c, 2019-10-19, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 14
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 7
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 18
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 22
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1N_PI1_METHOD 1 /* 29.33% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD 2
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 35
+
+#define DIV_1_VS_MUL_1_PERCENT 258
+
+#define MUL_TOOM22_THRESHOLD 22
+#define MUL_TOOM33_THRESHOLD 73
+#define MUL_TOOM44_THRESHOLD 124
+#define MUL_TOOM6H_THRESHOLD 274
+#define MUL_TOOM8H_THRESHOLD 430
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 81
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 99
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 85
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 88
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 113
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 26
+#define SQR_TOOM3_THRESHOLD 105
+#define SQR_TOOM4_THRESHOLD 154
+#define SQR_TOOM6_THRESHOLD 238
+#define SQR_TOOM8_THRESHOLD 309
+
+#define MULMID_TOOM42_THRESHOLD 50
+
+#define MULMOD_BNM1_THRESHOLD 15
+#define SQRMOD_BNM1_THRESHOLD 18
+
+#define MUL_FFT_MODF_THRESHOLD 570 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 570, 5}, { 21, 6}, { 11, 5}, { 25, 6}, \
+ { 13, 5}, { 27, 6}, { 15, 5}, { 31, 6}, \
+ { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \
+ { 32, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \
+ { 39, 7}, { 27, 8}, { 15, 7}, { 35, 8}, \
+ { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \
+ { 27, 9}, { 15, 8}, { 31, 7}, { 63, 8}, \
+ { 39, 9}, { 23, 8}, { 55, 9}, { 31, 8}, \
+ { 67, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \
+ { 95, 9}, { 55,10}, { 31, 9}, { 79,10}, \
+ { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \
+ { 135,10}, { 79, 9}, { 159,10}, { 95, 9}, \
+ { 191,11}, { 63,10}, { 143, 9}, { 287,10}, \
+ { 159,11}, { 95,10}, { 191,12}, { 63,11}, \
+ { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \
+ { 543,10}, { 287,11}, { 159,10}, { 319, 9}, \
+ { 639,10}, { 335, 9}, { 671,11}, { 191,10}, \
+ { 383, 9}, { 767,10}, { 399, 9}, { 799,11}, \
+ { 223,12}, { 127,11}, { 255,10}, { 543,11}, \
+ { 287,10}, { 607, 9}, { 1215,11}, { 319,10}, \
+ { 671,12}, { 191,11}, { 383,10}, { 799,11}, \
+ { 415,13}, { 127,12}, { 255,11}, { 543,10}, \
+ { 1087,11}, { 607,10}, { 1215,12}, { 319,11}, \
+ { 671,10}, { 1343,11}, { 735,10}, { 1471, 9}, \
+ { 2943,12}, { 383,11}, { 799,10}, { 1599,11}, \
+ { 863,12}, { 447,11}, { 959,13}, { 255,12}, \
+ { 511,11}, { 1087,12}, { 575,11}, { 1215,10}, \
+ { 2431,12}, { 639,11}, { 1343,12}, { 703,11}, \
+ { 1471,10}, { 2943,13}, { 383,12}, { 767,11}, \
+ { 1599,12}, { 831,11}, { 1727,10}, { 3455,12}, \
+ { 959,11}, { 1919,14}, { 255,13}, { 511,12}, \
+ { 1087,11}, { 2239,12}, { 1215,11}, { 2431,13}, \
+ { 639,12}, { 1471,11}, { 2943,13}, { 767,12}, \
+ { 1727,11}, { 3455,13}, { 895,12}, { 1983,14}, \
+ { 511,13}, { 1023,12}, { 2239,13}, { 1151,12}, \
+ { 2431,13}, { 1407,12}, { 2943,14}, { 767,13}, \
+ { 1663,12}, { 3455,13}, { 1919,15}, { 511,14}, \
+ { 1023,13}, { 2175,12}, { 4479,13}, { 2431,14}, \
+ { 1279,13}, { 2943,12}, { 5887,14}, { 1535,13}, \
+ { 3455,14}, { 1791,13}, { 3967,15}, { 1023,14}, \
+ { 2047,13}, { 4479,14}, { 2303,13}, { 4991,14}, \
+ { 2815,13}, { 5887,15}, { 1535,14}, { 3839,16} }
+#define MUL_FFT_TABLE3_SIZE 168
+#define MUL_FFT_THRESHOLD 7424
+
+#define SQR_FFT_MODF_THRESHOLD 525 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 525, 5}, { 25, 6}, { 13, 5}, { 28, 6}, \
+ { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \
+ { 31, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \
+ { 39, 7}, { 27, 8}, { 15, 7}, { 35, 8}, \
+ { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \
+ { 27, 9}, { 15, 8}, { 31, 7}, { 63, 8}, \
+ { 39, 9}, { 23, 8}, { 51, 9}, { 31, 8}, \
+ { 67, 9}, { 39, 8}, { 79, 9}, { 55,10}, \
+ { 31, 9}, { 79,10}, { 47, 9}, { 95,11}, \
+ { 31,10}, { 63, 9}, { 127,10}, { 79, 9}, \
+ { 159,10}, { 95,11}, { 63,10}, { 143, 9}, \
+ { 287,10}, { 159,11}, { 95,10}, { 191,12}, \
+ { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \
+ { 271, 9}, { 543,10}, { 287,11}, { 159,10}, \
+ { 319, 9}, { 639,10}, { 335, 9}, { 671,10}, \
+ { 351,11}, { 191,10}, { 383, 9}, { 767,10}, \
+ { 399, 9}, { 799,10}, { 415,12}, { 127,11}, \
+ { 255,10}, { 543,11}, { 287,10}, { 607,11}, \
+ { 319,10}, { 671, 9}, { 1343,11}, { 351,10}, \
+ { 703,12}, { 191,11}, { 383,10}, { 799, 9}, \
+ { 1599,11}, { 415,10}, { 831,13}, { 127,12}, \
+ { 255,11}, { 543,10}, { 1087,11}, { 607,12}, \
+ { 319,11}, { 671,10}, { 1343,11}, { 735,10}, \
+ { 1471,12}, { 383,11}, { 799,10}, { 1599,11}, \
+ { 863,10}, { 1727,12}, { 447,11}, { 959,10}, \
+ { 1919,11}, { 991,12}, { 511,11}, { 1087,12}, \
+ { 575,11}, { 1215,10}, { 2431,12}, { 639,11}, \
+ { 1343,12}, { 703,11}, { 1471,13}, { 383,12}, \
+ { 767,11}, { 1599,12}, { 831,11}, { 1727,10}, \
+ { 3455,12}, { 959,11}, { 1919,13}, { 511,12}, \
+ { 1087,11}, { 2239,12}, { 1215,11}, { 2431,13}, \
+ { 639,12}, { 1471,11}, { 2943,13}, { 767,12}, \
+ { 1727,11}, { 3455,13}, { 895,12}, { 1919,14}, \
+ { 511,13}, { 1023,12}, { 2239,13}, { 1151,12}, \
+ { 2495,13}, { 1279,12}, { 2623,13}, { 1407,12}, \
+ { 2943,14}, { 767,13}, { 1663,12}, { 3455,13}, \
+ { 1919,15}, { 511,14}, { 1023,13}, { 2175,12}, \
+ { 4351,13}, { 2431,14}, { 1279,13}, { 2943,12}, \
+ { 5887,14}, { 1535,13}, { 3455,14}, { 1791,13}, \
+ { 3967,15}, { 1023,14}, { 2047,13}, { 4351,14}, \
+ { 2303,13}, { 4991,14}, { 2815,13}, { 5887,15}, \
+ { 1535,14}, { 3839,16} }
+#define SQR_FFT_TABLE3_SIZE 166
+#define SQR_FFT_THRESHOLD 5312
+
+#define MULLO_BASECASE_THRESHOLD 6
+#define MULLO_DC_THRESHOLD 40
+#define MULLO_MUL_N_THRESHOLD 14281
+#define SQRLO_BASECASE_THRESHOLD 8
+#define SQRLO_DC_THRESHOLD 113
+#define SQRLO_SQR_THRESHOLD 10323
+
+#define DC_DIV_QR_THRESHOLD 56
+#define DC_DIVAPPR_Q_THRESHOLD 248
+#define DC_BDIV_QR_THRESHOLD 55
+#define DC_BDIV_Q_THRESHOLD 158
+
+#define INV_MULMOD_BNM1_THRESHOLD 42
+#define INV_NEWTON_THRESHOLD 254
+#define INV_APPR_THRESHOLD 252
+
+#define BINV_NEWTON_THRESHOLD 292
+#define REDC_1_TO_REDC_N_THRESHOLD 67
+
+#define MU_DIV_QR_THRESHOLD 1589
+#define MU_DIVAPPR_Q_THRESHOLD 1558
+#define MUPI_DIV_QR_THRESHOLD 114
+#define MU_BDIV_QR_THRESHOLD 1442
+#define MU_BDIV_Q_THRESHOLD 1524
+
+#define POWM_SEC_TABLE 1,16,102,416,1378
+
+#define GET_STR_DC_THRESHOLD 13
+#define GET_STR_PRECOMPUTE_THRESHOLD 21
+#define SET_STR_DC_THRESHOLD 270
+#define SET_STR_PRECOMPUTE_THRESHOLD 1105
+
+#define FAC_DSC_THRESHOLD 159
+#define FAC_ODD_THRESHOLD 34
+
+#define MATRIX22_STRASSEN_THRESHOLD 17
+#define HGCD2_DIV1_METHOD 3 /* 0.70% faster than 4 */
+#define HGCD_THRESHOLD 130
+#define HGCD_APPR_THRESHOLD 163
+#define HGCD_REDUCE_THRESHOLD 3389
+#define GCD_DC_THRESHOLD 573
+#define GCDEXT_DC_THRESHOLD 393
+#define JACOBI_BASE_METHOD 4 /* 9.13% faster than 1 */
+
+/* Tuneup completed successfully, took 52901 seconds */
diff --git a/vendor/gmp-6.3.0/mpn/x86/k6/README b/vendor/gmp-6.3.0/mpn/x86/k6/README
new file mode 100644
index 0000000..1d65af3
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/k6/README
@@ -0,0 +1,251 @@
+Copyright 2000, 2001 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/.
+
+
+
+
+ AMD K6 MPN SUBROUTINES
+
+
+
+This directory contains code optimized for AMD K6 CPUs, meaning K6, K6-2 and
+K6-3.
+
+The mmx subdirectory has MMX code suiting plain K6, the k62mmx subdirectory
+has MMX code suiting K6-2 and K6-3. All chips in the K6 family have MMX,
+the separate directories are just so that ./configure can omit them if the
+assembler doesn't support MMX.
+
+
+
+
+STATUS
+
+Times for the loops, with all code and data in L1 cache, are as follows.
+
+ cycles/limb
+
+ mpn_add_n/sub_n 3.25 normal, 2.75 in-place
+
+ mpn_mul_1 6.25
+ mpn_add/submul_1 7.65-8.4 (varying with data values)
+
+ mpn_mul_basecase 9.25 cycles/crossproduct (approx)
+ mpn_sqr_basecase 4.7 cycles/crossproduct (approx)
+ or 9.2 cycles/triangleproduct (approx)
+
+ mpn_l/rshift 3.0
+
+ mpn_divrem_1 20.0
+ mpn_mod_1 20.0
+ mpn_divexact_by3 11.0
+
+ mpn_copyi 1.0
+ mpn_copyd 1.0
+
+
+K6-2 and K6-3 have dual-issue MMX and get the following improvements.
+
+ mpn_l/rshift 1.75
+
+
+Prefetching of sources hasn't yet given any joy. With the 3DNow "prefetch"
+instruction, code seems to run slower, and with just "mov" loads it doesn't
+seem faster. Results so far are inconsistent. The K6 does a hardware
+prefetch of the second cache line in a sector, so the penalty for not
+prefetching in software is reduced.
+
+
+
+
+NOTES
+
+All K6 family chips have MMX, but only K6-2 and K6-3 have 3DNow.
+
+Plain K6 executes MMX instructions only in the X pipe, but K6-2 and K6-3 can
+execute them in both X and Y (and in both together).
+
+Branch misprediction penalty is 1 to 4 cycles (Optimization Manual
+chapter 6 table 12).
+
+Write-allocate L1 data cache means prefetching of destinations is unnecessary.
+Store queue is 7 entries of 64 bits each.
+
+Floating point multiplications can be done in parallel with integer
+multiplications, but there doesn't seem to be any way to make use of this.
+
+
+
+OPTIMIZATIONS
+
+Unrolled loops are used to reduce looping overhead. The unrolling is
+configurable up to 32 limbs/loop for most routines, up to 64 for some.
+
+Sometimes computed jumps into the unrolling are used to handle sizes not a
+multiple of the unrolling. An attractive feature of this is that times
+smoothly increase with operand size, but an indirect jump is about 6 cycles
+and the setups about another 6, so it depends on how much the unrolled code
+is faster than a simple loop as to whether a computed jump ought to be used.
+
+Position independent code is implemented using a call to get eip for
+computed jumps and a ret is always done, rather than an addl $4,%esp or a
+popl, so the CPU return address branch prediction stack stays synchronised
+with the actual stack in memory. Such a call however still costs 4 to 7
+cycles.
+
+Branch prediction, in absence of any history, will guess forward jumps are
+not taken and backward jumps are taken. Where possible it's arranged that
+the less likely or less important case is under a taken forward jump.
+
+
+
+MMX
+
+Putting emms or femms as late as possible in a routine seems to be fastest.
+Perhaps an emms or femms stalls until all outstanding MMX instructions have
+completed, so putting it later gives them a chance to complete on their own,
+in parallel with other operations (like register popping).
+
+The Optimization Manual chapter 5 recommends using a femms on K6-2 and K6-3
+at the start of a routine, in case it's been preceded by x87 floating point
+operations. This isn't done because in gmp programs it's expected that x87
+floating point won't be much used and that chances are an mpn routine won't
+have been preceded by any x87 code.
+
+
+
+CODING
+
+Instructions in general code are shown paired if they can decode and execute
+together, meaning two short decode instructions with the second not
+depending on the first, only the first using the shifter, no more than one
+load, and no more than one store.
+
+K6 does some out of order execution so the pairings aren't essential, they
+just show what slots might be available. When decoding is the limiting
+factor things can be scheduled that might not execute until later.
+
+
+
+NOTES
+
+Code alignment
+
+- if an opcode/modrm or 0Fh/opcode/modrm crosses a cache line boundary,
+ short decode is inhibited. The cross.pl script detects this.
+
+- loops and branch targets should be aligned to 16 bytes, or ensure at least
+ 2 instructions before a 32 byte boundary. This makes use of the 16 byte
+ cache in the BTB.
+
+Addressing modes
+
+- (%esi) degrades decoding from short to vector. 0(%esi) doesn't have this
+ problem, and can be used as an equivalent, or easier is just to use a
+ different register, like %ebx.
+
+- K6 and pre-CXT core K6-2 have the following problem. (K6-2 CXT and K6-3
+ have it fixed, these being cpuid function 1 signatures 0x588 to 0x58F).
+
+ If more than 3 bytes are needed to determine instruction length then
+ decoding degrades from direct to long, or from long to vector. This
+ happens with forms like "0F opcode mod/rm" with mod/rm=00-xxx-100 since
+ with mod=00 the sib determines whether there's a displacement.
+
+ This affects all MMX and 3DNow instructions, and others with an 0F prefix,
+ like movzbl. The modes affected are anything with an index and no
+ displacement, or an index but no base, and this includes (%esp) which is
+ really (,%esp,1).
+
+ The cross.pl script detects problem cases. The workaround is to always
+ use a displacement, and to do this with Zdisp if it's zero so the
+ assembler doesn't discard it.
+
+ See Optimization Manual rev D page 67 and 3DNow Porting Guide rev B pages
+ 13-14 and 36-37.
+
+Calls
+
+- indirect jumps and calls are not branch predicted, they measure about 6
+ cycles.
+
+Various
+
+- adcl 2 cycles of decode, maybe 2 cycles executing in the X pipe
+- bsf 12-27 cycles
+- emms 5 cycles
+- femms 3 cycles
+- jecxz 2 cycles taken, 13 not taken (optimization manual says 7 not taken)
+- divl 20 cycles back-to-back
+- imull 2 decode, 3 execute
+- mull 2 decode, 3 execute (optimization manual decoding sample)
+- prefetch 2 cycles
+- rcll/rcrl implicit by one bit: 2 cycles
+ immediate or %cl count: 11 + 2 per bit for dword
+ 13 + 4 per bit for byte
+- setCC 2 cycles
+- xchgl %eax,reg 1.5 cycles, back-to-back (strange)
+ reg,reg 2 cycles, back-to-back
+
+
+
+
+REFERENCES
+
+"AMD-K6 Processor Code Optimization Application Note", AMD publication
+number 21924, revision D amendment 0, January 2000. This describes K6-2 and
+K6-3. Available on-line,
+
+http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/21924.pdf
+
+"AMD-K6 MMX Enhanced Processor x86 Code Optimization Application Note", AMD
+publication number 21828, revision A amendment 0, August 1997. This is an
+older edition of the above document, describing plain K6. Available
+on-line,
+
+http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/21828.pdf
+
+"3DNow Technology Manual", AMD publication number 21928G/0-March 2000.
+This describes the femms and prefetch instructions, but nothing else from
+3DNow has been used. Available on-line,
+
+http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/21928.pdf
+
+"3DNow Instruction Porting Guide", AMD publication number 22621, revision B,
+August 1999. This has some notes on general K6 optimizations as well as
+3DNow. Available on-line,
+
+http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/22621.pdf
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 76
+End:
diff --git a/vendor/gmp-6.3.0/mpn/x86/k6/aors_n.asm b/vendor/gmp-6.3.0/mpn/x86/k6/aors_n.asm
new file mode 100644
index 0000000..168f9b4
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/k6/aors_n.asm
@@ -0,0 +1,337 @@
+dnl AMD K6 mpn_add/sub_n -- mpn addition or subtraction.
+
+dnl Copyright 1999-2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K6: normal 3.25 cycles/limb, in-place 2.75 cycles/limb.
+
+
+ifdef(`OPERATION_add_n', `
+ define(M4_inst, adcl)
+ define(M4_function_n, mpn_add_n)
+ define(M4_function_nc, mpn_add_nc)
+ define(M4_description, add)
+',`ifdef(`OPERATION_sub_n', `
+ define(M4_inst, sbbl)
+ define(M4_function_n, mpn_sub_n)
+ define(M4_function_nc, mpn_sub_nc)
+ define(M4_description, subtract)
+',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
+')')')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+
+C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size);
+C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size, mp_limb_t carry);
+C
+C Calculate src1,size M4_description src2,size, and store the result in
+C dst,size. The return value is the carry bit from the top of the result
+C (1 or 0).
+C
+C The _nc version accepts 1 or 0 for an initial carry into the low limb of
+C the calculation. Note values other than 1 or 0 here will lead to garbage
+C results.
+C
+C Instruction decoding limits a normal dst=src1+src2 operation to 3 c/l, and
+C an in-place dst+=src to 2.5 c/l. The unrolled loops have 1 cycle/loop of
+C loop control, which with 4 limbs/loop means an extra 0.25 c/l.
+
+define(PARAM_CARRY, `FRAME+20(%esp)')
+define(PARAM_SIZE, `FRAME+16(%esp)')
+define(PARAM_SRC2, `FRAME+12(%esp)')
+define(PARAM_SRC1, `FRAME+8(%esp)')
+define(PARAM_DST, `FRAME+4(%esp)')
+deflit(`FRAME',0)
+
+dnl minimum 5 because the unrolled code can't handle less
+deflit(UNROLL_THRESHOLD, 5)
+
+ TEXT
+ ALIGN(32)
+
+PROLOGUE(M4_function_nc)
+ movl PARAM_CARRY, %eax
+ jmp L(start)
+EPILOGUE()
+
+
+PROLOGUE(M4_function_n)
+ xorl %eax, %eax
+L(start):
+ movl PARAM_SIZE, %ecx
+ pushl %ebx
+FRAME_pushl()
+
+ movl PARAM_SRC1, %ebx
+ pushl %edi
+FRAME_pushl()
+
+ movl PARAM_SRC2, %edx
+ cmpl $UNROLL_THRESHOLD, %ecx
+
+ movl PARAM_DST, %edi
+ jae L(unroll)
+
+
+ shrl %eax C initial carry flag
+
+ C offset 0x21 here, close enough to aligned
+L(simple):
+ C eax scratch
+ C ebx src1
+ C ecx counter
+ C edx src2
+ C esi
+ C edi dst
+ C ebp
+ C
+ C The store to (%edi) could be done with a stosl; it'd be smaller
+ C code, but there's no speed gain and a cld would have to be added
+ C (per mpn/x86/README).
+
+ movl (%ebx), %eax
+ leal 4(%ebx), %ebx
+
+ M4_inst (%edx), %eax
+
+ movl %eax, (%edi)
+ leal 4(%edi), %edi
+
+ leal 4(%edx), %edx
+ loop L(simple)
+
+
+ movl $0, %eax
+ popl %edi
+
+ setc %al
+
+ popl %ebx
+ ret
+
+
+C -----------------------------------------------------------------------------
+L(unroll):
+ C eax carry
+ C ebx src1
+ C ecx counter
+ C edx src2
+ C esi
+ C edi dst
+ C ebp
+
+ cmpl %edi, %ebx
+ pushl %esi
+
+ je L(inplace)
+
+ifdef(`OPERATION_add_n',`
+ cmpl %edi, %edx
+
+ je L(inplace_reverse)
+')
+
+ movl %ecx, %esi
+
+ andl $-4, %ecx
+ andl $3, %esi
+
+ leal (%ebx,%ecx,4), %ebx
+ leal (%edx,%ecx,4), %edx
+ leal (%edi,%ecx,4), %edi
+
+ negl %ecx
+ shrl %eax
+
+ ALIGN(32)
+L(normal_top):
+ C eax counter, qwords, negative
+ C ebx src1
+ C ecx scratch
+ C edx src2
+ C esi
+ C edi dst
+ C ebp
+
+ movl (%ebx,%ecx,4), %eax
+ leal 5(%ecx), %ecx
+ M4_inst -20(%edx,%ecx,4), %eax
+ movl %eax, -20(%edi,%ecx,4)
+
+ movl 4-20(%ebx,%ecx,4), %eax
+ M4_inst 4-20(%edx,%ecx,4), %eax
+ movl %eax, 4-20(%edi,%ecx,4)
+
+ movl 8-20(%ebx,%ecx,4), %eax
+ M4_inst 8-20(%edx,%ecx,4), %eax
+ movl %eax, 8-20(%edi,%ecx,4)
+
+ movl 12-20(%ebx,%ecx,4), %eax
+ M4_inst 12-20(%edx,%ecx,4), %eax
+ movl %eax, 12-20(%edi,%ecx,4)
+
+ loop L(normal_top)
+
+
+ decl %esi
+ jz L(normal_finish_one)
+ js L(normal_done)
+
+ C two or three more limbs
+
+ movl (%ebx), %eax
+ M4_inst (%edx), %eax
+ movl %eax, (%edi)
+
+ movl 4(%ebx), %eax
+ M4_inst 4(%edx), %eax
+ decl %esi
+ movl %eax, 4(%edi)
+
+ jz L(normal_done)
+ movl $2, %ecx
+
+L(normal_finish_one):
+ movl (%ebx,%ecx,4), %eax
+ M4_inst (%edx,%ecx,4), %eax
+ movl %eax, (%edi,%ecx,4)
+
+L(normal_done):
+ popl %esi
+ popl %edi
+
+ movl $0, %eax
+ popl %ebx
+
+ setc %al
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+
+ifdef(`OPERATION_add_n',`
+L(inplace_reverse):
+ C dst==src2
+
+ movl %ebx, %edx
+')
+
+L(inplace):
+ C eax initial carry
+ C ebx
+ C ecx size
+ C edx src
+ C esi
+ C edi dst
+ C ebp
+
+ leal -1(%ecx), %esi
+ decl %ecx
+
+ andl $-4, %ecx
+ andl $3, %esi
+
+ movl (%edx), %ebx C src low limb
+ leal (%edx,%ecx,4), %edx
+
+ leal (%edi,%ecx,4), %edi
+ negl %ecx
+
+ shrl %eax
+
+
+ ALIGN(32)
+L(inplace_top):
+ C eax
+ C ebx next src limb
+ C ecx size
+ C edx src
+ C esi
+ C edi dst
+ C ebp
+
+ M4_inst %ebx, (%edi,%ecx,4)
+
+ movl 4(%edx,%ecx,4), %eax
+ leal 5(%ecx), %ecx
+
+ M4_inst %eax, 4-20(%edi,%ecx,4)
+
+ movl 8-20(%edx,%ecx,4), %eax
+ movl 12-20(%edx,%ecx,4), %ebx
+
+ M4_inst %eax, 8-20(%edi,%ecx,4)
+ M4_inst %ebx, 12-20(%edi,%ecx,4)
+
+ movl 16-20(%edx,%ecx,4), %ebx
+ loop L(inplace_top)
+
+
+ C now %esi is 0 to 3 representing respectively 1 to 4 limbs more
+
+ M4_inst %ebx, (%edi)
+
+ decl %esi
+ jz L(inplace_finish_one)
+ js L(inplace_done)
+
+ C two or three more limbs
+
+ movl 4(%edx), %eax
+ movl 8(%edx), %ebx
+ M4_inst %eax, 4(%edi)
+ M4_inst %ebx, 8(%edi)
+
+ decl %esi
+ movl $2, %ecx
+
+ jz L(normal_done)
+
+L(inplace_finish_one):
+ movl 4(%edx,%ecx,4), %eax
+ M4_inst %eax, 4(%edi,%ecx,4)
+
+L(inplace_done):
+ popl %esi
+ popl %edi
+
+ movl $0, %eax
+ popl %ebx
+
+ setc %al
+
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/k6/aorsmul_1.asm b/vendor/gmp-6.3.0/mpn/x86/k6/aorsmul_1.asm
new file mode 100644
index 0000000..eaa92eb
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/k6/aorsmul_1.asm
@@ -0,0 +1,391 @@
+dnl AMD K6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
+
+dnl Copyright 1999-2003, 2005 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C P5
+C P6 model 0-8,10-12 5.94
+C P6 model 9 (Banias) 5.51
+C P6 model 13 (Dothan) 5.57
+C P4 model 0 (Willamette)
+C P4 model 1 (?)
+C P4 model 2 (Northwood)
+C P4 model 3 (Prescott)
+C P4 model 4 (Nocona)
+C AMD K6 7.65-8.5 (data dependent)
+C AMD K7
+C AMD K8
+
+
+dnl K6: large multipliers small multipliers
+dnl UNROLL_COUNT cycles/limb cycles/limb
+dnl 4 9.5 7.78
+dnl 8 9.0 7.78
+dnl 16 8.4 7.65
+dnl 32 8.4 8.2
+dnl
+dnl Maximum possible unrolling with the current code is 32.
+dnl
+dnl Unrolling to 16 limbs/loop makes the unrolled loop fit exactly in a 256
+dnl byte block, which might explain the good speed at that unrolling.
+
+deflit(UNROLL_COUNT, 16)
+
+
+ifdef(`OPERATION_addmul_1', `
+ define(M4_inst, addl)
+ define(M4_function_1, mpn_addmul_1)
+ define(M4_function_1c, mpn_addmul_1c)
+',`ifdef(`OPERATION_submul_1', `
+ define(M4_inst, subl)
+ define(M4_function_1, mpn_submul_1)
+ define(M4_function_1c, mpn_submul_1c)
+',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
+')')')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
+
+
+C mp_limb_t mpn_addmul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t mult);
+C mp_limb_t mpn_addmul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t mult, mp_limb_t carry);
+C mp_limb_t mpn_submul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t mult);
+C mp_limb_t mpn_submul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t mult, mp_limb_t carry);
+C
+C The jadcl0()s in the unrolled loop makes the speed data dependent. Small
+C multipliers (most significant few bits clear) result in few carry bits and
+C speeds up to 7.65 cycles/limb are attained. Large multipliers (most
+C significant few bits set) make the carry bits 50/50 and lead to something
+C more like 8.4 c/l. With adcl's both of these would be 9.3 c/l.
+C
+C It's important that the gains for jadcl0 on small multipliers don't come
+C at the cost of slowing down other data. Tests on uniformly distributed
+C random data, designed to confound branch prediction, show about a 7%
+C speed-up using jadcl0 over adcl (8.93 versus 9.57 cycles/limb, with all
+C overheads included).
+C
+C In the simple loop, jadcl0() measures slower than adcl (11.9-14.7 versus
+C 11.0 cycles/limb), and hence isn't used.
+C
+C In the simple loop, note that running ecx from negative to zero and using
+C it as an index in the two movs wouldn't help. It would save one
+C instruction (2*addl+loop becoming incl+jnz), but there's nothing unpaired
+C that would be collapsed by this.
+C
+C Attempts at a simpler main loop, with less unrolling, haven't yielded much
+C success, generally running over 9 c/l.
+C
+C
+C jadcl0
+C ------
+C
+C jadcl0() being faster than adcl $0 seems to be an artifact of two things,
+C firstly the instruction decoding and secondly the fact that there's a
+C carry bit for the jadcl0 only on average about 1/4 of the time.
+C
+C The code in the unrolled loop decodes something like the following.
+C
+C decode cycles
+C mull %ebp 2
+C M4_inst %esi, disp(%edi) 1
+C adcl %eax, %ecx 2
+C movl %edx, %esi \ 1
+C jnc 1f /
+C incl %esi \ 1
+C 1: movl disp(%ebx), %eax /
+C ---
+C 7
+C
+C In a back-to-back style test this measures 7 with the jnc not taken, or 8
+C with it taken (both when correctly predicted). This is opposite to the
+C measurements showing small multipliers running faster than large ones.
+C Don't really know why.
+C
+C It's not clear how much branch misprediction might be costing. The K6
+C doco says it will be 1 to 4 cycles, but presumably it's near the low end
+C of that range to get the measured results.
+C
+C
+C In the code the two carries are more or less the preceding mul product and
+C the calculation is roughly
+C
+C x*y + u*b+v
+C
+C where b=2^32 is the size of a limb, x*y is the two carry limbs, and u and
+C v are the two limbs it's added to (being the low of the next mul, and a
+C limb from the destination).
+C
+C To get a carry requires x*y+u*b+v >= b^2, which is u*b+v >= b^2-x*y, and
+C there are b^2-(b^2-x*y) = x*y many such values, giving a probability of
+C x*y/b^2. If x, y, u and v are random and uniformly distributed between 0
+C and b-1, then the total probability can be summed over x and y,
+C
+C 1 b-1 b-1 x*y 1 b*(b-1) b*(b-1)
+C --- * sum sum --- = --- * ------- * ------- = 1/4
+C b^2 x=0 y=1 b^2 b^4 2 2
+C
+C Actually it's a very tiny bit less than 1/4 of course. If y is fixed,
+C then the probability is 1/2*y/b thus varying linearly between 0 and 1/2.
+
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 9)
+',`
+deflit(UNROLL_THRESHOLD, 6)
+')
+
+defframe(PARAM_CARRY, 20)
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ TEXT
+ ALIGN(32)
+
+PROLOGUE(M4_function_1c)
+ pushl %esi
+deflit(`FRAME',4)
+ movl PARAM_CARRY, %esi
+ jmp L(start_nc)
+EPILOGUE()
+
+PROLOGUE(M4_function_1)
+ push %esi
+deflit(`FRAME',4)
+ xorl %esi, %esi C initial carry
+
+L(start_nc):
+ movl PARAM_SIZE, %ecx
+ pushl %ebx
+deflit(`FRAME',8)
+
+ movl PARAM_SRC, %ebx
+ pushl %edi
+deflit(`FRAME',12)
+
+ cmpl $UNROLL_THRESHOLD, %ecx
+ movl PARAM_DST, %edi
+
+ pushl %ebp
+deflit(`FRAME',16)
+ jae L(unroll)
+
+
+ C simple loop
+
+ movl PARAM_MULTIPLIER, %ebp
+
+L(simple):
+ C eax scratch
+ C ebx src
+ C ecx counter
+ C edx scratch
+ C esi carry
+ C edi dst
+ C ebp multiplier
+
+ movl (%ebx), %eax
+ addl $4, %ebx
+
+ mull %ebp
+
+ addl $4, %edi
+ addl %esi, %eax
+
+ adcl $0, %edx
+
+ M4_inst %eax, -4(%edi)
+
+ adcl $0, %edx
+
+ movl %edx, %esi
+ loop L(simple)
+
+
+ popl %ebp
+ popl %edi
+
+ popl %ebx
+ movl %esi, %eax
+
+ popl %esi
+ ret
+
+
+
+C -----------------------------------------------------------------------------
+C The unrolled loop uses a "two carry limbs" scheme. At the top of the loop
+C the carries are ecx=lo, esi=hi, then they swap for each limb processed.
+C For the computed jump an odd size means they start one way around, an even
+C size the other.
+C
+C VAR_JUMP holds the computed jump temporarily because there's not enough
+C registers at the point of doing the mul for the initial two carry limbs.
+C
+C The add/adc for the initial carry in %esi is necessary only for the
+C mpn_addmul/submul_1c entry points. Duplicating the startup code to
+C eliminate this for the plain mpn_add/submul_1 doesn't seem like a good
+C idea.
+
+dnl overlapping with parameters already fetched
+define(VAR_COUNTER, `PARAM_SIZE')
+define(VAR_JUMP, `PARAM_DST')
+
+L(unroll):
+ C eax
+ C ebx src
+ C ecx size
+ C edx
+ C esi initial carry
+ C edi dst
+ C ebp
+
+ movl %ecx, %edx
+ decl %ecx
+
+ subl $2, %edx
+ negl %ecx
+
+ shrl $UNROLL_LOG2, %edx
+ andl $UNROLL_MASK, %ecx
+
+ movl %edx, VAR_COUNTER
+ movl %ecx, %edx
+
+ shll $4, %edx
+ negl %ecx
+
+ C 15 code bytes per limb
+ifdef(`PIC',`
+ call L(pic_calc)
+L(here):
+',`
+ leal L(entry) (%edx,%ecx,1), %edx
+')
+ movl (%ebx), %eax C src low limb
+
+ movl PARAM_MULTIPLIER, %ebp
+ movl %edx, VAR_JUMP
+
+ mull %ebp
+
+ addl %esi, %eax C initial carry (from _1c)
+ jadcl0( %edx)
+
+
+ leal 4(%ebx,%ecx,4), %ebx
+ movl %edx, %esi C high carry
+
+ movl VAR_JUMP, %edx
+ leal (%edi,%ecx,4), %edi
+
+ testl $1, %ecx
+ movl %eax, %ecx C low carry
+
+ jz L(noswap)
+ movl %esi, %ecx C high,low carry other way around
+
+ movl %eax, %esi
+L(noswap):
+
+ jmp *%edx
+
+
+ifdef(`PIC',`
+L(pic_calc):
+ C See mpn/x86/README about old gas bugs
+ leal (%edx,%ecx,1), %edx
+ addl $L(entry)-L(here), %edx
+ addl (%esp), %edx
+ ret_internal
+')
+
+
+C -----------------------------------------------------------
+ ALIGN(32)
+L(top):
+deflit(`FRAME',16)
+ C eax scratch
+ C ebx src
+ C ecx carry lo
+ C edx scratch
+ C esi carry hi
+ C edi dst
+ C ebp multiplier
+ C
+ C 15 code bytes per limb
+
+ leal UNROLL_BYTES(%edi), %edi
+
+L(entry):
+forloop(`i', 0, UNROLL_COUNT/2-1, `
+ deflit(`disp0', eval(2*i*4))
+ deflit(`disp1', eval(disp0 + 4))
+
+Zdisp( movl, disp0,(%ebx), %eax)
+ mull %ebp
+Zdisp( M4_inst,%ecx, disp0,(%edi))
+ adcl %eax, %esi
+ movl %edx, %ecx
+ jadcl0( %ecx)
+
+ movl disp1(%ebx), %eax
+ mull %ebp
+ M4_inst %esi, disp1(%edi)
+ adcl %eax, %ecx
+ movl %edx, %esi
+ jadcl0( %esi)
+')
+
+ decl VAR_COUNTER
+
+ leal UNROLL_BYTES(%ebx), %ebx
+ jns L(top)
+
+
+ popl %ebp
+ M4_inst %ecx, UNROLL_BYTES(%edi)
+
+ popl %edi
+ movl %esi, %eax
+
+ popl %ebx
+ jadcl0( %eax)
+
+ popl %esi
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/k6/cross.pl b/vendor/gmp-6.3.0/mpn/x86/k6/cross.pl
new file mode 100755
index 0000000..fc921a5
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/k6/cross.pl
@@ -0,0 +1,182 @@
+#! /usr/bin/perl
+
+# Copyright 2000, 2001 Free Software Foundation, Inc.
+#
+# This file is part of the GNU MP Library.
+#
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of either:
+#
+# * the GNU Lesser General Public License as published by the Free
+# Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
+#
+# or
+#
+# * the GNU General Public License as published by the Free Software
+# Foundation; either version 2 of the License, or (at your option) any
+# later version.
+#
+# or both in parallel, as here.
+#
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+# for more details.
+#
+# You should have received copies of the GNU General Public License and the
+# GNU Lesser General Public License along with the GNU MP Library. If not,
+# see https://www.gnu.org/licenses/.
+
+
+# Usage: cross.pl [filename.o]...
+#
+# Produce an annotated disassembly of the given object files, indicating
+# certain code alignment and addressing mode problems afflicting K6 chips.
+# "ZZ" is used on all annotations, so this can be searched for.
+#
+# With no arguments, all .o files corresponding to .asm files are processed.
+# This is good in the mpn object directory of a k6*-*-* build.
+#
+# Code alignments of 8 bytes or more are handled. When 32 is used, cache
+# line boundaries will fall in at offsets 0x20,0x40,etc and problems are
+# flagged at those locations. When 16 is used, the line boundaries can also
+# fall at offsets 0x10,0x30,0x50,etc, depending where the file is loaded, so
+# problems are identified there too. Likewise when 8 byte alignment is used
+# problems are flagged additionally at 0x08,0x18,0x28,etc.
+#
+# Usually 32 byte alignment is used for k6 routines, but less is certainly
+# possible if through good luck, or a little tweaking, cache line crossing
+# problems can be avoided at the extra locations.
+#
+# Bugs:
+#
+# Instructions without mod/rm bytes or which are already vector decoded are
+# unaffected by cache line boundary crossing, but not all of these have yet
+# been put in as exceptions. All that occur in practice in GMP are present
+# though.
+#
+# There's no messages for using the vector decoded addressing mode (%esi),
+# but that's easy to avoid when coding.
+#
+# Future:
+#
+# Warn about jump targets that are poorly aligned (less than 2 instructions
+# before a cache line boundary).
+
+use strict;
+
+sub disassemble {
+ my ($file) = @_;
+ my ($addr,$b1,$b2,$b3, $prefix,$opcode,$modrm);
+ my $align;
+
+ open (IN, "objdump -Srfh $file |")
+ || die "Cannot open pipe from objdump\n";
+ while (<IN>) {
+ print;
+
+ if (/^[ \t]*[0-9]+[ \t]+\.text[ \t]/ && /2\*\*([0-9]+)$/) {
+ $align = 1 << $1;
+ if ($align < 8) {
+ print "ZZ cross.pl cannot handle alignment < 2**3\n";
+ $align = 8
+ }
+ }
+
+ if (/^[ \t]*([0-9a-f]*):[ \t]*([0-9a-f]+)[ \t]+([0-9a-f]+)[ \t]+([0-9a-f]+)/) {
+ ($addr,$b1,$b2,$b3) = ($1,$2,$3,$4);
+
+ } elsif (/^[ \t]*([0-9a-f]*):[ \t]*([0-9a-f]+)[ \t]+([0-9a-f]+)/) {
+ ($addr,$b1,$b2,$b3) = ($1,$2,$3,'');
+
+ } elsif (/^[ \t]*([0-9a-f]*):[ \t]*([0-9a-f]+)/) {
+ ($addr,$b1,$b2,$b3) = ($1,$2,'','');
+
+ } else {
+ next;
+ }
+
+ if ($b1 =~ /0f/) {
+ $prefix = $b1;
+ $opcode = $b2;
+ $modrm = $b3;
+ } else {
+ $prefix = '';
+ $opcode = $b1;
+ $modrm = $b2;
+ }
+
+ # modrm of the form 00-xxx-100 with an 0F prefix is the problem case
+ # for K6 and pre-CXT K6-2
+ if ($prefix =~ /0f/
+ && $opcode !~ /^8/ # jcond disp32
+ && $modrm =~ /^[0-3][4c]/) {
+ print "ZZ ($file) >3 bytes to determine instruction length [K6]\n";
+ }
+
+ # with just an opcode, starting 1f mod 20h
+ if (($align==32 && $addr =~ /[13579bdf]f$/
+ || $align==16 && $addr =~ /f$/
+ || $align==8 && $addr =~ /[7f]$/)
+ && $prefix !~ /0f/
+ && $opcode !~ /1[012345]/ # adc
+ && $opcode !~ /1[89abcd]/ # sbb
+ && $opcode !~ /^4/ # inc/dec reg
+ && $opcode !~ /^5/ # push/pop reg
+ && $opcode !~ /68/ # push $imm32
+ && $opcode !~ /^7/ # jcond disp8
+ && $opcode !~ /a[89]/ # test+imm
+ && $opcode !~ /a[a-f]/ # stos/lods/scas
+ && $opcode !~ /b8/ # movl $imm32,%eax
+ && $opcode !~ /d[0123]/ # rcl
+ && $opcode !~ /e[0123]/ # loop/loopz/loopnz/jcxz
+ && $opcode !~ /e8/ # call disp32
+ && $opcode !~ /e[9b]/ # jmp disp32/disp8
+ && $opcode !~ /f[89abcd]/ # clc,stc,cli,sti,cld,std
+ && !($opcode =~ /f[67]/ # grp 1
+ && $modrm =~ /^[2367abef]/) # mul, imul, div, idiv
+ && $modrm !~ /^$/) {
+ print "ZZ ($file) opcode/modrm cross 32-byte boundary\n";
+ }
+
+ # with an 0F prefix, anything starting at 1f mod 20h
+ if (($align==32 && $addr =~ /[13579bdf][f]$/
+ || $align==16 && $addr =~ /f$/
+ || $align==8 && $addr =~ /[7f]$/)
+ && $prefix =~ /0f/
+ && $opcode !~ /af/ # imul
+ && $opcode !~ /a[45]/ # shldl
+ && $opcode !~ /a[cd]/ # shrdl
+ ) {
+ print "ZZ ($file) prefix/opcode cross 32-byte boundary\n";
+ }
+
+ # with an 0F prefix, anything with mod/rm starting at 1e mod 20h
+ if (($align==32 && $addr =~ /[13579bdf][e]$/
+ || $align==16 && $addr =~ /[e]$/
+ || $align==8 && $addr =~ /[6e]$/)
+ && $prefix =~ /0f/
+ && $opcode !~ /^8/ # jcond disp32
+ && $opcode !~ /af/ # imull reg,reg
+ && $opcode !~ /a[45]/ # shldl
+ && $opcode !~ /a[cd]/ # shrdl
+ && $modrm !~ /^$/) {
+ print "ZZ ($file) prefix/opcode/modrm cross 32-byte boundary\n";
+ }
+ }
+ close IN || die "Error from objdump (or objdump not available)\n";
+}
+
+
+my @files;
+if ($#ARGV >= 0) {
+ @files = @ARGV;
+} else {
+ @files = glob "*.asm";
+ map {s/.asm/.o/} @files;
+}
+
+foreach (@files) {
+ disassemble($_);
+}
diff --git a/vendor/gmp-6.3.0/mpn/x86/k6/divrem_1.asm b/vendor/gmp-6.3.0/mpn/x86/k6/divrem_1.asm
new file mode 100644
index 0000000..b4cea4f
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/k6/divrem_1.asm
@@ -0,0 +1,203 @@
+dnl AMD K6 mpn_divrem_1 -- mpn by limb division.
+
+dnl Copyright 1999-2003, 2007 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K6: 20 cycles/limb
+
+
+C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
+C mp_srcptr src, mp_size_t size, mp_limb_t divisor);
+C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
+C mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C mp_limb_t carry);
+C
+C The code here is basically the same as mpn/x86/divrem_1.asm, but uses loop
+C instead of decl+jnz, since it comes out 2 cycles/limb faster.
+C
+C A test is done to see if the high limb is less than the divisor, and if so
+C one less div is done. A div is 20 cycles, so assuming high<divisor about
+C half the time, then this test saves half that amount. The branch
+C misprediction penalty is less than that.
+C
+C Back-to-back div instructions run at 20 cycles, the same as the loop here,
+C so it seems there's nothing to gain by rearranging the loop. Pairing the
+C mov and loop instructions was found to gain nothing.
+C
+C Enhancements:
+C
+C The low-latency K6 multiply might be thought to suit a mul-by-inverse, but
+C that algorithm has been found to suffer from the relatively poor carry
+C handling on K6 and too many auxiliary instructions. The fractional part
+C however could be done at about 13 c/l, if it mattered enough.
+
+defframe(PARAM_CARRY, 24)
+defframe(PARAM_DIVISOR,20)
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC, 12)
+defframe(PARAM_XSIZE, 8)
+defframe(PARAM_DST, 4)
+
+ TEXT
+
+ ALIGN(32)
+PROLOGUE(mpn_divrem_1c)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ pushl %edi FRAME_pushl()
+
+ movl PARAM_SRC, %edi
+ pushl %esi FRAME_pushl()
+
+ movl PARAM_DIVISOR, %esi
+ pushl %ebx FRAME_pushl()
+
+ movl PARAM_DST, %ebx
+ pushl %ebp FRAME_pushl()
+
+ movl PARAM_XSIZE, %ebp
+ orl %ecx, %ecx C size
+
+ movl PARAM_CARRY, %edx
+ jz L(fraction) C if size==0
+
+ leal -4(%ebx,%ebp,4), %ebx C dst one limb below integer part
+ jmp L(integer_top)
+
+EPILOGUE()
+
+
+ ALIGN(16)
+PROLOGUE(mpn_divrem_1)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ pushl %edi FRAME_pushl()
+
+ movl PARAM_SRC, %edi
+ pushl %esi FRAME_pushl()
+
+ movl PARAM_DIVISOR, %esi
+ orl %ecx,%ecx C size
+
+ jz L(size_zero)
+ pushl %ebx FRAME_pushl()
+
+ movl -4(%edi,%ecx,4), %eax C src high limb
+ xorl %edx, %edx
+
+ movl PARAM_DST, %ebx
+ pushl %ebp FRAME_pushl()
+
+ movl PARAM_XSIZE, %ebp
+ cmpl %esi, %eax
+
+ leal -4(%ebx,%ebp,4), %ebx C dst one limb below integer part
+ jae L(integer_entry)
+
+
+ C high<divisor, so high of dst is zero, and avoid one div
+
+ movl %edx, (%ebx,%ecx,4)
+ decl %ecx
+
+ movl %eax, %edx
+ jz L(fraction)
+
+
+L(integer_top):
+ C eax scratch (quotient)
+ C ebx dst+4*xsize-4
+ C ecx counter
+ C edx scratch (remainder)
+ C esi divisor
+ C edi src
+ C ebp xsize
+
+ movl -4(%edi,%ecx,4), %eax
+L(integer_entry):
+
+ divl %esi
+
+ movl %eax, (%ebx,%ecx,4)
+ loop L(integer_top)
+
+
+L(fraction):
+ orl %ebp, %ecx
+ jz L(done)
+
+ movl PARAM_DST, %ebx
+
+
+L(fraction_top):
+ C eax scratch (quotient)
+ C ebx dst
+ C ecx counter
+ C edx scratch (remainder)
+ C esi divisor
+ C edi
+ C ebp
+
+ xorl %eax, %eax
+
+ divl %esi
+
+ movl %eax, -4(%ebx,%ecx,4)
+ loop L(fraction_top)
+
+
+L(done):
+ popl %ebp
+ movl %edx, %eax
+ popl %ebx
+ popl %esi
+ popl %edi
+ ret
+
+
+L(size_zero):
+deflit(`FRAME',8)
+ movl PARAM_XSIZE, %ecx
+ xorl %eax, %eax
+
+ movl PARAM_DST, %edi
+
+ cld C better safe than sorry, see mpn/x86/README
+
+ rep
+ stosl
+
+ popl %esi
+ popl %edi
+ ret
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/k6/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/x86/k6/gmp-mparam.h
new file mode 100644
index 0000000..f03f1b2
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/k6/gmp-mparam.h
@@ -0,0 +1,166 @@
+/* AMD K6 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000-2004, 2009, 2010 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+
+/* 450MHz K6-2 */
+
+#define MOD_1_NORM_THRESHOLD 12
+#define MOD_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 41
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 32
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 3
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 128
+#define USE_PREINV_DIVREM_1 0
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */
+
+#define MUL_TOOM22_THRESHOLD 20
+#define MUL_TOOM33_THRESHOLD 69
+#define MUL_TOOM44_THRESHOLD 106
+#define MUL_TOOM6H_THRESHOLD 157
+#define MUL_TOOM8H_THRESHOLD 199
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 73
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 69
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 65
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 64
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 32
+#define SQR_TOOM3_THRESHOLD 97
+#define SQR_TOOM4_THRESHOLD 143
+#define SQR_TOOM6_THRESHOLD 222
+#define SQR_TOOM8_THRESHOLD 272
+
+#define MULMOD_BNM1_THRESHOLD 13
+#define SQRMOD_BNM1_THRESHOLD 17
+
+#define MUL_FFT_MODF_THRESHOLD 476 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 476, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
+ { 11, 5}, { 23, 6}, { 17, 7}, { 9, 6}, \
+ { 19, 7}, { 11, 6}, { 23, 7}, { 13, 6}, \
+ { 27, 7}, { 15, 6}, { 31, 7}, { 17, 6}, \
+ { 35, 7}, { 21, 8}, { 11, 7}, { 27, 8}, \
+ { 15, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \
+ { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \
+ { 31, 7}, { 63, 8}, { 39, 9}, { 23, 8}, \
+ { 51,10}, { 15, 9}, { 31, 8}, { 67, 9}, \
+ { 47,10}, { 31, 9}, { 79,10}, { 47, 9}, \
+ { 95,11}, { 31,10}, { 63, 9}, { 135,10}, \
+ { 79, 9}, { 167,10}, { 95, 9}, { 191,10}, \
+ { 111,11}, { 63,10}, { 127, 9}, { 255,10}, \
+ { 143, 9}, { 287,10}, { 159,11}, { 95,10}, \
+ { 191, 9}, { 383,12}, { 63,11}, { 127,10}, \
+ { 255, 9}, { 511,10}, { 271, 9}, { 543,10}, \
+ { 287,11}, { 159,10}, { 351,11}, { 191,10}, \
+ { 415, 9}, { 831,11}, { 223,12}, { 127,11}, \
+ { 255,10}, { 543,11}, { 287,10}, { 575,11}, \
+ { 351,10}, { 703,12}, { 191,11}, { 415,10}, \
+ { 831,13}, { 127,12}, { 255,11}, { 543,10}, \
+ { 1087,11}, { 575,12}, { 319,11}, { 703,12}, \
+ { 383,11}, { 831,12}, { 447,11}, { 895,13}, \
+ { 255,12}, { 511,11}, { 1087,12}, { 575,11}, \
+ { 1151,12}, { 703,13}, { 383,12}, { 959,14}, \
+ { 255,13}, { 511,12}, { 1215,13}, { 8192,14}, \
+ { 16384,15}, { 32768,16} }
+#define MUL_FFT_TABLE3_SIZE 106
+#define MUL_FFT_THRESHOLD 7424
+
+#define SQR_FFT_MODF_THRESHOLD 432 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 432, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
+ { 11, 5}, { 23, 6}, { 21, 7}, { 11, 6}, \
+ { 24, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \
+ { 31, 7}, { 21, 8}, { 11, 7}, { 29, 8}, \
+ { 15, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \
+ { 23, 7}, { 49, 8}, { 27, 9}, { 15, 8}, \
+ { 39, 9}, { 23, 7}, { 93, 8}, { 47, 7}, \
+ { 95, 8}, { 51,10}, { 15, 9}, { 31, 8}, \
+ { 67, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \
+ { 95, 9}, { 55,10}, { 31, 9}, { 71, 8}, \
+ { 143, 9}, { 79,10}, { 47, 9}, { 95,11}, \
+ { 31,10}, { 63, 9}, { 135,10}, { 79, 9}, \
+ { 167,10}, { 95, 9}, { 191,11}, { 63,10}, \
+ { 127, 9}, { 255,10}, { 143, 9}, { 287, 8}, \
+ { 575,10}, { 159, 9}, { 319,11}, { 95,10}, \
+ { 191,12}, { 63,11}, { 127,10}, { 255, 9}, \
+ { 511,10}, { 271, 9}, { 543,10}, { 287,11}, \
+ { 159,10}, { 319, 9}, { 639,10}, { 351, 9}, \
+ { 703,11}, { 191,10}, { 415,11}, { 223,12}, \
+ { 127,11}, { 255,10}, { 543,11}, { 287,10}, \
+ { 607,11}, { 319,10}, { 639,11}, { 351,10}, \
+ { 703,12}, { 191,11}, { 415,10}, { 831,13}, \
+ { 127,12}, { 255,11}, { 543,10}, { 1087,11}, \
+ { 607,12}, { 319,11}, { 703,12}, { 383,11}, \
+ { 831,12}, { 447,13}, { 255,12}, { 511,11}, \
+ { 1087,12}, { 575,11}, { 1215,12}, { 703,13}, \
+ { 383,12}, { 895,14}, { 255,13}, { 511,12}, \
+ { 1215,13}, { 8192,14}, { 16384,15}, { 32768,16} }
+#define SQR_FFT_TABLE3_SIZE 112
+#define SQR_FFT_THRESHOLD 7040
+
+#define MULLO_BASECASE_THRESHOLD 3
+#define MULLO_DC_THRESHOLD 60
+#define MULLO_MUL_N_THRESHOLD 13463
+
+#define DC_DIV_QR_THRESHOLD 78
+#define DC_DIVAPPR_Q_THRESHOLD 252
+#define DC_BDIV_QR_THRESHOLD 84
+#define DC_BDIV_Q_THRESHOLD 171
+
+#define INV_MULMOD_BNM1_THRESHOLD 55
+#define INV_NEWTON_THRESHOLD 234
+#define INV_APPR_THRESHOLD 236
+
+#define BINV_NEWTON_THRESHOLD 268
+#define REDC_1_TO_REDC_N_THRESHOLD 67
+
+#define MU_DIV_QR_THRESHOLD 1308
+#define MU_DIVAPPR_Q_THRESHOLD 1142
+#define MUPI_DIV_QR_THRESHOLD 134
+#define MU_BDIV_QR_THRESHOLD 1164
+#define MU_BDIV_Q_THRESHOLD 1164
+
+#define MATRIX22_STRASSEN_THRESHOLD 15
+#define HGCD_THRESHOLD 182
+#define GCD_DC_THRESHOLD 591
+#define GCDEXT_DC_THRESHOLD 472
+#define JACOBI_BASE_METHOD 2
+
+#define GET_STR_DC_THRESHOLD 24
+#define GET_STR_PRECOMPUTE_THRESHOLD 40
+#define SET_STR_DC_THRESHOLD 834
+#define SET_STR_PRECOMPUTE_THRESHOLD 2042
diff --git a/vendor/gmp-6.3.0/mpn/x86/k6/k62mmx/copyd.asm b/vendor/gmp-6.3.0/mpn/x86/k6/k62mmx/copyd.asm
new file mode 100644
index 0000000..f80a5a1
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/k6/k62mmx/copyd.asm
@@ -0,0 +1,118 @@
+dnl AMD K6-2 mpn_copyd -- copy limb vector, decrementing.
+
+dnl Copyright 2001, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K6-2: 1.0 cycles/limb
+
+
+C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C The loop here is no faster than a rep movsl at 1.0 c/l, but it avoids a 30
+C cycle startup time, which amounts for instance to a 2x speedup at 15
+C limbs.
+C
+C If dst is 4mod8 the loop would be 1.17 c/l, but that's avoided by
+C processing one limb separately to make it aligned. This and a final odd
+C limb are handled in a branch-free fashion, ending up re-copying if the
+C special case isn't needed.
+C
+C Alternatives:
+C
+C There used to be a big unrolled version of this, running at 0.56 c/l if
+C the destination was aligned, but that seemed rather excessive for the
+C relative importance of copyd.
+C
+C If the destination alignment is ignored and just left to run at 1.17 c/l
+C some code size and a fixed few cycles can be saved. Considering how few
+C uses copyd finds perhaps that should be favoured. The current code has
+C the attraction of being no slower than a basic rep movsl though.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+dnl re-using parameter space
+define(SAVE_EBX,`PARAM_SIZE')
+
+ TEXT
+ ALIGN(16)
+
+PROLOGUE(mpn_copyd)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ movl %ebx, SAVE_EBX
+
+ movl PARAM_SRC, %eax
+ movl PARAM_DST, %edx
+
+ subl $1, %ecx C better code alignment than decl
+ jb L(zero)
+
+ jz L(one_more)
+ leal 4(%edx,%ecx,4), %ebx
+
+Zdisp( movd, 0,(%eax,%ecx,4), %mm0) C high limb
+Zdisp( movd, %mm0, 0,(%edx,%ecx,4)) C Zdisp for good code alignment
+
+ cmpl $1, %ecx
+ je L(one_more)
+
+ shrl $2, %ebx
+ andl $1, %ebx C 1 if dst[size-2] unaligned
+
+ subl %ebx, %ecx
+ nop C code alignment
+
+L(top):
+ C eax src
+ C ebx
+ C ecx counter
+ C edx dst
+
+ movq -4(%eax,%ecx,4), %mm0
+ subl $2, %ecx
+
+ movq %mm0, 4(%edx,%ecx,4)
+ ja L(top)
+
+
+L(one_more):
+ movd (%eax), %mm0
+ movd %mm0, (%edx)
+
+ movl SAVE_EBX, %ebx
+ emms_or_femms
+L(zero):
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/k6/k62mmx/lshift.asm b/vendor/gmp-6.3.0/mpn/x86/k6/k62mmx/lshift.asm
new file mode 100644
index 0000000..c86575f
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/k6/k62mmx/lshift.asm
@@ -0,0 +1,294 @@
+dnl AMD K6-2 mpn_lshift -- mpn left shift.
+
+dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K6-2: 1.75 cycles/limb
+
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C unsigned shift);
+C
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+dnl used after src has been fetched
+define(VAR_RETVAL,`PARAM_SRC')
+
+dnl minimum 9, because unrolled loop can't handle less
+deflit(UNROLL_THRESHOLD, 9)
+
+ TEXT
+ ALIGN(32)
+
+PROLOGUE(mpn_lshift)
+deflit(`FRAME',0)
+
+ C The 1 limb case can be done without the push %ebx, but it's then
+ C still the same speed. The push is left as a free helping hand for
+ C the two_or_more code.
+
+ movl PARAM_SIZE, %eax
+ pushl %ebx FRAME_pushl()
+
+ movl PARAM_SRC, %ebx
+ decl %eax
+
+ movl PARAM_SHIFT, %ecx
+ jnz L(two_or_more)
+
+ movl (%ebx), %edx C src limb
+ movl PARAM_DST, %ebx
+
+ shldl( %cl, %edx, %eax) C return value
+
+ shll %cl, %edx
+
+ movl %edx, (%ebx) C dst limb
+ popl %ebx
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(16) C avoid offset 0x1f
+L(two_or_more):
+ C eax size-1
+ C ebx src
+ C ecx shift
+ C edx
+
+ movl (%ebx,%eax,4), %edx C src high limb
+ negl %ecx
+
+ movd PARAM_SHIFT, %mm6
+ addl $32, %ecx C 32-shift
+
+ shrl %cl, %edx
+ cmpl $UNROLL_THRESHOLD-1, %eax
+
+ movl %edx, VAR_RETVAL
+ jae L(unroll)
+
+
+ movd %ecx, %mm7
+ movl %eax, %ecx
+
+ movl PARAM_DST, %eax
+
+L(simple):
+ C eax dst
+ C ebx src
+ C ecx counter, size-1 to 1
+ C edx retval
+ C
+ C mm0 scratch
+ C mm6 shift
+ C mm7 32-shift
+
+ movq -4(%ebx,%ecx,4), %mm0
+
+ psrlq %mm7, %mm0
+
+Zdisp( movd, %mm0, 0,(%eax,%ecx,4))
+ loop L(simple)
+
+
+ movd (%ebx), %mm0
+ popl %ebx
+
+ psllq %mm6, %mm0
+
+ movd %mm0, (%eax)
+ movl %edx, %eax
+
+ femms
+ ret
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(16)
+L(unroll):
+ C eax size-1
+ C ebx src
+ C ecx 32-shift
+ C edx retval (but instead VAR_RETVAL is used)
+ C
+ C mm6 shift
+
+ addl $32, %ecx
+ movl PARAM_DST, %edx
+
+ movd %ecx, %mm7
+ subl $7, %eax C size-8
+
+ leal (%edx,%eax,4), %ecx C alignment of dst
+
+ movq 32-8(%ebx,%eax,4), %mm2 C src high qword
+ testb $4, %cl
+
+ jz L(dst_aligned)
+ psllq %mm6, %mm2
+
+ psrlq $32, %mm2
+ decl %eax
+
+ movd %mm2, 32(%edx,%eax,4) C dst high limb
+ movq 32-8(%ebx,%eax,4), %mm2 C new src high qword
+L(dst_aligned):
+
+ movq 32-16(%ebx,%eax,4), %mm0 C src second highest qword
+
+
+ C This loop is the important bit, the rest is just support for it.
+ C Four src limbs are held at the start, and four more will be read.
+ C Four dst limbs will be written. This schedule seems necessary for
+ C full speed.
+ C
+ C The use of size-8 lets the loop stop when %eax goes negative and
+ C leaves -4 to -1 which can be tested with test $1 and $2.
+
+L(top):
+ C eax counter, size-8 step by -4 until <0
+ C ebx src
+ C ecx
+ C edx dst
+ C
+ C mm0 src next qword
+ C mm1 scratch
+ C mm2 src prev qword
+ C mm6 shift
+ C mm7 64-shift
+
+ psllq %mm6, %mm2
+ subl $4, %eax
+
+ movq %mm0, %mm1
+ psrlq %mm7, %mm0
+
+ por %mm0, %mm2
+ movq 24(%ebx,%eax,4), %mm0
+
+ psllq %mm6, %mm1
+ movq %mm2, 40(%edx,%eax,4)
+
+ movq %mm0, %mm2
+ psrlq %mm7, %mm0
+
+ por %mm0, %mm1
+ movq 16(%ebx,%eax,4), %mm0
+
+ movq %mm1, 32(%edx,%eax,4)
+ jnc L(top)
+
+
+ C Now have four limbs in mm2 (prev) and mm0 (next), plus eax mod 4.
+ C
+ C 8(%ebx) is the next source, and 24(%edx) is the next destination.
+ C %eax is between -4 and -1, representing respectively 0 to 3 extra
+ C limbs that must be read.
+
+
+ testl $2, %eax C testl to avoid bad cache line crossing
+ jz L(finish_nottwo)
+
+ C Two more limbs: lshift mm2, OR it with rshifted mm0, mm0 becomes
+ C new mm2 and a new mm0 is loaded.
+
+ psllq %mm6, %mm2
+ movq %mm0, %mm1
+
+ psrlq %mm7, %mm0
+ subl $2, %eax
+
+ por %mm0, %mm2
+ movq 16(%ebx,%eax,4), %mm0
+
+ movq %mm2, 32(%edx,%eax,4)
+ movq %mm1, %mm2
+L(finish_nottwo):
+
+
+ C lshift mm2, OR with rshifted mm0, mm1 becomes lshifted mm0
+
+ testb $1, %al
+ psllq %mm6, %mm2
+
+ movq %mm0, %mm1
+ psrlq %mm7, %mm0
+
+ por %mm0, %mm2
+ psllq %mm6, %mm1
+
+ movq %mm2, 24(%edx,%eax,4)
+ jz L(finish_even)
+
+
+ C Size is odd, so mm1 and one extra limb to process.
+
+ movd (%ebx), %mm0 C src[0]
+ popl %ebx
+deflit(`FRAME',0)
+
+ movq %mm0, %mm2
+ psllq $32, %mm0
+
+ psrlq %mm7, %mm0
+
+ psllq %mm6, %mm2
+ por %mm0, %mm1
+
+ movq %mm1, 4(%edx) C dst[1,2]
+ movd %mm2, (%edx) C dst[0]
+
+ movl VAR_RETVAL, %eax
+
+ femms
+ ret
+
+
+ nop C avoid bad cache line crossing
+L(finish_even):
+deflit(`FRAME',4)
+ C Size is even, so only mm1 left to process.
+
+ movq %mm1, (%edx) C dst[0,1]
+ movl VAR_RETVAL, %eax
+
+ popl %ebx
+ femms
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/k6/k62mmx/rshift.asm b/vendor/gmp-6.3.0/mpn/x86/k6/k62mmx/rshift.asm
new file mode 100644
index 0000000..f604a7b
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/k6/k62mmx/rshift.asm
@@ -0,0 +1,293 @@
+dnl AMD K6-2 mpn_rshift -- mpn right shift.
+
+dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K6-2: 1.75 cycles/limb
+
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C unsigned shift);
+C
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+dnl Minimum 9, because the unrolled loop can't handle less.
+dnl
+deflit(UNROLL_THRESHOLD, 9)
+
+ TEXT
+ ALIGN(32)
+
+PROLOGUE(mpn_rshift)
+deflit(`FRAME',0)
+
+ C The 1 limb case can be done without the push %ebx, but it's then
+ C still the same speed. The push is left as a free helping hand for
+ C the two_or_more code.
+
+ movl PARAM_SIZE, %eax
+ pushl %ebx FRAME_pushl()
+
+ movl PARAM_SRC, %ebx
+ decl %eax
+
+ movl PARAM_SHIFT, %ecx
+ jnz L(two_or_more)
+
+ movl (%ebx), %edx C src limb
+ movl PARAM_DST, %ebx
+
+ shrdl( %cl, %edx, %eax) C return value
+
+ shrl %cl, %edx
+
+ movl %edx, (%ebx) C dst limb
+ popl %ebx
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(16) C avoid offset 0x1f
+L(two_or_more):
+ C eax size-1
+ C ebx src
+ C ecx shift
+ C edx
+
+ movl (%ebx), %edx C src low limb
+ negl %ecx
+
+ addl $32, %ecx
+ movd PARAM_SHIFT, %mm6
+
+ shll %cl, %edx
+ cmpl $UNROLL_THRESHOLD-1, %eax
+
+ jae L(unroll)
+
+
+ C eax size-1
+ C ebx src
+ C ecx 32-shift
+ C edx retval
+ C
+ C mm6 shift
+
+ movl PARAM_DST, %ecx
+ leal (%ebx,%eax,4), %ebx
+
+ leal -4(%ecx,%eax,4), %ecx
+ negl %eax
+
+ C This loop runs at about 3 cycles/limb, which is the amount of
+ C decoding, and this is despite every second access being unaligned.
+
+L(simple):
+ C eax counter, -(size-1) to -1
+ C ebx &src[size-1]
+ C ecx &dst[size-1]
+ C edx retval
+ C
+ C mm0 scratch
+ C mm6 shift
+
+Zdisp( movq, 0,(%ebx,%eax,4), %mm0)
+ incl %eax
+
+ psrlq %mm6, %mm0
+
+Zdisp( movd, %mm0, 0,(%ecx,%eax,4))
+ jnz L(simple)
+
+
+ movq %mm0, (%ecx)
+ movl %edx, %eax
+
+ popl %ebx
+
+ femms
+ ret
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(16)
+L(unroll):
+ C eax size-1
+ C ebx src
+ C ecx 32-shift
+ C edx retval
+ C
+ C mm6 shift
+
+ addl $32, %ecx
+ subl $7, %eax C size-8
+
+ movd %ecx, %mm7
+ movl PARAM_DST, %ecx
+
+ movq (%ebx), %mm2 C src low qword
+ leal (%ebx,%eax,4), %ebx C src end - 32
+
+ testb $4, %cl
+ leal (%ecx,%eax,4), %ecx C dst end - 32
+
+ notl %eax C -(size-7)
+ jz L(dst_aligned)
+
+ psrlq %mm6, %mm2
+ incl %eax
+
+Zdisp( movd, %mm2, 0,(%ecx,%eax,4)) C dst low limb
+ movq 4(%ebx,%eax,4), %mm2 C new src low qword
+L(dst_aligned):
+
+ movq 12(%ebx,%eax,4), %mm0 C src second lowest qword
+ nop C avoid bad cache line crossing
+
+
+ C This loop is the important bit, the rest is just support for it.
+ C Four src limbs are held at the start, and four more will be read.
+ C Four dst limbs will be written. This schedule seems necessary for
+ C full speed.
+ C
+ C The use of -(size-7) lets the loop stop when %eax becomes >= 0 and
+ C and leaves 0 to 3 which can be tested with test $1 and $2.
+
+L(top):
+ C eax counter, -(size-7) step by +4 until >=0
+ C ebx src end - 32
+ C ecx dst end - 32
+ C edx retval
+ C
+ C mm0 src next qword
+ C mm1 scratch
+ C mm2 src prev qword
+ C mm6 shift
+ C mm7 64-shift
+
+ psrlq %mm6, %mm2
+ addl $4, %eax
+
+ movq %mm0, %mm1
+ psllq %mm7, %mm0
+
+ por %mm0, %mm2
+ movq 4(%ebx,%eax,4), %mm0
+
+ psrlq %mm6, %mm1
+ movq %mm2, -12(%ecx,%eax,4)
+
+ movq %mm0, %mm2
+ psllq %mm7, %mm0
+
+ por %mm0, %mm1
+ movq 12(%ebx,%eax,4), %mm0
+
+ movq %mm1, -4(%ecx,%eax,4)
+ ja L(top) C jump if no carry and not zero
+
+
+
+ C Now have the four limbs in mm2 (low) and mm0 (high), and %eax is 0
+ C to 3 representing respectively 3 to 0 further limbs.
+
+ testl $2, %eax C testl to avoid bad cache line crossings
+ jnz L(finish_nottwo)
+
+ C Two or three extra limbs: rshift mm2, OR it with lshifted mm0, mm0
+ C becomes new mm2 and a new mm0 is loaded.
+
+ psrlq %mm6, %mm2
+ movq %mm0, %mm1
+
+ psllq %mm7, %mm0
+ addl $2, %eax
+
+ por %mm0, %mm2
+ movq 12(%ebx,%eax,4), %mm0
+
+ movq %mm2, -4(%ecx,%eax,4)
+ movq %mm1, %mm2
+L(finish_nottwo):
+
+
+ testb $1, %al
+ psrlq %mm6, %mm2
+
+ movq %mm0, %mm1
+ psllq %mm7, %mm0
+
+ por %mm0, %mm2
+ psrlq %mm6, %mm1
+
+ movq %mm2, 4(%ecx,%eax,4)
+ jnz L(finish_even)
+
+
+ C one further extra limb to process
+
+ movd 32-4(%ebx), %mm0 C src[size-1], most significant limb
+ popl %ebx
+
+ movq %mm0, %mm2
+ psllq %mm7, %mm0
+
+ por %mm0, %mm1
+ psrlq %mm6, %mm2
+
+ movq %mm1, 32-12(%ecx) C dst[size-3,size-2]
+ movd %mm2, 32-4(%ecx) C dst[size-1]
+
+ movl %edx, %eax C retval
+
+ femms
+ ret
+
+
+ nop C avoid bad cache line crossing
+L(finish_even):
+ C no further extra limbs
+
+ movq %mm1, 32-8(%ecx) C dst[size-2,size-1]
+ movl %edx, %eax C retval
+
+ popl %ebx
+
+ femms
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/k6/mmx/com.asm b/vendor/gmp-6.3.0/mpn/x86/k6/mmx/com.asm
new file mode 100644
index 0000000..b747454
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/k6/mmx/com.asm
@@ -0,0 +1,103 @@
+dnl AMD K6-2 mpn_com -- mpn bitwise one's complement.
+
+dnl Copyright 1999-2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+NAILS_SUPPORT(0-31)
+
+
+C alignment dst/src, A=0mod8 N=4mod8
+C A/A A/N N/A N/N
+C K6-2 1.0 1.18 1.18 1.18 cycles/limb
+C K6 1.5 1.85 1.75 1.85
+
+
+C void mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C Take the bitwise ones-complement of src,size and write it to dst,size.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_com)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ movl PARAM_SRC, %eax
+ movl PARAM_DST, %edx
+ shrl %ecx
+ jnz L(two_or_more)
+
+ movl (%eax), %eax
+ notl_or_xorl_GMP_NUMB_MASK( %eax)
+ movl %eax, (%edx)
+ ret
+
+
+L(two_or_more):
+ pushl %ebx FRAME_pushl()
+ pcmpeqd %mm7, %mm7 C all ones
+
+ movl %ecx, %ebx
+ifelse(GMP_NAIL_BITS,0,,
+` psrld $GMP_NAIL_BITS, %mm7') C clear nails
+
+
+
+ ALIGN(8)
+L(top):
+ C eax src
+ C ebx floor(size/2)
+ C ecx counter
+ C edx dst
+ C
+ C mm0 scratch
+ C mm7 mask
+
+ movq -8(%eax,%ecx,8), %mm0
+ pxor %mm7, %mm0
+ movq %mm0, -8(%edx,%ecx,8)
+ loop L(top)
+
+
+ jnc L(no_extra)
+ movl (%eax,%ebx,8), %eax
+ notl_or_xorl_GMP_NUMB_MASK( %eax)
+ movl %eax, (%edx,%ebx,8)
+L(no_extra):
+
+ popl %ebx
+ emms_or_femms
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/k6/mmx/dive_1.asm b/vendor/gmp-6.3.0/mpn/x86/k6/mmx/dive_1.asm
new file mode 100644
index 0000000..1bbad3a
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/k6/mmx/dive_1.asm
@@ -0,0 +1,282 @@
+dnl AMD K6 mpn_divexact_1 -- mpn by limb exact division.
+
+dnl Copyright 2000-2002, 2007 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C divisor
+C odd even
+C K6: 10.0 12.0 cycles/limb
+C K6-2: 10.0 11.5
+
+
+C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor);
+C
+C A simple divl is used for size==1. This is about 10 cycles faster for an
+C odd divisor or 20 cycles for an even divisor.
+C
+C The loops are quite sensitive to code alignment, speeds should be
+C rechecked (odd and even divisor, pic and non-pic) if contemplating
+C changing anything.
+
+defframe(PARAM_DIVISOR,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+dnl re-use parameter space
+define(VAR_INVERSE,`PARAM_DST')
+
+ TEXT
+
+ ALIGN(32)
+PROLOGUE(mpn_divexact_1)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+
+ movl PARAM_SRC, %eax
+ xorl %edx, %edx
+
+ cmpl $1, %ecx
+ jnz L(two_or_more)
+
+ movl (%eax), %eax
+
+ divl PARAM_DIVISOR
+
+ movl PARAM_DST, %ecx
+ movl %eax, (%ecx)
+
+ ret
+
+
+L(two_or_more):
+ movl PARAM_DIVISOR, %eax
+ pushl %ebx FRAME_pushl()
+
+ movl PARAM_SRC, %ebx
+ pushl %ebp FRAME_pushl()
+
+L(strip_twos):
+ shrl %eax
+ incl %edx C will get shift+1
+
+ jnc L(strip_twos)
+ pushl %esi FRAME_pushl()
+
+ leal 1(%eax,%eax), %esi C d without twos
+ andl $127, %eax C d/2, 7 bits
+
+ifdef(`PIC',`
+ LEA( binvert_limb_table, %ebp)
+Zdisp( movzbl, 0,(%eax,%ebp), %eax)
+',`
+ movzbl binvert_limb_table(%eax), %eax C inv 8 bits
+')
+ pushl %edi FRAME_pushl()
+
+ leal (%eax,%eax), %ebp C 2*inv
+
+ imull %eax, %eax C inv*inv
+
+ movl PARAM_DST, %edi
+
+ imull %esi, %eax C inv*inv*d
+
+ subl %eax, %ebp C inv = 2*inv - inv*inv*d
+ leal (%ebp,%ebp), %eax C 2*inv
+
+ imull %ebp, %ebp C inv*inv
+
+ movl %esi, PARAM_DIVISOR C d without twos
+ leal (%ebx,%ecx,4), %ebx C src end
+
+ imull %esi, %ebp C inv*inv*d
+
+ leal (%edi,%ecx,4), %edi C dst end
+ negl %ecx C -size
+
+ subl %ebp, %eax C inv = 2*inv - inv*inv*d
+ subl $1, %edx C shift amount, and clear carry
+
+ ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS
+ pushl %eax FRAME_pushl()
+ imull PARAM_DIVISOR, %eax
+ cmpl $1, %eax
+ popl %eax FRAME_popl()')
+
+ movl %eax, VAR_INVERSE
+ jnz L(even)
+
+ movl (%ebx,%ecx,4), %esi C src low limb
+ jmp L(odd_entry)
+
+
+ ALIGN(16)
+ nop C code alignment
+L(odd_top):
+ C eax scratch
+ C ebx src end
+ C ecx counter, limbs, negative
+ C edx inverse
+ C esi next limb, adjusted for carry
+ C edi dst end
+ C ebp carry bit, 0 or -1
+
+ imull %edx, %esi
+
+ movl PARAM_DIVISOR, %eax
+ movl %esi, -4(%edi,%ecx,4)
+
+ mull %esi C carry limb in edx
+
+ subl %ebp, %edx C apply carry bit
+ movl (%ebx,%ecx,4), %esi
+
+L(odd_entry):
+ subl %edx, %esi C apply carry limb
+ movl VAR_INVERSE, %edx
+
+ sbbl %ebp, %ebp C 0 or -1
+
+ incl %ecx
+ jnz L(odd_top)
+
+
+ imull %edx, %esi
+
+ movl %esi, -4(%edi,%ecx,4)
+
+ popl %edi
+ popl %esi
+
+ popl %ebp
+ popl %ebx
+
+ ret
+
+
+L(even):
+ C eax
+ C ebx src end
+ C ecx -size
+ C edx twos
+ C esi
+ C edi dst end
+ C ebp
+
+ xorl %ebp, %ebp
+Zdisp( movq, 0,(%ebx,%ecx,4), %mm0) C src[0,1]
+
+ movd %edx, %mm7
+ movl VAR_INVERSE, %edx
+
+ addl $2, %ecx
+ psrlq %mm7, %mm0
+
+ movd %mm0, %esi
+ jz L(even_two) C if only two limbs
+
+
+C Out-of-order execution is good enough to hide the load/rshift/movd
+C latency. Having imul at the top of the loop gives 11.5 c/l instead of 12,
+C on K6-2. In fact there's only 11 of decode, but nothing running at 11 has
+C been found. Maybe the fact every second movq is unaligned costs the extra
+C 0.5.
+
+L(even_top):
+ C eax scratch
+ C ebx src end
+ C ecx counter, limbs, negative
+ C edx inverse
+ C esi next limb, adjusted for carry
+ C edi dst end
+ C ebp carry bit, 0 or -1
+ C
+ C mm0 scratch, source limbs
+ C mm7 twos
+
+ imull %edx, %esi
+
+ movl %esi, -8(%edi,%ecx,4)
+ movl PARAM_DIVISOR, %eax
+
+ mull %esi C carry limb in edx
+
+ movq -4(%ebx,%ecx,4), %mm0
+ psrlq %mm7, %mm0
+
+ movd %mm0, %esi
+ subl %ebp, %edx C apply carry bit
+
+ subl %edx, %esi C apply carry limb
+ movl VAR_INVERSE, %edx
+
+ sbbl %ebp, %ebp C 0 or -1
+
+ incl %ecx
+ jnz L(even_top)
+
+
+L(even_two):
+ movd -4(%ebx), %mm0 C src high limb
+ psrlq %mm7, %mm0
+
+ imull %edx, %esi
+
+ movl %esi, -8(%edi)
+ movl PARAM_DIVISOR, %eax
+
+ mull %esi C carry limb in edx
+
+ movd %mm0, %esi
+ subl %ebp, %edx C apply carry bit
+
+ movl VAR_INVERSE, %eax
+ subl %edx, %esi C apply carry limb
+
+ imull %eax, %esi
+
+ movl %esi, -4(%edi)
+
+ popl %edi
+ popl %esi
+
+ popl %ebp
+ popl %ebx
+
+ emms_or_femms
+
+ ret
+
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/x86/k6/mmx/logops_n.asm b/vendor/gmp-6.3.0/mpn/x86/k6/mmx/logops_n.asm
new file mode 100644
index 0000000..e17930b
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/k6/mmx/logops_n.asm
@@ -0,0 +1,226 @@
+dnl AMD K6-2 mpn_and_n, mpn_andn_n, mpn_nand_n, mpn_ior_n, mpn_iorn_n,
+dnl mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise logical operations.
+
+dnl Copyright 1999-2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+NAILS_SUPPORT(0-31)
+
+
+C alignment dst/src1/src2, A=0mod8, N=4mod8
+C A/A/A A/A/N A/N/A A/N/N N/A/A N/A/N N/N/A N/N/N
+C
+C K6-2 1.2 1.5 1.5 1.2 1.2 1.5 1.5 1.2 and,andn,ior,xor
+C K6-2 1.5 1.75 2.0 1.75 1.75 2.0 1.75 1.5 iorn,xnor
+C K6-2 1.75 2.0 2.0 2.0 2.0 2.0 2.0 1.75 nand,nior
+C
+C K6 1.5 1.68 1.75 1.2 1.75 1.75 1.68 1.5 and,andn,ior,xor
+C K6 2.0 2.0 2.25 2.25 2.25 2.25 2.0 2.0 iorn,xnor
+C K6 2.0 2.25 2.25 2.25 2.25 2.25 2.25 2.0 nand,nior
+
+
+dnl M4_p and M4_i are the MMX and integer instructions
+dnl M4_*_neg_dst means whether to negate the final result before writing
+dnl M4_*_neg_src2 means whether to negate the src2 values before using them
+
+define(M4_choose_op,
+m4_assert_numargs(7)
+`ifdef(`OPERATION_$1',`
+define(`M4_function', `mpn_$1')
+define(`M4_operation', `$1')
+define(`M4_p', `$2')
+define(`M4_p_neg_dst', `$3')
+define(`M4_p_neg_src2',`$4')
+define(`M4_i', `$5')
+define(`M4_i_neg_dst', `$6')
+define(`M4_i_neg_src2',`$7')
+')')
+
+dnl xnor is done in "iorn" style because it's a touch faster than "nior"
+dnl style (the two are equivalent for xor).
+dnl
+dnl pandn can't be used with nails.
+
+M4_choose_op( and_n, pand,0,0, andl,0,0)
+ifelse(GMP_NAIL_BITS,0,
+`M4_choose_op(andn_n, pandn,0,0, andl,0,1)',
+`M4_choose_op(andn_n, pand,0,1, andl,0,1)')
+M4_choose_op( nand_n, pand,1,0, andl,1,0)
+M4_choose_op( ior_n, por,0,0, orl,0,0)
+M4_choose_op( iorn_n, por,0,1, orl,0,1)
+M4_choose_op( nior_n, por,1,0, orl,1,0)
+M4_choose_op( xor_n, pxor,0,0, xorl,0,0)
+M4_choose_op( xnor_n, pxor,0,1, xorl,0,1)
+
+ifdef(`M4_function',,
+`m4_error(`Unrecognised or undefined OPERATION symbol
+')')
+
+MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
+
+
+C void M4_function (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size);
+C
+C Do src1,size M4_operation src2,size, storing the result in dst,size.
+C
+C Unaligned movq loads and stores are a bit slower than aligned ones. The
+C test at the start of the routine checks the alignment of src1 and if
+C necessary processes one limb separately at the low end to make it aligned.
+C
+C The raw speeds without this alignment switch are as follows.
+C
+C alignment dst/src1/src2, A=0mod8, N=4mod8
+C A/A/A A/A/N A/N/A A/N/N N/A/A N/A/N N/N/A N/N/N
+C
+C K6 1.5 2.0 1.5 2.0 and,andn,ior,xor
+C K6 1.75 2.2 2.0 2.28 iorn,xnor
+C K6 2.0 2.25 2.35 2.28 nand,nior
+C
+C
+C Future:
+C
+C K6 can do one 64-bit load per cycle so each of these routines should be
+C able to approach 1.0 c/l, if aligned. The basic and/andn/ior/xor might be
+C able to get 1.0 with just a 4 limb loop, being 3 instructions per 2 limbs.
+C The others are 4 instructions per 2 limbs, and so can only approach 1.0
+C because there's nowhere to hide some loop control.
+
+defframe(PARAM_SIZE,16)
+defframe(PARAM_SRC2,12)
+defframe(PARAM_SRC1,8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+ TEXT
+ ALIGN(32)
+PROLOGUE(M4_function)
+ movl PARAM_SIZE, %ecx
+ pushl %ebx FRAME_pushl()
+
+ movl PARAM_SRC1, %eax
+
+ movl PARAM_SRC2, %ebx
+ cmpl $1, %ecx
+
+ movl PARAM_DST, %edx
+ ja L(two_or_more)
+
+
+ movl (%ebx), %ecx
+ popl %ebx
+ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK( %ecx)')
+ M4_i (%eax), %ecx
+ifelse(M4_i_neg_dst,1,` notl_or_xorl_GMP_NUMB_MASK( %ecx)')
+ movl %ecx, (%edx)
+
+ ret
+
+
+L(two_or_more):
+ C eax src1
+ C ebx src2
+ C ecx size
+ C edx dst
+ C esi
+ C edi
+ C ebp
+
+ pushl %esi FRAME_pushl()
+ testl $4, %eax
+ jz L(alignment_ok)
+
+ movl (%ebx), %esi
+ addl $4, %ebx
+ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK( %esi)')
+ M4_i (%eax), %esi
+ addl $4, %eax
+ifelse(M4_i_neg_dst,1,` notl_or_xorl_GMP_NUMB_MASK( %esi)')
+ movl %esi, (%edx)
+ addl $4, %edx
+ decl %ecx
+
+L(alignment_ok):
+ movl %ecx, %esi
+ shrl %ecx
+ jnz L(still_two_or_more)
+
+ movl (%ebx), %ecx
+ popl %esi
+ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK( %ecx)')
+ M4_i (%eax), %ecx
+ifelse(M4_i_neg_dst,1,` notl_or_xorl_GMP_NUMB_MASK( %ecx)')
+ popl %ebx
+ movl %ecx, (%edx)
+ ret
+
+
+L(still_two_or_more):
+ifelse(eval(M4_p_neg_src2 || M4_p_neg_dst),1,`
+ pcmpeqd %mm7, %mm7 C all ones
+ifelse(GMP_NAIL_BITS,0,,`psrld $GMP_NAIL_BITS, %mm7') C clear nails
+')
+
+ ALIGN(16)
+L(top):
+ C eax src1
+ C ebx src2
+ C ecx counter
+ C edx dst
+ C esi
+ C edi
+ C ebp
+ C
+ C carry bit is low of size
+
+ movq -8(%ebx,%ecx,8), %mm0
+ifelse(M4_p_neg_src2,1,`pxor %mm7, %mm0')
+ M4_p -8(%eax,%ecx,8), %mm0
+ifelse(M4_p_neg_dst,1,` pxor %mm7, %mm0')
+ movq %mm0, -8(%edx,%ecx,8)
+
+ loop L(top)
+
+
+ jnc L(no_extra)
+
+ movl -4(%ebx,%esi,4), %ebx
+ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK( %ebx)')
+ M4_i -4(%eax,%esi,4), %ebx
+ifelse(M4_i_neg_dst,1,` notl_or_xorl_GMP_NUMB_MASK( %ebx)')
+ movl %ebx, -4(%edx,%esi,4)
+L(no_extra):
+
+ popl %esi
+ popl %ebx
+ emms_or_femms
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/k6/mmx/lshift.asm b/vendor/gmp-6.3.0/mpn/x86/k6/mmx/lshift.asm
new file mode 100644
index 0000000..45be582
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/k6/mmx/lshift.asm
@@ -0,0 +1,130 @@
+dnl AMD K6 mpn_lshift -- mpn left shift.
+
+dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K6: 3.0 cycles/limb
+
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C unsigned shift);
+C
+C The loop runs at 3 cycles/limb, limited by decoding and by having 3 mmx
+C instructions. This is despite every second fetch being unaligned.
+
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ TEXT
+ ALIGN(32)
+
+PROLOGUE(mpn_lshift)
+deflit(`FRAME',0)
+
+ C The 1 limb case can be done without the push %ebx, but it's then
+ C still the same speed. The push is left as a free helping hand for
+ C the two_or_more code.
+
+ movl PARAM_SIZE, %eax
+ pushl %ebx FRAME_pushl()
+
+ movl PARAM_SRC, %ebx
+ decl %eax
+
+ movl PARAM_SHIFT, %ecx
+ jnz L(two_or_more)
+
+ movl (%ebx), %edx C src limb
+ movl PARAM_DST, %ebx
+
+ shldl( %cl, %edx, %eax) C return value
+
+ shll %cl, %edx
+
+ movl %edx, (%ebx) C dst limb
+ popl %ebx
+
+ ret
+
+
+ ALIGN(16) C avoid offset 0x1f
+ nop C avoid bad cache line crossing
+L(two_or_more):
+ C eax size-1
+ C ebx src
+ C ecx shift
+ C edx
+
+ movl (%ebx,%eax,4), %edx C src high limb
+ negl %ecx
+
+ movd PARAM_SHIFT, %mm6
+ addl $32, %ecx C 32-shift
+
+ shrl %cl, %edx
+
+ movd %ecx, %mm7
+ movl PARAM_DST, %ecx
+
+L(top):
+ C eax counter, size-1 to 1
+ C ebx src
+ C ecx dst
+ C edx retval
+ C
+ C mm0 scratch
+ C mm6 shift
+ C mm7 32-shift
+
+ movq -4(%ebx,%eax,4), %mm0
+ decl %eax
+
+ psrlq %mm7, %mm0
+
+ movd %mm0, 4(%ecx,%eax,4)
+ jnz L(top)
+
+
+ movd (%ebx), %mm0
+ popl %ebx
+
+ psllq %mm6, %mm0
+ movl %edx, %eax
+
+ movd %mm0, (%ecx)
+
+ emms
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/k6/mmx/popham.asm b/vendor/gmp-6.3.0/mpn/x86/k6/mmx/popham.asm
new file mode 100644
index 0000000..2b19d0b
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/k6/mmx/popham.asm
@@ -0,0 +1,236 @@
+dnl AMD K6-2 mpn_popcount, mpn_hamdist -- mpn bit population count and
+dnl hamming distance.
+
+dnl Copyright 2000-2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C popcount hamdist
+C K6-2: 9.0 11.5 cycles/limb
+C K6: 12.5 13.0
+
+
+C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
+C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
+C
+C The code here isn't optimal, but it's already a 2x speedup over the plain
+C integer mpn/generic/popcount.c,hamdist.c.
+
+
+ifdef(`OPERATION_popcount',,
+`ifdef(`OPERATION_hamdist',,
+`m4_error(`Need OPERATION_popcount or OPERATION_hamdist
+')m4exit(1)')')
+
+define(HAM,
+m4_assert_numargs(1)
+`ifdef(`OPERATION_hamdist',`$1')')
+
+define(POP,
+m4_assert_numargs(1)
+`ifdef(`OPERATION_popcount',`$1')')
+
+HAM(`
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC2, 8)
+defframe(PARAM_SRC, 4)
+define(M4_function,mpn_hamdist)
+')
+POP(`
+defframe(PARAM_SIZE, 8)
+defframe(PARAM_SRC, 4)
+define(M4_function,mpn_popcount)
+')
+
+MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
+
+
+ifdef(`PIC',,`
+ dnl non-PIC
+
+ RODATA
+ ALIGN(8)
+
+L(rodata_AAAAAAAAAAAAAAAA):
+ .long 0xAAAAAAAA
+ .long 0xAAAAAAAA
+
+L(rodata_3333333333333333):
+ .long 0x33333333
+ .long 0x33333333
+
+L(rodata_0F0F0F0F0F0F0F0F):
+ .long 0x0F0F0F0F
+ .long 0x0F0F0F0F
+
+L(rodata_000000FF000000FF):
+ .long 0x000000FF
+ .long 0x000000FF
+')
+
+ TEXT
+ ALIGN(32)
+
+POP(`ifdef(`PIC', `
+ C avoid shrl crossing a 32-byte boundary
+ nop')')
+
+PROLOGUE(M4_function)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+
+ifdef(`PIC',`
+ movl $0xAAAAAAAA, %eax
+ movl $0x33333333, %edx
+
+ movd %eax, %mm7
+ movd %edx, %mm6
+
+ movl $0x0F0F0F0F, %eax
+ movl $0x000000FF, %edx
+
+ punpckldq %mm7, %mm7
+ punpckldq %mm6, %mm6
+
+ movd %eax, %mm5
+ movd %edx, %mm4
+
+ punpckldq %mm5, %mm5
+ punpckldq %mm4, %mm4
+',`
+
+ movq L(rodata_AAAAAAAAAAAAAAAA), %mm7
+ movq L(rodata_3333333333333333), %mm6
+ movq L(rodata_0F0F0F0F0F0F0F0F), %mm5
+ movq L(rodata_000000FF000000FF), %mm4
+')
+
+define(REG_AAAAAAAAAAAAAAAA, %mm7)
+define(REG_3333333333333333, %mm6)
+define(REG_0F0F0F0F0F0F0F0F, %mm5)
+define(REG_000000FF000000FF, %mm4)
+
+
+ movl PARAM_SRC, %eax
+HAM(` movl PARAM_SRC2, %edx')
+
+ pxor %mm2, %mm2 C total
+
+ shrl %ecx
+ jnc L(top)
+
+Zdisp( movd, 0,(%eax,%ecx,8), %mm1)
+
+HAM(`
+Zdisp( movd, 0,(%edx,%ecx,8), %mm0)
+ pxor %mm0, %mm1
+')
+
+ incl %ecx
+ jmp L(loaded)
+
+
+ ALIGN(16)
+POP(` nop C alignment to avoid crossing 32-byte boundaries')
+
+L(top):
+ C eax src
+ C ebx
+ C ecx counter, qwords, decrementing
+ C edx [hamdist] src2
+ C
+ C mm0 (scratch)
+ C mm1 (scratch)
+ C mm2 total (low dword)
+ C mm3
+ C mm4 \
+ C mm5 | special constants
+ C mm6 |
+ C mm7 /
+
+ movq -8(%eax,%ecx,8), %mm1
+HAM(` pxor -8(%edx,%ecx,8), %mm1')
+
+L(loaded):
+ movq %mm1, %mm0
+ pand REG_AAAAAAAAAAAAAAAA, %mm1
+
+ psrlq $1, %mm1
+HAM(` nop C code alignment')
+
+ psubd %mm1, %mm0 C bit pairs
+HAM(` nop C code alignment')
+
+
+ movq %mm0, %mm1
+ psrlq $2, %mm0
+
+ pand REG_3333333333333333, %mm0
+ pand REG_3333333333333333, %mm1
+
+ paddd %mm1, %mm0 C nibbles
+
+
+ movq %mm0, %mm1
+ psrlq $4, %mm0
+
+ pand REG_0F0F0F0F0F0F0F0F, %mm0
+ pand REG_0F0F0F0F0F0F0F0F, %mm1
+
+ paddd %mm1, %mm0 C bytes
+
+ movq %mm0, %mm1
+ psrlq $8, %mm0
+
+
+ paddb %mm1, %mm0 C words
+
+
+ movq %mm0, %mm1
+ psrlq $16, %mm0
+
+ paddd %mm1, %mm0 C dwords
+
+ pand REG_000000FF000000FF, %mm0
+
+ paddd %mm0, %mm2 C low to total
+ psrlq $32, %mm0
+
+ paddd %mm0, %mm2 C high to total
+ loop L(top)
+
+
+
+ movd %mm2, %eax
+ emms_or_femms
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/k6/mmx/rshift.asm b/vendor/gmp-6.3.0/mpn/x86/k6/mmx/rshift.asm
new file mode 100644
index 0000000..cd0382f
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/k6/mmx/rshift.asm
@@ -0,0 +1,130 @@
+dnl AMD K6 mpn_rshift -- mpn right shift.
+
+dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K6: 3.0 cycles/limb
+
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C unsigned shift);
+C
+C The loop runs at 3 cycles/limb, limited by decoding and by having 3 mmx
+C instructions. This is despite every second fetch being unaligned.
+
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+ TEXT
+ ALIGN(32)
+
+PROLOGUE(mpn_rshift)
+deflit(`FRAME',0)
+
+ C The 1 limb case can be done without the push %ebx, but it's then
+ C still the same speed. The push is left as a free helping hand for
+ C the two_or_more code.
+
+ movl PARAM_SIZE, %eax
+ pushl %ebx FRAME_pushl()
+
+ movl PARAM_SRC, %ebx
+ decl %eax
+
+ movl PARAM_SHIFT, %ecx
+ jnz L(two_or_more)
+
+ movl (%ebx), %edx C src limb
+ movl PARAM_DST, %ebx
+
+ shrdl( %cl, %edx, %eax) C return value
+
+ shrl %cl, %edx
+
+ movl %edx, (%ebx) C dst limb
+ popl %ebx
+
+ ret
+
+
+ ALIGN(16) C avoid offset 0x1f
+L(two_or_more):
+ C eax size-1
+ C ebx src
+ C ecx shift
+ C edx
+
+ movl (%ebx), %edx C src low limb
+ negl %ecx
+
+ addl $32, %ecx C 32-shift
+ movd PARAM_SHIFT, %mm6
+
+ shll %cl, %edx C retval
+ movl PARAM_DST, %ecx
+
+ leal (%ebx,%eax,4), %ebx
+
+ leal -4(%ecx,%eax,4), %ecx
+ negl %eax
+
+
+L(simple):
+ C eax counter (negative)
+ C ebx &src[size-1]
+ C ecx &dst[size-1]
+ C edx retval
+ C
+ C mm0 scratch
+ C mm6 shift
+
+Zdisp( movq, 0,(%ebx,%eax,4), %mm0)
+ incl %eax
+
+ psrlq %mm6, %mm0
+
+Zdisp( movd, %mm0, 0,(%ecx,%eax,4))
+ jnz L(simple)
+
+
+ movq %mm0, (%ecx)
+ movl %edx, %eax
+
+ popl %ebx
+
+ emms
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/k6/mod_34lsub1.asm b/vendor/gmp-6.3.0/mpn/x86/k6/mod_34lsub1.asm
new file mode 100644
index 0000000..7e30503
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/k6/mod_34lsub1.asm
@@ -0,0 +1,190 @@
+dnl AMD K6 mpn_mod_34lsub1 -- mpn remainder modulo 2**24-1.
+
+dnl Copyright 2000-2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K6: 2.66 cycles/limb
+
+
+C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
+C
+C An attempt was made to use a loop like
+C
+C L(top):
+C adcl (%edx), %eax
+C adcl 4(%edx), %ebx
+C adcl 8(%edx), %esi
+C leal 12(%edx), %edx
+C loop L(top)
+C
+C with %ecx starting from floor(size/3), but it still measured 2.66 c/l.
+C The form used instead can save about 6 cycles by not dividing by 3.
+C
+C In the code used, putting the "leal"s at the top of the loop is necessary
+C for the claimed speed, anywhere else costs an extra cycle per loop.
+C Perhaps a tight loop like this needs short decode instructions at the
+C branch target, which would explain the leal/loop form above taking 8
+C cycles instead of 7 too.
+
+defframe(PARAM_SIZE, 8)
+defframe(PARAM_SRC, 4)
+
+dnl re-use parameter space
+define(SAVE_EBX, `PARAM_SIZE')
+define(SAVE_ESI, `PARAM_SRC')
+
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mod_34lsub1)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %eax
+ movl PARAM_SRC, %edx
+
+ subl $2, %eax
+ ja L(three_or_more)
+
+Zdisp( movl, 0,(%edx), %eax) C avoid code cache line boundary
+ jne L(one)
+
+ movl %eax, %ecx
+ movl 4(%edx), %edx
+
+ shrl $24, %eax C src[0] high
+ andl $0x00FFFFFF, %ecx C src[0] low
+
+ addl %ecx, %eax
+ movl %edx, %ecx
+
+ shll $8, %edx
+ andl $0x00FFFF00, %edx C src[1] high
+
+ shrl $16, %ecx C src[1] low
+ addl %ecx, %eax
+
+ addl %edx, %eax
+
+L(one):
+ ret
+
+
+L(three_or_more):
+ C eax size-2
+ C ebx
+ C ecx
+ C edx src
+
+ movl %ebx, SAVE_EBX
+ xorl %ebx, %ebx
+
+ movl %esi, SAVE_ESI
+ pushl %edi FRAME_pushl()
+
+ xorl %esi, %esi
+ xorl %edi, %edi C and clear carry flag
+
+L(top):
+ C eax counter, limbs
+ C ebx acc 0mod3
+ C ecx
+ C edx src, incrementing
+ C esi acc 1mod3
+ C edi acc 2mod3
+ C ebp
+
+ leal -2(%eax), %eax
+ leal 12(%edx), %edx
+
+ adcl -12(%edx), %ebx
+ adcl -8(%edx), %esi
+ adcl -4(%edx), %edi
+
+ decl %eax
+ jg L(top)
+
+
+ C ecx is -3, -2 or -1 representing 0, 1 or 2 more limbs, respectively
+
+ movb $0, %cl
+ incl %eax
+
+ js L(combine) C 0 more
+
+Zdisp( adcl, 0,(%edx), %ebx) C avoid code cache line crossings
+
+ movb $8, %cl
+ decl %eax
+
+ js L(combine) C 1 more
+
+ adcl 4(%edx), %esi
+
+ movb $16, %cl
+
+
+L(combine):
+ sbbl %edx, %edx
+
+ shll %cl, %edx C carry
+ movl %ebx, %eax C 0mod3
+
+ shrl $24, %eax C 0mod3 high
+ andl $0x00FFFFFF, %ebx C 0mod3 low
+
+ subl %edx, %eax C apply carry
+ movl %esi, %ecx C 1mod3
+
+ shrl $16, %esi C 1mod3 high
+ addl %ebx, %eax C apply 0mod3 low
+
+ andl $0x0000FFFF, %ecx
+ addl %esi, %eax C apply 1mod3 high
+
+ shll $8, %ecx C 1mod3 low
+ movl %edi, %edx C 2mod3
+
+ shrl $8, %edx C 2mod3 high
+ addl %ecx, %eax C apply 1mod3 low
+
+ addl %edx, %eax C apply 2mod3 high
+ andl $0x000000FF, %edi
+
+ shll $16, %edi C 2mod3 low
+ movl SAVE_EBX, %ebx
+
+ addl %edi, %eax C apply 2mod3 low
+ movl SAVE_ESI, %esi
+
+ popl %edi
+
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/k6/mode1o.asm b/vendor/gmp-6.3.0/mpn/x86/k6/mode1o.asm
new file mode 100644
index 0000000..4a338bd
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/k6/mode1o.asm
@@ -0,0 +1,176 @@
+dnl AMD K6 mpn_modexact_1_odd -- exact division style remainder.
+
+dnl Copyright 2000-2003, 2007 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K6: 10.0 cycles/limb
+
+
+C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor);
+C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor, mp_limb_t carry);
+C
+C A special case for high<divisor at the end measured only about 4 cycles
+C faster, and so isn't used.
+C
+C A special case for size==1 using a divl rather than the inverse measured
+C only about 5 cycles faster, and so isn't used. When size==1 and
+C high<divisor it can skip a division and be a full 24 cycles faster, but
+C this isn't an important case.
+
+defframe(PARAM_CARRY, 16)
+defframe(PARAM_DIVISOR,12)
+defframe(PARAM_SIZE, 8)
+defframe(PARAM_SRC, 4)
+
+ TEXT
+
+ ALIGN(32)
+PROLOGUE(mpn_modexact_1c_odd)
+deflit(`FRAME',0)
+
+ movl PARAM_DIVISOR, %ecx
+ pushl %esi FRAME_pushl()
+
+ movl PARAM_CARRY, %edx
+ jmp L(start_1c)
+
+EPILOGUE()
+
+
+ ALIGN(16)
+PROLOGUE(mpn_modexact_1_odd)
+deflit(`FRAME',0)
+
+ movl PARAM_DIVISOR, %ecx
+ pushl %esi FRAME_pushl()
+
+ xorl %edx, %edx
+L(start_1c):
+ pushl %edi FRAME_pushl()
+
+ shrl %ecx C d/2
+ movl PARAM_DIVISOR, %esi
+
+ andl $127, %ecx C d/2, 7 bits
+ pushl %ebp FRAME_pushl()
+
+ifdef(`PIC',`
+ LEA( binvert_limb_table, %edi)
+Zdisp( movzbl, 0,(%ecx,%edi), %edi) C inv 8 bits
+',`
+ movzbl binvert_limb_table(%ecx), %edi C inv 8 bits
+')
+ leal (%edi,%edi), %ecx C 2*inv
+
+ imull %edi, %edi C inv*inv
+
+ movl PARAM_SRC, %eax
+ movl PARAM_SIZE, %ebp
+
+ imull %esi, %edi C inv*inv*d
+
+ pushl %ebx FRAME_pushl()
+ leal (%eax,%ebp,4), %ebx C src end
+
+ subl %edi, %ecx C inv = 2*inv - inv*inv*d
+ leal (%ecx,%ecx), %edi C 2*inv
+
+ imull %ecx, %ecx C inv*inv
+
+ movl (%eax), %eax C src low limb
+ negl %ebp C -size
+
+ imull %esi, %ecx C inv*inv*d
+
+ subl %ecx, %edi C inv = 2*inv - inv*inv*d
+
+ ASSERT(e,` C d*inv == 1 mod 2^GMP_LIMB_BITS
+ pushl %eax
+ movl %esi, %eax
+ imull %edi, %eax
+ cmpl $1, %eax
+ popl %eax')
+
+ jmp L(entry)
+
+
+C Rotating the mul to the top of the loop saves 1 cycle, presumably by
+C hiding the loop control under the imul latency.
+C
+C The run time is 10 cycles, but decoding is only 9 (and the dependent chain
+C only 8). It's not clear how to get down to 9 cycles.
+C
+C The xor and rcl to handle the carry bit could be an sbb instead, with the
+C the carry bit add becoming a sub, but that doesn't save anything.
+
+L(top):
+ C eax (low product)
+ C ebx src end
+ C ecx carry bit, 0 or 1
+ C edx (high product, being carry limb)
+ C esi divisor
+ C edi inverse
+ C ebp counter, limbs, negative
+
+ mull %esi
+
+ movl (%ebx,%ebp,4), %eax
+ addl %ecx, %edx C apply carry bit to carry limb
+
+L(entry):
+ xorl %ecx, %ecx
+ subl %edx, %eax C apply carry limb
+
+ rcll %ecx
+
+ imull %edi, %eax
+
+ incl %ebp
+ jnz L(top)
+
+
+
+ popl %ebx
+ popl %ebp
+
+ mull %esi
+
+ popl %edi
+ popl %esi
+
+ leal (%ecx,%edx), %eax
+
+ ret
+
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/x86/k6/mul_1.asm b/vendor/gmp-6.3.0/mpn/x86/k6/mul_1.asm
new file mode 100644
index 0000000..3ef7ec2
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/k6/mul_1.asm
@@ -0,0 +1,292 @@
+dnl AMD K6 mpn_mul_1 -- mpn by limb multiply.
+
+dnl Copyright 1999, 2000, 2002, 2005 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C P5
+C P6 model 0-8,10-12 5.5
+C P6 model 9 (Banias)
+C P6 model 13 (Dothan) 4.87
+C P4 model 0 (Willamette)
+C P4 model 1 (?)
+C P4 model 2 (Northwood)
+C P4 model 3 (Prescott)
+C P4 model 4 (Nocona)
+C AMD K6 6.25
+C AMD K7
+C AMD K8
+
+
+C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t multiplier);
+C mp_limb_t mpn_mul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t multiplier, mp_limb_t carry);
+C
+C Multiply src,size by mult and store the result in dst,size.
+C Return the carry limb from the top of the result.
+C
+C mpn_mul_1c() accepts an initial carry for the calculation, it's added into
+C the low limb of the result.
+
+defframe(PARAM_CARRY, 20)
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+dnl minimum 5 because the unrolled code can't handle less
+deflit(UNROLL_THRESHOLD, 5)
+
+ TEXT
+ ALIGN(32)
+
+PROLOGUE(mpn_mul_1c)
+ pushl %esi
+deflit(`FRAME',4)
+ movl PARAM_CARRY, %esi
+ jmp L(start_nc)
+EPILOGUE()
+
+
+PROLOGUE(mpn_mul_1)
+ push %esi
+deflit(`FRAME',4)
+ xorl %esi, %esi C initial carry
+
+L(start_nc):
+ mov PARAM_SIZE, %ecx
+ push %ebx
+FRAME_pushl()
+
+ movl PARAM_SRC, %ebx
+ push %edi
+FRAME_pushl()
+
+ movl PARAM_DST, %edi
+ pushl %ebp
+FRAME_pushl()
+
+ cmpl $UNROLL_THRESHOLD, %ecx
+ movl PARAM_MULTIPLIER, %ebp
+
+ jae L(unroll)
+
+
+ C code offset 0x22 here, close enough to aligned
+L(simple):
+ C eax scratch
+ C ebx src
+ C ecx counter
+ C edx scratch
+ C esi carry
+ C edi dst
+ C ebp multiplier
+ C
+ C this loop 8 cycles/limb
+
+ movl (%ebx), %eax
+ addl $4, %ebx
+
+ mull %ebp
+
+ addl %esi, %eax
+ movl $0, %esi
+
+ adcl %edx, %esi
+
+ movl %eax, (%edi)
+ addl $4, %edi
+
+ loop L(simple)
+
+
+ popl %ebp
+
+ popl %edi
+ popl %ebx
+
+ movl %esi, %eax
+ popl %esi
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+C The code for each limb is 6 cycles, with instruction decoding being the
+C limiting factor. At 4 limbs/loop and 1 cycle/loop of overhead it's 6.25
+C cycles/limb in total.
+C
+C The secret ingredient to get 6.25 is to start the loop with the mul and
+C have the load/store pair at the end. Rotating the load/store to the top
+C is an 0.5 c/l slowdown. (Some address generation effect probably.)
+C
+C The whole unrolled loop fits nicely in exactly 80 bytes.
+
+
+ ALIGN(16) C already aligned to 16 here actually
+L(unroll):
+ movl (%ebx), %eax
+ leal -16(%ebx,%ecx,4), %ebx
+
+ leal -16(%edi,%ecx,4), %edi
+ subl $4, %ecx
+
+ negl %ecx
+
+
+ ALIGN(16) C one byte nop for this alignment
+L(top):
+ C eax scratch
+ C ebx &src[size-4]
+ C ecx counter
+ C edx scratch
+ C esi carry
+ C edi &dst[size-4]
+ C ebp multiplier
+
+ mull %ebp
+
+ addl %esi, %eax
+ movl $0, %esi
+
+ adcl %edx, %esi
+
+ movl %eax, (%edi,%ecx,4)
+ movl 4(%ebx,%ecx,4), %eax
+
+
+ mull %ebp
+
+ addl %esi, %eax
+ movl $0, %esi
+
+ adcl %edx, %esi
+
+ movl %eax, 4(%edi,%ecx,4)
+ movl 8(%ebx,%ecx,4), %eax
+
+
+ mull %ebp
+
+ addl %esi, %eax
+ movl $0, %esi
+
+ adcl %edx, %esi
+
+ movl %eax, 8(%edi,%ecx,4)
+ movl 12(%ebx,%ecx,4), %eax
+
+
+ mull %ebp
+
+ addl %esi, %eax
+ movl $0, %esi
+
+ adcl %edx, %esi
+
+ movl %eax, 12(%edi,%ecx,4)
+ movl 16(%ebx,%ecx,4), %eax
+
+
+ addl $4, %ecx
+ js L(top)
+
+
+
+ C eax next src limb
+ C ebx &src[size-4]
+ C ecx 0 to 3 representing respectively 4 to 1 further limbs
+ C edx
+ C esi carry
+ C edi &dst[size-4]
+
+ testb $2, %cl
+ jnz L(finish_not_two)
+
+ mull %ebp
+
+ addl %esi, %eax
+ movl $0, %esi
+
+ adcl %edx, %esi
+
+ movl %eax, (%edi,%ecx,4)
+ movl 4(%ebx,%ecx,4), %eax
+
+
+ mull %ebp
+
+ addl %esi, %eax
+ movl $0, %esi
+
+ adcl %edx, %esi
+
+ movl %eax, 4(%edi,%ecx,4)
+ movl 8(%ebx,%ecx,4), %eax
+
+ addl $2, %ecx
+L(finish_not_two):
+
+
+ testb $1, %cl
+ jnz L(finish_not_one)
+
+ mull %ebp
+
+ addl %esi, %eax
+ movl $0, %esi
+
+ adcl %edx, %esi
+
+ movl %eax, 8(%edi)
+ movl 12(%ebx), %eax
+L(finish_not_one):
+
+
+ mull %ebp
+
+ addl %esi, %eax
+ popl %ebp
+
+ adcl $0, %edx
+
+ movl %eax, 12(%edi)
+ popl %edi
+
+ popl %ebx
+ movl %edx, %eax
+
+ popl %esi
+
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/k6/mul_basecase.asm b/vendor/gmp-6.3.0/mpn/x86/k6/mul_basecase.asm
new file mode 100644
index 0000000..7030001
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/k6/mul_basecase.asm
@@ -0,0 +1,612 @@
+dnl AMD K6 mpn_mul_basecase -- multiply two mpn numbers.
+
+dnl Copyright 1999-2003 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K6: approx 9.0 cycles per cross product on 30x30 limbs (with 16 limbs/loop
+C unrolling).
+
+
+
+dnl K6: UNROLL_COUNT cycles/product (approx)
+dnl 8 9.75
+dnl 16 9.3
+dnl 32 9.3
+dnl Maximum possible with the current code is 32.
+dnl
+dnl With 16 the inner unrolled loop fits exactly in a 256 byte block, which
+dnl might explain it's good performance.
+
+deflit(UNROLL_COUNT, 16)
+
+
+C void mpn_mul_basecase (mp_ptr wp,
+C mp_srcptr xp, mp_size_t xsize,
+C mp_srcptr yp, mp_size_t ysize);
+C
+C Calculate xp,xsize multiplied by yp,ysize, storing the result in
+C wp,xsize+ysize.
+C
+C This routine is essentially the same as mpn/generic/mul_basecase.c, but
+C it's faster because it does most of the mpn_addmul_1() entry code only
+C once. The saving is about 10-20% on typical sizes coming from the
+C Karatsuba multiply code.
+C
+C Enhancements:
+C
+C The mul_1 loop is about 8.5 c/l, which is slower than mpn_mul_1 at 6.25
+C c/l. Could call mpn_mul_1 when ysize is big enough to make it worthwhile.
+C
+C The main unrolled addmul loop could be shared by mpn_addmul_1, using some
+C extra stack setups and maybe 2 or 3 wasted cycles at the end. Code saving
+C would be 256 bytes.
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 8)
+',`
+deflit(UNROLL_THRESHOLD, 8)
+')
+
+defframe(PARAM_YSIZE,20)
+defframe(PARAM_YP, 16)
+defframe(PARAM_XSIZE,12)
+defframe(PARAM_XP, 8)
+defframe(PARAM_WP, 4)
+
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_mul_basecase)
+deflit(`FRAME',0)
+
+ movl PARAM_XSIZE, %ecx
+ movl PARAM_YP, %eax
+
+ movl PARAM_XP, %edx
+ movl (%eax), %eax C yp low limb
+
+ cmpl $2, %ecx
+ ja L(xsize_more_than_two_limbs)
+ je L(two_by_something)
+
+
+ C one limb by one limb
+
+ movl (%edx), %edx C xp low limb
+ movl PARAM_WP, %ecx
+
+ mull %edx
+
+ movl %eax, (%ecx)
+ movl %edx, 4(%ecx)
+ ret
+
+
+C -----------------------------------------------------------------------------
+L(two_by_something):
+ decl PARAM_YSIZE
+ pushl %ebx
+deflit(`FRAME',4)
+
+ movl PARAM_WP, %ebx
+ pushl %esi
+deflit(`FRAME',8)
+
+ movl %eax, %ecx C yp low limb
+ movl (%edx), %eax C xp low limb
+
+ movl %edx, %esi C xp
+ jnz L(two_by_two)
+
+
+ C two limbs by one limb
+
+ mull %ecx
+
+ movl %eax, (%ebx)
+ movl 4(%esi), %eax
+
+ movl %edx, %esi C carry
+
+ mull %ecx
+
+ addl %eax, %esi
+ movl %esi, 4(%ebx)
+
+ adcl $0, %edx
+
+ movl %edx, 8(%ebx)
+ popl %esi
+
+ popl %ebx
+ ret
+
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(16)
+L(two_by_two):
+ C eax xp low limb
+ C ebx wp
+ C ecx yp low limb
+ C edx
+ C esi xp
+ C edi
+ C ebp
+deflit(`FRAME',8)
+
+ mull %ecx C xp[0] * yp[0]
+
+ push %edi
+deflit(`FRAME',12)
+ movl %eax, (%ebx)
+
+ movl 4(%esi), %eax
+ movl %edx, %edi C carry, for wp[1]
+
+ mull %ecx C xp[1] * yp[0]
+
+ addl %eax, %edi
+ movl PARAM_YP, %ecx
+
+ adcl $0, %edx
+
+ movl %edi, 4(%ebx)
+ movl 4(%ecx), %ecx C yp[1]
+
+ movl 4(%esi), %eax C xp[1]
+ movl %edx, %edi C carry, for wp[2]
+
+ mull %ecx C xp[1] * yp[1]
+
+ addl %eax, %edi
+
+ adcl $0, %edx
+
+ movl (%esi), %eax C xp[0]
+ movl %edx, %esi C carry, for wp[3]
+
+ mull %ecx C xp[0] * yp[1]
+
+ addl %eax, 4(%ebx)
+ adcl %edx, %edi
+ adcl $0, %esi
+
+ movl %edi, 8(%ebx)
+ popl %edi
+
+ movl %esi, 12(%ebx)
+ popl %esi
+
+ popl %ebx
+ ret
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(16)
+L(xsize_more_than_two_limbs):
+
+C The first limb of yp is processed with a simple mpn_mul_1 style loop
+C inline. Unrolling this doesn't seem worthwhile since it's only run once
+C (whereas the addmul below is run ysize-1 many times). A call to the
+C actual mpn_mul_1 will be slowed down by the call and parameter pushing and
+C popping, and doesn't seem likely to be worthwhile on the typical 10-20
+C limb operations the Karatsuba code calls here with.
+
+ C eax yp[0]
+ C ebx
+ C ecx xsize
+ C edx xp
+ C esi
+ C edi
+ C ebp
+deflit(`FRAME',0)
+
+ pushl %edi defframe_pushl(SAVE_EDI)
+ pushl %ebp defframe_pushl(SAVE_EBP)
+
+ movl PARAM_WP, %edi
+ pushl %esi defframe_pushl(SAVE_ESI)
+
+ movl %eax, %ebp
+ pushl %ebx defframe_pushl(SAVE_EBX)
+
+ leal (%edx,%ecx,4), %ebx C xp end
+ xorl %esi, %esi
+
+ leal (%edi,%ecx,4), %edi C wp end of mul1
+ negl %ecx
+
+
+L(mul1):
+ C eax scratch
+ C ebx xp end
+ C ecx counter, negative
+ C edx scratch
+ C esi carry
+ C edi wp end of mul1
+ C ebp multiplier
+
+ movl (%ebx,%ecx,4), %eax
+
+ mull %ebp
+
+ addl %esi, %eax
+ movl $0, %esi
+
+ adcl %edx, %esi
+
+ movl %eax, (%edi,%ecx,4)
+ incl %ecx
+
+ jnz L(mul1)
+
+
+ movl PARAM_YSIZE, %edx
+ movl %esi, (%edi) C final carry
+
+ movl PARAM_XSIZE, %ecx
+ decl %edx
+
+ jnz L(ysize_more_than_one_limb)
+
+ popl %ebx
+ popl %esi
+ popl %ebp
+ popl %edi
+ ret
+
+
+L(ysize_more_than_one_limb):
+ cmpl $UNROLL_THRESHOLD, %ecx
+ movl PARAM_YP, %eax
+
+ jae L(unroll)
+
+
+C -----------------------------------------------------------------------------
+C Simple addmul loop.
+C
+C Using ebx and edi pointing at the ends of their respective locations saves
+C a couple of instructions in the outer loop. The inner loop is still 11
+C cycles, the same as the simple loop in aorsmul_1.asm.
+
+ C eax yp
+ C ebx xp end
+ C ecx xsize
+ C edx ysize-1
+ C esi
+ C edi wp end of mul1
+ C ebp
+
+ movl 4(%eax), %ebp C multiplier
+ negl %ecx
+
+ movl %ecx, PARAM_XSIZE C -xsize
+ xorl %esi, %esi C initial carry
+
+ leal 4(%eax,%edx,4), %eax C yp end
+ negl %edx
+
+ movl %eax, PARAM_YP
+ movl %edx, PARAM_YSIZE
+
+ jmp L(simple_outer_entry)
+
+
+ C aligning here saves a couple of cycles
+ ALIGN(16)
+L(simple_outer_top):
+ C edx ysize counter, negative
+
+ movl PARAM_YP, %eax C yp end
+ xorl %esi, %esi C carry
+
+ movl PARAM_XSIZE, %ecx C -xsize
+ movl %edx, PARAM_YSIZE
+
+ movl (%eax,%edx,4), %ebp C yp limb multiplier
+L(simple_outer_entry):
+ addl $4, %edi
+
+
+L(simple_inner):
+ C eax scratch
+ C ebx xp end
+ C ecx counter, negative
+ C edx scratch
+ C esi carry
+ C edi wp end of this addmul
+ C ebp multiplier
+
+ movl (%ebx,%ecx,4), %eax
+
+ mull %ebp
+
+ addl %esi, %eax
+ movl $0, %esi
+
+ adcl $0, %edx
+ addl %eax, (%edi,%ecx,4)
+ adcl %edx, %esi
+
+ incl %ecx
+ jnz L(simple_inner)
+
+
+ movl PARAM_YSIZE, %edx
+ movl %esi, (%edi)
+
+ incl %edx
+ jnz L(simple_outer_top)
+
+
+ popl %ebx
+ popl %esi
+ popl %ebp
+ popl %edi
+ ret
+
+
+C -----------------------------------------------------------------------------
+C Unrolled loop.
+C
+C The unrolled inner loop is the same as in aorsmul_1.asm, see that code for
+C some comments.
+C
+C VAR_COUNTER is for the inner loop, running from VAR_COUNTER_INIT down to
+C 0, inclusive.
+C
+C VAR_JMP is the computed jump into the unrolled loop.
+C
+C PARAM_XP and PARAM_WP get offset appropriately for where the unrolled loop
+C is entered.
+C
+C VAR_XP_LOW is the least significant limb of xp, which is needed at the
+C start of the unrolled loop. This can't just be fetched through the xp
+C pointer because of the offset applied to it.
+C
+C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
+C inclusive.
+C
+C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be
+C added to give the location of the next limb of yp, which is the multiplier
+C in the unrolled loop.
+C
+C PARAM_WP is similarly offset so that the PARAM_YSIZE counter can be added
+C to give the starting point in the destination for each unrolled loop (this
+C point is one limb upwards for each limb of yp processed).
+C
+C Having PARAM_YSIZE count negative to zero means it's not necessary to
+C store new values of PARAM_YP and PARAM_WP on each loop. Those values on
+C the stack remain constant and on each loop an leal adjusts them with the
+C PARAM_YSIZE counter value.
+
+
+defframe(VAR_COUNTER, -20)
+defframe(VAR_COUNTER_INIT, -24)
+defframe(VAR_JMP, -28)
+defframe(VAR_XP_LOW, -32)
+deflit(VAR_STACK_SPACE, 16)
+
+dnl For some strange reason using (%esp) instead of 0(%esp) is a touch
+dnl slower in this code, hence the defframe empty-if-zero feature is
+dnl disabled.
+dnl
+dnl If VAR_COUNTER is at (%esp), the effect is worse. In this case the
+dnl unrolled loop is 255 instead of 256 bytes, but quite how this affects
+dnl anything isn't clear.
+dnl
+define(`defframe_empty_if_zero_disabled',1)
+
+L(unroll):
+ C eax yp (not used)
+ C ebx xp end (not used)
+ C ecx xsize
+ C edx ysize-1
+ C esi
+ C edi wp end of mul1 (not used)
+ C ebp
+deflit(`FRAME', 16)
+
+ leal -2(%ecx), %ebp C one limb processed at start,
+ decl %ecx C and ebp is one less
+
+ shrl $UNROLL_LOG2, %ebp
+ negl %ecx
+
+ subl $VAR_STACK_SPACE, %esp
+deflit(`FRAME', 16+VAR_STACK_SPACE)
+ andl $UNROLL_MASK, %ecx
+
+ movl %ecx, %esi
+ shll $4, %ecx
+
+ movl %ebp, VAR_COUNTER_INIT
+ negl %esi
+
+ C 15 code bytes per limb
+ifdef(`PIC',`
+ call L(pic_calc)
+L(unroll_here):
+',`
+ leal L(unroll_entry) (%ecx,%esi,1), %ecx
+')
+
+ movl PARAM_XP, %ebx
+ movl %ebp, VAR_COUNTER
+
+ movl PARAM_WP, %edi
+ movl %ecx, VAR_JMP
+
+ movl (%ebx), %eax
+ leal 4(%edi,%esi,4), %edi C wp adjust for unrolling and mul1
+
+ leal (%ebx,%esi,4), %ebx C xp adjust for unrolling
+
+ movl %eax, VAR_XP_LOW
+
+ movl %ebx, PARAM_XP
+ movl PARAM_YP, %ebx
+
+ leal (%edi,%edx,4), %ecx C wp adjust for ysize indexing
+ movl 4(%ebx), %ebp C multiplier (yp second limb)
+
+ leal 4(%ebx,%edx,4), %ebx C yp adjust for ysize indexing
+
+ movl %ecx, PARAM_WP
+
+ leal 1(%esi), %ecx C adjust parity for decl %ecx above
+
+ movl %ebx, PARAM_YP
+ negl %edx
+
+ movl %edx, PARAM_YSIZE
+ jmp L(unroll_outer_entry)
+
+
+ifdef(`PIC',`
+L(pic_calc):
+ C See mpn/x86/README about old gas bugs
+ leal (%ecx,%esi,1), %ecx
+ addl $L(unroll_entry)-L(unroll_here), %ecx
+ addl (%esp), %ecx
+ ret_internal
+')
+
+
+C -----------------------------------------------------------------------------
+ C Aligning here saves a couple of cycles per loop. Using 32 doesn't
+ C cost any extra space, since the inner unrolled loop below is
+ C aligned to 32.
+ ALIGN(32)
+L(unroll_outer_top):
+ C edx ysize
+
+ movl PARAM_YP, %eax
+ movl %edx, PARAM_YSIZE C incremented ysize counter
+
+ movl PARAM_WP, %edi
+
+ movl VAR_COUNTER_INIT, %ebx
+ movl (%eax,%edx,4), %ebp C next multiplier
+
+ movl PARAM_XSIZE, %ecx
+ leal (%edi,%edx,4), %edi C adjust wp for where we are in yp
+
+ movl VAR_XP_LOW, %eax
+ movl %ebx, VAR_COUNTER
+
+L(unroll_outer_entry):
+ mull %ebp
+
+ C using testb is a tiny bit faster than testl
+ testb $1, %cl
+
+ movl %eax, %ecx C low carry
+ movl VAR_JMP, %eax
+
+ movl %edx, %esi C high carry
+ movl PARAM_XP, %ebx
+
+ jnz L(unroll_noswap)
+ movl %ecx, %esi C high,low carry other way around
+
+ movl %edx, %ecx
+L(unroll_noswap):
+
+ jmp *%eax
+
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(32)
+L(unroll_top):
+ C eax scratch
+ C ebx xp
+ C ecx carry low
+ C edx scratch
+ C esi carry high
+ C edi wp
+ C ebp multiplier
+ C VAR_COUNTER loop counter
+ C
+ C 15 code bytes each limb
+
+ leal UNROLL_BYTES(%edi), %edi
+
+L(unroll_entry):
+deflit(CHUNK_COUNT,2)
+forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+ deflit(`disp0', eval(i*CHUNK_COUNT*4))
+ deflit(`disp1', eval(disp0 + 4))
+ deflit(`disp2', eval(disp1 + 4))
+
+ movl disp1(%ebx), %eax
+ mull %ebp
+Zdisp( addl, %ecx, disp0,(%edi))
+ adcl %eax, %esi
+ movl %edx, %ecx
+ jadcl0( %ecx)
+
+ movl disp2(%ebx), %eax
+ mull %ebp
+ addl %esi, disp1(%edi)
+ adcl %eax, %ecx
+ movl %edx, %esi
+ jadcl0( %esi)
+')
+
+ decl VAR_COUNTER
+ leal UNROLL_BYTES(%ebx), %ebx
+
+ jns L(unroll_top)
+
+
+ movl PARAM_YSIZE, %edx
+ addl %ecx, UNROLL_BYTES(%edi)
+
+ adcl $0, %esi
+
+ incl %edx
+ movl %esi, UNROLL_BYTES+4(%edi)
+
+ jnz L(unroll_outer_top)
+
+
+ movl SAVE_ESI, %esi
+ movl SAVE_EBP, %ebp
+ movl SAVE_EDI, %edi
+ movl SAVE_EBX, %ebx
+
+ addl $FRAME, %esp
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/k6/pre_mod_1.asm b/vendor/gmp-6.3.0/mpn/x86/k6/pre_mod_1.asm
new file mode 100644
index 0000000..34db20d
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/k6/pre_mod_1.asm
@@ -0,0 +1,146 @@
+dnl AMD K6 mpn_preinv_mod_1 -- mpn by 1 remainder, with pre-inverted divisor.
+
+dnl Copyright 2000, 2002, 2003 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K6: 18.0 cycles/limb
+
+
+C mp_limb_t mpn_preinv_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C mp_limb_t inverse);
+C
+C This code is only 2 c/l faster than a simple divl, but that's 10% so it's
+C considered worthwhile (just).
+
+defframe(PARAM_INVERSE,16)
+defframe(PARAM_DIVISOR,12)
+defframe(PARAM_SIZE, 8)
+defframe(PARAM_SRC, 4)
+
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_preinv_mod_1)
+deflit(`FRAME',0)
+
+ ASSERT(ae,`cmpl $1, PARAM_SIZE')
+ ASSERT(nz,`testl $0x80000000, PARAM_DIVISOR')
+
+ movl PARAM_SIZE, %ecx
+ pushl %ebp FRAME_pushl()
+
+ movl PARAM_SRC, %ebp
+ pushl %edi FRAME_pushl()
+
+ movl PARAM_DIVISOR, %eax
+ pushl %esi FRAME_pushl()
+
+ movl -4(%ebp,%ecx,4), %esi C src high limb
+ pushl %ebx FRAME_pushl()
+
+ movl %edx, %edi C first n2 to cancel
+ subl %eax, %esi C first n1 = high-divisor
+
+ decl %ecx
+ jz L(done_sbbl)
+
+L(top):
+ C eax scratch
+ C ebx n10, nadj, q1
+ C ecx counter, size to 1
+ C edx scratch
+ C esi n2
+ C edi old high, for underflow test
+ C ebp src
+
+ sbbl %edx, %edi C high n-(q1+1)*d, 0 or -1
+
+L(entry):
+ andl PARAM_DIVISOR, %edi
+L(q1_ff_top):
+ movl -4(%ebp,%ecx,4), %ebx
+
+ addl %esi, %edi C possible addback
+ movl %ebx, %esi C n10
+
+ sarl $31, %ebx C -n1 = 0 or -1
+ movl %edi, %eax C n2
+
+ movl PARAM_INVERSE, %edx
+ subl %ebx, %eax C n2+n1
+
+ mull %edx C m*(n2+n1)
+
+ andl PARAM_DIVISOR, %ebx C -n1 & d
+ addl %esi, %ebx C nadj = n10 + (-n1&d), ignoring overflow
+
+ addl %ebx, %eax C low m*(n2+n1) + nadj, giving carry flag
+ leal 1(%edi), %ebx C n2+1
+
+ adcl %ebx, %edx C 1+high(n2<<32+m*(n2+n1)+nadj) = q1+1
+
+ movl PARAM_DIVISOR, %eax C d
+ jz L(q1_ff)
+
+ mull %edx C (q1+1)*d
+
+ subl %eax, %esi C low n-(q1+1)*d
+ loop L(top)
+
+
+
+L(done_sbbl):
+ sbbl %edx, %edi C high n-(q1+1)*d, 0 or -1
+
+ andl PARAM_DIVISOR, %edi
+L(done_esi_edi):
+ popl %ebx
+
+ leal (%esi,%edi), %eax
+ popl %esi
+
+ popl %edi
+ popl %ebp
+
+ ret
+
+
+C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
+C of q*d is simply -d and the remainder n-q*d = n10+d. This is rarely
+C reached.
+
+L(q1_ff):
+ movl PARAM_DIVISOR, %edi
+ loop L(q1_ff_top)
+
+ jmp L(done_esi_edi)
+
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/k6/sqr_basecase.asm b/vendor/gmp-6.3.0/mpn/x86/k6/sqr_basecase.asm
new file mode 100644
index 0000000..b7ecb5c
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/k6/sqr_basecase.asm
@@ -0,0 +1,680 @@
+dnl AMD K6 mpn_sqr_basecase -- square an mpn number.
+
+dnl Copyright 1999-2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K6: approx 4.7 cycles per cross product, or 9.2 cycles per triangular
+C product (measured on the speed difference between 17 and 33 limbs,
+C which is roughly the Karatsuba recursing range).
+
+
+dnl SQR_TOOM2_THRESHOLD_MAX is the maximum SQR_TOOM2_THRESHOLD this
+dnl code supports. This value is used only by the tune program to know
+dnl what it can go up to. (An attempt to compile with a bigger value will
+dnl trigger some m4_assert()s in the code, making the build fail.)
+dnl
+dnl The value is determined by requiring the displacements in the unrolled
+dnl addmul to fit in single bytes. This means a maximum UNROLL_COUNT of
+dnl 63, giving a maximum SQR_TOOM2_THRESHOLD of 66.
+
+deflit(SQR_TOOM2_THRESHOLD_MAX, 66)
+
+
+dnl Allow a value from the tune program to override config.m4.
+
+ifdef(`SQR_TOOM2_THRESHOLD_OVERRIDE',
+`define(`SQR_TOOM2_THRESHOLD',SQR_TOOM2_THRESHOLD_OVERRIDE)')
+
+
+dnl UNROLL_COUNT is the number of code chunks in the unrolled addmul. The
+dnl number required is determined by SQR_TOOM2_THRESHOLD, since
+dnl mpn_sqr_basecase only needs to handle sizes < SQR_TOOM2_THRESHOLD.
+dnl
+dnl The first addmul is the biggest, and this takes the second least
+dnl significant limb and multiplies it by the third least significant and
+dnl up. Hence for a maximum operand size of SQR_TOOM2_THRESHOLD-1
+dnl limbs, UNROLL_COUNT needs to be SQR_TOOM2_THRESHOLD-3.
+
+m4_config_gmp_mparam(`SQR_TOOM2_THRESHOLD')
+deflit(UNROLL_COUNT, eval(SQR_TOOM2_THRESHOLD-3))
+
+
+C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C The algorithm is essentially the same as mpn/generic/sqr_basecase.c, but a
+C lot of function call overheads are avoided, especially when the given size
+C is small.
+C
+C The code size might look a bit excessive, but not all of it is executed
+C and so won't fill up the code cache. The 1x1, 2x2 and 3x3 special cases
+C clearly apply only to those sizes; mid sizes like 10x10 only need part of
+C the unrolled addmul; and big sizes like 35x35 that do need all of it will
+C at least be getting value for money, because 35x35 spends something like
+C 5780 cycles here.
+C
+C Different values of UNROLL_COUNT give slightly different speeds, between
+C 9.0 and 9.2 c/tri-prod measured on the difference between 17 and 33 limbs.
+C This isn't a big difference, but it's presumably some alignment effect
+C which if understood could give a simple speedup.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_sqr_basecase)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ movl PARAM_SRC, %eax
+
+ cmpl $2, %ecx
+ je L(two_limbs)
+
+ movl PARAM_DST, %edx
+ ja L(three_or_more)
+
+
+C -----------------------------------------------------------------------------
+C one limb only
+ C eax src
+ C ebx
+ C ecx size
+ C edx dst
+
+ movl (%eax), %eax
+ movl %edx, %ecx
+
+ mull %eax
+
+ movl %eax, (%ecx)
+ movl %edx, 4(%ecx)
+ ret
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(16)
+L(two_limbs):
+ C eax src
+ C ebx
+ C ecx size
+ C edx dst
+
+ pushl %ebx
+ movl %eax, %ebx C src
+deflit(`FRAME',4)
+
+ movl (%ebx), %eax
+ movl PARAM_DST, %ecx
+
+ mull %eax C src[0]^2
+
+ movl %eax, (%ecx)
+ movl 4(%ebx), %eax
+
+ movl %edx, 4(%ecx)
+
+ mull %eax C src[1]^2
+
+ movl %eax, 8(%ecx)
+ movl (%ebx), %eax
+
+ movl %edx, 12(%ecx)
+ movl 4(%ebx), %edx
+
+ mull %edx C src[0]*src[1]
+
+ addl %eax, 4(%ecx)
+
+ adcl %edx, 8(%ecx)
+ adcl $0, 12(%ecx)
+
+ popl %ebx
+ addl %eax, 4(%ecx)
+
+ adcl %edx, 8(%ecx)
+ adcl $0, 12(%ecx)
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+L(three_or_more):
+deflit(`FRAME',0)
+ cmpl $4, %ecx
+ jae L(four_or_more)
+
+
+C -----------------------------------------------------------------------------
+C three limbs
+ C eax src
+ C ecx size
+ C edx dst
+
+ pushl %ebx
+ movl %eax, %ebx C src
+
+ movl (%ebx), %eax
+ movl %edx, %ecx C dst
+
+ mull %eax C src[0] ^ 2
+
+ movl %eax, (%ecx)
+ movl 4(%ebx), %eax
+
+ movl %edx, 4(%ecx)
+ pushl %esi
+
+ mull %eax C src[1] ^ 2
+
+ movl %eax, 8(%ecx)
+ movl 8(%ebx), %eax
+
+ movl %edx, 12(%ecx)
+ pushl %edi
+
+ mull %eax C src[2] ^ 2
+
+ movl %eax, 16(%ecx)
+ movl (%ebx), %eax
+
+ movl %edx, 20(%ecx)
+ movl 4(%ebx), %edx
+
+ mull %edx C src[0] * src[1]
+
+ movl %eax, %esi
+ movl (%ebx), %eax
+
+ movl %edx, %edi
+ movl 8(%ebx), %edx
+
+ pushl %ebp
+ xorl %ebp, %ebp
+
+ mull %edx C src[0] * src[2]
+
+ addl %eax, %edi
+ movl 4(%ebx), %eax
+
+ adcl %edx, %ebp
+
+ movl 8(%ebx), %edx
+
+ mull %edx C src[1] * src[2]
+
+ addl %eax, %ebp
+
+ adcl $0, %edx
+
+
+ C eax will be dst[5]
+ C ebx
+ C ecx dst
+ C edx dst[4]
+ C esi dst[1]
+ C edi dst[2]
+ C ebp dst[3]
+
+ xorl %eax, %eax
+ addl %esi, %esi
+ adcl %edi, %edi
+ adcl %ebp, %ebp
+ adcl %edx, %edx
+ adcl $0, %eax
+
+ addl %esi, 4(%ecx)
+ adcl %edi, 8(%ecx)
+ adcl %ebp, 12(%ecx)
+
+ popl %ebp
+ popl %edi
+
+ adcl %edx, 16(%ecx)
+
+ popl %esi
+ popl %ebx
+
+ adcl %eax, 20(%ecx)
+ ASSERT(nc)
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+
+defframe(SAVE_EBX, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EDI, -12)
+defframe(SAVE_EBP, -16)
+defframe(VAR_COUNTER,-20)
+defframe(VAR_JMP, -24)
+deflit(STACK_SPACE, 24)
+
+ ALIGN(16)
+L(four_or_more):
+
+ C eax src
+ C ebx
+ C ecx size
+ C edx dst
+ C esi
+ C edi
+ C ebp
+
+C First multiply src[0]*src[1..size-1] and store at dst[1..size].
+C
+C A test was done calling mpn_mul_1 here to get the benefit of its unrolled
+C loop, but this was only a tiny speedup; at 35 limbs it took 24 cycles off
+C a 5780 cycle operation, which is not surprising since the loop here is 8
+C c/l and mpn_mul_1 is 6.25 c/l.
+
+ subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE)
+
+ movl %edi, SAVE_EDI
+ leal 4(%edx), %edi
+
+ movl %ebx, SAVE_EBX
+ leal 4(%eax), %ebx
+
+ movl %esi, SAVE_ESI
+ xorl %esi, %esi
+
+ movl %ebp, SAVE_EBP
+
+ C eax
+ C ebx src+4
+ C ecx size
+ C edx
+ C esi
+ C edi dst+4
+ C ebp
+
+ movl (%eax), %ebp C multiplier
+ leal -1(%ecx), %ecx C size-1, and pad to a 16 byte boundary
+
+
+ ALIGN(16)
+L(mul_1):
+ C eax scratch
+ C ebx src ptr
+ C ecx counter
+ C edx scratch
+ C esi carry
+ C edi dst ptr
+ C ebp multiplier
+
+ movl (%ebx), %eax
+ addl $4, %ebx
+
+ mull %ebp
+
+ addl %esi, %eax
+ movl $0, %esi
+
+ adcl %edx, %esi
+
+ movl %eax, (%edi)
+ addl $4, %edi
+
+ loop L(mul_1)
+
+
+C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
+C
+C The last two addmuls, which are the bottom right corner of the product
+C triangle, are left to the end. These are src[size-3]*src[size-2,size-1]
+C and src[size-2]*src[size-1]. If size is 4 then it's only these corner
+C cases that need to be done.
+C
+C The unrolled code is the same as mpn_addmul_1(), see that routine for some
+C comments.
+C
+C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive.
+C
+C VAR_JMP is the computed jump into the unrolled code, stepped by one code
+C chunk each outer loop.
+C
+C K6 doesn't do any branch prediction on indirect jumps, which is good
+C actually because it's a different target each time. The unrolled addmul
+C is about 3 cycles/limb faster than a simple loop, so the 6 cycle cost of
+C the indirect jump is quickly recovered.
+
+
+dnl This value is also implicitly encoded in a shift and add.
+dnl
+deflit(CODE_BYTES_PER_LIMB, 15)
+
+dnl With the unmodified &src[size] and &dst[size] pointers, the
+dnl displacements in the unrolled code fit in a byte for UNROLL_COUNT
+dnl values up to 31. Above that an offset must be added to them.
+dnl
+deflit(OFFSET,
+ifelse(eval(UNROLL_COUNT>31),1,
+eval((UNROLL_COUNT-31)*4),
+0))
+
+ C eax
+ C ebx &src[size]
+ C ecx
+ C edx
+ C esi carry
+ C edi &dst[size]
+ C ebp
+
+ movl PARAM_SIZE, %ecx
+ movl %esi, (%edi)
+
+ subl $4, %ecx
+ jz L(corner)
+
+ movl %ecx, %edx
+ifelse(OFFSET,0,,
+` subl $OFFSET, %ebx')
+
+ shll $4, %ecx
+ifelse(OFFSET,0,,
+` subl $OFFSET, %edi')
+
+ negl %ecx
+
+ifdef(`PIC',`
+ call L(pic_calc)
+L(here):
+',`
+ leal L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
+')
+ negl %edx
+
+
+ C The calculated jump mustn't be before the start of the available
+ C code. This is the limitation UNROLL_COUNT puts on the src operand
+ C size, but checked here using the jump address directly.
+ C
+ ASSERT(ae,`
+ movl_text_address( L(unroll_inner_start), %eax)
+ cmpl %eax, %ecx
+ ')
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(16)
+L(unroll_outer_top):
+ C eax
+ C ebx &src[size], constant
+ C ecx VAR_JMP
+ C edx VAR_COUNTER, limbs, negative
+ C esi high limb to store
+ C edi dst ptr, high of last addmul
+ C ebp
+
+ movl -12+OFFSET(%ebx,%edx,4), %ebp C multiplier
+ movl %edx, VAR_COUNTER
+
+ movl -8+OFFSET(%ebx,%edx,4), %eax C first limb of multiplicand
+
+ mull %ebp
+
+ testb $1, %cl
+
+ movl %edx, %esi C high carry
+ movl %ecx, %edx C jump
+
+ movl %eax, %ecx C low carry
+ leal CODE_BYTES_PER_LIMB(%edx), %edx
+
+ movl %edx, VAR_JMP
+ leal 4(%edi), %edi
+
+ C A branch-free version of this using some xors was found to be a
+ C touch slower than just a conditional jump, despite the jump
+ C switching between taken and not taken on every loop.
+
+ifelse(eval(UNROLL_COUNT%2),0,
+ jz,jnz) L(unroll_noswap)
+ movl %esi, %eax C high,low carry other way around
+
+ movl %ecx, %esi
+ movl %eax, %ecx
+L(unroll_noswap):
+
+ jmp *%edx
+
+
+ C Must be on an even address here so the low bit of the jump address
+ C will indicate which way around ecx/esi should start.
+ C
+ C An attempt was made at padding here to get the end of the unrolled
+ C code to come out on a good alignment, to save padding before
+ C L(corner). This worked, but turned out to run slower than just an
+ C ALIGN(2). The reason for this is not clear, it might be related
+ C to the different speeds on different UNROLL_COUNTs noted above.
+
+ ALIGN(2)
+
+L(unroll_inner_start):
+ C eax scratch
+ C ebx src
+ C ecx carry low
+ C edx scratch
+ C esi carry high
+ C edi dst
+ C ebp multiplier
+ C
+ C 15 code bytes each limb
+ C ecx/esi swapped on each chunk
+
+forloop(`i', UNROLL_COUNT, 1, `
+ deflit(`disp_src', eval(-i*4 + OFFSET))
+ deflit(`disp_dst', eval(disp_src - 4))
+
+ m4_assert(`disp_src>=-128 && disp_src<128')
+ m4_assert(`disp_dst>=-128 && disp_dst<128')
+
+ifelse(eval(i%2),0,`
+Zdisp( movl, disp_src,(%ebx), %eax)
+ mull %ebp
+Zdisp( addl, %esi, disp_dst,(%edi))
+ adcl %eax, %ecx
+ movl %edx, %esi
+ jadcl0( %esi)
+',`
+ dnl this one comes out last
+Zdisp( movl, disp_src,(%ebx), %eax)
+ mull %ebp
+Zdisp( addl, %ecx, disp_dst,(%edi))
+ adcl %eax, %esi
+ movl %edx, %ecx
+ jadcl0( %ecx)
+')
+')
+L(unroll_inner_end):
+
+ addl %esi, -4+OFFSET(%edi)
+
+ movl VAR_COUNTER, %edx
+ jadcl0( %ecx)
+
+ movl %ecx, m4_empty_if_zero(OFFSET)(%edi)
+ movl VAR_JMP, %ecx
+
+ incl %edx
+ jnz L(unroll_outer_top)
+
+
+ifelse(OFFSET,0,,`
+ addl $OFFSET, %ebx
+ addl $OFFSET, %edi
+')
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(16)
+L(corner):
+ C ebx &src[size]
+ C edi &dst[2*size-5]
+
+ movl -12(%ebx), %ebp
+
+ movl -8(%ebx), %eax
+ movl %eax, %ecx
+
+ mull %ebp
+
+ addl %eax, -4(%edi)
+ adcl $0, %edx
+
+ movl -4(%ebx), %eax
+ movl %edx, %esi
+ movl %eax, %ebx
+
+ mull %ebp
+
+ addl %esi, %eax
+ adcl $0, %edx
+
+ addl %eax, (%edi)
+ adcl $0, %edx
+
+ movl %edx, %esi
+ movl %ebx, %eax
+
+ mull %ecx
+
+ addl %esi, %eax
+ movl %eax, 4(%edi)
+
+ adcl $0, %edx
+
+ movl %edx, 8(%edi)
+
+
+C -----------------------------------------------------------------------------
+C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1].
+C The loop measures about 6 cycles/iteration, though it looks like it should
+C decode in 5.
+
+L(lshift_start):
+ movl PARAM_SIZE, %ecx
+
+ movl PARAM_DST, %edi
+ subl $1, %ecx C size-1 and clear carry
+
+ movl PARAM_SRC, %ebx
+ movl %ecx, %edx
+
+ xorl %eax, %eax C ready for adcl
+
+
+ ALIGN(16)
+L(lshift):
+ C eax
+ C ebx src (for later use)
+ C ecx counter, decrementing
+ C edx size-1 (for later use)
+ C esi
+ C edi dst, incrementing
+ C ebp
+
+ rcll 4(%edi)
+ rcll 8(%edi)
+ leal 8(%edi), %edi
+ loop L(lshift)
+
+
+ adcl %eax, %eax
+
+ movl %eax, 4(%edi) C dst most significant limb
+ movl (%ebx), %eax C src[0]
+
+ leal 4(%ebx,%edx,4), %ebx C &src[size]
+ subl %edx, %ecx C -(size-1)
+
+
+C -----------------------------------------------------------------------------
+C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
+C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the
+C low limb of src[0]^2.
+
+
+ mull %eax
+
+ movl %eax, (%edi,%ecx,8) C dst[0]
+
+
+ ALIGN(16)
+L(diag):
+ C eax scratch
+ C ebx &src[size]
+ C ecx counter, negative
+ C edx carry
+ C esi scratch
+ C edi dst[2*size-2]
+ C ebp
+
+ movl (%ebx,%ecx,4), %eax
+ movl %edx, %esi
+
+ mull %eax
+
+ addl %esi, 4(%edi,%ecx,8)
+ adcl %eax, 8(%edi,%ecx,8)
+ adcl $0, %edx
+
+ incl %ecx
+ jnz L(diag)
+
+
+ movl SAVE_EBX, %ebx
+ movl SAVE_ESI, %esi
+
+ addl %edx, 4(%edi) C dst most significant limb
+
+ movl SAVE_EDI, %edi
+ movl SAVE_EBP, %ebp
+ addl $FRAME, %esp
+ ret
+
+
+
+C -----------------------------------------------------------------------------
+ifdef(`PIC',`
+L(pic_calc):
+ C See mpn/x86/README about old gas bugs
+ addl (%esp), %ecx
+ addl $L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx
+ addl %edx, %ecx
+ ret_internal
+')
+
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/k7/README b/vendor/gmp-6.3.0/mpn/x86/k7/README
new file mode 100644
index 0000000..5711b61
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/k7/README
@@ -0,0 +1,174 @@
+Copyright 2000, 2001 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/.
+
+
+
+
+ AMD K7 MPN SUBROUTINES
+
+
+This directory contains code optimized for the AMD Athlon CPU.
+
+The mmx subdirectory has routines using MMX instructions. All Athlons have
+MMX, the separate directory is just so that configure can omit it if the
+assembler doesn't support MMX.
+
+
+
+STATUS
+
+Times for the loops, with all code and data in L1 cache.
+
+ cycles/limb
+ mpn_add/sub_n 1.6
+
+ mpn_copyi 0.75 or 1.0 \ varying with data alignment
+ mpn_copyd 0.75 or 1.0 /
+
+ mpn_divrem_1 17.0 integer part, 15.0 fractional part
+ mpn_mod_1 17.0
+ mpn_divexact_by3 8.0
+
+ mpn_l/rshift 1.2
+
+ mpn_mul_1 3.4
+ mpn_addmul/submul_1 3.9
+
+ mpn_mul_basecase 4.42 cycles/crossproduct (approx)
+ mpn_sqr_basecase 2.3 cycles/crossproduct (approx)
+ or 4.55 cycles/triangleproduct (approx)
+
+Prefetching of sources hasn't yet been tried.
+
+
+
+NOTES
+
+cmov, MMX, 3DNow and some extensions to MMX and 3DNow are available.
+
+Write-allocate L1 data cache means prefetching of destinations is unnecessary.
+
+Floating point multiplications can be done in parallel with integer
+multiplications, but there doesn't seem to be any way to make use of this.
+
+Unsigned "mul"s can be issued every 3 cycles. This suggests 3 is a limit on
+the speed of the multiplication routines. The documentation shows mul
+executing in IEU0 (or maybe in IEU0 and IEU1 together), so it might be that,
+to get near 3 cycles code has to be arranged so that nothing else is issued
+to IEU0. A busy IEU0 could explain why some code takes 4 cycles and other
+apparently equivalent code takes 5.
+
+
+
+OPTIMIZATIONS
+
+Unrolled loops are used to reduce looping overhead. The unrolling is
+configurable up to 32 limbs/loop for most routines and up to 64 for some.
+The K7 has 64k L1 code cache so quite big unrolling is allowable.
+
+Computed jumps into the unrolling are used to handle sizes not a multiple of
+the unrolling. An attractive feature of this is that times increase
+smoothly with operand size, but it may be that some routines should just
+have simple loops to finish up, especially when PIC adds between 2 and 16
+cycles to get %eip.
+
+Position independent code is implemented using a call to get %eip for the
+computed jumps and a ret is always done, rather than an addl $4,%esp or a
+popl, so the CPU return address branch prediction stack stays synchronised
+with the actual stack in memory.
+
+Branch prediction, in absence of any history, will guess forward jumps are
+not taken and backward jumps are taken. Where possible it's arranged that
+the less likely or less important case is under a taken forward jump.
+
+
+
+CODING
+
+Instructions in general code have been shown grouped if they can execute
+together, which means up to three direct-path instructions which have no
+successive dependencies. K7 always decodes three and has out-of-order
+execution, but the groupings show what slots might be available and what
+dependency chains exist.
+
+When there's vector-path instructions an effort is made to get triplets of
+direct-path instructions in between them, even if there's dependencies,
+since this maximizes decoding throughput and might save a cycle or two if
+decoding is the limiting factor.
+
+
+
+INSTRUCTIONS
+
+adcl direct
+divl 39 cycles back-to-back
+lodsl,etc vector
+loop 1 cycle vector (decl/jnz opens up one decode slot)
+movd reg vector
+movd mem direct
+mull issue every 3 cycles, latency 4 cycles low word, 6 cycles high word
+popl vector (use movl for more than one pop)
+pushl direct, will pair with a load
+shrdl %cl vector, 3 cycles, seems to be 3 decode too
+xorl r,r false read dependency recognised
+
+
+
+REFERENCES
+
+"AMD Athlon Processor X86 Code Optimization Guide", AMD publication number
+22007, revision K, February 2002. Available on-line,
+
+http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/22007.pdf
+
+"3DNow Technology Manual", AMD publication number 21928G/0-March 2000.
+This describes the femms and prefetch instructions. Available on-line,
+
+http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/21928.pdf
+
+"AMD Extensions to the 3DNow and MMX Instruction Sets Manual", AMD
+publication number 22466, revision D, March 2000. This describes
+instructions added in the Athlon processor, such as pswapd and the extra
+prefetch forms. Available on-line,
+
+http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/22466.pdf
+
+"3DNow Instruction Porting Guide", AMD publication number 22621, revision B,
+August 1999. This has some notes on general Athlon optimizations as well as
+3DNow. Available on-line,
+
+http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/22621.pdf
+
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 76
+End:
diff --git a/vendor/gmp-6.3.0/mpn/x86/k7/addlsh1_n.asm b/vendor/gmp-6.3.0/mpn/x86/k7/addlsh1_n.asm
new file mode 100644
index 0000000..2cba1eb
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/k7/addlsh1_n.asm
@@ -0,0 +1,196 @@
+dnl AMD K7 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1)
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C This is an attempt at an addlsh1_n for x86-32, not relying on sse2 insns.
+C The innerloop is 2*3-way unrolled, which is best we can do with the available
+C registers. It seems tricky to use the same structure for rsblsh1_n, since we
+C cannot feed carry between operations there.
+
+C cycles/limb
+C P5
+C P6 model 0-8,10-12
+C P6 model 9 (Banias)
+C P6 model 13 (Dothan) 5.4 (worse than add_n + lshift)
+C P4 model 0 (Willamette)
+C P4 model 1 (?)
+C P4 model 2 (Northwood)
+C P4 model 3 (Prescott)
+C P4 model 4 (Nocona)
+C Intel Atom 6
+C AMD K6 ?
+C AMD K7 2.5
+C AMD K8
+
+C This is a basic addlsh1_n for k7, atom, and perhaps some other x86-32
+C processors. It uses 2*3-way unrolling, for good reasons. Unfortunately,
+C that means we need an initial magic multiply.
+C
+C It is not clear how to do sublsh1_n or rsblsh1_n using the same pattern. We
+C cannot do rsblsh1_n since we feed carry from the shift blocks to the
+C add/subtract blocks, which is right for addition but reversed for
+C subtraction. We could perhaps do sublsh1_n, with some extra move insns,
+C without losing any time, since we're not issue limited but carry recurrency
+C latency.
+C
+C Breaking carry recurrency might be a good idea. We would then need separate
+C registers for the shift carry and add/subtract carry, which in turn would
+C force us to 2*2-way unrolling.
+
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_DBLD, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+dnl re-use parameter space
+define(VAR_COUNT,`PARAM_DST')
+define(VAR_TMP,`PARAM_DBLD')
+
+ASM_START()
+ TEXT
+ ALIGN(8)
+PROLOGUE(mpn_addlsh1_n)
+deflit(`FRAME',0)
+
+define(`rp', `%edi')
+define(`up', `%esi')
+define(`vp', `%ebp')
+
+ mov $0x2aaaaaab, %eax
+
+ push %ebx FRAME_pushl()
+ mov PARAM_SIZE, %ebx C size
+
+ push rp FRAME_pushl()
+ mov PARAM_DST, rp
+
+ mul %ebx
+
+ push up FRAME_pushl()
+ mov PARAM_SRC, up
+
+ not %edx C count = -(size\8)-1
+ mov %edx, VAR_COUNT
+
+ push vp FRAME_pushl()
+ mov PARAM_DBLD, vp
+
+ lea 3(%edx,%edx,2), %ecx C count*3+3 = -(size\6)*3
+ xor %edx, %edx
+ lea (%ebx,%ecx,2), %ebx C size + (count*3+3)*2 = size % 6
+ or %ebx, %ebx
+ jz L(exact)
+
+L(oop):
+ifdef(`CPU_P6',`
+ shr %edx ') C restore 2nd saved carry bit
+ mov (vp), %eax
+ adc %eax, %eax
+ rcr %edx C restore 1st saved carry bit
+ lea 4(vp), vp
+ adc (up), %eax
+ lea 4(up), up
+ adc %edx, %edx C save a carry bit in edx
+ifdef(`CPU_P6',`
+ adc %edx, %edx ') C save another carry bit in edx
+ dec %ebx
+ mov %eax, (rp)
+ lea 4(rp), rp
+ jnz L(oop)
+ mov vp, VAR_TMP
+L(exact):
+ incl VAR_COUNT
+ jz L(end)
+
+ ALIGN(16)
+L(top):
+ifdef(`CPU_P6',`
+ shr %edx ') C restore 2nd saved carry bit
+ mov (vp), %eax
+ adc %eax, %eax
+ mov 4(vp), %ebx
+ adc %ebx, %ebx
+ mov 8(vp), %ecx
+ adc %ecx, %ecx
+
+ rcr %edx C restore 1st saved carry bit
+
+ adc (up), %eax
+ mov %eax, (rp)
+ adc 4(up), %ebx
+ mov %ebx, 4(rp)
+ adc 8(up), %ecx
+ mov %ecx, 8(rp)
+
+ mov 12(vp), %eax
+ adc %eax, %eax
+ mov 16(vp), %ebx
+ adc %ebx, %ebx
+ mov 20(vp), %ecx
+ adc %ecx, %ecx
+
+ lea 24(vp), vp
+ adc %edx, %edx C save a carry bit in edx
+
+ adc 12(up), %eax
+ mov %eax, 12(rp)
+ adc 16(up), %ebx
+ mov %ebx, 16(rp)
+ adc 20(up), %ecx
+
+ lea 24(up), up
+
+ifdef(`CPU_P6',`
+ adc %edx, %edx ') C save another carry bit in edx
+ mov %ecx, 20(rp)
+ incl VAR_COUNT
+ lea 24(rp), rp
+ jne L(top)
+
+L(end):
+ pop vp FRAME_popl()
+ pop up FRAME_popl()
+
+ifdef(`CPU_P6',`
+ xor %eax, %eax
+ shr $1, %edx
+ adc %edx, %eax
+',`
+ adc $0, %edx
+ mov %edx, %eax
+')
+ pop rp FRAME_popl()
+ pop %ebx FRAME_popl()
+ ret
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/x86/k7/aors_n.asm b/vendor/gmp-6.3.0/mpn/x86/k7/aors_n.asm
new file mode 100644
index 0000000..1a08072
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/k7/aors_n.asm
@@ -0,0 +1,258 @@
+dnl AMD K7 mpn_add_n/mpn_sub_n -- mpn add or subtract.
+
+dnl Copyright 1999-2003 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K7: 1.64 cycles/limb (at 16 limbs/loop).
+
+
+
+dnl K7: UNROLL_COUNT cycles/limb
+dnl 8 1.9
+dnl 16 1.64
+dnl 32 1.7
+dnl 64 2.0
+dnl Maximum possible with the current code is 64.
+
+deflit(UNROLL_COUNT, 16)
+
+
+ifdef(`OPERATION_add_n', `
+ define(M4_inst, adcl)
+ define(M4_function_n, mpn_add_n)
+ define(M4_function_nc, mpn_add_nc)
+ define(M4_description, add)
+',`ifdef(`OPERATION_sub_n', `
+ define(M4_inst, sbbl)
+ define(M4_function_n, mpn_sub_n)
+ define(M4_function_nc, mpn_sub_nc)
+ define(M4_description, subtract)
+',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
+')')')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+
+C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size);
+C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size, mp_limb_t carry);
+C
+C Calculate src1,size M4_description src2,size, and store the result in
+C dst,size. The return value is the carry bit from the top of the result (1
+C or 0).
+C
+C The _nc version accepts 1 or 0 for an initial carry into the low limb of
+C the calculation. Note values other than 1 or 0 here will lead to garbage
+C results.
+C
+C This code runs at 1.64 cycles/limb, which might be the best possible with
+C plain integer operations. Each limb is 2 loads and 1 store, any 2 of
+C which can be done each cycle, leading to 1.5 c/l.
+
+dnl Must have UNROLL_THRESHOLD >= 2, since the unrolled loop can't handle 1.
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 8)
+',`
+deflit(UNROLL_THRESHOLD, 8)
+')
+
+defframe(PARAM_CARRY,20)
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC2, 12)
+defframe(PARAM_SRC1, 8)
+defframe(PARAM_DST, 4)
+
+defframe(SAVE_EBP, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EBX, -12)
+defframe(SAVE_EDI, -16)
+deflit(STACK_SPACE, 16)
+
+ TEXT
+ ALIGN(32)
+deflit(`FRAME',0)
+
+PROLOGUE(M4_function_nc)
+ movl PARAM_CARRY, %eax
+ jmp L(start)
+EPILOGUE()
+
+PROLOGUE(M4_function_n)
+
+ xorl %eax, %eax C carry
+L(start):
+ movl PARAM_SIZE, %ecx
+ subl $STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+ movl %edi, SAVE_EDI
+ movl %ebx, SAVE_EBX
+ cmpl $UNROLL_THRESHOLD, %ecx
+
+ movl PARAM_SRC2, %edx
+ movl PARAM_SRC1, %ebx
+ jae L(unroll)
+
+ movl PARAM_DST, %edi
+ leal (%ebx,%ecx,4), %ebx
+ leal (%edx,%ecx,4), %edx
+
+ leal (%edi,%ecx,4), %edi
+ negl %ecx
+ shrl %eax
+
+ C This loop in in a single 16 byte code block already, so no
+ C alignment necessary.
+L(simple):
+ C eax scratch
+ C ebx src1
+ C ecx counter
+ C edx src2
+ C esi
+ C edi dst
+ C ebp
+
+ movl (%ebx,%ecx,4), %eax
+ M4_inst (%edx,%ecx,4), %eax
+ movl %eax, (%edi,%ecx,4)
+ incl %ecx
+ jnz L(simple)
+
+ movl $0, %eax
+ movl SAVE_EDI, %edi
+
+ movl SAVE_EBX, %ebx
+ setc %al
+ addl $STACK_SPACE, %esp
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+ C This is at 0x55, close enough to aligned.
+L(unroll):
+deflit(`FRAME',STACK_SPACE)
+ movl %ebp, SAVE_EBP
+ andl $-2, %ecx C size low bit masked out
+ andl $1, PARAM_SIZE C size low bit kept
+
+ movl %ecx, %edi
+ decl %ecx
+ movl PARAM_DST, %ebp
+
+ shrl $UNROLL_LOG2, %ecx
+ negl %edi
+ movl %esi, SAVE_ESI
+
+ andl $UNROLL_MASK, %edi
+
+ifdef(`PIC',`
+ call L(pic_calc)
+L(here):
+',`
+ leal L(entry) (%edi,%edi,8), %esi C 9 bytes per
+')
+ negl %edi
+ shrl %eax
+
+ leal ifelse(UNROLL_BYTES,256,128) (%ebx,%edi,4), %ebx
+ leal ifelse(UNROLL_BYTES,256,128) (%edx,%edi,4), %edx
+ leal ifelse(UNROLL_BYTES,256,128) (%ebp,%edi,4), %edi
+
+ jmp *%esi
+
+
+ifdef(`PIC',`
+L(pic_calc):
+ C See mpn/x86/README about old gas bugs
+ leal (%edi,%edi,8), %esi
+ addl $L(entry)-L(here), %esi
+ addl (%esp), %esi
+ ret_internal
+')
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(32)
+L(top):
+ C eax zero
+ C ebx src1
+ C ecx counter
+ C edx src2
+ C esi scratch (was computed jump)
+ C edi dst
+ C ebp scratch
+
+ leal UNROLL_BYTES(%edx), %edx
+
+L(entry):
+deflit(CHUNK_COUNT, 2)
+forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+ deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
+ deflit(`disp1', eval(disp0 + 4))
+
+Zdisp( movl, disp0,(%ebx), %esi)
+ movl disp1(%ebx), %ebp
+Zdisp( M4_inst,disp0,(%edx), %esi)
+Zdisp( movl, %esi, disp0,(%edi))
+ M4_inst disp1(%edx), %ebp
+ movl %ebp, disp1(%edi)
+')
+
+ decl %ecx
+ leal UNROLL_BYTES(%ebx), %ebx
+ leal UNROLL_BYTES(%edi), %edi
+ jns L(top)
+
+
+ mov PARAM_SIZE, %esi
+ movl SAVE_EBP, %ebp
+ movl $0, %eax
+
+ decl %esi
+ js L(even)
+
+ movl (%ebx), %ecx
+ M4_inst UNROLL_BYTES(%edx), %ecx
+ movl %ecx, (%edi)
+L(even):
+
+ movl SAVE_EDI, %edi
+ movl SAVE_EBX, %ebx
+ setc %al
+
+ movl SAVE_ESI, %esi
+ addl $STACK_SPACE, %esp
+
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/k7/aorsmul_1.asm b/vendor/gmp-6.3.0/mpn/x86/k7/aorsmul_1.asm
new file mode 100644
index 0000000..eec8df6
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/k7/aorsmul_1.asm
@@ -0,0 +1,167 @@
+dnl AMD K7 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
+
+dnl Copyright 1999-2002, 2005, 2008 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C P5
+C P6 model 0-8,10-12
+C P6 model 9 (Banias) 6.5
+C P6 model 13 (Dothan)
+C P4 model 0 (Willamette)
+C P4 model 1 (?)
+C P4 model 2 (Northwood)
+C P4 model 3 (Prescott)
+C P4 model 4 (Nocona)
+C AMD K6
+C AMD K7 3.75
+C AMD K8
+
+C TODO
+C * Improve feed-in and wind-down code. We beat the old code for all n != 1,
+C but lose by 2x for n == 1.
+
+ifdef(`OPERATION_addmul_1',`
+ define(`ADDSUB', `add')
+ define(`func', `mpn_addmul_1')
+')
+ifdef(`OPERATION_submul_1',`
+ define(`ADDSUB', `sub')
+ define(`func', `mpn_submul_1')
+')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(func)
+ add $-16, %esp
+ mov %ebp, (%esp)
+ mov %ebx, 4(%esp)
+ mov %esi, 8(%esp)
+ mov %edi, 12(%esp)
+
+ mov 20(%esp), %edi
+ mov 24(%esp), %esi
+ mov 28(%esp), %eax
+ mov 32(%esp), %ecx
+ mov %eax, %ebx
+ shr $2, %eax
+ mov %eax, 28(%esp)
+ mov (%esi), %eax
+ and $3, %ebx
+ jz L(b0)
+ cmp $2, %ebx
+ jz L(b2)
+ jg L(b3)
+
+L(b1): lea -4(%esi), %esi
+ lea -4(%edi), %edi
+ mul %ecx
+ mov %eax, %ebx
+ mov %edx, %ebp
+ cmpl $0, 28(%esp)
+ jz L(cj1)
+ mov 8(%esi), %eax
+ jmp L(1)
+
+L(b2): mul %ecx
+ mov %eax, %ebp
+ mov 4(%esi), %eax
+ mov %edx, %ebx
+ cmpl $0, 28(%esp)
+ jne L(2)
+ jmp L(cj2)
+
+L(b3): lea -12(%esi), %esi
+ lea -12(%edi), %edi
+ mul %ecx
+ mov %eax, %ebx
+ mov %edx, %ebp
+ mov 16(%esi), %eax
+ incl 28(%esp)
+ jmp L(3)
+
+L(b0): lea -8(%esi), %esi
+ lea -8(%edi), %edi
+ mul %ecx
+ mov %eax, %ebp
+ mov 12(%esi), %eax
+ mov %edx, %ebx
+ jmp L(0)
+
+ ALIGN(16)
+L(top): lea 16(%edi), %edi
+L(2): mul %ecx
+ ADDSUB %ebp, 0(%edi)
+ mov $0, %ebp
+ adc %eax, %ebx
+ mov 8(%esi), %eax
+ adc %edx, %ebp
+L(1): mul %ecx
+ ADDSUB %ebx, 4(%edi)
+ mov $0, %ebx
+ adc %eax, %ebp
+ mov 12(%esi), %eax
+ adc %edx, %ebx
+L(0): mul %ecx
+ ADDSUB %ebp, 8(%edi)
+ mov $0, %ebp
+ adc %eax, %ebx
+ adc %edx, %ebp
+ mov 16(%esi), %eax
+L(3): mul %ecx
+ ADDSUB %ebx, 12(%edi)
+ adc %eax, %ebp
+ mov 20(%esi), %eax
+ lea 16(%esi), %esi
+ mov $0, %ebx
+ adc %edx, %ebx
+ decl 28(%esp)
+ jnz L(top)
+
+L(end): lea 16(%edi), %edi
+L(cj2): mul %ecx
+ ADDSUB %ebp, (%edi)
+ adc %eax, %ebx
+ adc $0, %edx
+L(cj1): ADDSUB %ebx, 4(%edi)
+ adc $0, %edx
+ mov %edx, %eax
+ mov (%esp), %ebp
+ mov 4(%esp), %ebx
+ mov 8(%esp), %esi
+ mov 12(%esp), %edi
+ add $16, %esp
+ ret
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/x86/k7/bdiv_q_1.asm b/vendor/gmp-6.3.0/mpn/x86/k7/bdiv_q_1.asm
new file mode 100644
index 0000000..2af7bb9
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/k7/bdiv_q_1.asm
@@ -0,0 +1,245 @@
+dnl AMD K7 mpn_bdiv_q_1 -- mpn by limb exact division.
+
+dnl Rearranged from mpn/x86/k7/dive_1.asm by Marco Bodrato.
+
+dnl Copyright 2001, 2002, 2004, 2007, 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C Athlon: 11.0
+C Hammer: 9.0
+
+
+C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor);
+C
+C The dependent chain is mul+imul+sub for 11 cycles and that speed is
+C achieved with no special effort. The load and shrld latencies are hidden
+C by out of order execution.
+C
+C It's a touch faster on size==1 to use the mul-by-inverse than divl.
+
+defframe(PARAM_SHIFT, 24)
+defframe(PARAM_INVERSE,20)
+defframe(PARAM_DIVISOR,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+defframe(SAVE_EBX, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EDI, -12)
+defframe(SAVE_EBP, -16)
+defframe(VAR_INVERSE, -20)
+defframe(VAR_DST_END, -24)
+
+deflit(STACK_SPACE, 24)
+
+ TEXT
+
+C mp_limb_t
+C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C mp_limb_t inverse, int shift)
+ ALIGN(16)
+PROLOGUE(mpn_pi1_bdiv_q_1)
+deflit(`FRAME',0)
+
+ subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE)
+ movl PARAM_SHIFT, %ecx C shift count
+
+ movl %ebp, SAVE_EBP
+ movl PARAM_SIZE, %ebp
+
+ movl %esi, SAVE_ESI
+ movl PARAM_SRC, %esi
+
+ movl %edi, SAVE_EDI
+ movl PARAM_DST, %edi
+
+ movl %ebx, SAVE_EBX
+
+ leal (%esi,%ebp,4), %esi C src end
+ leal (%edi,%ebp,4), %edi C dst end
+ negl %ebp C -size
+
+ movl PARAM_INVERSE, %eax C inv
+
+L(common):
+ movl %eax, VAR_INVERSE
+ movl (%esi,%ebp,4), %eax C src[0]
+
+ incl %ebp
+ jz L(one)
+
+ movl (%esi,%ebp,4), %edx C src[1]
+
+ shrdl( %cl, %edx, %eax)
+
+ movl %edi, VAR_DST_END
+ xorl %ebx, %ebx
+ jmp L(entry)
+
+ ALIGN(8)
+L(top):
+ C eax q
+ C ebx carry bit, 0 or 1
+ C ecx shift
+ C edx
+ C esi src end
+ C edi dst end
+ C ebp counter, limbs, negative
+
+ mull PARAM_DIVISOR C carry limb in edx
+
+ movl -4(%esi,%ebp,4), %eax
+ movl (%esi,%ebp,4), %edi
+
+ shrdl( %cl, %edi, %eax)
+
+ subl %ebx, %eax C apply carry bit
+ setc %bl
+ movl VAR_DST_END, %edi
+
+ subl %edx, %eax C apply carry limb
+ adcl $0, %ebx
+
+L(entry):
+ imull VAR_INVERSE, %eax
+
+ movl %eax, -4(%edi,%ebp,4)
+ incl %ebp
+ jnz L(top)
+
+
+ mull PARAM_DIVISOR C carry limb in edx
+
+ movl -4(%esi), %eax C src high limb
+ shrl %cl, %eax
+ movl SAVE_ESI, %esi
+
+ subl %ebx, %eax C apply carry bit
+ movl SAVE_EBX, %ebx
+ movl SAVE_EBP, %ebp
+
+ subl %edx, %eax C apply carry limb
+
+ imull VAR_INVERSE, %eax
+
+ movl %eax, -4(%edi)
+ movl SAVE_EDI, %edi
+ addl $STACK_SPACE, %esp
+
+ ret
+
+L(one):
+ shrl %cl, %eax
+ movl SAVE_ESI, %esi
+ movl SAVE_EBX, %ebx
+
+ imull VAR_INVERSE, %eax
+
+ movl SAVE_EBP, %ebp
+
+ movl %eax, -4(%edi)
+ movl SAVE_EDI, %edi
+ addl $STACK_SPACE, %esp
+
+ ret
+EPILOGUE()
+
+C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor);
+C
+
+ ALIGN(16)
+PROLOGUE(mpn_bdiv_q_1)
+deflit(`FRAME',0)
+
+ movl PARAM_DIVISOR, %eax
+ subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE)
+ movl $-1, %ecx C shift count
+
+ movl %ebp, SAVE_EBP
+ movl PARAM_SIZE, %ebp
+
+ movl %esi, SAVE_ESI
+ movl %edi, SAVE_EDI
+
+ C If there's usually only one or two trailing zero bits then this
+ C should be faster than bsfl.
+L(strip_twos):
+ incl %ecx
+ shrl %eax
+ jnc L(strip_twos)
+
+ movl %ebx, SAVE_EBX
+ leal 1(%eax,%eax), %ebx C d without twos
+ andl $127, %eax C d/2, 7 bits
+
+ifdef(`PIC',`
+ LEA( binvert_limb_table, %edx)
+ movzbl (%eax,%edx), %eax C inv 8 bits
+',`
+ movzbl binvert_limb_table(%eax), %eax C inv 8 bits
+')
+
+ leal (%eax,%eax), %edx C 2*inv
+ movl %ebx, PARAM_DIVISOR C d without twos
+
+ imull %eax, %eax C inv*inv
+
+ movl PARAM_SRC, %esi
+ movl PARAM_DST, %edi
+
+ imull %ebx, %eax C inv*inv*d
+
+ subl %eax, %edx C inv = 2*inv - inv*inv*d
+ leal (%edx,%edx), %eax C 2*inv
+
+ imull %edx, %edx C inv*inv
+
+ leal (%esi,%ebp,4), %esi C src end
+ leal (%edi,%ebp,4), %edi C dst end
+ negl %ebp C -size
+
+ imull %ebx, %edx C inv*inv*d
+
+ subl %edx, %eax C inv = 2*inv - inv*inv*d
+
+ ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS
+ pushl %eax FRAME_pushl()
+ imull PARAM_DIVISOR, %eax
+ cmpl $1, %eax
+ popl %eax FRAME_popl()')
+
+ jmp L(common)
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/x86/k7/dive_1.asm b/vendor/gmp-6.3.0/mpn/x86/k7/dive_1.asm
new file mode 100644
index 0000000..458bd02
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/k7/dive_1.asm
@@ -0,0 +1,208 @@
+dnl AMD K7 mpn_divexact_1 -- mpn by limb exact division.
+
+dnl Copyright 2001, 2002, 2004, 2007 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C Athlon: 11.0
+C Hammer: 9.0
+
+
+C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor);
+C
+C The dependent chain is mul+imul+sub for 11 cycles and that speed is
+C achieved with no special effort. The load and shrld latencies are hidden
+C by out of order execution.
+C
+C It's a touch faster on size==1 to use the mul-by-inverse than divl.
+
+defframe(PARAM_DIVISOR,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+defframe(SAVE_EBX, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EDI, -12)
+defframe(SAVE_EBP, -16)
+defframe(VAR_INVERSE, -20)
+defframe(VAR_DST_END, -24)
+
+deflit(STACK_SPACE, 24)
+
+ TEXT
+
+ ALIGN(16)
+PROLOGUE(mpn_divexact_1)
+deflit(`FRAME',0)
+
+ movl PARAM_DIVISOR, %eax
+ subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE)
+ movl $-1, %ecx C shift count
+
+ movl %ebp, SAVE_EBP
+ movl PARAM_SIZE, %ebp
+
+ movl %esi, SAVE_ESI
+ movl %edi, SAVE_EDI
+
+ C If there's usually only one or two trailing zero bits then this
+ C should be faster than bsfl.
+L(strip_twos):
+ incl %ecx
+ shrl %eax
+ jnc L(strip_twos)
+
+ movl %ebx, SAVE_EBX
+ leal 1(%eax,%eax), %ebx C d without twos
+ andl $127, %eax C d/2, 7 bits
+
+ifdef(`PIC',`
+ LEA( binvert_limb_table, %edx)
+ movzbl (%eax,%edx), %eax C inv 8 bits
+',`
+ movzbl binvert_limb_table(%eax), %eax C inv 8 bits
+')
+
+ leal (%eax,%eax), %edx C 2*inv
+ movl %ebx, PARAM_DIVISOR C d without twos
+
+ imull %eax, %eax C inv*inv
+
+ movl PARAM_SRC, %esi
+ movl PARAM_DST, %edi
+
+ imull %ebx, %eax C inv*inv*d
+
+ subl %eax, %edx C inv = 2*inv - inv*inv*d
+ leal (%edx,%edx), %eax C 2*inv
+
+ imull %edx, %edx C inv*inv
+
+ leal (%esi,%ebp,4), %esi C src end
+ leal (%edi,%ebp,4), %edi C dst end
+ negl %ebp C -size
+
+ imull %ebx, %edx C inv*inv*d
+
+ subl %edx, %eax C inv = 2*inv - inv*inv*d
+
+ ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS
+ pushl %eax FRAME_pushl()
+ imull PARAM_DIVISOR, %eax
+ cmpl $1, %eax
+ popl %eax FRAME_popl()')
+
+ movl %eax, VAR_INVERSE
+ movl (%esi,%ebp,4), %eax C src[0]
+
+ incl %ebp
+ jz L(one)
+
+ movl (%esi,%ebp,4), %edx C src[1]
+
+ shrdl( %cl, %edx, %eax)
+
+ movl %edi, VAR_DST_END
+ xorl %ebx, %ebx
+ jmp L(entry)
+
+ ALIGN(8)
+L(top):
+ C eax q
+ C ebx carry bit, 0 or 1
+ C ecx shift
+ C edx
+ C esi src end
+ C edi dst end
+ C ebp counter, limbs, negative
+
+ mull PARAM_DIVISOR C carry limb in edx
+
+ movl -4(%esi,%ebp,4), %eax
+ movl (%esi,%ebp,4), %edi
+
+ shrdl( %cl, %edi, %eax)
+
+ subl %ebx, %eax C apply carry bit
+ setc %bl
+ movl VAR_DST_END, %edi
+
+ subl %edx, %eax C apply carry limb
+ adcl $0, %ebx
+
+L(entry):
+ imull VAR_INVERSE, %eax
+
+ movl %eax, -4(%edi,%ebp,4)
+ incl %ebp
+ jnz L(top)
+
+
+ mull PARAM_DIVISOR C carry limb in edx
+
+ movl -4(%esi), %eax C src high limb
+ shrl %cl, %eax
+ movl SAVE_ESI, %esi
+
+ subl %ebx, %eax C apply carry bit
+ movl SAVE_EBX, %ebx
+ movl SAVE_EBP, %ebp
+
+ subl %edx, %eax C apply carry limb
+
+ imull VAR_INVERSE, %eax
+
+ movl %eax, -4(%edi)
+ movl SAVE_EDI, %edi
+ addl $STACK_SPACE, %esp
+
+ ret
+
+
+L(one):
+ shrl %cl, %eax
+ movl SAVE_ESI, %esi
+ movl SAVE_EBX, %ebx
+
+ imull VAR_INVERSE, %eax
+
+ movl SAVE_EBP, %ebp
+ movl %eax, -4(%edi)
+
+ movl SAVE_EDI, %edi
+ addl $STACK_SPACE, %esp
+
+ ret
+
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/x86/k7/gcd_11.asm b/vendor/gmp-6.3.0/mpn/x86/k7/gcd_11.asm
new file mode 100644
index 0000000..2648dfd
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/k7/gcd_11.asm
@@ -0,0 +1,107 @@
+dnl x86 mpn_gcd_11 optimised for AMD K7.
+
+dnl Contributed to the GNU project by by Kevin Ryde. Rehacked by Torbjorn
+dnl Granlund.
+
+dnl Copyright 2000-2002, 2005, 2009, 2011, 2012, 2014, 2015 Free Software
+dnl Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/bit (approx)
+C AMD K7 5.31
+C AMD K8,K9 5.33
+C AMD K10 5.30
+C AMD bd1 ?
+C AMD bobcat 7.02
+C Intel P4-2 10.1
+C Intel P4-3/4 10.0
+C Intel P6/13 5.88
+C Intel core2 6.26
+C Intel NHM 6.83
+C Intel SBR 8.50
+C Intel atom 8.90
+C VIA nano ?
+C Numbers measured with: speed -CD -s16-32 -t16 mpn_gcd_1
+
+
+C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
+
+deflit(MAXSHIFT, 6)
+deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1))
+
+DEF_OBJECT(ctz_table,64)
+ .byte MAXSHIFT
+forloop(i,1,MASK,
+` .byte m4_count_trailing_zeros(i)
+')
+END_OBJECT(ctz_table)
+
+
+define(`u0', `%eax')
+define(`v0', `%edx')
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_gcd_11)
+ push %edi
+ push %esi
+
+ mov 12(%esp), %eax
+ mov 16(%esp), %edx
+
+ LEAL( ctz_table, %esi)
+ jmp L(odd)
+
+ ALIGN(16) C
+L(top): cmovc( %ecx, %eax) C u = |v - u|
+ cmovc( %edi, %edx) C v = min(u,v)
+L(mid): and $MASK, %ecx C
+ movzbl (%esi,%ecx), %ecx C
+ jz L(shift_alot) C
+ shr %cl, %eax C
+L(odd): mov %eax, %edi C
+ mov %edx, %ecx C
+ sub %eax, %ecx C
+ sub %edx, %eax C
+ jnz L(top) C
+
+L(end): mov %edx, %eax
+ pop %esi
+ pop %edi
+ ret
+
+L(shift_alot):
+ shr $MAXSHIFT, %eax
+ mov %eax, %ecx
+ jmp L(mid)
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/x86/k7/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/x86/k7/gmp-mparam.h
new file mode 100644
index 0000000..a09507d
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/k7/gmp-mparam.h
@@ -0,0 +1,263 @@
+/* AMD K7 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 2083 MHz K7 Barton */
+/* FFT tuning limit = 49,770,069 */
+/* Generated by tuneup.c, 2019-11-09, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 3
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 8
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 4
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 24
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 13
+#define USE_PREINV_DIVREM_1 1 /* native */
+/* From mati.gmplib.org, 2023-07-21 */
+#define DIV_QR_1N_PI1_METHOD 3 /* 9.52% faster than 1 */
+#define DIV_QR_1_NORM_THRESHOLD 4
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 26
+
+#define DIV_1_VS_MUL_1_PERCENT 182
+
+#define MUL_TOOM22_THRESHOLD 28
+#define MUL_TOOM33_THRESHOLD 85
+#define MUL_TOOM44_THRESHOLD 154
+#define MUL_TOOM6H_THRESHOLD 208
+#define MUL_TOOM8H_THRESHOLD 309
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 99
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 97
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 102
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 121
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 50
+#define SQR_TOOM3_THRESHOLD 86
+#define SQR_TOOM4_THRESHOLD 220
+#define SQR_TOOM6_THRESHOLD 270
+#define SQR_TOOM8_THRESHOLD 446
+
+#define MULMID_TOOM42_THRESHOLD 50
+
+#define MULMOD_BNM1_THRESHOLD 18
+#define SQRMOD_BNM1_THRESHOLD 19
+
+#define MUL_FFT_MODF_THRESHOLD 606 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 606, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \
+ { 15, 5}, { 31, 6}, { 28, 7}, { 15, 6}, \
+ { 32, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \
+ { 39, 7}, { 23, 6}, { 47, 7}, { 29, 8}, \
+ { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \
+ { 23, 7}, { 49, 8}, { 31, 7}, { 63, 8}, \
+ { 39, 9}, { 23, 8}, { 55, 9}, { 31, 8}, \
+ { 63, 7}, { 127, 8}, { 71, 9}, { 39, 6}, \
+ { 319, 9}, { 47, 8}, { 99, 6}, { 399, 9}, \
+ { 55,10}, { 31, 9}, { 63, 8}, { 127, 9}, \
+ { 79,10}, { 47, 9}, { 95, 8}, { 191, 4}, \
+ { 3135, 5}, { 1599, 4}, { 3455, 6}, { 959, 8}, \
+ { 247,10}, { 79, 9}, { 167,10}, { 95, 9}, \
+ { 199,10}, { 111,11}, { 63,10}, { 127, 9}, \
+ { 255,10}, { 143, 9}, { 287, 8}, { 575,10}, \
+ { 159, 9}, { 319, 8}, { 639, 7}, { 1279,11}, \
+ { 95,10}, { 191, 9}, { 383, 8}, { 799,10}, \
+ { 207,12}, { 63,11}, { 127,10}, { 255, 9}, \
+ { 511, 8}, { 1023,10}, { 271, 9}, { 543, 8}, \
+ { 1087, 9}, { 575,11}, { 159, 9}, { 639,10}, \
+ { 335, 9}, { 671, 8}, { 1343,10}, { 351, 9}, \
+ { 703,11}, { 191,10}, { 383, 9}, { 799, 8}, \
+ { 1599,11}, { 223,10}, { 447,12}, { 127,11}, \
+ { 255,10}, { 511, 9}, { 1023,10}, { 543, 9}, \
+ { 1087,10}, { 575, 9}, { 1151,10}, { 607, 9}, \
+ { 1215,11}, { 319,10}, { 639, 9}, { 1343,10}, \
+ { 703, 9}, { 1407,12}, { 191,11}, { 383,10}, \
+ { 767, 9}, { 1535,10}, { 799, 9}, { 1599,10}, \
+ { 831, 9}, { 1727, 8}, { 3455,11}, { 447,13}, \
+ { 127,12}, { 255,11}, { 511,10}, { 1023, 9}, \
+ { 2047,11}, { 543,10}, { 1087,11}, { 575,10}, \
+ { 1151, 9}, { 2303,11}, { 607,10}, { 1215,12}, \
+ { 319,11}, { 639,10}, { 1279,11}, { 671,10}, \
+ { 1343,11}, { 703,10}, { 1407,11}, { 735,10}, \
+ { 1471, 9}, { 2943,12}, { 383,11}, { 767,10}, \
+ { 1535,11}, { 799,10}, { 1599,11}, { 831,10}, \
+ { 1663,11}, { 863,10}, { 1727,12}, { 447,11}, \
+ { 895,10}, { 1791,11}, { 959,10}, { 1919,13}, \
+ { 255,12}, { 511,11}, { 1023,10}, { 2111,11}, \
+ { 1087,10}, { 2175,12}, { 575,11}, { 1151,10}, \
+ { 2303,11}, { 1215,10}, { 2431,12}, { 639,11}, \
+ { 1343,12}, { 703,11}, { 1407,10}, { 2815,11}, \
+ { 1471,10}, { 2943,13}, { 383,12}, { 767,11}, \
+ { 1599,12}, { 831,11}, { 1663,10}, { 3327,11}, \
+ { 1727,10}, { 3455,12}, { 895,11}, { 1855,12}, \
+ { 959,11}, { 1919,10}, { 3839,14}, { 255,13}, \
+ { 511,12}, { 1023,11}, { 2111,12}, { 1087,11}, \
+ { 2239,12}, { 1151,11}, { 2303,12}, { 1215,11}, \
+ { 2431,13}, { 639,12}, { 1343,11}, { 2687,12}, \
+ { 1407,11}, { 2815,12}, { 1471,11}, { 2943,13}, \
+ { 767,12}, { 1663,11}, { 3327,12}, { 1727,11}, \
+ { 3455,13}, { 895,12}, { 1919,11}, { 3839,12}, \
+ { 1983,14}, { 511,13}, { 1023,12}, { 2239,13}, \
+ { 1151,12}, { 2495,13}, { 1279,12}, { 2687,13}, \
+ { 1407,12}, { 2943,14}, { 767,13}, { 1535,12}, \
+ { 3135,13}, { 1663,12}, { 3455,13}, { 1791,12}, \
+ { 3583,13}, { 1919,12}, { 3967,15}, { 511,14}, \
+ { 1023,13}, { 2047,12}, { 4095,13}, { 2175,12}, \
+ { 4479,13}, { 2431,12}, { 4863,14}, { 1279,13}, \
+ { 2559,12}, { 5119,13}, { 2943,12}, { 5887,14}, \
+ { 1535,13}, { 3455,14}, { 1791,13}, { 3967,15}, \
+ { 1023,14}, { 2047,13}, { 4479,14}, { 2303,13}, \
+ { 4991,14}, { 2559,13}, { 5119,14}, { 2815,13}, \
+ { 5887,15}, { 32768,16} }
+#define MUL_FFT_TABLE3_SIZE 254
+#define MUL_FFT_THRESHOLD 7552
+
+#define SQR_FFT_MODF_THRESHOLD 492 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 492, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \
+ { 28, 7}, { 15, 6}, { 32, 7}, { 17, 6}, \
+ { 35, 7}, { 19, 6}, { 39, 7}, { 27, 8}, \
+ { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \
+ { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \
+ { 31, 7}, { 63, 8}, { 39, 9}, { 23, 8}, \
+ { 51, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \
+ { 79, 9}, { 47, 8}, { 95, 9}, { 55,10}, \
+ { 31, 9}, { 79,10}, { 47, 9}, { 103,11}, \
+ { 31,10}, { 63, 9}, { 135, 8}, { 271, 9}, \
+ { 143,10}, { 79, 9}, { 167,10}, { 95, 9}, \
+ { 191, 8}, { 383,10}, { 111,11}, { 63,10}, \
+ { 127, 9}, { 255, 8}, { 511,10}, { 143, 9}, \
+ { 303,10}, { 159, 9}, { 319, 8}, { 639,11}, \
+ { 95,10}, { 191, 9}, { 383, 8}, { 767, 9}, \
+ { 399,10}, { 207,12}, { 63,11}, { 127,10}, \
+ { 255, 9}, { 511,10}, { 271, 9}, { 543, 8}, \
+ { 1087,10}, { 287, 9}, { 575,10}, { 303,11}, \
+ { 159,10}, { 319, 9}, { 639,10}, { 335, 9}, \
+ { 671, 8}, { 1343, 9}, { 703,11}, { 191,10}, \
+ { 383, 9}, { 767, 8}, { 1535,10}, { 399, 9}, \
+ { 799, 8}, { 1599, 9}, { 863,11}, { 223,10}, \
+ { 447,12}, { 127,11}, { 255,10}, { 511, 9}, \
+ { 1087,10}, { 575, 9}, { 1215,10}, { 639, 9}, \
+ { 1279,10}, { 671, 9}, { 1343,11}, { 351,10}, \
+ { 703, 9}, { 1407,10}, { 735, 9}, { 1471,12}, \
+ { 191,11}, { 383,10}, { 767, 9}, { 1535,10}, \
+ { 799, 9}, { 1599,11}, { 415,10}, { 831, 9}, \
+ { 1663,10}, { 863, 9}, { 1727, 8}, { 3455,11}, \
+ { 447,10}, { 895,13}, { 127,12}, { 255,11}, \
+ { 511,10}, { 1023, 9}, { 2047,11}, { 543,10}, \
+ { 1087, 9}, { 2175,11}, { 575,10}, { 1151, 9}, \
+ { 2303,11}, { 607,10}, { 1215, 9}, { 2431,12}, \
+ { 319,11}, { 639,10}, { 1279,11}, { 671,10}, \
+ { 1343,11}, { 703,10}, { 1407, 9}, { 2815,11}, \
+ { 735,10}, { 1471, 9}, { 2943,12}, { 383,11}, \
+ { 767,10}, { 1599,11}, { 831,10}, { 1663, 9}, \
+ { 3327,10}, { 1727,12}, { 447,11}, { 895,10}, \
+ { 1791,11}, { 959,10}, { 1919,13}, { 255,12}, \
+ { 511,11}, { 1023,10}, { 2111,11}, { 1087,10}, \
+ { 2175,12}, { 575,11}, { 1151,10}, { 2303,11}, \
+ { 1215,10}, { 2431,12}, { 639,11}, { 1343,12}, \
+ { 703,11}, { 1407,10}, { 2815,11}, { 1471,10}, \
+ { 2943,13}, { 383,12}, { 767,11}, { 1599,12}, \
+ { 831,11}, { 1663,10}, { 3327,11}, { 1727,10}, \
+ { 3455,12}, { 895,11}, { 1791,12}, { 959,11}, \
+ { 1919,10}, { 3839,14}, { 255,13}, { 511,12}, \
+ { 1023,11}, { 2111,12}, { 1087,11}, { 2239,12}, \
+ { 1151,11}, { 2303,12}, { 1215,11}, { 2431,13}, \
+ { 639,12}, { 1343,11}, { 2687,12}, { 1407,11}, \
+ { 2815,12}, { 1471,11}, { 2943,13}, { 767,12}, \
+ { 1599,11}, { 3199,12}, { 1663,11}, { 3327,12}, \
+ { 1727,11}, { 3455,13}, { 895,12}, { 1791,11}, \
+ { 3583,12}, { 1919,11}, { 3839,12}, { 1983,14}, \
+ { 511,13}, { 1023,12}, { 2239,13}, { 1151,12}, \
+ { 2431,13}, { 1279,12}, { 2687,13}, { 1407,12}, \
+ { 2943,14}, { 767,13}, { 1535,12}, { 3199,13}, \
+ { 1663,12}, { 3455,13}, { 1791,12}, { 3583,13}, \
+ { 1919,12}, { 3967,15}, { 511,14}, { 1023,13}, \
+ { 2047,12}, { 4095,13}, { 2175,12}, { 4351,13}, \
+ { 2431,14}, { 1279,13}, { 2943,12}, { 5887,14}, \
+ { 1535,13}, { 3455,14}, { 1791,13}, { 3967,15}, \
+ { 1023,14}, { 2047,13}, { 4351,14}, { 2303,13}, \
+ { 4991,14}, { 2559,13}, { 5119,14}, { 2815,13}, \
+ { 5887,15}, { 32768,16} }
+#define SQR_FFT_TABLE3_SIZE 258
+#define SQR_FFT_THRESHOLD 5504
+
+#define MULLO_BASECASE_THRESHOLD 3
+#define MULLO_DC_THRESHOLD 34
+#define MULLO_MUL_N_THRESHOLD 14281
+#define SQRLO_BASECASE_THRESHOLD 6
+#define SQRLO_DC_THRESHOLD 137
+#define SQRLO_SQR_THRESHOLD 10821
+
+#define DC_DIV_QR_THRESHOLD 45
+#define DC_DIVAPPR_Q_THRESHOLD 206
+#define DC_BDIV_QR_THRESHOLD 39
+#define DC_BDIV_Q_THRESHOLD 144
+
+#define INV_MULMOD_BNM1_THRESHOLD 54
+#define INV_NEWTON_THRESHOLD 202
+#define INV_APPR_THRESHOLD 206
+
+#define BINV_NEWTON_THRESHOLD 224
+#define REDC_1_TO_REDC_N_THRESHOLD 63
+
+#define MU_DIV_QR_THRESHOLD 1442
+#define MU_DIVAPPR_Q_THRESHOLD 1387
+#define MUPI_DIV_QR_THRESHOLD 82
+#define MU_BDIV_QR_THRESHOLD 1308
+#define MU_BDIV_Q_THRESHOLD 1387
+
+#define POWM_SEC_TABLE 1,16,102,428,1221
+
+#define GET_STR_DC_THRESHOLD 14
+#define GET_STR_PRECOMPUTE_THRESHOLD 28
+#define SET_STR_DC_THRESHOLD 254
+#define SET_STR_PRECOMPUTE_THRESHOLD 890
+
+#define FAC_DSC_THRESHOLD 206
+#define FAC_ODD_THRESHOLD 29
+
+#define MATRIX22_STRASSEN_THRESHOLD 17
+#define HGCD2_DIV1_METHOD 3 /* 3.84% faster than 4 */
+#define HGCD_THRESHOLD 123
+#define HGCD_APPR_THRESHOLD 151
+#define HGCD_REDUCE_THRESHOLD 3389
+#define GCD_DC_THRESHOLD 435
+#define GCDEXT_DC_THRESHOLD 318
+#define JACOBI_BASE_METHOD 4 /* 8.04% faster than 3 */
+
+/* Tuneup completed successfully, took 175382 seconds */
diff --git a/vendor/gmp-6.3.0/mpn/x86/k7/invert_limb.asm b/vendor/gmp-6.3.0/mpn/x86/k7/invert_limb.asm
new file mode 100644
index 0000000..31a867e
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/k7/invert_limb.asm
@@ -0,0 +1,194 @@
+dnl x86 mpn_invert_limb
+
+dnl Contributed to the GNU project by Niels Möller
+
+dnl Copyright 2009, 2011, 2015 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles (approx) div
+C P5 ?
+C P6 model 0-8,10-12 ?
+C P6 model 9 (Banias) ?
+C P6 model 13 (Dothan) ?
+C P4 model 0 (Willamette) ?
+C P4 model 1 (?) ?
+C P4 model 2 (Northwood) ?
+C P4 model 3 (Prescott) ?
+C P4 model 4 (Nocona) ?
+C AMD K6 ?
+C AMD K7 41 53
+C AMD K8 ?
+
+C TODO
+C * These c/l numbers are for a non-PIC build. Consider falling back to using
+C the 'div' instruction for PIC builds.
+C * Perhaps use this file--or at least the algorithm--for more machines than k7.
+
+C Register usage:
+C Input D in %edi
+C Current approximation is in %eax and/or %ecx
+C %ebx and %edx are temporaries
+C %esi and %ebp are unused
+
+defframe(PARAM_DIVISOR,4)
+
+ASM_START()
+
+C Make approx_tab global to work around Apple relocation bug.
+ifdef(`DARWIN',`
+ deflit(`approx_tab', MPN(invert_limb_tab))
+ GLOBL approx_tab')
+
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_invert_limb)
+deflit(`FRAME', 0)
+ mov PARAM_DIVISOR, %eax
+ C Avoid push/pop on k7.
+ sub $8, %esp FRAME_subl_esp(8)
+ mov %ebx, (%esp)
+ mov %edi, 4(%esp)
+
+ mov %eax, %edi
+ shr $22, %eax
+ifdef(`PIC',`
+ LEAL( approx_tab, %ebx)
+ movzwl -1024(%ebx, %eax, 2), %eax
+',`
+ movzwl -1024+approx_tab(%eax, %eax), %eax C %eax = v0
+')
+
+ C v1 = (v0 << 4) - ((v0*v0*d_21) >> 32) - 1
+ mov %eax, %ecx
+ imul %eax, %eax
+ mov %edi, %ebx
+ shr $11, %ebx
+ inc %ebx
+ mul %ebx
+ mov %edi, %ebx C Prepare
+ shr %ebx
+ sbb %eax, %eax
+ sub %eax, %ebx C %ebx = d_31, %eax = mask
+ shl $4, %ecx
+ dec %ecx
+ sub %edx, %ecx C %ecx = v1
+
+ C v_2 = (v1 << 15) + ((v1 *(2^48 - v1 * d31 + (v1 >> 1) & mask)) >> 33)
+ imul %ecx, %ebx
+ and %ecx, %eax
+ shr %eax
+ sub %ebx, %eax
+ mul %ecx
+ mov %edi, %eax C Prepare for next mul
+ shl $15, %ecx
+ shr %edx
+ add %edx, %ecx C %ecx = v2
+
+ mul %ecx
+ add %edi, %eax
+ mov %ecx, %eax
+ adc %edi, %edx
+ sub %edx, %eax C %eax = v3
+
+ mov (%esp), %ebx
+ mov 4(%esp), %edi
+ add $8, %esp
+
+ ret
+
+EPILOGUE()
+
+DEF_OBJECT(approx_tab,2)
+ .value 0x7fe1,0x7fa1,0x7f61,0x7f22,0x7ee3,0x7ea4,0x7e65,0x7e27
+ .value 0x7de9,0x7dab,0x7d6d,0x7d30,0x7cf3,0x7cb6,0x7c79,0x7c3d
+ .value 0x7c00,0x7bc4,0x7b89,0x7b4d,0x7b12,0x7ad7,0x7a9c,0x7a61
+ .value 0x7a27,0x79ec,0x79b2,0x7979,0x793f,0x7906,0x78cc,0x7894
+ .value 0x785b,0x7822,0x77ea,0x77b2,0x777a,0x7742,0x770b,0x76d3
+ .value 0x769c,0x7665,0x762f,0x75f8,0x75c2,0x758c,0x7556,0x7520
+ .value 0x74ea,0x74b5,0x7480,0x744b,0x7416,0x73e2,0x73ad,0x7379
+ .value 0x7345,0x7311,0x72dd,0x72aa,0x7277,0x7243,0x7210,0x71de
+ .value 0x71ab,0x7179,0x7146,0x7114,0x70e2,0x70b1,0x707f,0x704e
+ .value 0x701c,0x6feb,0x6fba,0x6f8a,0x6f59,0x6f29,0x6ef9,0x6ec8
+ .value 0x6e99,0x6e69,0x6e39,0x6e0a,0x6ddb,0x6dab,0x6d7d,0x6d4e
+ .value 0x6d1f,0x6cf1,0x6cc2,0x6c94,0x6c66,0x6c38,0x6c0a,0x6bdd
+ .value 0x6bb0,0x6b82,0x6b55,0x6b28,0x6afb,0x6acf,0x6aa2,0x6a76
+ .value 0x6a49,0x6a1d,0x69f1,0x69c6,0x699a,0x696e,0x6943,0x6918
+ .value 0x68ed,0x68c2,0x6897,0x686c,0x6842,0x6817,0x67ed,0x67c3
+ .value 0x6799,0x676f,0x6745,0x671b,0x66f2,0x66c8,0x669f,0x6676
+ .value 0x664d,0x6624,0x65fc,0x65d3,0x65aa,0x6582,0x655a,0x6532
+ .value 0x650a,0x64e2,0x64ba,0x6493,0x646b,0x6444,0x641c,0x63f5
+ .value 0x63ce,0x63a7,0x6381,0x635a,0x6333,0x630d,0x62e7,0x62c1
+ .value 0x629a,0x6275,0x624f,0x6229,0x6203,0x61de,0x61b8,0x6193
+ .value 0x616e,0x6149,0x6124,0x60ff,0x60da,0x60b6,0x6091,0x606d
+ .value 0x6049,0x6024,0x6000,0x5fdc,0x5fb8,0x5f95,0x5f71,0x5f4d
+ .value 0x5f2a,0x5f07,0x5ee3,0x5ec0,0x5e9d,0x5e7a,0x5e57,0x5e35
+ .value 0x5e12,0x5def,0x5dcd,0x5dab,0x5d88,0x5d66,0x5d44,0x5d22
+ .value 0x5d00,0x5cde,0x5cbd,0x5c9b,0x5c7a,0x5c58,0x5c37,0x5c16
+ .value 0x5bf5,0x5bd4,0x5bb3,0x5b92,0x5b71,0x5b51,0x5b30,0x5b10
+ .value 0x5aef,0x5acf,0x5aaf,0x5a8f,0x5a6f,0x5a4f,0x5a2f,0x5a0f
+ .value 0x59ef,0x59d0,0x59b0,0x5991,0x5972,0x5952,0x5933,0x5914
+ .value 0x58f5,0x58d6,0x58b7,0x5899,0x587a,0x585b,0x583d,0x581f
+ .value 0x5800,0x57e2,0x57c4,0x57a6,0x5788,0x576a,0x574c,0x572e
+ .value 0x5711,0x56f3,0x56d5,0x56b8,0x569b,0x567d,0x5660,0x5643
+ .value 0x5626,0x5609,0x55ec,0x55cf,0x55b2,0x5596,0x5579,0x555d
+ .value 0x5540,0x5524,0x5507,0x54eb,0x54cf,0x54b3,0x5497,0x547b
+ .value 0x545f,0x5443,0x5428,0x540c,0x53f0,0x53d5,0x53b9,0x539e
+ .value 0x5383,0x5368,0x534c,0x5331,0x5316,0x52fb,0x52e0,0x52c6
+ .value 0x52ab,0x5290,0x5276,0x525b,0x5240,0x5226,0x520c,0x51f1
+ .value 0x51d7,0x51bd,0x51a3,0x5189,0x516f,0x5155,0x513b,0x5121
+ .value 0x5108,0x50ee,0x50d5,0x50bb,0x50a2,0x5088,0x506f,0x5056
+ .value 0x503c,0x5023,0x500a,0x4ff1,0x4fd8,0x4fbf,0x4fa6,0x4f8e
+ .value 0x4f75,0x4f5c,0x4f44,0x4f2b,0x4f13,0x4efa,0x4ee2,0x4eca
+ .value 0x4eb1,0x4e99,0x4e81,0x4e69,0x4e51,0x4e39,0x4e21,0x4e09
+ .value 0x4df1,0x4dda,0x4dc2,0x4daa,0x4d93,0x4d7b,0x4d64,0x4d4d
+ .value 0x4d35,0x4d1e,0x4d07,0x4cf0,0x4cd8,0x4cc1,0x4caa,0x4c93
+ .value 0x4c7d,0x4c66,0x4c4f,0x4c38,0x4c21,0x4c0b,0x4bf4,0x4bde
+ .value 0x4bc7,0x4bb1,0x4b9a,0x4b84,0x4b6e,0x4b58,0x4b41,0x4b2b
+ .value 0x4b15,0x4aff,0x4ae9,0x4ad3,0x4abd,0x4aa8,0x4a92,0x4a7c
+ .value 0x4a66,0x4a51,0x4a3b,0x4a26,0x4a10,0x49fb,0x49e5,0x49d0
+ .value 0x49bb,0x49a6,0x4990,0x497b,0x4966,0x4951,0x493c,0x4927
+ .value 0x4912,0x48fe,0x48e9,0x48d4,0x48bf,0x48ab,0x4896,0x4881
+ .value 0x486d,0x4858,0x4844,0x482f,0x481b,0x4807,0x47f3,0x47de
+ .value 0x47ca,0x47b6,0x47a2,0x478e,0x477a,0x4766,0x4752,0x473e
+ .value 0x472a,0x4717,0x4703,0x46ef,0x46db,0x46c8,0x46b4,0x46a1
+ .value 0x468d,0x467a,0x4666,0x4653,0x4640,0x462c,0x4619,0x4606
+ .value 0x45f3,0x45e0,0x45cd,0x45ba,0x45a7,0x4594,0x4581,0x456e
+ .value 0x455b,0x4548,0x4536,0x4523,0x4510,0x44fe,0x44eb,0x44d8
+ .value 0x44c6,0x44b3,0x44a1,0x448f,0x447c,0x446a,0x4458,0x4445
+ .value 0x4433,0x4421,0x440f,0x43fd,0x43eb,0x43d9,0x43c7,0x43b5
+ .value 0x43a3,0x4391,0x437f,0x436d,0x435c,0x434a,0x4338,0x4327
+ .value 0x4315,0x4303,0x42f2,0x42e0,0x42cf,0x42bd,0x42ac,0x429b
+ .value 0x4289,0x4278,0x4267,0x4256,0x4244,0x4233,0x4222,0x4211
+ .value 0x4200,0x41ef,0x41de,0x41cd,0x41bc,0x41ab,0x419a,0x418a
+ .value 0x4179,0x4168,0x4157,0x4147,0x4136,0x4125,0x4115,0x4104
+ .value 0x40f4,0x40e3,0x40d3,0x40c2,0x40b2,0x40a2,0x4091,0x4081
+ .value 0x4071,0x4061,0x4050,0x4040,0x4030,0x4020,0x4010,0x4000
+END_OBJECT(approx_tab)
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/x86/k7/mmx/com.asm b/vendor/gmp-6.3.0/mpn/x86/k7/mmx/com.asm
new file mode 100644
index 0000000..a258c22
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/k7/mmx/com.asm
@@ -0,0 +1,125 @@
+dnl AMD Athlon mpn_com -- mpn bitwise one's complement.
+
+dnl Copyright 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K7: 1.0 cycles/limb
+
+
+C void mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C The loop form below is necessary for the claimed speed. It needs to be
+C aligned to a 16 byte boundary and only 16 bytes long. Maybe that's so it
+C fits in a BTB entry. The adjustments to %eax and %edx avoid offsets on
+C the movq's and achieve the necessary size.
+C
+C If both src and dst are 4mod8, the loop runs at 1.5 c/l. So long as one
+C of the two is 0mod8, it runs at 1.0 c/l. On that basis dst is checked
+C (offset by the size, as per the loop addressing) and one high limb
+C processed separately to get alignment.
+C
+C The padding for the nails case is unattractive, but shouldn't cost any
+C cycles. Explicit .byte's guarantee the desired instructions, at a point
+C where we're probably stalled waiting for loads anyway.
+C
+C Enhancements:
+C
+C The combination load/pxor/store might be able to be unrolled to approach
+C 0.5 c/l if desired.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ TEXT
+ ALIGN(16)
+
+PROLOGUE(mpn_com)
+deflit(`FRAME',0)
+
+ movl PARAM_DST, %edx
+ movl PARAM_SIZE, %ecx
+ pcmpeqd %mm7, %mm7
+
+ leal (%edx,%ecx,4), %eax
+ andl $4, %eax
+ifelse(GMP_NAIL_BITS,0,,
+` psrld $GMP_NAIL_BITS, %mm7') C GMP_NUMB_MASK
+
+ movl PARAM_SRC, %eax
+ movd -4(%eax,%ecx,4), %mm0 C src high limb
+
+ifelse(GMP_NAIL_BITS,0,,
+` C padding for alignment below
+ .byte 0x8d, 0xb6, 0x00, 0x00, 0x00, 0x00 C lea 0(%esi),%esi
+ .byte 0x8d, 0xbf, 0x00, 0x00, 0x00, 0x00 C lea 0(%edi),%edi
+')
+
+ jz L(aligned)
+
+ pxor %mm7, %mm0
+ movd %mm0, -4(%edx,%ecx,4) C dst high limb
+ decl %ecx
+ jz L(done)
+L(aligned):
+
+ addl $4, %eax
+ addl $4, %edx
+ decl %ecx
+ jz L(one)
+
+ C offset 0x30 for no nails, or 0x40 for nails
+ ALIGN(16)
+L(top):
+ C eax src
+ C ebx
+ C ecx counter
+ C edx dst
+
+ subl $2, %ecx
+ movq (%eax,%ecx,4), %mm0
+ pxor %mm7, %mm0
+ movq %mm0, (%edx,%ecx,4)
+ jg L(top)
+
+ jnz L(done) C if size even
+
+L(one):
+ movd -4(%eax), %mm0 C src low limb
+ pxor %mm7, %mm0
+ movd %mm0, -4(%edx) C dst low limb
+
+L(done):
+ emms
+
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/k7/mmx/copyd.asm b/vendor/gmp-6.3.0/mpn/x86/k7/mmx/copyd.asm
new file mode 100644
index 0000000..59ece40
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/k7/mmx/copyd.asm
@@ -0,0 +1,144 @@
+dnl AMD K7 mpn_copyd -- copy limb vector, decrementing.
+
+dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C alignment dst/src, A=0mod8 N=4mod8
+C A/A A/N N/A N/N
+C K7 0.75 1.0 1.0 0.75
+
+
+C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C The various comments in mpn/x86/k7/copyi.asm apply here too.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+dnl parameter space reused
+define(SAVE_EBX,`PARAM_SIZE')
+define(SAVE_ESI,`PARAM_SRC')
+
+dnl minimum 5 since the unrolled code can't handle less than 5
+deflit(UNROLL_THRESHOLD, 5)
+
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_copyd)
+
+ movl PARAM_SIZE, %ecx
+ movl %ebx, SAVE_EBX
+
+ movl PARAM_SRC, %eax
+ movl PARAM_DST, %edx
+
+ cmpl $UNROLL_THRESHOLD, %ecx
+ jae L(unroll)
+
+ orl %ecx, %ecx
+ jz L(simple_done)
+
+L(simple):
+ C eax src
+ C ebx scratch
+ C ecx counter
+ C edx dst
+ C
+ C this loop is 2 cycles/limb
+
+ movl -4(%eax,%ecx,4), %ebx
+ movl %ebx, -4(%edx,%ecx,4)
+ decl %ecx
+ jnz L(simple)
+
+L(simple_done):
+ movl SAVE_EBX, %ebx
+ ret
+
+
+L(unroll):
+ movl %esi, SAVE_ESI
+ leal (%eax,%ecx,4), %ebx
+ leal (%edx,%ecx,4), %esi
+
+ andl %esi, %ebx
+ movl SAVE_ESI, %esi
+ subl $4, %ecx C size-4
+
+ testl $4, %ebx C testl to pad code closer to 16 bytes for L(top)
+ jz L(aligned)
+
+ C both src and dst unaligned, process one limb to align them
+ movl 12(%eax,%ecx,4), %ebx
+ movl %ebx, 12(%edx,%ecx,4)
+ decl %ecx
+L(aligned):
+
+
+ ALIGN(16)
+L(top):
+ C eax src
+ C ebx
+ C ecx counter, limbs
+ C edx dst
+
+ movq 8(%eax,%ecx,4), %mm0
+ movq (%eax,%ecx,4), %mm1
+ subl $4, %ecx
+ movq %mm0, 16+8(%edx,%ecx,4)
+ movq %mm1, 16(%edx,%ecx,4)
+ jns L(top)
+
+
+ C now %ecx is -4 to -1 representing respectively 0 to 3 limbs remaining
+
+ testb $2, %cl
+ jz L(finish_not_two)
+
+ movq 8(%eax,%ecx,4), %mm0
+ movq %mm0, 8(%edx,%ecx,4)
+L(finish_not_two):
+
+ testb $1, %cl
+ jz L(done)
+
+ movl (%eax), %ebx
+ movl %ebx, (%edx)
+
+L(done):
+ movl SAVE_EBX, %ebx
+ emms
+ ret
+
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/k7/mmx/copyi.asm b/vendor/gmp-6.3.0/mpn/x86/k7/mmx/copyi.asm
new file mode 100644
index 0000000..9a28f92
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/k7/mmx/copyi.asm
@@ -0,0 +1,157 @@
+dnl AMD K7 mpn_copyi -- copy limb vector, incrementing.
+
+dnl Copyright 1999, 2000, 2002, 2003 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C alignment dst/src, A=0mod8 N=4mod8
+C A/A A/N N/A N/N
+C K7 0.75 1.0 1.0 0.75
+
+
+C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C Copy src,size to dst,size.
+C
+C This code at 0.75 or 1.0 c/l is always faster than a plain rep movsl at
+C 1.33 c/l.
+C
+C The K7 can do a 64-bit load and 64-bit store in one cycle (optimization
+C guile 22007 appendix B), so 0.5 c/l should be possible, however nothing
+C under 0.7 c/l is known. Apparently only two 32-bit stores can be done in
+C one cycle, so perhaps some scheduling is needed to ensure it's a
+C load+store in each cycle, not store+store.
+C
+C If both source and destination are unaligned then one limb is processed at
+C the start to make them aligned and so get 0.75 c/l, whereas if they'd been
+C used unaligned it would be 1.5 c/l.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+dnl parameter space reused
+define(SAVE_EBX,`PARAM_SIZE')
+
+dnl minimum 5 since the unrolled code can't handle less than 5
+deflit(UNROLL_THRESHOLD, 5)
+
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_copyi)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ movl %ebx, SAVE_EBX
+
+ movl PARAM_SRC, %eax
+ movl PARAM_DST, %edx
+
+ cmpl $UNROLL_THRESHOLD, %ecx
+ jae L(unroll)
+
+ orl %ecx, %ecx
+ jz L(simple_done)
+
+L(simple):
+ C eax src, incrementing
+ C ebx scratch
+ C ecx counter
+ C edx dst, incrementing
+ C
+ C this loop is 2 cycles/limb
+
+ movl (%eax), %ebx
+ movl %ebx, (%edx)
+ decl %ecx
+ leal 4(%eax), %eax
+ leal 4(%edx), %edx
+ jnz L(simple)
+
+L(simple_done):
+ movl SAVE_EBX, %ebx
+ ret
+
+
+L(unroll):
+ movl %eax, %ebx
+ leal -12(%eax,%ecx,4), %eax C src end - 12
+ subl $3, %ecx C size-3
+
+ andl %edx, %ebx
+ leal (%edx,%ecx,4), %edx C dst end - 12
+ negl %ecx
+
+ testl $4, %ebx C testl to pad code closer to 16 bytes for L(top)
+ jz L(aligned)
+
+ C both src and dst unaligned, process one limb to align them
+ movl (%eax,%ecx,4), %ebx
+ movl %ebx, (%edx,%ecx,4)
+ incl %ecx
+L(aligned):
+
+
+ ALIGN(16)
+L(top):
+ C eax src end - 12
+ C ebx
+ C ecx counter, negative, limbs
+ C edx dst end - 12
+
+ movq (%eax,%ecx,4), %mm0
+ movq 8(%eax,%ecx,4), %mm1
+ addl $4, %ecx
+ movq %mm0, -16(%edx,%ecx,4)
+ movq %mm1, -16+8(%edx,%ecx,4)
+ ja L(top) C jump no carry and not zero
+
+
+ C now %ecx is 0 to 3 representing respectively 3 to 0 limbs remaining
+
+ testb $2, %cl
+ jnz L(finish_not_two)
+
+ movq (%eax,%ecx,4), %mm0
+ movq %mm0, (%edx,%ecx,4)
+L(finish_not_two):
+
+ testb $1, %cl
+ jnz L(done)
+
+ movl 8(%eax), %ebx
+ movl %ebx, 8(%edx)
+
+L(done):
+ movl SAVE_EBX, %ebx
+ emms
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/k7/mmx/divrem_1.asm b/vendor/gmp-6.3.0/mpn/x86/k7/mmx/divrem_1.asm
new file mode 100644
index 0000000..cf34328
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/k7/mmx/divrem_1.asm
@@ -0,0 +1,832 @@
+dnl AMD K7 mpn_divrem_1, mpn_divrem_1c, mpn_preinv_divrem_1 -- mpn by limb
+dnl division.
+
+dnl Copyright 1999-2002, 2004 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K7: 17.0 cycles/limb integer part, 15.0 cycles/limb fraction part.
+
+
+C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
+C mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor);
+C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
+C mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor, mp_limb_t carry);
+C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize,
+C mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor, mp_limb_t inverse,
+C unsigned shift);
+C
+C Algorithm:
+C
+C The method and nomenclature follow part 8 of "Division by Invariant
+C Integers using Multiplication" by Granlund and Montgomery, reference in
+C gmp.texi.
+C
+C The "and"s shown in the paper are done here with "cmov"s. "m" is written
+C for m', and "d" for d_norm, which won't cause any confusion since it's
+C only the normalized divisor that's of any use in the code. "b" is written
+C for 2^N, the size of a limb, N being 32 here.
+C
+C The step "sdword dr = n - 2^N*d + (2^N-1-q1) * d" is instead done as
+C "n-(q1+1)*d"; this rearrangement gives the same two-limb answer. If
+C q1==0xFFFFFFFF, then q1+1 would overflow. We branch to a special case
+C "q1_ff" if this occurs. Since the true quotient is either q1 or q1+1 then
+C if q1==0xFFFFFFFF that must be the right value.
+C
+C For the last and second last steps q1==0xFFFFFFFF is instead handled by an
+C sbbl to go back to 0xFFFFFFFF if an overflow occurs when adding 1. This
+C then goes through as normal, and finding no addback required. sbbl costs
+C an extra cycle over what the main loop code does, but it keeps code size
+C and complexity down.
+C
+C Notes:
+C
+C mpn_divrem_1 and mpn_preinv_divrem_1 avoid one division if the src high
+C limb is less than the divisor. mpn_divrem_1c doesn't check for a zero
+C carry, since in normal circumstances that will be a very rare event.
+C
+C The test for skipping a division is branch free (once size>=1 is tested).
+C The store to the destination high limb is 0 when a divide is skipped, or
+C if it's not skipped then a copy of the src high limb is used. The latter
+C is in case src==dst.
+C
+C There's a small bias towards expecting xsize==0, by having code for
+C xsize==0 in a straight line and xsize!=0 under forward jumps.
+C
+C Alternatives:
+C
+C If the divisor is normalized (high bit set) then a division step can
+C always be skipped, since the high destination limb is always 0 or 1 in
+C that case. It doesn't seem worth checking for this though, since it
+C probably occurs infrequently, in particular note that big_base for a
+C decimal mpn_get_str is not normalized in a 32-bit limb.
+
+
+dnl MUL_THRESHOLD is the value of xsize+size at which the multiply by
+dnl inverse method is used, rather than plain "divl"s. Minimum value 1.
+dnl
+dnl The inverse takes about 50 cycles to calculate, but after that the
+dnl multiply is 17 c/l versus division at 42 c/l.
+dnl
+dnl At 3 limbs the mul is a touch faster than div on the integer part, and
+dnl even more so on the fractional part.
+
+deflit(MUL_THRESHOLD, 3)
+
+
+defframe(PARAM_PREINV_SHIFT, 28) dnl mpn_preinv_divrem_1
+defframe(PARAM_PREINV_INVERSE, 24) dnl mpn_preinv_divrem_1
+defframe(PARAM_CARRY, 24) dnl mpn_divrem_1c
+defframe(PARAM_DIVISOR,20)
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC, 12)
+defframe(PARAM_XSIZE, 8)
+defframe(PARAM_DST, 4)
+
+defframe(SAVE_EBX, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EDI, -12)
+defframe(SAVE_EBP, -16)
+
+defframe(VAR_NORM, -20)
+defframe(VAR_INVERSE, -24)
+defframe(VAR_SRC, -28)
+defframe(VAR_DST, -32)
+defframe(VAR_DST_STOP,-36)
+
+deflit(STACK_SPACE, 36)
+
+ TEXT
+ ALIGN(32)
+
+PROLOGUE(mpn_preinv_divrem_1)
+deflit(`FRAME',0)
+ movl PARAM_XSIZE, %ecx
+ movl PARAM_DST, %edx
+ subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE)
+
+ movl %esi, SAVE_ESI
+ movl PARAM_SRC, %esi
+
+ movl %ebx, SAVE_EBX
+ movl PARAM_SIZE, %ebx
+
+ leal 8(%edx,%ecx,4), %edx C &dst[xsize+2]
+ movl %ebp, SAVE_EBP
+ movl PARAM_DIVISOR, %ebp
+
+ movl %edx, VAR_DST_STOP C &dst[xsize+2]
+ movl %edi, SAVE_EDI
+ xorl %edi, %edi C carry
+
+ movl -4(%esi,%ebx,4), %eax C src high limb
+ xor %ecx, %ecx
+
+ C
+
+ C
+
+ cmpl %ebp, %eax C high cmp divisor
+
+ cmovc( %eax, %edi) C high is carry if high<divisor
+ cmovnc( %eax, %ecx) C 0 if skip div, src high if not
+ C (the latter in case src==dst)
+
+ movl %ecx, -12(%edx,%ebx,4) C dst high limb
+ sbbl $0, %ebx C skip one division if high<divisor
+ movl PARAM_PREINV_SHIFT, %ecx
+
+ leal -8(%edx,%ebx,4), %edx C &dst[xsize+size]
+ movl $32, %eax
+
+ movl %edx, VAR_DST C &dst[xsize+size]
+
+ shll %cl, %ebp C d normalized
+ subl %ecx, %eax
+ movl %ecx, VAR_NORM
+
+ movd %eax, %mm7 C rshift
+ movl PARAM_PREINV_INVERSE, %eax
+ jmp L(start_preinv)
+
+EPILOGUE()
+
+
+ ALIGN(16)
+
+PROLOGUE(mpn_divrem_1c)
+deflit(`FRAME',0)
+ movl PARAM_CARRY, %edx
+ movl PARAM_SIZE, %ecx
+ subl $STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+ movl %ebx, SAVE_EBX
+ movl PARAM_XSIZE, %ebx
+
+ movl %edi, SAVE_EDI
+ movl PARAM_DST, %edi
+
+ movl %ebp, SAVE_EBP
+ movl PARAM_DIVISOR, %ebp
+
+ movl %esi, SAVE_ESI
+ movl PARAM_SRC, %esi
+
+ leal -4(%edi,%ebx,4), %edi C &dst[xsize-1]
+ jmp L(start_1c)
+
+EPILOGUE()
+
+
+ C offset 0xa1, close enough to aligned
+PROLOGUE(mpn_divrem_1)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ movl $0, %edx C initial carry (if can't skip a div)
+ subl $STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+ movl %esi, SAVE_ESI
+ movl PARAM_SRC, %esi
+
+ movl %ebx, SAVE_EBX
+ movl PARAM_XSIZE, %ebx
+
+ movl %ebp, SAVE_EBP
+ movl PARAM_DIVISOR, %ebp
+ orl %ecx, %ecx C size
+
+ movl %edi, SAVE_EDI
+ movl PARAM_DST, %edi
+ leal -4(%edi,%ebx,4), %edi C &dst[xsize-1]
+
+ jz L(no_skip_div) C if size==0
+ movl -4(%esi,%ecx,4), %eax C src high limb
+ xorl %esi, %esi
+
+ cmpl %ebp, %eax C high cmp divisor
+
+ cmovc( %eax, %edx) C high is carry if high<divisor
+ cmovnc( %eax, %esi) C 0 if skip div, src high if not
+
+ movl %esi, (%edi,%ecx,4) C dst high limb
+ sbbl $0, %ecx C size-1 if high<divisor
+ movl PARAM_SRC, %esi C reload
+L(no_skip_div):
+
+
+L(start_1c):
+ C eax
+ C ebx xsize
+ C ecx size
+ C edx carry
+ C esi src
+ C edi &dst[xsize-1]
+ C ebp divisor
+
+ leal (%ebx,%ecx), %eax C size+xsize
+ cmpl $MUL_THRESHOLD, %eax
+ jae L(mul_by_inverse)
+
+
+C With MUL_THRESHOLD set to 3, the simple loops here only do 0 to 2 limbs.
+C It'd be possible to write them out without the looping, but no speedup
+C would be expected.
+C
+C Using PARAM_DIVISOR instead of %ebp measures 1 cycle/loop faster on the
+C integer part, but curiously not on the fractional part, where %ebp is a
+C (fixed) couple of cycles faster.
+
+ orl %ecx, %ecx
+ jz L(divide_no_integer)
+
+L(divide_integer):
+ C eax scratch (quotient)
+ C ebx xsize
+ C ecx counter
+ C edx scratch (remainder)
+ C esi src
+ C edi &dst[xsize-1]
+ C ebp divisor
+
+ movl -4(%esi,%ecx,4), %eax
+
+ divl PARAM_DIVISOR
+
+ movl %eax, (%edi,%ecx,4)
+ decl %ecx
+ jnz L(divide_integer)
+
+
+L(divide_no_integer):
+ movl PARAM_DST, %edi
+ orl %ebx, %ebx
+ jnz L(divide_fraction)
+
+L(divide_done):
+ movl SAVE_ESI, %esi
+ movl SAVE_EDI, %edi
+ movl %edx, %eax
+
+ movl SAVE_EBX, %ebx
+ movl SAVE_EBP, %ebp
+ addl $STACK_SPACE, %esp
+
+ ret
+
+
+L(divide_fraction):
+ C eax scratch (quotient)
+ C ebx counter
+ C ecx
+ C edx scratch (remainder)
+ C esi
+ C edi dst
+ C ebp divisor
+
+ movl $0, %eax
+
+ divl %ebp
+
+ movl %eax, -4(%edi,%ebx,4)
+ decl %ebx
+ jnz L(divide_fraction)
+
+ jmp L(divide_done)
+
+
+
+C -----------------------------------------------------------------------------
+
+L(mul_by_inverse):
+ C eax
+ C ebx xsize
+ C ecx size
+ C edx carry
+ C esi src
+ C edi &dst[xsize-1]
+ C ebp divisor
+
+ bsrl %ebp, %eax C 31-l
+
+ leal 12(%edi), %ebx C &dst[xsize+2], loop dst stop
+ leal 4(%edi,%ecx,4), %edi C &dst[xsize+size]
+
+ movl %edi, VAR_DST
+ movl %ebx, VAR_DST_STOP
+
+ movl %ecx, %ebx C size
+ movl $31, %ecx
+
+ movl %edx, %edi C carry
+ movl $-1, %edx
+
+ C
+
+ xorl %eax, %ecx C l
+ incl %eax C 32-l
+
+ shll %cl, %ebp C d normalized
+ movl %ecx, VAR_NORM
+
+ movd %eax, %mm7
+
+ movl $-1, %eax
+ subl %ebp, %edx C (b-d)-1 giving edx:eax = b*(b-d)-1
+
+ divl %ebp C floor (b*(b-d)-1) / d
+
+L(start_preinv):
+ C eax inverse
+ C ebx size
+ C ecx shift
+ C edx
+ C esi src
+ C edi carry
+ C ebp divisor
+ C
+ C mm7 rshift
+
+ orl %ebx, %ebx C size
+ movl %eax, VAR_INVERSE
+ leal -12(%esi,%ebx,4), %eax C &src[size-3]
+
+ jz L(start_zero)
+ movl %eax, VAR_SRC
+ cmpl $1, %ebx
+
+ movl 8(%eax), %esi C src high limb
+ jz L(start_one)
+
+L(start_two_or_more):
+ movl 4(%eax), %edx C src second highest limb
+
+ shldl( %cl, %esi, %edi) C n2 = carry,high << l
+
+ shldl( %cl, %edx, %esi) C n10 = high,second << l
+
+ cmpl $2, %ebx
+ je L(integer_two_left)
+ jmp L(integer_top)
+
+
+L(start_one):
+ shldl( %cl, %esi, %edi) C n2 = carry,high << l
+
+ shll %cl, %esi C n10 = high << l
+ movl %eax, VAR_SRC
+ jmp L(integer_one_left)
+
+
+L(start_zero):
+ C Can be here with xsize==0 if mpn_preinv_divrem_1 had size==1 and
+ C skipped a division.
+
+ shll %cl, %edi C n2 = carry << l
+ movl %edi, %eax C return value for zero_done
+ cmpl $0, PARAM_XSIZE
+
+ je L(zero_done)
+ jmp L(fraction_some)
+
+
+
+C -----------------------------------------------------------------------------
+C
+C The multiply by inverse loop is 17 cycles, and relies on some out-of-order
+C execution. The instruction scheduling is important, with various
+C apparently equivalent forms running 1 to 5 cycles slower.
+C
+C A lower bound for the time would seem to be 16 cycles, based on the
+C following successive dependencies.
+C
+C cycles
+C n2+n1 1
+C mul 6
+C q1+1 1
+C mul 6
+C sub 1
+C addback 1
+C ---
+C 16
+C
+C This chain is what the loop has already, but 16 cycles isn't achieved.
+C K7 has enough decode, and probably enough execute (depending maybe on what
+C a mul actually consumes), but nothing running under 17 has been found.
+C
+C In theory n2+n1 could be done in the sub and addback stages (by
+C calculating both n2 and n2+n1 there), but lack of registers makes this an
+C unlikely proposition.
+C
+C The jz in the loop keeps the q1+1 stage to 1 cycle. Handling an overflow
+C from q1+1 with an "sbbl $0, %ebx" would add a cycle to the dependent
+C chain, and nothing better than 18 cycles has been found when using it.
+C The jump is taken only when q1 is 0xFFFFFFFF, and on random data this will
+C be an extremely rare event.
+C
+C Branch mispredictions will hit random occurrences of q1==0xFFFFFFFF, but
+C if some special data is coming out with this always, the q1_ff special
+C case actually runs at 15 c/l. 0x2FFF...FFFD divided by 3 is a good way to
+C induce the q1_ff case, for speed measurements or testing. Note that
+C 0xFFF...FFF divided by 1 or 2 doesn't induce it.
+C
+C The instruction groupings and empty comments show the cycles for a naive
+C in-order view of the code (conveniently ignoring the load latency on
+C VAR_INVERSE). This shows some of where the time is going, but is nonsense
+C to the extent that out-of-order execution rearranges it. In this case
+C there's 19 cycles shown, but it executes at 17.
+
+ ALIGN(16)
+L(integer_top):
+ C eax scratch
+ C ebx scratch (nadj, q1)
+ C ecx scratch (src, dst)
+ C edx scratch
+ C esi n10
+ C edi n2
+ C ebp divisor
+ C
+ C mm0 scratch (src qword)
+ C mm7 rshift for normalization
+
+ cmpl $0x80000000, %esi C n1 as 0=c, 1=nc
+ movl %edi, %eax C n2
+ movl VAR_SRC, %ecx
+
+ leal (%ebp,%esi), %ebx
+ cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow
+ sbbl $-1, %eax C n2+n1
+
+ mull VAR_INVERSE C m*(n2+n1)
+
+ movq (%ecx), %mm0 C next limb and the one below it
+ subl $4, %ecx
+
+ movl %ecx, VAR_SRC
+
+ C
+
+ addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
+ leal 1(%edi), %ebx C n2+1
+ movl %ebp, %eax C d
+
+ C
+
+ adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+ jz L(q1_ff)
+ movl VAR_DST, %ecx
+
+ mull %ebx C (q1+1)*d
+
+ psrlq %mm7, %mm0
+
+ leal -4(%ecx), %ecx
+
+ C
+
+ subl %eax, %esi
+ movl VAR_DST_STOP, %eax
+
+ C
+
+ sbbl %edx, %edi C n - (q1+1)*d
+ movl %esi, %edi C remainder -> n2
+ leal (%ebp,%esi), %edx
+
+ movd %mm0, %esi
+
+ cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
+ sbbl $0, %ebx C q
+ cmpl %eax, %ecx
+
+ movl %ebx, (%ecx)
+ movl %ecx, VAR_DST
+ jne L(integer_top)
+
+
+L(integer_loop_done):
+
+
+C -----------------------------------------------------------------------------
+C
+C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz
+C q1_ff special case. This make the code a bit smaller and simpler, and
+C costs only 1 cycle (each).
+
+L(integer_two_left):
+ C eax scratch
+ C ebx scratch (nadj, q1)
+ C ecx scratch (src, dst)
+ C edx scratch
+ C esi n10
+ C edi n2
+ C ebp divisor
+ C
+ C mm7 rshift
+
+ cmpl $0x80000000, %esi C n1 as 0=c, 1=nc
+ movl %edi, %eax C n2
+ movl PARAM_SRC, %ecx
+
+ leal (%ebp,%esi), %ebx
+ cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow
+ sbbl $-1, %eax C n2+n1
+
+ mull VAR_INVERSE C m*(n2+n1)
+
+ movd (%ecx), %mm0 C src low limb
+
+ movl VAR_DST_STOP, %ecx
+
+ C
+
+ addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
+ leal 1(%edi), %ebx C n2+1
+ movl %ebp, %eax C d
+
+ adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+ sbbl $0, %ebx
+
+ mull %ebx C (q1+1)*d
+
+ psllq $32, %mm0
+
+ psrlq %mm7, %mm0
+
+ C
+
+ subl %eax, %esi
+
+ C
+
+ sbbl %edx, %edi C n - (q1+1)*d
+ movl %esi, %edi C remainder -> n2
+ leal (%ebp,%esi), %edx
+
+ movd %mm0, %esi
+
+ cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
+ sbbl $0, %ebx C q
+
+ movl %ebx, -4(%ecx)
+
+
+C -----------------------------------------------------------------------------
+L(integer_one_left):
+ C eax scratch
+ C ebx scratch (nadj, q1)
+ C ecx dst
+ C edx scratch
+ C esi n10
+ C edi n2
+ C ebp divisor
+ C
+ C mm7 rshift
+
+ movl VAR_DST_STOP, %ecx
+ cmpl $0x80000000, %esi C n1 as 0=c, 1=nc
+ movl %edi, %eax C n2
+
+ leal (%ebp,%esi), %ebx
+ cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow
+ sbbl $-1, %eax C n2+n1
+
+ mull VAR_INVERSE C m*(n2+n1)
+
+ C
+
+ C
+
+ C
+
+ addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
+ leal 1(%edi), %ebx C n2+1
+ movl %ebp, %eax C d
+
+ C
+
+ adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+ sbbl $0, %ebx C q1 if q1+1 overflowed
+
+ mull %ebx
+
+ C
+
+ C
+
+ C
+
+ subl %eax, %esi
+
+ C
+
+ sbbl %edx, %edi C n - (q1+1)*d
+ movl %esi, %edi C remainder -> n2
+ leal (%ebp,%esi), %edx
+
+ cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
+ sbbl $0, %ebx C q
+
+ movl %ebx, -8(%ecx)
+ subl $8, %ecx
+
+
+
+L(integer_none):
+ cmpl $0, PARAM_XSIZE
+ jne L(fraction_some)
+
+ movl %edi, %eax
+L(fraction_done):
+ movl VAR_NORM, %ecx
+L(zero_done):
+ movl SAVE_EBP, %ebp
+
+ movl SAVE_EDI, %edi
+ movl SAVE_ESI, %esi
+
+ movl SAVE_EBX, %ebx
+ addl $STACK_SPACE, %esp
+
+ shrl %cl, %eax
+ emms
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+C
+C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
+C of q*d is simply -d and the remainder n-q*d = n10+d
+
+L(q1_ff):
+ C eax (divisor)
+ C ebx (q1+1 == 0)
+ C ecx
+ C edx
+ C esi n10
+ C edi n2
+ C ebp divisor
+
+ movl VAR_DST, %ecx
+ movl VAR_DST_STOP, %edx
+ subl $4, %ecx
+
+ psrlq %mm7, %mm0
+ leal (%ebp,%esi), %edi C n-q*d remainder -> next n2
+ movl %ecx, VAR_DST
+
+ movd %mm0, %esi C next n10
+
+ movl $-1, (%ecx)
+ cmpl %ecx, %edx
+ jne L(integer_top)
+
+ jmp L(integer_loop_done)
+
+
+
+C -----------------------------------------------------------------------------
+C
+C Being the fractional part, the "source" limbs are all zero, meaning
+C n10=0, n1=0, and hence nadj=0, leading to many instructions eliminated.
+C
+C The loop runs at 15 cycles. The dependent chain is the same as the
+C general case above, but without the n2+n1 stage (due to n1==0), so 15
+C would seem to be the lower bound.
+C
+C A not entirely obvious simplification is that q1+1 never overflows a limb,
+C and so there's no need for the sbbl $0 or jz q1_ff from the general case.
+C q1 is the high word of m*n2+b*n2 and the following shows q1<=b-2 always.
+C rnd() means rounding down to a multiple of d.
+C
+C m*n2 + b*n2 <= m*(d-1) + b*(d-1)
+C = m*d + b*d - m - b
+C = floor((b(b-d)-1)/d)*d + b*d - m - b
+C = rnd(b(b-d)-1) + b*d - m - b
+C = rnd(b(b-d)-1 + b*d) - m - b
+C = rnd(b*b-1) - m - b
+C <= (b-2)*b
+C
+C Unchanged from the general case is that the final quotient limb q can be
+C either q1 or q1+1, and the q1+1 case occurs often. This can be seen from
+C equation 8.4 of the paper which simplifies as follows when n1==0 and
+C n0==0.
+C
+C n-q1*d = (n2*k+q0*d)/b <= d + (d*d-2d)/b
+C
+C As before, the instruction groupings and empty comments show a naive
+C in-order view of the code, which is made a nonsense by out of order
+C execution. There's 17 cycles shown, but it executes at 15.
+C
+C Rotating the store q and remainder->n2 instructions up to the top of the
+C loop gets the run time down from 16 to 15.
+
+ ALIGN(16)
+L(fraction_some):
+ C eax
+ C ebx
+ C ecx
+ C edx
+ C esi
+ C edi carry
+ C ebp divisor
+
+ movl PARAM_DST, %esi
+ movl VAR_DST_STOP, %ecx C &dst[xsize+2]
+ movl %edi, %eax
+
+ subl $8, %ecx C &dst[xsize]
+ jmp L(fraction_entry)
+
+
+ ALIGN(16)
+L(fraction_top):
+ C eax n2 carry, then scratch
+ C ebx scratch (nadj, q1)
+ C ecx dst, decrementing
+ C edx scratch
+ C esi dst stop point
+ C edi (will be n2)
+ C ebp divisor
+
+ movl %ebx, (%ecx) C previous q
+ movl %eax, %edi C remainder->n2
+
+L(fraction_entry):
+ mull VAR_INVERSE C m*n2
+
+ movl %ebp, %eax C d
+ subl $4, %ecx C dst
+ leal 1(%edi), %ebx
+
+ C
+
+ C
+
+ C
+
+ C
+
+ addl %edx, %ebx C 1 + high(n2<<32 + m*n2) = q1+1
+
+ mull %ebx C (q1+1)*d
+
+ C
+
+ C
+
+ C
+
+ negl %eax C low of n - (q1+1)*d
+
+ C
+
+ sbbl %edx, %edi C high of n - (q1+1)*d, caring only about carry
+ leal (%ebp,%eax), %edx
+
+ cmovc( %edx, %eax) C n - q1*d if underflow from using q1+1
+ sbbl $0, %ebx C q
+ cmpl %esi, %ecx
+
+ jne L(fraction_top)
+
+
+ movl %ebx, (%ecx)
+ jmp L(fraction_done)
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/k7/mmx/lshift.asm b/vendor/gmp-6.3.0/mpn/x86/k7/mmx/lshift.asm
new file mode 100644
index 0000000..b3383cf
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/k7/mmx/lshift.asm
@@ -0,0 +1,481 @@
+dnl AMD K7 mpn_lshift -- mpn left shift.
+
+dnl Copyright 1999-2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K7: 1.21 cycles/limb (at 16 limbs/loop).
+
+
+
+dnl K7: UNROLL_COUNT cycles/limb
+dnl 4 1.51
+dnl 8 1.26
+dnl 16 1.21
+dnl 32 1.2
+dnl Maximum possible with the current code is 64.
+
+deflit(UNROLL_COUNT, 16)
+
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C unsigned shift);
+C
+C Shift src,size left by shift many bits and store the result in dst,size.
+C Zeros are shifted in at the right. The bits shifted out at the left are
+C the return value.
+C
+C The comments in mpn_rshift apply here too.
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 10)
+',`
+deflit(UNROLL_THRESHOLD, 10)
+')
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+defframe(SAVE_EDI, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EBX, -12)
+deflit(SAVE_SIZE, 12)
+
+ TEXT
+ ALIGN(32)
+
+PROLOGUE(mpn_lshift)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %eax
+ movl PARAM_SRC, %edx
+ subl $SAVE_SIZE, %esp
+deflit(`FRAME',SAVE_SIZE)
+
+ movl PARAM_SHIFT, %ecx
+ movl %edi, SAVE_EDI
+
+ movl PARAM_DST, %edi
+ decl %eax
+ jnz L(more_than_one_limb)
+
+ movl (%edx), %edx
+
+ shldl( %cl, %edx, %eax) C eax was decremented to zero
+
+ shll %cl, %edx
+
+ movl %edx, (%edi)
+ movl SAVE_EDI, %edi
+ addl $SAVE_SIZE, %esp
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+L(more_than_one_limb):
+ C eax size-1
+ C ebx
+ C ecx shift
+ C edx src
+ C esi
+ C edi dst
+ C ebp
+
+ movd PARAM_SHIFT, %mm6
+ movd (%edx,%eax,4), %mm5 C src high limb
+ cmp $UNROLL_THRESHOLD-1, %eax
+
+ jae L(unroll)
+ negl %ecx
+ movd (%edx), %mm4 C src low limb
+
+ addl $32, %ecx
+
+ movd %ecx, %mm7
+
+L(simple_top):
+ C eax loop counter, limbs
+ C ebx
+ C ecx
+ C edx src
+ C esi
+ C edi dst
+ C ebp
+ C
+ C mm0 scratch
+ C mm4 src low limb
+ C mm5 src high limb
+ C mm6 shift
+ C mm7 32-shift
+
+ movq -4(%edx,%eax,4), %mm0
+ decl %eax
+
+ psrlq %mm7, %mm0
+
+ movd %mm0, 4(%edi,%eax,4)
+ jnz L(simple_top)
+
+
+ psllq %mm6, %mm5
+ psllq %mm6, %mm4
+
+ psrlq $32, %mm5
+ movd %mm4, (%edi) C dst low limb
+
+ movd %mm5, %eax C return value
+
+ movl SAVE_EDI, %edi
+ addl $SAVE_SIZE, %esp
+ emms
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(16)
+L(unroll):
+ C eax size-1
+ C ebx (saved)
+ C ecx shift
+ C edx src
+ C esi
+ C edi dst
+ C ebp
+ C
+ C mm5 src high limb, for return value
+ C mm6 lshift
+
+ movl %esi, SAVE_ESI
+ movl %ebx, SAVE_EBX
+ leal -4(%edx,%eax,4), %edx C &src[size-2]
+
+ testb $4, %dl
+ movq (%edx), %mm1 C src high qword
+
+ jz L(start_src_aligned)
+
+
+ C src isn't aligned, process high limb (marked xxx) separately to
+ C make it so
+ C
+ C source -4(edx,%eax,4)
+ C |
+ C +-------+-------+-------+--
+ C | xxx |
+ C +-------+-------+-------+--
+ C 0mod8 4mod8 0mod8
+ C
+ C dest -4(edi,%eax,4)
+ C |
+ C +-------+-------+--
+ C | xxx | |
+ C +-------+-------+--
+
+ psllq %mm6, %mm1
+ subl $4, %edx
+ movl %eax, PARAM_SIZE C size-1
+
+ psrlq $32, %mm1
+ decl %eax C size-2 is new size-1
+
+ movd %mm1, 4(%edi,%eax,4)
+ movq (%edx), %mm1 C new src high qword
+L(start_src_aligned):
+
+
+ leal -4(%edi,%eax,4), %edi C &dst[size-2]
+ psllq %mm6, %mm5
+
+ testl $4, %edi
+ psrlq $32, %mm5 C return value
+
+ jz L(start_dst_aligned)
+
+
+ C dst isn't aligned, subtract 4 bytes to make it so, and pretend the
+ C shift is 32 bits extra. High limb of dst (marked xxx) handled
+ C here separately.
+ C
+ C source %edx
+ C +-------+-------+--
+ C | mm1 |
+ C +-------+-------+--
+ C 0mod8 4mod8
+ C
+ C dest %edi
+ C +-------+-------+-------+--
+ C | xxx |
+ C +-------+-------+-------+--
+ C 0mod8 4mod8 0mod8
+
+ movq %mm1, %mm0
+ psllq %mm6, %mm1
+ addl $32, %ecx C shift+32
+
+ psrlq $32, %mm1
+
+ movd %mm1, 4(%edi)
+ movq %mm0, %mm1
+ subl $4, %edi
+
+ movd %ecx, %mm6 C new lshift
+L(start_dst_aligned):
+
+ decl %eax C size-2, two last limbs handled at end
+ movq %mm1, %mm2 C copy of src high qword
+ negl %ecx
+
+ andl $-2, %eax C round size down to even
+ addl $64, %ecx
+
+ movl %eax, %ebx
+ negl %eax
+
+ andl $UNROLL_MASK, %eax
+ decl %ebx
+
+ shll %eax
+
+ movd %ecx, %mm7 C rshift = 64-lshift
+
+ifdef(`PIC',`
+ call L(pic_calc)
+L(here):
+',`
+ leal L(entry) (%eax,%eax,4), %esi
+')
+ shrl $UNROLL_LOG2, %ebx C loop counter
+
+ leal ifelse(UNROLL_BYTES,256,128) -8(%edx,%eax,2), %edx
+ leal ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi
+ movl PARAM_SIZE, %eax C for use at end
+ jmp *%esi
+
+
+ifdef(`PIC',`
+L(pic_calc):
+ C See mpn/x86/README about old gas bugs
+ leal (%eax,%eax,4), %esi
+ addl $L(entry)-L(here), %esi
+ addl (%esp), %esi
+
+ ret_internal
+')
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(32)
+L(top):
+ C eax size (for use at end)
+ C ebx loop counter
+ C ecx rshift
+ C edx src
+ C esi computed jump
+ C edi dst
+ C ebp
+ C
+ C mm0 scratch
+ C mm1 \ carry (alternating, mm2 first)
+ C mm2 /
+ C mm6 lshift
+ C mm7 rshift
+ C
+ C 10 code bytes/limb
+ C
+ C The two chunks differ in whether mm1 or mm2 hold the carry.
+ C The computed jump puts the initial carry in both mm1 and mm2.
+
+L(entry):
+deflit(CHUNK_COUNT, 4)
+forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+ deflit(`disp0', eval(-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
+ deflit(`disp1', eval(disp0 - 8))
+
+Zdisp( movq, disp0,(%edx), %mm0)
+ psllq %mm6, %mm2
+
+ movq %mm0, %mm1
+ psrlq %mm7, %mm0
+
+ por %mm2, %mm0
+Zdisp( movq, %mm0, disp0,(%edi))
+
+
+Zdisp( movq, disp1,(%edx), %mm0)
+ psllq %mm6, %mm1
+
+ movq %mm0, %mm2
+ psrlq %mm7, %mm0
+
+ por %mm1, %mm0
+Zdisp( movq, %mm0, disp1,(%edi))
+')
+
+ subl $UNROLL_BYTES, %edx
+ subl $UNROLL_BYTES, %edi
+ decl %ebx
+
+ jns L(top)
+
+
+
+define(`disp', `m4_empty_if_zero(eval($1 ifelse(UNROLL_BYTES,256,-128)))')
+
+L(end):
+ testb $1, %al
+ movl SAVE_EBX, %ebx
+ psllq %mm6, %mm2 C wanted left shifted in all cases below
+
+ movd %mm5, %eax
+
+ movl SAVE_ESI, %esi
+ jz L(end_even)
+
+
+L(end_odd):
+
+ C Size odd, destination was aligned.
+ C
+ C source edx+8 edx+4
+ C --+---------------+-------+
+ C | mm2 | |
+ C --+---------------+-------+
+ C
+ C dest edi
+ C --+---------------+---------------+-------+
+ C | written | | |
+ C --+---------------+---------------+-------+
+ C
+ C mm6 = shift
+ C mm7 = ecx = 64-shift
+
+
+ C Size odd, destination was unaligned.
+ C
+ C source edx+8 edx+4
+ C --+---------------+-------+
+ C | mm2 | |
+ C --+---------------+-------+
+ C
+ C dest edi
+ C --+---------------+---------------+
+ C | written | |
+ C --+---------------+---------------+
+ C
+ C mm6 = shift+32
+ C mm7 = ecx = 64-(shift+32)
+
+
+ C In both cases there's one extra limb of src to fetch and combine
+ C with mm2 to make a qword at (%edi), and in the aligned case
+ C there's an extra limb of dst to be formed from that extra src limb
+ C left shifted.
+
+ movd disp(4) (%edx), %mm0
+ testb $32, %cl
+
+ movq %mm0, %mm1
+ psllq $32, %mm0
+
+ psrlq %mm7, %mm0
+ psllq %mm6, %mm1
+
+ por %mm2, %mm0
+
+ movq %mm0, disp(0) (%edi)
+ jz L(end_odd_unaligned)
+ movd %mm1, disp(-4) (%edi)
+L(end_odd_unaligned):
+
+ movl SAVE_EDI, %edi
+ addl $SAVE_SIZE, %esp
+ emms
+
+ ret
+
+
+L(end_even):
+
+ C Size even, destination was aligned.
+ C
+ C source edx+8
+ C --+---------------+
+ C | mm2 |
+ C --+---------------+
+ C
+ C dest edi
+ C --+---------------+---------------+
+ C | written | |
+ C --+---------------+---------------+
+ C
+ C mm6 = shift
+ C mm7 = ecx = 64-shift
+
+
+ C Size even, destination was unaligned.
+ C
+ C source edx+8
+ C --+---------------+
+ C | mm2 |
+ C --+---------------+
+ C
+ C dest edi+4
+ C --+---------------+-------+
+ C | written | |
+ C --+---------------+-------+
+ C
+ C mm6 = shift+32
+ C mm7 = ecx = 64-(shift+32)
+
+
+ C The movq for the aligned case overwrites the movd for the
+ C unaligned case.
+
+ movq %mm2, %mm0
+ psrlq $32, %mm2
+
+ testb $32, %cl
+ movd %mm2, disp(4) (%edi)
+
+ jz L(end_even_unaligned)
+ movq %mm0, disp(0) (%edi)
+L(end_even_unaligned):
+
+ movl SAVE_EDI, %edi
+ addl $SAVE_SIZE, %esp
+ emms
+
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/k7/mmx/popham.asm b/vendor/gmp-6.3.0/mpn/x86/k7/mmx/popham.asm
new file mode 100644
index 0000000..95965b7
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/k7/mmx/popham.asm
@@ -0,0 +1,213 @@
+dnl AMD K7 mpn_popcount, mpn_hamdist -- population count and hamming
+dnl distance.
+
+dnl Copyright 2000-2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C popcount hamdist
+C P3 generic 6.5 7
+C P3 model 9 (Banias) 5.7 6.1
+C P3 model 13 (Dothan) 5.75 6
+C K7 5 6
+
+C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
+C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
+C
+C The code here is almost certainly not optimal, but is already a 3x speedup
+C over the generic C code. The main improvement would be to interleave
+C processing of two qwords in the loop so as to fully exploit the available
+C execution units, possibly leading to 3.25 c/l (13 cycles for 4 limbs).
+C
+C The loop is based on the example "Efficient 64-bit population count using
+C MMX instructions" in the Athlon Optimization Guide, AMD document 22007,
+C page 158 of rev E (reference in mpn/x86/k7/README).
+
+ifdef(`OPERATION_popcount',,
+`ifdef(`OPERATION_hamdist',,
+`m4_error(`Need OPERATION_popcount or OPERATION_hamdist defined
+')')')
+
+define(HAM,
+m4_assert_numargs(1)
+`ifdef(`OPERATION_hamdist',`$1')')
+
+define(POP,
+m4_assert_numargs(1)
+`ifdef(`OPERATION_popcount',`$1')')
+
+HAM(`
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC2, 8)
+defframe(PARAM_SRC, 4)
+define(M4_function,mpn_hamdist)
+')
+POP(`
+defframe(PARAM_SIZE, 8)
+defframe(PARAM_SRC, 4)
+define(M4_function,mpn_popcount)
+')
+
+MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
+
+
+ifdef(`PIC',,`
+ dnl non-PIC
+
+ RODATA
+ ALIGN(8)
+
+L(rodata_AAAAAAAAAAAAAAAA):
+ .long 0xAAAAAAAA
+ .long 0xAAAAAAAA
+
+L(rodata_3333333333333333):
+ .long 0x33333333
+ .long 0x33333333
+
+L(rodata_0F0F0F0F0F0F0F0F):
+ .long 0x0F0F0F0F
+ .long 0x0F0F0F0F
+')
+
+ TEXT
+ ALIGN(32)
+
+PROLOGUE(M4_function)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+
+ifdef(`PIC',`
+ movl $0xAAAAAAAA, %eax
+ movl $0x33333333, %edx
+
+ movd %eax, %mm7
+ movd %edx, %mm6
+
+ movl $0x0F0F0F0F, %eax
+
+ punpckldq %mm7, %mm7
+ punpckldq %mm6, %mm6
+
+ movd %eax, %mm5
+ movd %edx, %mm4
+
+ punpckldq %mm5, %mm5
+
+',`
+ movq L(rodata_AAAAAAAAAAAAAAAA), %mm7
+ movq L(rodata_3333333333333333), %mm6
+ movq L(rodata_0F0F0F0F0F0F0F0F), %mm5
+')
+ pxor %mm4, %mm4
+
+define(REG_AAAAAAAAAAAAAAAA,%mm7)
+define(REG_3333333333333333,%mm6)
+define(REG_0F0F0F0F0F0F0F0F,%mm5)
+define(REG_0000000000000000,%mm4)
+
+
+ movl PARAM_SRC, %eax
+HAM(` movl PARAM_SRC2, %edx')
+
+ pxor %mm2, %mm2 C total
+
+ shrl %ecx
+ jnc L(top)
+
+ movd (%eax,%ecx,8), %mm1
+
+HAM(` movd (%edx,%ecx,8), %mm0
+ pxor %mm0, %mm1
+')
+ orl %ecx, %ecx
+ jmp L(loaded)
+
+
+ ALIGN(16)
+L(top):
+ C eax src
+ C ebx
+ C ecx counter, qwords, decrementing
+ C edx [hamdist] src2
+ C
+ C mm0 (scratch)
+ C mm1 (scratch)
+ C mm2 total (low dword)
+ C mm3
+ C mm4 \
+ C mm5 | special constants
+ C mm6 |
+ C mm7 /
+
+ movq -8(%eax,%ecx,8), %mm1
+
+HAM(` pxor -8(%edx,%ecx,8), %mm1')
+ decl %ecx
+
+L(loaded):
+ movq %mm1, %mm0
+ pand REG_AAAAAAAAAAAAAAAA, %mm1
+
+ psrlq $1, %mm1
+
+ psubd %mm1, %mm0 C bit pairs
+
+
+ movq %mm0, %mm1
+ psrlq $2, %mm0
+
+ pand REG_3333333333333333, %mm0
+ pand REG_3333333333333333, %mm1
+
+ paddd %mm1, %mm0 C nibbles
+
+
+ movq %mm0, %mm1
+ psrlq $4, %mm0
+
+ pand REG_0F0F0F0F0F0F0F0F, %mm0
+ pand REG_0F0F0F0F0F0F0F0F, %mm1
+
+ paddd %mm1, %mm0 C bytes
+
+
+ psadbw( %mm4, %mm0)
+
+ paddd %mm0, %mm2 C add to total
+ jnz L(top)
+
+
+ movd %mm2, %eax
+ emms
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/k7/mmx/rshift.asm b/vendor/gmp-6.3.0/mpn/x86/k7/mmx/rshift.asm
new file mode 100644
index 0000000..345d23a
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/k7/mmx/rshift.asm
@@ -0,0 +1,480 @@
+dnl AMD K7 mpn_rshift -- mpn right shift.
+
+dnl Copyright 1999-2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K7: 1.21 cycles/limb (at 16 limbs/loop).
+
+
+
+dnl K7: UNROLL_COUNT cycles/limb
+dnl 4 1.51
+dnl 8 1.26
+dnl 16 1.21
+dnl 32 1.2
+dnl Maximum possible with the current code is 64.
+
+deflit(UNROLL_COUNT, 16)
+
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C unsigned shift);
+C
+C Shift src,size right by shift many bits and store the result in dst,size.
+C Zeros are shifted in at the left. The bits shifted out at the right are
+C the return value.
+C
+C This code uses 64-bit MMX operations, which makes it possible to handle
+C two limbs at a time, for a theoretical 1.0 cycles/limb. Plain integer
+C code, on the other hand, suffers from shrd being a vector path decode and
+C running at 3 cycles back-to-back.
+C
+C Full speed depends on source and destination being aligned, and some hairy
+C setups and finish-ups are done to arrange this for the loop.
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 10)
+',`
+deflit(UNROLL_THRESHOLD, 10)
+')
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+defframe(SAVE_EDI, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EBX, -12)
+deflit(SAVE_SIZE, 12)
+
+ TEXT
+ ALIGN(32)
+
+PROLOGUE(mpn_rshift)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %eax
+ movl PARAM_SRC, %edx
+ subl $SAVE_SIZE, %esp
+deflit(`FRAME',SAVE_SIZE)
+
+ movl PARAM_SHIFT, %ecx
+ movl %edi, SAVE_EDI
+
+ movl PARAM_DST, %edi
+ decl %eax
+ jnz L(more_than_one_limb)
+
+ movl (%edx), %edx C src limb
+
+ shrdl( %cl, %edx, %eax) C eax was decremented to zero
+
+ shrl %cl, %edx
+
+ movl %edx, (%edi) C dst limb
+ movl SAVE_EDI, %edi
+ addl $SAVE_SIZE, %esp
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+L(more_than_one_limb):
+ C eax size-1
+ C ebx
+ C ecx shift
+ C edx src
+ C esi
+ C edi dst
+ C ebp
+
+ movd PARAM_SHIFT, %mm6 C rshift
+ movd (%edx), %mm5 C src low limb
+ cmp $UNROLL_THRESHOLD-1, %eax
+
+ jae L(unroll)
+ leal (%edx,%eax,4), %edx C &src[size-1]
+ leal -4(%edi,%eax,4), %edi C &dst[size-2]
+
+ movd (%edx), %mm4 C src high limb
+ negl %eax
+
+
+L(simple_top):
+ C eax loop counter, limbs, negative
+ C ebx
+ C ecx shift
+ C edx carry
+ C edx &src[size-1]
+ C edi &dst[size-2]
+ C ebp
+ C
+ C mm0 scratch
+ C mm4 src high limb
+ C mm5 src low limb
+ C mm6 shift
+
+ movq (%edx,%eax,4), %mm0
+ incl %eax
+
+ psrlq %mm6, %mm0
+
+ movd %mm0, (%edi,%eax,4)
+ jnz L(simple_top)
+
+
+ psllq $32, %mm5
+ psrlq %mm6, %mm4
+
+ psrlq %mm6, %mm5
+ movd %mm4, 4(%edi) C dst high limb
+
+ movd %mm5, %eax C return value
+
+ movl SAVE_EDI, %edi
+ addl $SAVE_SIZE, %esp
+ emms
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(16)
+L(unroll):
+ C eax size-1
+ C ebx
+ C ecx shift
+ C edx src
+ C esi
+ C edi dst
+ C ebp
+ C
+ C mm5 src low limb
+ C mm6 rshift
+
+ testb $4, %dl
+ movl %esi, SAVE_ESI
+ movl %ebx, SAVE_EBX
+
+ psllq $32, %mm5
+ jz L(start_src_aligned)
+
+
+ C src isn't aligned, process low limb separately (marked xxx) and
+ C step src and dst by one limb, making src aligned.
+ C
+ C source edx
+ C --+-------+-------+-------+
+ C | xxx |
+ C --+-------+-------+-------+
+ C 4mod8 0mod8 4mod8
+ C
+ C dest edi
+ C --+-------+-------+
+ C | | xxx |
+ C --+-------+-------+
+
+ movq (%edx), %mm0 C src low two limbs
+ addl $4, %edx
+ movl %eax, PARAM_SIZE C size-1
+
+ addl $4, %edi
+ decl %eax C size-2 is new size-1
+
+ psrlq %mm6, %mm0
+ movl %edi, PARAM_DST C new dst
+
+ movd %mm0, -4(%edi)
+L(start_src_aligned):
+
+
+ movq (%edx), %mm1 C src low two limbs
+ decl %eax C size-2, two last limbs handled at end
+ testl $4, %edi
+
+ psrlq %mm6, %mm5
+ jz L(start_dst_aligned)
+
+
+ C dst isn't aligned, add 4 to make it so, and pretend the shift is
+ C 32 bits extra. Low limb of dst (marked xxx) handled here separately.
+ C
+ C source edx
+ C --+-------+-------+
+ C | mm1 |
+ C --+-------+-------+
+ C 4mod8 0mod8
+ C
+ C dest edi
+ C --+-------+-------+-------+
+ C | xxx |
+ C --+-------+-------+-------+
+ C 4mod8 0mod8 4mod8
+
+ movq %mm1, %mm0
+ psrlq %mm6, %mm1
+ addl $32, %ecx C shift+32
+
+ movd %mm1, (%edi)
+ movq %mm0, %mm1
+ addl $4, %edi C new dst
+
+ movd %ecx, %mm6
+L(start_dst_aligned):
+
+
+ movq %mm1, %mm2 C copy of src low two limbs
+ negl %ecx
+ andl $-2, %eax C round size down to even
+
+ movl %eax, %ebx
+ negl %eax
+ addl $64, %ecx
+
+ andl $UNROLL_MASK, %eax
+ decl %ebx
+
+ shll %eax
+
+ movd %ecx, %mm7 C lshift = 64-rshift
+
+ifdef(`PIC',`
+ call L(pic_calc)
+L(here):
+',`
+ leal L(entry) (%eax,%eax,4), %esi
+ negl %eax
+')
+ shrl $UNROLL_LOG2, %ebx C loop counter
+
+ leal ifelse(UNROLL_BYTES,256,128+) 8(%edx,%eax,2), %edx
+ leal ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi
+ movl PARAM_SIZE, %eax C for use at end
+
+ jmp *%esi
+
+
+ifdef(`PIC',`
+L(pic_calc):
+ C See mpn/x86/README about old gas bugs
+ leal (%eax,%eax,4), %esi
+ addl $L(entry)-L(here), %esi
+ addl (%esp), %esi
+ negl %eax
+
+ ret_internal
+')
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(64)
+L(top):
+ C eax size, for use at end
+ C ebx loop counter
+ C ecx lshift
+ C edx src
+ C esi was computed jump
+ C edi dst
+ C ebp
+ C
+ C mm0 scratch
+ C mm1 \ carry (alternating)
+ C mm2 /
+ C mm6 rshift
+ C mm7 lshift
+ C
+ C 10 code bytes/limb
+ C
+ C The two chunks differ in whether mm1 or mm2 hold the carry.
+ C The computed jump puts the initial carry in both mm1 and mm2.
+
+L(entry):
+deflit(CHUNK_COUNT, 4)
+forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+ deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
+ deflit(`disp1', eval(disp0 + 8))
+
+Zdisp( movq, disp0,(%edx), %mm0)
+ psrlq %mm6, %mm2
+
+ movq %mm0, %mm1
+ psllq %mm7, %mm0
+
+ por %mm2, %mm0
+Zdisp( movq, %mm0, disp0,(%edi))
+
+
+Zdisp( movq, disp1,(%edx), %mm0)
+ psrlq %mm6, %mm1
+
+ movq %mm0, %mm2
+ psllq %mm7, %mm0
+
+ por %mm1, %mm0
+Zdisp( movq, %mm0, disp1,(%edi))
+')
+
+ addl $UNROLL_BYTES, %edx
+ addl $UNROLL_BYTES, %edi
+ decl %ebx
+
+ jns L(top)
+
+
+deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
+deflit(`disp1', eval(disp0-0 + 8))
+
+ testb $1, %al
+ psrlq %mm6, %mm2 C wanted rshifted in all cases below
+ movl SAVE_ESI, %esi
+
+ movd %mm5, %eax C return value
+
+ movl SAVE_EBX, %ebx
+ jz L(end_even)
+
+
+ C Size odd, destination was aligned.
+ C
+ C source
+ C edx
+ C +-------+---------------+--
+ C | | mm2 |
+ C +-------+---------------+--
+ C
+ C dest edi
+ C +-------+---------------+---------------+--
+ C | | | written |
+ C +-------+---------------+---------------+--
+ C
+ C mm6 = shift
+ C mm7 = ecx = 64-shift
+
+
+ C Size odd, destination was unaligned.
+ C
+ C source
+ C edx
+ C +-------+---------------+--
+ C | | mm2 |
+ C +-------+---------------+--
+ C
+ C dest edi
+ C +---------------+---------------+--
+ C | | written |
+ C +---------------+---------------+--
+ C
+ C mm6 = shift+32
+ C mm7 = ecx = 64-(shift+32)
+
+
+ C In both cases there's one extra limb of src to fetch and combine
+ C with mm2 to make a qword to store, and in the aligned case there's
+ C a further extra limb of dst to be formed.
+
+
+ movd disp0(%edx), %mm0
+ movq %mm0, %mm1
+
+ psllq %mm7, %mm0
+ testb $32, %cl
+
+ por %mm2, %mm0
+ psrlq %mm6, %mm1
+
+ movq %mm0, disp0(%edi)
+ jz L(finish_odd_unaligned)
+
+ movd %mm1, disp1(%edi)
+L(finish_odd_unaligned):
+
+ movl SAVE_EDI, %edi
+ addl $SAVE_SIZE, %esp
+ emms
+
+ ret
+
+
+L(end_even):
+
+ C Size even, destination was aligned.
+ C
+ C source
+ C +---------------+--
+ C | mm2 |
+ C +---------------+--
+ C
+ C dest edi
+ C +---------------+---------------+--
+ C | | mm3 |
+ C +---------------+---------------+--
+ C
+ C mm6 = shift
+ C mm7 = ecx = 64-shift
+
+
+ C Size even, destination was unaligned.
+ C
+ C source
+ C +---------------+--
+ C | mm2 |
+ C +---------------+--
+ C
+ C dest edi
+ C +-------+---------------+--
+ C | | mm3 |
+ C +-------+---------------+--
+ C
+ C mm6 = shift+32
+ C mm7 = 64-(shift+32)
+
+
+ C The movd for the unaligned case is the same data as the movq for
+ C the aligned case, it's just a choice between whether one or two
+ C limbs should be written.
+
+
+ testb $32, %cl
+ movd %mm2, disp0(%edi)
+
+ jz L(end_even_unaligned)
+
+ movq %mm2, disp0(%edi)
+L(end_even_unaligned):
+
+ movl SAVE_EDI, %edi
+ addl $SAVE_SIZE, %esp
+ emms
+
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/k7/mod_1_1.asm b/vendor/gmp-6.3.0/mpn/x86/k7/mod_1_1.asm
new file mode 100644
index 0000000..1bbe6f9
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/k7/mod_1_1.asm
@@ -0,0 +1,221 @@
+dnl x86-32 mpn_mod_1_1p, requiring cmov.
+
+dnl Contributed to the GNU project by Niels Möller and Torbjorn Granlund.
+
+dnl Copyright 2010, 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C P5 ?
+C P6 model 0-8,10-12 ?
+C P6 model 9 (Banias) ?
+C P6 model 13 (Dothan) ?
+C P4 model 0 (Willamette) ?
+C P4 model 1 (?) ?
+C P4 model 2 (Northwood) ?
+C P4 model 3 (Prescott) ?
+C P4 model 4 (Nocona) ?
+C AMD K6 ?
+C AMD K7 7
+C AMD K8 ?
+
+define(`B2mb', `%ebx')
+define(`r0', `%esi')
+define(`r2', `%ebp')
+define(`t0', `%edi')
+define(`ap', `%ecx') C Also shift count
+
+C Stack frame
+C pre 36(%esp)
+C b 32(%esp)
+C n 28(%esp)
+C ap 24(%esp)
+C return 20(%esp)
+C %ebp 16(%esp)
+C %edi 12(%esp)
+C %esi 8(%esp)
+C %ebx 4(%esp)
+C B2mod (%esp)
+
+define(`B2modb', `(%esp)')
+define(`n', `28(%esp)')
+define(`b', `32(%esp)')
+define(`pre', `36(%esp)')
+
+C mp_limb_t
+C mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, mp_limb_t pre[4])
+C
+C The pre array contains bi, cnt, B1modb, B2modb
+C Note: This implementation needs B1modb only when cnt > 0
+
+ASM_START()
+ TEXT
+ ALIGN(8)
+PROLOGUE(mpn_mod_1_1p)
+ push %ebp
+ push %edi
+ push %esi
+ push %ebx
+ mov 32(%esp), %ebp C pre[]
+
+ mov 12(%ebp), %eax C B2modb
+ push %eax C Put it on stack
+
+ mov n, %edx
+ mov 24(%esp), ap
+
+ lea (ap, %edx, 4), ap
+ mov -4(ap), %eax
+ cmp $3, %edx
+ jnc L(first)
+ mov -8(ap), r0
+ jmp L(reduce_two)
+
+L(first):
+ C First iteration, no r2
+ mull B2modb
+ mov -12(ap), r0
+ add %eax, r0
+ mov -8(ap), %eax
+ adc %edx, %eax
+ sbb r2, r2
+ subl $3, n
+ lea -16(ap), ap
+ jz L(reduce_three)
+
+ mov B2modb, B2mb
+ sub b, B2mb
+ lea (B2mb, r0), t0
+ jmp L(mid)
+
+ ALIGN(16)
+L(top): C Loopmixed to 7 c/l on k7
+ add %eax, r0
+ lea (B2mb, r0), t0
+ mov r2, %eax
+ adc %edx, %eax
+ sbb r2, r2
+L(mid): mull B2modb
+ and B2modb, r2
+ add r0, r2
+ decl n
+ mov (ap), r0
+ cmovc( t0, r2)
+ lea -4(ap), ap
+ jnz L(top)
+
+ add %eax, r0
+ mov r2, %eax
+ adc %edx, %eax
+ sbb r2, r2
+
+L(reduce_three):
+ C Eliminate r2
+ and b, r2
+ sub r2, %eax
+
+L(reduce_two):
+ mov pre, %ebp
+ movb 4(%ebp), %cl
+ test %cl, %cl
+ jz L(normalized)
+
+ C Unnormalized, use B1modb to reduce to size < B b
+ mull 8(%ebp)
+ xor t0, t0
+ add %eax, r0
+ adc %edx, t0
+ mov t0, %eax
+
+ C Left-shift to normalize
+ shld %cl, r0, %eax C Always use shld?
+
+ shl %cl, r0
+ jmp L(udiv)
+
+L(normalized):
+ mov %eax, t0
+ sub b, t0
+ cmovnc( t0, %eax)
+
+L(udiv):
+ lea 1(%eax), t0
+ mull (%ebp)
+ mov b, %ebx C Needed in register for lea
+ add r0, %eax
+ adc t0, %edx
+ imul %ebx, %edx
+ sub %edx, r0
+ cmp r0, %eax
+ lea (%ebx, r0), %eax
+ cmovnc( r0, %eax)
+ cmp %ebx, %eax
+ jnc L(fix)
+L(ok): shr %cl, %eax
+
+ add $4, %esp
+ pop %ebx
+ pop %esi
+ pop %edi
+ pop %ebp
+
+ ret
+L(fix): sub %ebx, %eax
+ jmp L(ok)
+EPILOGUE()
+
+PROLOGUE(mpn_mod_1_1p_cps)
+ push %ebp
+ mov 12(%esp), %ebp
+ push %esi
+ bsr %ebp, %ecx
+ push %ebx
+ xor $31, %ecx
+ mov 16(%esp), %esi
+ sal %cl, %ebp
+ mov %ebp, %edx
+ not %edx
+ mov $-1, %eax
+ div %ebp C On K7, invert_limb would be a few cycles faster.
+ mov %eax, (%esi) C store bi
+ mov %ecx, 4(%esi) C store cnt
+ neg %ebp
+ mov $1, %edx
+ shld %cl, %eax, %edx
+ imul %ebp, %edx
+ shr %cl, %edx
+ imul %ebp, %eax
+ mov %edx, 8(%esi) C store B1modb
+ mov %eax, 12(%esi) C store B2modb
+ pop %ebx
+ pop %esi
+ pop %ebp
+ ret
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/k7/mod_1_4.asm b/vendor/gmp-6.3.0/mpn/x86/k7/mod_1_4.asm
new file mode 100644
index 0000000..bb7597e
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/k7/mod_1_4.asm
@@ -0,0 +1,260 @@
+dnl x86-32 mpn_mod_1s_4p, requiring cmov.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2009, 2010 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C P5 ?
+C P6 model 0-8,10-12 ?
+C P6 model 9 (Banias) ?
+C P6 model 13 (Dothan) 6
+C P4 model 0 (Willamette) ?
+C P4 model 1 (?) ?
+C P4 model 2 (Northwood) 15.5
+C P4 model 3 (Prescott) ?
+C P4 model 4 (Nocona) ?
+C AMD K6 ?
+C AMD K7 4.75
+C AMD K8 ?
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mod_1s_4p)
+ push %ebp
+ push %edi
+ push %esi
+ push %ebx
+ sub $28, %esp
+ mov 60(%esp), %edi C cps[]
+ mov 8(%edi), %eax
+ mov 12(%edi), %edx
+ mov 16(%edi), %ecx
+ mov 20(%edi), %esi
+ mov 24(%edi), %edi
+ mov %eax, 4(%esp)
+ mov %edx, 8(%esp)
+ mov %ecx, 12(%esp)
+ mov %esi, 16(%esp)
+ mov %edi, 20(%esp)
+ mov 52(%esp), %eax C n
+ xor %edi, %edi
+ mov 48(%esp), %esi C up
+ lea -12(%esi,%eax,4), %esi
+ and $3, %eax
+ je L(b0)
+ cmp $2, %eax
+ jc L(b1)
+ je L(b2)
+
+L(b3): mov 4(%esi), %eax
+ mull 4(%esp)
+ mov (%esi), %ebp
+ add %eax, %ebp
+ adc %edx, %edi
+ mov 8(%esi), %eax
+ mull 8(%esp)
+ lea -12(%esi), %esi
+ jmp L(m0)
+
+L(b0): mov (%esi), %eax
+ mull 4(%esp)
+ mov -4(%esi), %ebp
+ add %eax, %ebp
+ adc %edx, %edi
+ mov 4(%esi), %eax
+ mull 8(%esp)
+ add %eax, %ebp
+ adc %edx, %edi
+ mov 8(%esi), %eax
+ mull 12(%esp)
+ lea -16(%esi), %esi
+ jmp L(m0)
+
+L(b1): mov 8(%esi), %ebp
+ lea -4(%esi), %esi
+ jmp L(m1)
+
+L(b2): mov 8(%esi), %edi
+ mov 4(%esi), %ebp
+ lea -8(%esi), %esi
+ jmp L(m1)
+
+ ALIGN(16)
+L(top): mov (%esi), %eax
+ mull 4(%esp)
+ mov -4(%esi), %ebx
+ xor %ecx, %ecx
+ add %eax, %ebx
+ adc %edx, %ecx
+ mov 4(%esi), %eax
+ mull 8(%esp)
+ add %eax, %ebx
+ adc %edx, %ecx
+ mov 8(%esi), %eax
+ mull 12(%esp)
+ add %eax, %ebx
+ adc %edx, %ecx
+ lea -16(%esi), %esi
+ mov 16(%esp), %eax
+ mul %ebp
+ add %eax, %ebx
+ adc %edx, %ecx
+ mov 20(%esp), %eax
+ mul %edi
+ mov %ebx, %ebp
+ mov %ecx, %edi
+L(m0): add %eax, %ebp
+ adc %edx, %edi
+L(m1): subl $4, 52(%esp)
+ ja L(top)
+
+L(end): mov 4(%esp), %eax
+ mul %edi
+ mov 60(%esp), %edi
+ add %eax, %ebp
+ adc $0, %edx
+ mov 4(%edi), %ecx
+ mov %edx, %esi
+ mov %ebp, %eax
+ sal %cl, %esi
+ mov %ecx, %ebx
+ neg %ecx
+ shr %cl, %eax
+ or %esi, %eax
+ lea 1(%eax), %esi
+ mull (%edi)
+ mov %ebx, %ecx
+ mov %eax, %ebx
+ mov %ebp, %eax
+ mov 56(%esp), %ebp
+ sal %cl, %eax
+ add %eax, %ebx
+ adc %esi, %edx
+ imul %ebp, %edx
+ sub %edx, %eax
+ lea (%eax,%ebp), %edx
+ cmp %eax, %ebx
+ cmovc( %edx, %eax)
+ mov %eax, %edx
+ sub %ebp, %eax
+ cmovc( %edx, %eax)
+ add $28, %esp
+ pop %ebx
+ pop %esi
+ pop %edi
+ pop %ebp
+ shr %cl, %eax
+ ret
+EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(mpn_mod_1s_4p_cps)
+C CAUTION: This is the same code as in pentium4/sse2/mod_1_4.asm
+ push %ebp
+ push %edi
+ push %esi
+ push %ebx
+ mov 20(%esp), %ebp C FIXME: avoid bp for 0-idx
+ mov 24(%esp), %ebx
+ bsr %ebx, %ecx
+ xor $31, %ecx
+ sal %cl, %ebx C b << cnt
+ mov %ebx, %edx
+ not %edx
+ mov $-1, %eax
+ div %ebx
+ xor %edi, %edi
+ sub %ebx, %edi
+ mov $1, %esi
+ mov %eax, (%ebp) C store bi
+ mov %ecx, 4(%ebp) C store cnt
+ shld %cl, %eax, %esi
+ imul %edi, %esi
+ mov %eax, %edi
+ mul %esi
+
+ add %esi, %edx
+ shr %cl, %esi
+ mov %esi, 8(%ebp) C store B1modb
+
+ not %edx
+ imul %ebx, %edx
+ lea (%edx,%ebx), %esi
+ cmp %edx, %eax
+ cmovnc( %edx, %esi)
+ mov %edi, %eax
+ mul %esi
+
+ add %esi, %edx
+ shr %cl, %esi
+ mov %esi, 12(%ebp) C store B2modb
+
+ not %edx
+ imul %ebx, %edx
+ lea (%edx,%ebx), %esi
+ cmp %edx, %eax
+ cmovnc( %edx, %esi)
+ mov %edi, %eax
+ mul %esi
+
+ add %esi, %edx
+ shr %cl, %esi
+ mov %esi, 16(%ebp) C store B3modb
+
+ not %edx
+ imul %ebx, %edx
+ lea (%edx,%ebx), %esi
+ cmp %edx, %eax
+ cmovnc( %edx, %esi)
+ mov %edi, %eax
+ mul %esi
+
+ add %esi, %edx
+ shr %cl, %esi
+ mov %esi, 20(%ebp) C store B4modb
+
+ not %edx
+ imul %ebx, %edx
+ add %edx, %ebx
+ cmp %edx, %eax
+ cmovnc( %edx, %ebx)
+
+ shr %cl, %ebx
+ mov %ebx, 24(%ebp) C store B5modb
+
+ pop %ebx
+ pop %esi
+ pop %edi
+ pop %ebp
+ ret
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/k7/mod_34lsub1.asm b/vendor/gmp-6.3.0/mpn/x86/k7/mod_34lsub1.asm
new file mode 100644
index 0000000..ee3ad04
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/k7/mod_34lsub1.asm
@@ -0,0 +1,188 @@
+dnl AMD K7 mpn_mod_34lsub1 -- remainder modulo 2^24-1.
+
+dnl Copyright 2000-2002, 2004, 2005, 2008 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C Athlon: 1
+C Hammer: 1
+
+
+C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
+C
+C The loop form below and the 64 byte code alignment seem necessary for the
+C claimed speed. This is a bit strange, since normally k7 isn't very
+C sensitive to such things. Perhaps there has to be 6 instructions in the
+C first 16 bytes for the BTB entry or something.
+
+defframe(PARAM_SIZE, 8)
+defframe(PARAM_SRC, 4)
+
+dnl re-use parameter space
+define(SAVE_EDI, `PARAM_SIZE')
+
+ TEXT
+ ALIGN(64)
+PROLOGUE(mpn_mod_34lsub1)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ movl PARAM_SRC, %edx
+
+ subl $2, %ecx
+ ja L(three_or_more)
+
+ movl (%edx), %eax
+ jb L(one)
+
+ movl 4(%edx), %ecx
+ movl %eax, %edx
+ shrl $24, %eax C src[0] low
+
+ andl $0xFFFFFF, %edx C src[0] high
+ addl %edx, %eax
+ movl %ecx, %edx
+
+ andl $0xFFFF, %ecx
+ shrl $16, %edx C src[1] high
+ addl %edx, %eax
+
+ shll $8, %ecx C src[1] low
+ addl %ecx, %eax
+
+L(one):
+ ret
+
+
+L(three_or_more):
+ C eax
+ C ebx
+ C ecx size-2
+ C edx src
+ C esi
+ C edi
+
+ pushl %ebx FRAME_pushl()
+ xorl %eax, %eax
+ xorl %ebx, %ebx
+
+ movl %edi, SAVE_EDI
+ pushl %esi FRAME_pushl()
+ xorl %esi, %esi C and clear carry flag
+
+
+ C code offset 0x40 at this point
+L(top):
+ C eax acc 0mod3
+ C ebx acc 1mod3
+ C ecx counter, limbs
+ C edx src
+ C esi acc 2mod3
+ C edi
+
+ leal 24(%edx), %edx
+ leal -2(%ecx), %ecx
+ adcl -24(%edx), %eax
+ adcl -20(%edx), %ebx
+ adcl -16(%edx), %esi
+
+ decl %ecx
+ jng L(done_loop)
+
+ leal -2(%ecx), %ecx
+ adcl -12(%edx), %eax
+ adcl -8(%edx), %ebx
+ adcl -4(%edx), %esi
+
+ decl %ecx
+ jg L(top)
+
+
+ leal 12(%edx), %edx
+
+
+L(done_loop):
+ C ecx is -2, -1 or 0 representing 0, 1 or 2 more limbs, respectively
+
+ incl %ecx
+ movl $0xFFFFFFFF, %edi
+ js L(combine)
+
+ adcl -12(%edx), %eax
+ decl %ecx
+ movl $0xFFFFFF00, %edi
+ js L(combine)
+
+ adcl -8(%edx), %ebx
+ movl $0xFFFF0000, %edi
+
+
+L(combine):
+ C eax acc 0mod3
+ C ebx acc 1mod3
+ C ecx
+ C edx
+ C esi acc 2mod3
+ C edi mask
+
+ sbbl %ecx, %ecx C carry
+ movl %eax, %edx C 0mod3
+ shrl $24, %eax C 0mod3 high
+
+ andl %edi, %ecx C carry masked
+ andl $0x00FFFFFF, %edx C 0mod3 low
+ movl %ebx, %edi C 1mod3
+
+ subl %ecx, %eax C apply carry
+ shrl $16, %ebx C 1mod3 high
+ andl $0xFFFF, %edi
+
+ addl %edx, %eax C apply 0mod3 low
+ movl %esi, %edx C 2mod3
+ shll $8, %edi C 1mod3 low
+
+ addl %ebx, %eax C apply 1mod3 high
+ shrl $8, %esi C 2mod3 high
+ movzbl %dl, %edx C 2mod3 low
+
+ addl %edi, %eax C apply 1mod3 low
+ shll $16, %edx C 2mod3 low
+
+ addl %esi, %eax C apply 2mod3 high
+ popl %esi FRAME_popl()
+
+ movl SAVE_EDI, %edi
+ addl %edx, %eax C apply 2mod3 low
+ popl %ebx FRAME_popl()
+
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/k7/mode1o.asm b/vendor/gmp-6.3.0/mpn/x86/k7/mode1o.asm
new file mode 100644
index 0000000..2394033
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/k7/mode1o.asm
@@ -0,0 +1,181 @@
+dnl AMD K7 mpn_modexact_1_odd -- exact division style remainder.
+
+dnl Copyright 2000-2002, 2004, 2007 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C Athlon: 11.0
+C Hammer: 7.0
+
+
+C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor);
+C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor, mp_limb_t carry);
+C
+C With the loop running at just 11 cycles it doesn't seem worth bothering to
+C check for high<divisor to save one step.
+C
+C Using a divl for size==1 measures slower than the modexact method, which
+C is not too surprising since for the latter it's only about 24 cycles to
+C calculate the modular inverse.
+
+defframe(PARAM_CARRY, 16)
+defframe(PARAM_DIVISOR,12)
+defframe(PARAM_SIZE, 8)
+defframe(PARAM_SRC, 4)
+
+defframe(SAVE_EBX, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EDI, -12)
+defframe(SAVE_EBP, -16)
+
+deflit(STACK_SPACE, 16)
+
+ TEXT
+
+ ALIGN(16)
+PROLOGUE(mpn_modexact_1c_odd)
+deflit(`FRAME',0)
+
+ movl PARAM_CARRY, %ecx
+ jmp L(start_1c)
+
+EPILOGUE()
+
+
+ ALIGN(16)
+PROLOGUE(mpn_modexact_1_odd)
+deflit(`FRAME',0)
+
+ xorl %ecx, %ecx
+L(start_1c):
+ movl PARAM_DIVISOR, %eax
+ subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE)
+
+ movl %esi, SAVE_ESI
+ movl PARAM_DIVISOR, %esi
+
+ movl %edi, SAVE_EDI
+
+ shrl %eax C d/2
+
+ andl $127, %eax
+
+ifdef(`PIC',`
+ LEA( binvert_limb_table, %edi)
+ movzbl (%eax,%edi), %edi C inv 8 bits
+',`
+ movzbl binvert_limb_table(%eax), %edi C inv 8 bits
+')
+
+ xorl %edx, %edx C initial extra carry
+ leal (%edi,%edi), %eax C 2*inv
+
+ imull %edi, %edi C inv*inv
+
+ movl %ebp, SAVE_EBP
+ movl PARAM_SIZE, %ebp
+
+ movl %ebx, SAVE_EBX
+ movl PARAM_SRC, %ebx
+
+ imull %esi, %edi C inv*inv*d
+
+ subl %edi, %eax C inv = 2*inv - inv*inv*d
+ leal (%eax,%eax), %edi C 2*inv
+
+ imull %eax, %eax C inv*inv
+
+ imull %esi, %eax C inv*inv*d
+
+ leal (%ebx,%ebp,4), %ebx C src end
+ negl %ebp C -size
+
+ subl %eax, %edi C inv = 2*inv - inv*inv*d
+
+ ASSERT(e,` C d*inv == 1 mod 2^GMP_LIMB_BITS
+ movl %esi, %eax
+ imull %edi, %eax
+ cmpl $1, %eax')
+
+
+C The dependent chain here is
+C
+C cycles
+C subl %edx, %eax 1
+C imull %edi, %eax 4
+C mull %esi 6 (high limb)
+C ----
+C total 11
+C
+C Out of order execution hides the load latency for the source data, so no
+C special scheduling is required.
+
+L(top):
+ C eax src limb
+ C ebx src end ptr
+ C ecx next carry bit, 0 or 1 (or initial carry param)
+ C edx carry limb, high of last product
+ C esi divisor
+ C edi inverse
+ C ebp counter, limbs, negative
+
+ movl (%ebx,%ebp,4), %eax
+
+ subl %ecx, %eax C apply carry bit
+ movl $0, %ecx
+
+ setc %cl C new carry bit
+
+ subl %edx, %eax C apply carry limb
+ adcl $0, %ecx
+
+ imull %edi, %eax
+
+ mull %esi
+
+ incl %ebp
+ jnz L(top)
+
+
+ movl SAVE_ESI, %esi
+ movl SAVE_EDI, %edi
+ leal (%ecx,%edx), %eax
+
+ movl SAVE_EBX, %ebx
+ movl SAVE_EBP, %ebp
+ addl $STACK_SPACE, %esp
+
+ ret
+
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/x86/k7/mul_1.asm b/vendor/gmp-6.3.0/mpn/x86/k7/mul_1.asm
new file mode 100644
index 0000000..755cd2e
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/k7/mul_1.asm
@@ -0,0 +1,237 @@
+dnl AMD K7 mpn_mul_1.
+
+dnl Copyright 1999-2002, 2005, 2008 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C P5
+C P6 model 0-8,10-12)
+C P6 model 9 (Banias)
+C P6 model 13 (Dothan)
+C P4 model 0 (Willamette)
+C P4 model 1 (?)
+C P4 model 2 (Northwood)
+C P4 model 3 (Prescott)
+C P4 model 4 (Nocona)
+C AMD K6
+C AMD K7 3.25
+C AMD K8
+
+C TODO
+C * Improve feed-in and wind-down code. We beat the old code for all n != 1,
+C but we might be able to do even better.
+C * The feed-in code for mul_1c is crude.
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mul_1c)
+ add $-16, %esp
+ mov %ebp, (%esp)
+ mov %ebx, 4(%esp)
+ mov %esi, 8(%esp)
+ mov %edi, 12(%esp)
+
+ mov 20(%esp), %edi
+ mov 24(%esp), %esi
+ mov 28(%esp), %ebp
+ mov 32(%esp), %ecx
+ mov %ebp, %ebx
+ shr $2, %ebp
+ mov %ebp, 28(%esp)
+ mov (%esi), %eax
+ and $3, %ebx
+ jz L(c0)
+ cmp $2, %ebx
+ mov 36(%esp), %ebx
+ jz L(c2)
+ jg L(c3)
+
+L(c1): lea -4(%edi), %edi
+ mul %ecx
+ test %ebp, %ebp
+ jnz 1f
+ add %ebx, %eax
+ mov %eax, 4(%edi)
+ mov %edx, %eax
+ adc %ebp, %eax
+ jmp L(rt)
+1: add %eax, %ebx
+ mov $0, %ebp
+ adc %edx, %ebp
+ mov 4(%esi), %eax
+ jmp L(1)
+
+L(c2): lea 4(%esi), %esi
+ mul %ecx
+ test %ebp, %ebp
+ mov %ebx, %ebp
+ jnz 2f
+ add %eax, %ebp
+ mov $0, %ebx
+ adc %edx, %ebx
+ mov (%esi), %eax
+ jmp L(cj2)
+2: add %eax, %ebp
+ mov $0, %ebx
+ adc %edx, %ebx
+ mov (%esi), %eax
+ jmp L(2)
+
+L(c3): lea 8(%esi), %esi
+ lea -12(%edi), %edi
+ mul %ecx
+ add %eax, %ebx
+ mov $0, %ebp
+ adc %edx, %ebp
+ mov -4(%esi), %eax
+ incl 28(%esp)
+ jmp L(3)
+
+L(c0): mov 36(%esp), %ebx
+ lea -4(%esi), %esi
+ lea -8(%edi), %edi
+ mul %ecx
+ mov %ebx, %ebp
+ add %eax, %ebp
+ mov $0, %ebx
+ adc %edx, %ebx
+ mov 8(%esi), %eax
+ jmp L(0)
+
+EPILOGUE()
+ ALIGN(16)
+PROLOGUE(mpn_mul_1)
+ add $-16, %esp
+ mov %ebp, (%esp)
+ mov %ebx, 4(%esp)
+ mov %esi, 8(%esp)
+ mov %edi, 12(%esp)
+
+ mov 20(%esp), %edi
+ mov 24(%esp), %esi
+ mov 28(%esp), %ebp
+ mov 32(%esp), %ecx
+ mov %ebp, %ebx
+ shr $2, %ebp
+ mov %ebp, 28(%esp)
+ mov (%esi), %eax
+ and $3, %ebx
+ jz L(b0)
+ cmp $2, %ebx
+ jz L(b2)
+ jg L(b3)
+
+L(b1): lea -4(%edi), %edi
+ mul %ecx
+ test %ebp, %ebp
+ jnz L(gt1)
+ mov %eax, 4(%edi)
+ mov %edx, %eax
+ jmp L(rt)
+L(gt1): mov %eax, %ebx
+ mov %edx, %ebp
+ mov 4(%esi), %eax
+ jmp L(1)
+
+L(b2): lea 4(%esi), %esi
+ mul %ecx
+ test %ebp, %ebp
+ mov %eax, %ebp
+ mov %edx, %ebx
+ mov (%esi), %eax
+ jnz L(2)
+ jmp L(cj2)
+
+L(b3): lea 8(%esi), %esi
+ lea -12(%edi), %edi
+ mul %ecx
+ mov %eax, %ebx
+ mov %edx, %ebp
+ mov -4(%esi), %eax
+ incl 28(%esp)
+ jmp L(3)
+
+L(b0): lea -4(%esi), %esi
+ lea -8(%edi), %edi
+ mul %ecx
+ mov %eax, %ebp
+ mov %edx, %ebx
+ mov 8(%esi), %eax
+ jmp L(0)
+
+ ALIGN(16)
+L(top): mov $0, %ebx
+ adc %edx, %ebx
+L(2): mul %ecx
+ add %eax, %ebx
+ mov %ebp, 0(%edi)
+ mov 4(%esi), %eax
+ mov $0, %ebp
+ adc %edx, %ebp
+L(1): mul %ecx
+ add %eax, %ebp
+ mov 8(%esi), %eax
+ mov %ebx, 4(%edi)
+ mov $0, %ebx
+ adc %edx, %ebx
+L(0): mov %ebp, 8(%edi)
+ mul %ecx
+ add %eax, %ebx
+ mov 12(%esi), %eax
+ lea 16(%esi), %esi
+ mov $0, %ebp
+ adc %edx, %ebp
+L(3): mov %ebx, 12(%edi)
+ mul %ecx
+ lea 16(%edi), %edi
+ add %eax, %ebp
+ decl 28(%esp)
+ mov 0(%esi), %eax
+ jnz L(top)
+
+L(end): mov $0, %ebx
+ adc %edx, %ebx
+L(cj2): mul %ecx
+ add %eax, %ebx
+ mov %ebp, (%edi)
+L(cj1): mov %ebx, 4(%edi)
+ adc $0, %edx
+ mov %edx, %eax
+
+L(rt): mov (%esp), %ebp
+ mov 4(%esp), %ebx
+ mov 8(%esp), %esi
+ mov 12(%esp), %edi
+ add $16, %esp
+ ret
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/x86/k7/mul_basecase.asm b/vendor/gmp-6.3.0/mpn/x86/k7/mul_basecase.asm
new file mode 100644
index 0000000..4dfb500
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/k7/mul_basecase.asm
@@ -0,0 +1,602 @@
+dnl AMD K7 mpn_mul_basecase -- multiply two mpn numbers.
+
+dnl Copyright 1999-2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K7: approx 4.42 cycles per cross product at around 20x20 limbs (16
+C limbs/loop unrolling).
+
+
+
+dnl K7 UNROLL_COUNT cycles/product (at around 20x20)
+dnl 8 4.67
+dnl 16 4.59
+dnl 32 4.42
+dnl Maximum possible with the current code is 32.
+dnl
+dnl At 32 the typical 13-26 limb sizes from the karatsuba code will get
+dnl done with a straight run through a block of code, no inner loop. Using
+dnl 32 gives 1k of code, but the k7 has a 64k L1 code cache.
+
+deflit(UNROLL_COUNT, 32)
+
+
+C void mpn_mul_basecase (mp_ptr wp,
+C mp_srcptr xp, mp_size_t xsize,
+C mp_srcptr yp, mp_size_t ysize);
+C
+C Calculate xp,xsize multiplied by yp,ysize, storing the result in
+C wp,xsize+ysize.
+C
+C This routine is essentially the same as mpn/generic/mul_basecase.c, but
+C it's faster because it does most of the mpn_addmul_1() startup
+C calculations only once. The saving is 15-25% on typical sizes coming from
+C the Karatsuba multiply code.
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 5)
+',`
+deflit(UNROLL_THRESHOLD, 5)
+')
+
+defframe(PARAM_YSIZE,20)
+defframe(PARAM_YP, 16)
+defframe(PARAM_XSIZE,12)
+defframe(PARAM_XP, 8)
+defframe(PARAM_WP, 4)
+
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_mul_basecase)
+deflit(`FRAME',0)
+
+ movl PARAM_XSIZE, %ecx
+ movl PARAM_YP, %eax
+
+ movl PARAM_XP, %edx
+ movl (%eax), %eax C yp low limb
+
+ cmpl $2, %ecx
+ ja L(xsize_more_than_two)
+ je L(two_by_something)
+
+
+ C one limb by one limb
+
+ mull (%edx)
+
+ movl PARAM_WP, %ecx
+ movl %eax, (%ecx)
+ movl %edx, 4(%ecx)
+ ret
+
+
+C -----------------------------------------------------------------------------
+L(two_by_something):
+deflit(`FRAME',0)
+ decl PARAM_YSIZE
+ pushl %ebx defframe_pushl(`SAVE_EBX')
+ movl %eax, %ecx C yp low limb
+
+ movl PARAM_WP, %ebx
+ pushl %esi defframe_pushl(`SAVE_ESI')
+ movl %edx, %esi C xp
+
+ movl (%edx), %eax C xp low limb
+ jnz L(two_by_two)
+
+
+ C two limbs by one limb
+
+ mull %ecx
+
+ movl %eax, (%ebx)
+ movl 4(%esi), %eax
+ movl %edx, %esi C carry
+
+ mull %ecx
+
+ addl %eax, %esi
+
+ movl %esi, 4(%ebx)
+ movl SAVE_ESI, %esi
+
+ adcl $0, %edx
+
+ movl %edx, 8(%ebx)
+ movl SAVE_EBX, %ebx
+ addl $FRAME, %esp
+
+ ret
+
+
+
+C -----------------------------------------------------------------------------
+C Could load yp earlier into another register.
+
+ ALIGN(16)
+L(two_by_two):
+ C eax xp low limb
+ C ebx wp
+ C ecx yp low limb
+ C edx
+ C esi xp
+ C edi
+ C ebp
+
+dnl FRAME carries on from previous
+
+ mull %ecx C xp[0] * yp[0]
+
+ push %edi defframe_pushl(`SAVE_EDI')
+ movl %edx, %edi C carry, for wp[1]
+
+ movl %eax, (%ebx)
+ movl 4(%esi), %eax
+
+ mull %ecx C xp[1] * yp[0]
+
+ addl %eax, %edi
+ movl PARAM_YP, %ecx
+
+ adcl $0, %edx
+ movl 4(%ecx), %ecx C yp[1]
+ movl %edi, 4(%ebx)
+
+ movl 4(%esi), %eax C xp[1]
+ movl %edx, %edi C carry, for wp[2]
+
+ mull %ecx C xp[1] * yp[1]
+
+ addl %eax, %edi
+
+ adcl $0, %edx
+ movl (%esi), %eax C xp[0]
+
+ movl %edx, %esi C carry, for wp[3]
+
+ mull %ecx C xp[0] * yp[1]
+
+ addl %eax, 4(%ebx)
+ adcl %edx, %edi
+ movl %edi, 8(%ebx)
+
+ adcl $0, %esi
+ movl SAVE_EDI, %edi
+ movl %esi, 12(%ebx)
+
+ movl SAVE_ESI, %esi
+ movl SAVE_EBX, %ebx
+ addl $FRAME, %esp
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(16)
+L(xsize_more_than_two):
+
+C The first limb of yp is processed with a simple mpn_mul_1 style loop
+C inline. Unrolling this doesn't seem worthwhile since it's only run once
+C (whereas the addmul below is run ysize-1 many times). A call to the
+C actual mpn_mul_1 will be slowed down by the call and parameter pushing and
+C popping, and doesn't seem likely to be worthwhile on the typical 13-26
+C limb operations the Karatsuba code calls here with.
+
+ C eax yp[0]
+ C ebx
+ C ecx xsize
+ C edx xp
+ C esi
+ C edi
+ C ebp
+
+dnl FRAME doesn't carry on from previous, no pushes yet here
+defframe(`SAVE_EBX',-4)
+defframe(`SAVE_ESI',-8)
+defframe(`SAVE_EDI',-12)
+defframe(`SAVE_EBP',-16)
+deflit(`FRAME',0)
+
+ subl $16, %esp
+deflit(`FRAME',16)
+
+ movl %edi, SAVE_EDI
+ movl PARAM_WP, %edi
+
+ movl %ebx, SAVE_EBX
+ movl %ebp, SAVE_EBP
+ movl %eax, %ebp
+
+ movl %esi, SAVE_ESI
+ xorl %ebx, %ebx
+ leal (%edx,%ecx,4), %esi C xp end
+
+ leal (%edi,%ecx,4), %edi C wp end of mul1
+ negl %ecx
+
+
+L(mul1):
+ C eax scratch
+ C ebx carry
+ C ecx counter, negative
+ C edx scratch
+ C esi xp end
+ C edi wp end of mul1
+ C ebp multiplier
+
+ movl (%esi,%ecx,4), %eax
+
+ mull %ebp
+
+ addl %ebx, %eax
+ movl %eax, (%edi,%ecx,4)
+ movl $0, %ebx
+
+ adcl %edx, %ebx
+ incl %ecx
+ jnz L(mul1)
+
+
+ movl PARAM_YSIZE, %edx
+ movl PARAM_XSIZE, %ecx
+
+ movl %ebx, (%edi) C final carry
+ decl %edx
+
+ jnz L(ysize_more_than_one)
+
+
+ movl SAVE_EDI, %edi
+ movl SAVE_EBX, %ebx
+
+ movl SAVE_EBP, %ebp
+ movl SAVE_ESI, %esi
+ addl $FRAME, %esp
+
+ ret
+
+
+L(ysize_more_than_one):
+ cmpl $UNROLL_THRESHOLD, %ecx
+ movl PARAM_YP, %eax
+
+ jae L(unroll)
+
+
+C -----------------------------------------------------------------------------
+ C simple addmul looping
+ C
+ C eax yp
+ C ebx
+ C ecx xsize
+ C edx ysize-1
+ C esi xp end
+ C edi wp end of mul1
+ C ebp
+
+ leal 4(%eax,%edx,4), %ebp C yp end
+ negl %ecx
+ negl %edx
+
+ movl (%esi,%ecx,4), %eax C xp low limb
+ movl %edx, PARAM_YSIZE C -(ysize-1)
+ incl %ecx
+
+ xorl %ebx, %ebx C initial carry
+ movl %ecx, PARAM_XSIZE C -(xsize-1)
+ movl %ebp, PARAM_YP
+
+ movl (%ebp,%edx,4), %ebp C yp second lowest limb - multiplier
+ jmp L(simple_outer_entry)
+
+
+ C this is offset 0x121 so close enough to aligned
+L(simple_outer_top):
+ C ebp ysize counter, negative
+
+ movl PARAM_YP, %edx
+ movl PARAM_XSIZE, %ecx C -(xsize-1)
+ xorl %ebx, %ebx C carry
+
+ movl %ebp, PARAM_YSIZE
+ addl $4, %edi C next position in wp
+
+ movl (%edx,%ebp,4), %ebp C yp limb - multiplier
+ movl -4(%esi,%ecx,4), %eax C xp low limb
+
+
+L(simple_outer_entry):
+
+L(simple_inner):
+ C eax xp limb
+ C ebx carry limb
+ C ecx loop counter (negative)
+ C edx scratch
+ C esi xp end
+ C edi wp end
+ C ebp multiplier
+
+ mull %ebp
+
+ addl %eax, %ebx
+ adcl $0, %edx
+
+ addl %ebx, (%edi,%ecx,4)
+ movl (%esi,%ecx,4), %eax
+ adcl $0, %edx
+
+ incl %ecx
+ movl %edx, %ebx
+ jnz L(simple_inner)
+
+
+ mull %ebp
+
+ movl PARAM_YSIZE, %ebp
+ addl %eax, %ebx
+
+ adcl $0, %edx
+ addl %ebx, (%edi)
+
+ adcl $0, %edx
+ incl %ebp
+
+ movl %edx, 4(%edi)
+ jnz L(simple_outer_top)
+
+
+ movl SAVE_EBX, %ebx
+ movl SAVE_ESI, %esi
+
+ movl SAVE_EDI, %edi
+ movl SAVE_EBP, %ebp
+ addl $FRAME, %esp
+
+ ret
+
+
+
+C -----------------------------------------------------------------------------
+C
+C The unrolled loop is the same as in mpn_addmul_1(), see that code for some
+C comments.
+C
+C VAR_ADJUST is the negative of how many limbs the leals in the inner loop
+C increment xp and wp. This is used to adjust back xp and wp, and rshifted
+C to given an initial VAR_COUNTER at the top of the outer loop.
+C
+C VAR_COUNTER is for the unrolled loop, running from VAR_ADJUST/UNROLL_COUNT
+C up to -1, inclusive.
+C
+C VAR_JMP is the computed jump into the unrolled loop.
+C
+C VAR_XP_LOW is the least significant limb of xp, which is needed at the
+C start of the unrolled loop.
+C
+C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
+C inclusive.
+C
+C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be
+C added to give the location of the next limb of yp, which is the multiplier
+C in the unrolled loop.
+C
+C The trick with VAR_ADJUST means it's only necessary to do one fetch in the
+C outer loop to take care of xp, wp and the inner loop counter.
+
+defframe(VAR_COUNTER, -20)
+defframe(VAR_ADJUST, -24)
+defframe(VAR_JMP, -28)
+defframe(VAR_XP_LOW, -32)
+deflit(VAR_EXTRA_SPACE, 16)
+
+
+L(unroll):
+ C eax yp
+ C ebx
+ C ecx xsize
+ C edx ysize-1
+ C esi xp end
+ C edi wp end of mul1
+ C ebp
+
+ movl PARAM_XP, %esi
+ movl 4(%eax), %ebp C multiplier (yp second limb)
+ leal 4(%eax,%edx,4), %eax C yp adjust for ysize indexing
+
+ movl PARAM_WP, %edi
+ movl %eax, PARAM_YP
+ negl %edx
+
+ movl %edx, PARAM_YSIZE
+ leal UNROLL_COUNT-2(%ecx), %ebx C (xsize-1)+UNROLL_COUNT-1
+ decl %ecx C xsize-1
+
+ movl (%esi), %eax C xp low limb
+ andl $-UNROLL_MASK-1, %ebx
+ negl %ecx
+
+ subl $VAR_EXTRA_SPACE, %esp
+deflit(`FRAME',16+VAR_EXTRA_SPACE)
+ negl %ebx
+ andl $UNROLL_MASK, %ecx
+
+ movl %ebx, VAR_ADJUST
+ movl %ecx, %edx
+ shll $4, %ecx
+
+ sarl $UNROLL_LOG2, %ebx
+
+ C 17 code bytes per limb
+ifdef(`PIC',`
+ call L(pic_calc)
+L(unroll_here):
+',`
+ leal L(unroll_entry) (%ecx,%edx,1), %ecx
+')
+ negl %edx
+
+ movl %eax, VAR_XP_LOW
+ movl %ecx, VAR_JMP
+ leal 4(%edi,%edx,4), %edi C wp and xp, adjust for unrolling,
+ leal 4(%esi,%edx,4), %esi C and start at second limb
+ jmp L(unroll_outer_entry)
+
+
+ifdef(`PIC',`
+L(pic_calc):
+ C See mpn/x86/README about old gas bugs
+ leal (%ecx,%edx,1), %ecx
+ addl $L(unroll_entry)-L(unroll_here), %ecx
+ addl (%esp), %ecx
+ ret_internal
+')
+
+
+C --------------------------------------------------------------------------
+ ALIGN(32)
+L(unroll_outer_top):
+ C ebp ysize counter, negative
+
+ movl VAR_ADJUST, %ebx
+ movl PARAM_YP, %edx
+
+ movl VAR_XP_LOW, %eax
+ movl %ebp, PARAM_YSIZE C store incremented ysize counter
+
+ leal 4(%edi,%ebx,4), %edi
+ leal (%esi,%ebx,4), %esi
+ sarl $UNROLL_LOG2, %ebx
+
+ movl (%edx,%ebp,4), %ebp C yp next multiplier
+ movl VAR_JMP, %ecx
+
+L(unroll_outer_entry):
+ mull %ebp
+
+ testb $1, %cl C and clear carry bit
+ movl %ebx, VAR_COUNTER
+ movl $0, %ebx
+
+ movl $0, %ecx
+ cmovz( %eax, %ecx) C eax into low carry, zero into high carry limb
+ cmovnz( %eax, %ebx)
+
+ C Extra fetch of VAR_JMP is bad, but registers are tight
+ jmp *VAR_JMP
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(32)
+L(unroll_top):
+ C eax xp limb
+ C ebx carry high
+ C ecx carry low
+ C edx scratch
+ C esi xp+8
+ C edi wp
+ C ebp yp multiplier limb
+ C
+ C VAR_COUNTER loop counter, negative
+ C
+ C 17 bytes each limb
+
+L(unroll_entry):
+
+deflit(CHUNK_COUNT,2)
+forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+ deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
+ deflit(`disp1', eval(disp0 + 4))
+
+Zdisp( movl, disp0,(%esi), %eax)
+ adcl %edx, %ebx
+
+ mull %ebp
+
+Zdisp( addl, %ecx, disp0,(%edi))
+ movl $0, %ecx
+
+ adcl %eax, %ebx
+
+
+ movl disp1(%esi), %eax
+ adcl %edx, %ecx
+
+ mull %ebp
+
+ addl %ebx, disp1(%edi)
+ movl $0, %ebx
+
+ adcl %eax, %ecx
+')
+
+
+ incl VAR_COUNTER
+ leal UNROLL_BYTES(%esi), %esi
+ leal UNROLL_BYTES(%edi), %edi
+
+ jnz L(unroll_top)
+
+
+ C eax
+ C ebx zero
+ C ecx low
+ C edx high
+ C esi
+ C edi wp, pointing at second last limb)
+ C ebp
+ C
+ C carry flag to be added to high
+
+deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
+deflit(`disp1', eval(disp0-0 + 4))
+
+ movl PARAM_YSIZE, %ebp
+ adcl $0, %edx
+ addl %ecx, disp0(%edi)
+
+ adcl $0, %edx
+ incl %ebp
+
+ movl %edx, disp1(%edi)
+ jnz L(unroll_outer_top)
+
+
+ movl SAVE_ESI, %esi
+ movl SAVE_EBP, %ebp
+
+ movl SAVE_EDI, %edi
+ movl SAVE_EBX, %ebx
+ addl $FRAME, %esp
+
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/k7/sqr_basecase.asm b/vendor/gmp-6.3.0/mpn/x86/k7/sqr_basecase.asm
new file mode 100644
index 0000000..7b6a97e
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/k7/sqr_basecase.asm
@@ -0,0 +1,635 @@
+dnl AMD K7 mpn_sqr_basecase -- square an mpn number.
+
+dnl Copyright 1999-2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K7: approx 2.3 cycles/crossproduct, or 4.55 cycles/triangular product
+C (measured on the speed difference between 25 and 50 limbs, which is
+C roughly the Karatsuba recursing range).
+
+
+dnl These are the same as mpn/x86/k6/sqr_basecase.asm, see that code for
+dnl some comments.
+
+deflit(SQR_TOOM2_THRESHOLD_MAX, 66)
+
+ifdef(`SQR_TOOM2_THRESHOLD_OVERRIDE',
+`define(`SQR_TOOM2_THRESHOLD',SQR_TOOM2_THRESHOLD_OVERRIDE)')
+
+m4_config_gmp_mparam(`SQR_TOOM2_THRESHOLD')
+deflit(UNROLL_COUNT, eval(SQR_TOOM2_THRESHOLD-3))
+
+
+C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C With a SQR_TOOM2_THRESHOLD around 50 this code is about 1500 bytes,
+C which is quite a bit, but is considered good value since squares big
+C enough to use most of the code will be spending quite a few cycles in it.
+
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_sqr_basecase)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ movl PARAM_SRC, %eax
+ cmpl $2, %ecx
+
+ movl PARAM_DST, %edx
+ je L(two_limbs)
+ ja L(three_or_more)
+
+
+C------------------------------------------------------------------------------
+C one limb only
+ C eax src
+ C ecx size
+ C edx dst
+
+ movl (%eax), %eax
+ movl %edx, %ecx
+
+ mull %eax
+
+ movl %edx, 4(%ecx)
+ movl %eax, (%ecx)
+ ret
+
+
+C------------------------------------------------------------------------------
+C
+C Using the read/modify/write "add"s seems to be faster than saving and
+C restoring registers. Perhaps the loads for the first set hide under the
+C mul latency and the second gets store to load forwarding.
+
+ ALIGN(16)
+L(two_limbs):
+ C eax src
+ C ebx
+ C ecx size
+ C edx dst
+deflit(`FRAME',0)
+
+ pushl %ebx FRAME_pushl()
+ movl %eax, %ebx C src
+ movl (%eax), %eax
+
+ movl %edx, %ecx C dst
+
+ mull %eax C src[0]^2
+
+ movl %eax, (%ecx) C dst[0]
+ movl 4(%ebx), %eax
+
+ movl %edx, 4(%ecx) C dst[1]
+
+ mull %eax C src[1]^2
+
+ movl %eax, 8(%ecx) C dst[2]
+ movl (%ebx), %eax
+
+ movl %edx, 12(%ecx) C dst[3]
+
+ mull 4(%ebx) C src[0]*src[1]
+
+ popl %ebx
+
+ addl %eax, 4(%ecx)
+ adcl %edx, 8(%ecx)
+ adcl $0, 12(%ecx)
+ ASSERT(nc)
+
+ addl %eax, 4(%ecx)
+ adcl %edx, 8(%ecx)
+ adcl $0, 12(%ecx)
+ ASSERT(nc)
+
+ ret
+
+
+C------------------------------------------------------------------------------
+defframe(SAVE_EBX, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EDI, -12)
+defframe(SAVE_EBP, -16)
+deflit(STACK_SPACE, 16)
+
+L(three_or_more):
+ subl $STACK_SPACE, %esp
+ cmpl $4, %ecx
+ jae L(four_or_more)
+deflit(`FRAME',STACK_SPACE)
+
+
+C------------------------------------------------------------------------------
+C Three limbs
+C
+C Writing out the loads and stores separately at the end of this code comes
+C out about 10 cycles faster than using adcls to memory.
+
+ C eax src
+ C ecx size
+ C edx dst
+
+ movl %ebx, SAVE_EBX
+ movl %eax, %ebx C src
+ movl (%eax), %eax
+
+ movl %edx, %ecx C dst
+ movl %esi, SAVE_ESI
+ movl %edi, SAVE_EDI
+
+ mull %eax C src[0] ^ 2
+
+ movl %eax, (%ecx)
+ movl 4(%ebx), %eax
+ movl %edx, 4(%ecx)
+
+ mull %eax C src[1] ^ 2
+
+ movl %eax, 8(%ecx)
+ movl 8(%ebx), %eax
+ movl %edx, 12(%ecx)
+
+ mull %eax C src[2] ^ 2
+
+ movl %eax, 16(%ecx)
+ movl (%ebx), %eax
+ movl %edx, 20(%ecx)
+
+ mull 4(%ebx) C src[0] * src[1]
+
+ movl %eax, %esi
+ movl (%ebx), %eax
+ movl %edx, %edi
+
+ mull 8(%ebx) C src[0] * src[2]
+
+ addl %eax, %edi
+ movl %ebp, SAVE_EBP
+ movl $0, %ebp
+
+ movl 4(%ebx), %eax
+ adcl %edx, %ebp
+
+ mull 8(%ebx) C src[1] * src[2]
+
+ xorl %ebx, %ebx
+ addl %eax, %ebp
+
+ adcl $0, %edx
+
+ C eax
+ C ebx zero, will be dst[5]
+ C ecx dst
+ C edx dst[4]
+ C esi dst[1]
+ C edi dst[2]
+ C ebp dst[3]
+
+ adcl $0, %edx
+ addl %esi, %esi
+
+ adcl %edi, %edi
+ movl 4(%ecx), %eax
+
+ adcl %ebp, %ebp
+
+ adcl %edx, %edx
+
+ adcl $0, %ebx
+ addl %eax, %esi
+ movl 8(%ecx), %eax
+
+ adcl %eax, %edi
+ movl 12(%ecx), %eax
+ movl %esi, 4(%ecx)
+
+ adcl %eax, %ebp
+ movl 16(%ecx), %eax
+ movl %edi, 8(%ecx)
+
+ movl SAVE_ESI, %esi
+ movl SAVE_EDI, %edi
+
+ adcl %eax, %edx
+ movl 20(%ecx), %eax
+ movl %ebp, 12(%ecx)
+
+ adcl %ebx, %eax
+ ASSERT(nc)
+ movl SAVE_EBX, %ebx
+ movl SAVE_EBP, %ebp
+
+ movl %edx, 16(%ecx)
+ movl %eax, 20(%ecx)
+ addl $FRAME, %esp
+
+ ret
+
+
+C------------------------------------------------------------------------------
+L(four_or_more):
+
+C First multiply src[0]*src[1..size-1] and store at dst[1..size].
+C Further products are added in rather than stored.
+
+ C eax src
+ C ebx
+ C ecx size
+ C edx dst
+ C esi
+ C edi
+ C ebp
+
+defframe(`VAR_COUNTER',-20)
+defframe(`VAR_JMP', -24)
+deflit(EXTRA_STACK_SPACE, 8)
+
+ movl %ebx, SAVE_EBX
+ movl %edi, SAVE_EDI
+ leal (%edx,%ecx,4), %edi C &dst[size]
+
+ movl %esi, SAVE_ESI
+ movl %ebp, SAVE_EBP
+ leal (%eax,%ecx,4), %esi C &src[size]
+
+ movl (%eax), %ebp C multiplier
+ movl $0, %ebx
+ decl %ecx
+
+ negl %ecx
+ subl $EXTRA_STACK_SPACE, %esp
+FRAME_subl_esp(EXTRA_STACK_SPACE)
+
+L(mul_1):
+ C eax scratch
+ C ebx carry
+ C ecx counter
+ C edx scratch
+ C esi &src[size]
+ C edi &dst[size]
+ C ebp multiplier
+
+ movl (%esi,%ecx,4), %eax
+
+ mull %ebp
+
+ addl %ebx, %eax
+ movl %eax, (%edi,%ecx,4)
+ movl $0, %ebx
+
+ adcl %edx, %ebx
+ incl %ecx
+ jnz L(mul_1)
+
+
+C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
+C
+C The last two products, which are the bottom right corner of the product
+C triangle, are left to the end. These are src[size-3]*src[size-2,size-1]
+C and src[size-2]*src[size-1]. If size is 4 then it's only these corner
+C cases that need to be done.
+C
+C The unrolled code is the same as in mpn_addmul_1, see that routine for
+C some comments.
+C
+C VAR_COUNTER is the outer loop, running from -size+4 to -1, inclusive.
+C
+C VAR_JMP is the computed jump into the unrolled code, stepped by one code
+C chunk each outer loop.
+C
+C K7 does branch prediction on indirect jumps, which is bad since it's a
+C different target each time. There seems no way to avoid this.
+
+dnl This value also hard coded in some shifts and adds
+deflit(CODE_BYTES_PER_LIMB, 17)
+
+dnl With the unmodified &src[size] and &dst[size] pointers, the
+dnl displacements in the unrolled code fit in a byte for UNROLL_COUNT
+dnl values up to 31, but above that an offset must be added to them.
+
+deflit(OFFSET,
+ifelse(eval(UNROLL_COUNT>31),1,
+eval((UNROLL_COUNT-31)*4),
+0))
+
+dnl Because the last chunk of code is generated differently, a label placed
+dnl at the end doesn't work. Instead calculate the implied end using the
+dnl start and how many chunks of code there are.
+
+deflit(UNROLL_INNER_END,
+`L(unroll_inner_start)+eval(UNROLL_COUNT*CODE_BYTES_PER_LIMB)')
+
+ C eax
+ C ebx carry
+ C ecx
+ C edx
+ C esi &src[size]
+ C edi &dst[size]
+ C ebp
+
+ movl PARAM_SIZE, %ecx
+ movl %ebx, (%edi)
+
+ subl $4, %ecx
+ jz L(corner)
+
+ negl %ecx
+ifelse(OFFSET,0,,`subl $OFFSET, %edi')
+ifelse(OFFSET,0,,`subl $OFFSET, %esi')
+
+ movl %ecx, %edx
+ shll $4, %ecx
+
+ifdef(`PIC',`
+ call L(pic_calc)
+L(here):
+',`
+ leal UNROLL_INNER_END-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
+')
+
+
+ C The calculated jump mustn't come out to before the start of the
+ C code available. This is the limit UNROLL_COUNT puts on the src
+ C operand size, but checked here directly using the jump address.
+ ASSERT(ae,
+ `movl_text_address(L(unroll_inner_start), %eax)
+ cmpl %eax, %ecx')
+
+
+C------------------------------------------------------------------------------
+ ALIGN(16)
+L(unroll_outer_top):
+ C eax
+ C ebx high limb to store
+ C ecx VAR_JMP
+ C edx VAR_COUNTER, limbs, negative
+ C esi &src[size], constant
+ C edi dst ptr, high of last addmul
+ C ebp
+
+ movl -12+OFFSET(%esi,%edx,4), %ebp C next multiplier
+ movl -8+OFFSET(%esi,%edx,4), %eax C first of multiplicand
+
+ movl %edx, VAR_COUNTER
+
+ mull %ebp
+
+define(cmovX,`ifelse(eval(UNROLL_COUNT%2),0,`cmovz($@)',`cmovnz($@)')')
+
+ testb $1, %cl
+ movl %edx, %ebx C high carry
+ movl %ecx, %edx C jump
+
+ movl %eax, %ecx C low carry
+ cmovX( %ebx, %ecx) C high carry reverse
+ cmovX( %eax, %ebx) C low carry reverse
+
+ leal CODE_BYTES_PER_LIMB(%edx), %eax
+ xorl %edx, %edx
+ leal 4(%edi), %edi
+
+ movl %eax, VAR_JMP
+
+ jmp *%eax
+
+
+ifdef(`PIC',`
+L(pic_calc):
+ addl (%esp), %ecx
+ addl $UNROLL_INNER_END-eval(2*CODE_BYTES_PER_LIMB)-L(here), %ecx
+ addl %edx, %ecx
+ ret_internal
+')
+
+
+ C Must be an even address to preserve the significance of the low
+ C bit of the jump address indicating which way around ecx/ebx should
+ C start.
+ ALIGN(2)
+
+L(unroll_inner_start):
+ C eax next limb
+ C ebx carry high
+ C ecx carry low
+ C edx scratch
+ C esi src
+ C edi dst
+ C ebp multiplier
+
+forloop(`i', UNROLL_COUNT, 1, `
+ deflit(`disp_src', eval(-i*4 + OFFSET))
+ deflit(`disp_dst', eval(disp_src - 4))
+
+ m4_assert(`disp_src>=-128 && disp_src<128')
+ m4_assert(`disp_dst>=-128 && disp_dst<128')
+
+ifelse(eval(i%2),0,`
+Zdisp( movl, disp_src,(%esi), %eax)
+ adcl %edx, %ebx
+
+ mull %ebp
+
+Zdisp( addl, %ecx, disp_dst,(%edi))
+ movl $0, %ecx
+
+ adcl %eax, %ebx
+
+',`
+ dnl this bit comes out last
+Zdisp( movl, disp_src,(%esi), %eax)
+ adcl %edx, %ecx
+
+ mull %ebp
+
+Zdisp( addl, %ebx, disp_dst,(%edi))
+
+ifelse(forloop_last,0,
+` movl $0, %ebx')
+
+ adcl %eax, %ecx
+')
+')
+
+ C eax next limb
+ C ebx carry high
+ C ecx carry low
+ C edx scratch
+ C esi src
+ C edi dst
+ C ebp multiplier
+
+ adcl $0, %edx
+ addl %ecx, -4+OFFSET(%edi)
+ movl VAR_JMP, %ecx
+
+ adcl $0, %edx
+
+ movl %edx, m4_empty_if_zero(OFFSET) (%edi)
+ movl VAR_COUNTER, %edx
+
+ incl %edx
+ jnz L(unroll_outer_top)
+
+
+ifelse(OFFSET,0,,`
+ addl $OFFSET, %esi
+ addl $OFFSET, %edi
+')
+
+
+C------------------------------------------------------------------------------
+L(corner):
+ C esi &src[size]
+ C edi &dst[2*size-5]
+
+ movl -12(%esi), %ebp
+ movl -8(%esi), %eax
+ movl %eax, %ecx
+
+ mull %ebp
+
+ addl %eax, -4(%edi)
+ movl -4(%esi), %eax
+
+ adcl $0, %edx
+ movl %edx, %ebx
+ movl %eax, %esi
+
+ mull %ebp
+
+ addl %ebx, %eax
+
+ adcl $0, %edx
+ addl %eax, (%edi)
+ movl %esi, %eax
+
+ adcl $0, %edx
+ movl %edx, %ebx
+
+ mull %ecx
+
+ addl %ebx, %eax
+ movl %eax, 4(%edi)
+
+ adcl $0, %edx
+ movl %edx, 8(%edi)
+
+
+
+C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
+
+L(lshift_start):
+ movl PARAM_SIZE, %eax
+ movl PARAM_DST, %edi
+ xorl %ecx, %ecx C clear carry
+
+ leal (%edi,%eax,8), %edi
+ notl %eax C -size-1, preserve carry
+
+ leal 2(%eax), %eax C -(size-1)
+
+L(lshift):
+ C eax counter, negative
+ C ebx
+ C ecx
+ C edx
+ C esi
+ C edi dst, pointing just after last limb
+ C ebp
+
+ rcll -4(%edi,%eax,8)
+ rcll (%edi,%eax,8)
+ incl %eax
+ jnz L(lshift)
+
+ setc %al
+
+ movl PARAM_SRC, %esi
+ movl %eax, -4(%edi) C dst most significant limb
+
+ movl PARAM_SIZE, %ecx
+
+
+C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
+C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the
+C low limb of src[0]^2.
+
+ movl (%esi), %eax C src[0]
+
+ mull %eax
+
+ leal (%esi,%ecx,4), %esi C src point just after last limb
+ negl %ecx
+
+ movl %eax, (%edi,%ecx,8) C dst[0]
+ incl %ecx
+
+L(diag):
+ C eax scratch
+ C ebx scratch
+ C ecx counter, negative
+ C edx carry
+ C esi src just after last limb
+ C edi dst just after last limb
+ C ebp
+
+ movl (%esi,%ecx,4), %eax
+ movl %edx, %ebx
+
+ mull %eax
+
+ addl %ebx, -4(%edi,%ecx,8)
+ adcl %eax, (%edi,%ecx,8)
+ adcl $0, %edx
+
+ incl %ecx
+ jnz L(diag)
+
+
+ movl SAVE_ESI, %esi
+ movl SAVE_EBX, %ebx
+
+ addl %edx, -4(%edi) C dst most significant limb
+ movl SAVE_EDI, %edi
+
+ movl SAVE_EBP, %ebp
+ addl $FRAME, %esp
+
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/k7/sublsh1_n.asm b/vendor/gmp-6.3.0/mpn/x86/k7/sublsh1_n.asm
new file mode 100644
index 0000000..8851683
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/k7/sublsh1_n.asm
@@ -0,0 +1,173 @@
+dnl AMD K7 mpn_sublsh1_n_ip1 -- rp[] = rp[] - (up[] << 1)
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C This is an attempt at a sublsh1_n for x86-32, not relying on sse2 insns. The
+C innerloop is 2*3-way unrolled, which is best we can do with the available
+C registers. It seems tricky to use the same structure for rsblsh1_n, since we
+C cannot feed carry between operations there.
+
+C cycles/limb
+C P5
+C P6 model 0-8,10-12
+C P6 model 9 (Banias)
+C P6 model 13 (Dothan)
+C P4 model 0 (Willamette)
+C P4 model 1 (?)
+C P4 model 2 (Northwood)
+C P4 model 3 (Prescott)
+C P4 model 4 (Nocona)
+C Intel Atom 6.75
+C AMD K6
+C AMD K7
+C AMD K8
+
+C This is a basic sublsh1_n for k7, atom, and perhaps some other x86-32
+C processors. It uses 2*4-way unrolling, for good reasons.
+C
+C Breaking carry recurrency might be a good idea. We would then need separate
+C registers for the shift carry and add/subtract carry, which in turn would
+C force us to 2*2-way unrolling.
+
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+dnl re-use parameter space
+define(VAR_COUNT,`PARAM_SIZE')
+define(SAVE_EBX,`PARAM_SRC')
+define(SAVE_EBP,`PARAM_DST')
+
+ASM_START()
+ TEXT
+ ALIGN(8)
+PROLOGUE(mpn_sublsh1_n_ip1)
+deflit(`FRAME',0)
+
+define(`rp', `%edi')
+define(`up', `%esi')
+
+ mov PARAM_SIZE, %eax C size
+ push up FRAME_pushl()
+ push rp FRAME_pushl()
+ xor %edx, %edx
+ mov PARAM_SRC, up
+ mov PARAM_DST, rp
+ mov %ebx, SAVE_EBX
+ mov %eax, %ebx
+ shr $3, %eax
+
+ not %eax C count = -(size\8)-i
+ and $7, %ebx C size % 8
+ jz L(exact)
+
+L(oop):
+ifdef(`CPU_P6',`
+ shr %edx ') C restore 2nd saved carry bit
+ mov (up), %ecx
+ adc %ecx, %ecx
+ rcr %edx C restore 1st saved carry bit
+ lea 4(up), up
+ sbb %ecx, (rp)
+ lea 4(rp), rp
+ adc %edx, %edx C save a carry bit in edx
+ifdef(`CPU_P6',`
+ adc %edx, %edx ') C save another carry bit in edx
+ dec %ebx
+ jnz L(oop)
+L(exact):
+ inc %eax
+ jz L(end)
+ mov %eax, VAR_COUNT
+ mov %ebp, SAVE_EBP
+
+ ALIGN(16)
+L(top):
+ifdef(`CPU_P6',`
+ shr %edx ') C restore 2nd saved carry bit
+ mov (up), %eax
+ adc %eax, %eax
+ mov 4(up), %ebx
+ adc %ebx, %ebx
+ mov 8(up), %ecx
+ adc %ecx, %ecx
+ mov 12(up), %ebp
+ adc %ebp, %ebp
+
+ rcr %edx C restore 1st saved carry bit
+
+ sbb %eax, (rp)
+ sbb %ebx, 4(rp)
+ sbb %ecx, 8(rp)
+ sbb %ebp, 12(rp)
+
+ mov 16(up), %eax
+ adc %eax, %eax
+ mov 20(up), %ebx
+ adc %ebx, %ebx
+ mov 24(up), %ecx
+ adc %ecx, %ecx
+ mov 28(up), %ebp
+ adc %ebp, %ebp
+
+ lea 32(up), up
+ adc %edx, %edx C save a carry bit in edx
+
+ sbb %eax, 16(rp)
+ sbb %ebx, 20(rp)
+ sbb %ecx, 24(rp)
+ sbb %ebp, 28(rp)
+
+ifdef(`CPU_P6',`
+ adc %edx, %edx ') C save another carry bit in edx
+ incl VAR_COUNT
+ lea 32(rp), rp
+ jne L(top)
+
+ mov SAVE_EBP, %ebp
+L(end):
+ mov SAVE_EBX, %ebx
+
+ifdef(`CPU_P6',`
+ xor %eax, %eax
+ shr $1, %edx
+ adc %edx, %eax
+',`
+ adc $0, %edx
+ mov %edx, %eax
+')
+ pop rp FRAME_popl()
+ pop up FRAME_popl()
+ ret
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/x86/k8/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/x86/k8/gmp-mparam.h
new file mode 100644
index 0000000..fa71292
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/k8/gmp-mparam.h
@@ -0,0 +1,215 @@
+/* x86/k8 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000-2011, 2014 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 2500 MHz K8 Brisbane */
+/* FFT tuning limit = 67,000,000 */
+/* Generated by tuneup.c, 2019-10-20, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 11
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 4
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 12
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 21
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1N_PI1_METHOD 1 /* 36.85% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD 3
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 44
+
+#define DIV_1_VS_MUL_1_PERCENT 251
+
+#define MUL_TOOM22_THRESHOLD 26
+#define MUL_TOOM33_THRESHOLD 78
+#define MUL_TOOM44_THRESHOLD 136
+#define MUL_TOOM6H_THRESHOLD 270
+#define MUL_TOOM8H_THRESHOLD 430
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 85
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 91
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 89
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 96
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 121
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 46
+#define SQR_TOOM3_THRESHOLD 81
+#define SQR_TOOM4_THRESHOLD 202
+#define SQR_TOOM6_THRESHOLD 300
+#define SQR_TOOM8_THRESHOLD 430
+
+#define MULMID_TOOM42_THRESHOLD 50
+
+#define MULMOD_BNM1_THRESHOLD 18
+#define SQRMOD_BNM1_THRESHOLD 22
+
+#define MUL_FFT_MODF_THRESHOLD 606 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 606, 5}, { 27, 6}, { 15, 5}, { 31, 6}, \
+ { 25, 7}, { 13, 6}, { 29, 7}, { 15, 6}, \
+ { 33, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \
+ { 39, 7}, { 23, 6}, { 47, 7}, { 35, 8}, \
+ { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \
+ { 31, 7}, { 63, 8}, { 39, 9}, { 23, 8}, \
+ { 55, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \
+ { 79, 9}, { 47, 8}, { 95, 9}, { 55,10}, \
+ { 31, 9}, { 79,10}, { 47, 9}, { 103,11}, \
+ { 31,10}, { 63, 9}, { 135,10}, { 79, 9}, \
+ { 167,10}, { 95, 9}, { 191,10}, { 111,11}, \
+ { 63,10}, { 159,11}, { 95,10}, { 191,12}, \
+ { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \
+ { 271, 9}, { 543,10}, { 287,11}, { 159,10}, \
+ { 319, 9}, { 639,10}, { 335, 9}, { 671,11}, \
+ { 191,10}, { 383, 9}, { 767,10}, { 399, 9}, \
+ { 799,11}, { 223,12}, { 127,11}, { 255,10}, \
+ { 511, 9}, { 1023,10}, { 543,11}, { 287,10}, \
+ { 607,11}, { 319,10}, { 671,11}, { 351,12}, \
+ { 191,11}, { 383,10}, { 799,11}, { 415,13}, \
+ { 127,12}, { 255,11}, { 511,10}, { 1023,11}, \
+ { 543,10}, { 1087,11}, { 607,12}, { 319,11}, \
+ { 671,10}, { 1343,11}, { 735,10}, { 1471,12}, \
+ { 383,11}, { 799,10}, { 1599,11}, { 863,12}, \
+ { 447,11}, { 927,13}, { 255,12}, { 511,11}, \
+ { 1087,12}, { 575,11}, { 1215,12}, { 639,11}, \
+ { 1343,12}, { 703,11}, { 1471,13}, { 383,12}, \
+ { 767,11}, { 1599,12}, { 831,11}, { 1727,12}, \
+ { 895,11}, { 1791,12}, { 959,14}, { 255,13}, \
+ { 511,12}, { 1087,11}, { 2239,12}, { 1215,13}, \
+ { 639,12}, { 1471,13}, { 767,12}, { 1727,13}, \
+ { 895,12}, { 1919,14}, { 511,13}, { 1023,12}, \
+ { 2239,13}, { 1151,12}, { 2431,13}, { 1279,12}, \
+ { 2623,13}, { 1407,12}, { 2943,14}, { 767,13}, \
+ { 1663,12}, { 3455,13}, { 1919,15}, { 511,14}, \
+ { 1023,13}, { 2175,12}, { 4351,13}, { 2431,14}, \
+ { 1279,13}, { 2943,14}, { 1535,13}, { 3455,14}, \
+ { 1791,13}, { 3967,15}, { 1023,14}, { 2047,13}, \
+ { 4351,14}, { 2303,13}, { 4991,14}, { 2815,15}, \
+ { 1535,14}, { 3839,16} }
+#define MUL_FFT_TABLE3_SIZE 158
+#define MUL_FFT_THRESHOLD 7296
+
+#define SQR_FFT_MODF_THRESHOLD 500 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 500, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \
+ { 25, 7}, { 13, 6}, { 29, 7}, { 15, 6}, \
+ { 32, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \
+ { 39, 7}, { 29, 8}, { 15, 7}, { 35, 8}, \
+ { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \
+ { 27, 9}, { 15, 8}, { 31, 7}, { 63, 8}, \
+ { 39, 9}, { 23, 8}, { 51, 9}, { 31, 8}, \
+ { 67, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \
+ { 95,10}, { 31, 9}, { 79,10}, { 47, 9}, \
+ { 95,11}, { 31,10}, { 63, 9}, { 135,10}, \
+ { 79, 9}, { 159,10}, { 95, 9}, { 191,10}, \
+ { 111,11}, { 63,10}, { 127, 9}, { 255,10}, \
+ { 143, 9}, { 287, 8}, { 575,10}, { 159,11}, \
+ { 95,10}, { 191,12}, { 63,11}, { 127,10}, \
+ { 255, 9}, { 511,10}, { 271, 9}, { 543,10}, \
+ { 287,11}, { 159,10}, { 319, 9}, { 639,10}, \
+ { 335, 9}, { 671,10}, { 351,11}, { 191,10}, \
+ { 383, 9}, { 767,10}, { 399, 9}, { 799,10}, \
+ { 415,11}, { 223,12}, { 127,11}, { 255,10}, \
+ { 511, 9}, { 1023,10}, { 543,11}, { 287,10}, \
+ { 607, 9}, { 1215,11}, { 319,10}, { 671,11}, \
+ { 351,10}, { 703,12}, { 191,11}, { 383,10}, \
+ { 799,11}, { 415,10}, { 831,13}, { 127,12}, \
+ { 255,11}, { 511,10}, { 1023,11}, { 543,10}, \
+ { 1087,11}, { 607,12}, { 319,11}, { 671,10}, \
+ { 1343,11}, { 735,10}, { 1471,12}, { 383,11}, \
+ { 799,10}, { 1599,11}, { 863,12}, { 447,11}, \
+ { 959,13}, { 255,12}, { 511,11}, { 1087,12}, \
+ { 575,11}, { 1215,10}, { 2431,12}, { 639,11}, \
+ { 1343,12}, { 703,11}, { 1471,13}, { 383,12}, \
+ { 767,11}, { 1599,12}, { 831,11}, { 1727,12}, \
+ { 959,14}, { 255,13}, { 511,12}, { 1087,11}, \
+ { 2239,12}, { 1215,11}, { 2431,13}, { 639,12}, \
+ { 1471,13}, { 767,12}, { 1727,13}, { 895,12}, \
+ { 1919,14}, { 511,13}, { 1023,12}, { 2239,13}, \
+ { 1151,12}, { 2431,13}, { 1279,12}, { 2623,13}, \
+ { 1407,12}, { 2943,14}, { 767,13}, { 1663,12}, \
+ { 3455,13}, { 1919,15}, { 511,14}, { 1023,13}, \
+ { 2431,14}, { 1279,13}, { 2943,14}, { 1535,13}, \
+ { 3455,14}, { 1791,13}, { 3839,15}, { 1023,14}, \
+ { 2047,13}, { 4223,14}, { 2303,13}, { 4863,14}, \
+ { 2815,15}, { 1535,14}, { 3839,16} }
+#define SQR_FFT_TABLE3_SIZE 167
+#define SQR_FFT_THRESHOLD 5504
+
+#define MULLO_BASECASE_THRESHOLD 4
+#define MULLO_DC_THRESHOLD 29
+#define MULLO_MUL_N_THRESHOLD 14281
+#define SQRLO_BASECASE_THRESHOLD 6
+#define SQRLO_DC_THRESHOLD 193
+#define SQRLO_SQR_THRESHOLD 10704
+
+#define DC_DIV_QR_THRESHOLD 84
+#define DC_DIVAPPR_Q_THRESHOLD 278
+#define DC_BDIV_QR_THRESHOLD 87
+#define DC_BDIV_Q_THRESHOLD 216
+
+#define INV_MULMOD_BNM1_THRESHOLD 50
+#define INV_NEWTON_THRESHOLD 268
+#define INV_APPR_THRESHOLD 268
+
+#define BINV_NEWTON_THRESHOLD 276
+#define REDC_1_TO_REDC_N_THRESHOLD 78
+
+#define MU_DIV_QR_THRESHOLD 1652
+#define MU_DIVAPPR_Q_THRESHOLD 1528
+#define MUPI_DIV_QR_THRESHOLD 114
+#define MU_BDIV_QR_THRESHOLD 1442
+#define MU_BDIV_Q_THRESHOLD 1466
+
+#define POWM_SEC_TABLE 1,22,102,452,1357
+
+#define GET_STR_DC_THRESHOLD 14
+#define GET_STR_PRECOMPUTE_THRESHOLD 24
+#define SET_STR_DC_THRESHOLD 270
+#define SET_STR_PRECOMPUTE_THRESHOLD 1149
+
+#define FAC_DSC_THRESHOLD 208
+#define FAC_ODD_THRESHOLD 48
+
+#define MATRIX22_STRASSEN_THRESHOLD 16
+#define HGCD2_DIV1_METHOD 3 /* 4.69% faster than 1 */
+#define HGCD_THRESHOLD 139
+#define HGCD_APPR_THRESHOLD 174
+#define HGCD_REDUCE_THRESHOLD 3389
+#define GCD_DC_THRESHOLD 599
+#define GCDEXT_DC_THRESHOLD 419
+#define JACOBI_BASE_METHOD 1 /* 1.57% faster than 4 */
+
+/* Tuneup completed successfully, took 83851 seconds */
diff --git a/vendor/gmp-6.3.0/mpn/x86/lshift.asm b/vendor/gmp-6.3.0/mpn/x86/lshift.asm
new file mode 100644
index 0000000..6ee6153
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/lshift.asm
@@ -0,0 +1,106 @@
+dnl x86 mpn_lshift -- mpn left shift.
+
+dnl Copyright 1992, 1994, 1996, 1999-2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C P54 7.5
+C P55 7.0
+C P6 2.5
+C K6 4.5
+C K7 5.0
+C P4 14.5
+
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C unsigned shift);
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ TEXT
+ ALIGN(8)
+PROLOGUE(mpn_lshift)
+
+ pushl %edi
+ pushl %esi
+ pushl %ebx
+deflit(`FRAME',12)
+
+ movl PARAM_DST,%edi
+ movl PARAM_SRC,%esi
+ movl PARAM_SIZE,%edx
+ movl PARAM_SHIFT,%ecx
+
+ subl $4,%esi C adjust src
+
+ movl (%esi,%edx,4),%ebx C read most significant limb
+ xorl %eax,%eax
+ shldl( %cl, %ebx, %eax) C compute carry limb
+ decl %edx
+ jz L(end)
+ pushl %eax C push carry limb onto stack
+ testb $1,%dl
+ jnz L(1) C enter loop in the middle
+ movl %ebx,%eax
+
+ ALIGN(8)
+L(oop): movl (%esi,%edx,4),%ebx C load next lower limb
+ shldl( %cl, %ebx, %eax) C compute result limb
+ movl %eax,(%edi,%edx,4) C store it
+ decl %edx
+L(1): movl (%esi,%edx,4),%eax
+ shldl( %cl, %eax, %ebx)
+ movl %ebx,(%edi,%edx,4)
+ decl %edx
+ jnz L(oop)
+
+ shll %cl,%eax C compute least significant limb
+ movl %eax,(%edi) C store it
+
+ popl %eax C pop carry limb
+
+ popl %ebx
+ popl %esi
+ popl %edi
+ ret
+
+L(end): shll %cl,%ebx C compute least significant limb
+ movl %ebx,(%edi) C store it
+
+ popl %ebx
+ popl %esi
+ popl %edi
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/mmx/sec_tabselect.asm b/vendor/gmp-6.3.0/mpn/x86/mmx/sec_tabselect.asm
new file mode 100644
index 0000000..aae158a
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/mmx/sec_tabselect.asm
@@ -0,0 +1,163 @@
+dnl X86 MMX mpn_sec_tabselect.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb cycles/limb
+C ali,evn n unal,evn n
+C P5
+C P6 model 0-8,10-12
+C P6 model 9 (Banias)
+C P6 model 13 (Dothan) 1.33 1.87
+C P4 model 0 (Willamette)
+C P4 model 1 (?)
+C P4 model 2 (Northwood) 2.1 2.63
+C P4 model 3 (Prescott)
+C P4 model 4 (Nocona) 1.7 2.57
+C Intel Atom 1.85 2.7
+C AMD K6
+C AMD K7 1.33 1.33
+C AMD K8
+C AMD K10
+
+define(`rp', `%edi')
+define(`tp', `%esi')
+define(`n', `%edx')
+define(`nents', `%ecx')
+define(`which', `')
+
+define(`i', `%ebp')
+define(`j', `%ebx')
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_sec_tabselect)
+ push %ebx
+ push %esi
+ push %edi
+ push %ebp
+
+ mov 20(%esp), rp
+ mov 24(%esp), tp
+ mov 28(%esp), n
+ mov 32(%esp), nents
+
+ movd 36(%esp), %mm6
+ punpckldq %mm6, %mm6 C 2 copies of `which'
+
+ mov $1, %ebx
+ movd %ebx, %mm7
+ punpckldq %mm7, %mm7 C 2 copies of 1
+
+ mov n, j
+ add $-4, j
+ js L(outer_end)
+
+L(outer_top):
+ mov nents, i
+ mov tp, %eax
+ pxor %mm1, %mm1
+ pxor %mm4, %mm4
+ pxor %mm5, %mm5
+ ALIGN(16)
+L(top): movq %mm6, %mm0
+ pcmpeqd %mm1, %mm0
+ paddd %mm7, %mm1
+ movq (tp), %mm2
+ movq 8(tp), %mm3
+ pand %mm0, %mm2
+ pand %mm0, %mm3
+ por %mm2, %mm4
+ por %mm3, %mm5
+ lea (tp,n,4), tp
+ add $-1, i
+ jne L(top)
+
+ movq %mm4, (rp)
+ movq %mm5, 8(rp)
+
+ lea 16(%eax), tp
+ lea 16(rp), rp
+ add $-4, j
+ jns L(outer_top)
+L(outer_end):
+
+ test $2, %dl
+ jz L(b0x)
+
+L(b1x): mov nents, i
+ mov tp, %eax
+ pxor %mm1, %mm1
+ pxor %mm4, %mm4
+ ALIGN(16)
+L(tp2): movq %mm6, %mm0
+ pcmpeqd %mm1, %mm0
+ paddd %mm7, %mm1
+ movq (tp), %mm2
+ pand %mm0, %mm2
+ por %mm2, %mm4
+ lea (tp,n,4), tp
+ add $-1, i
+ jne L(tp2)
+
+ movq %mm4, (rp)
+
+ lea 8(%eax), tp
+ lea 8(rp), rp
+
+L(b0x): test $1, %dl
+ jz L(b00)
+
+L(b01): mov nents, i
+ pxor %mm1, %mm1
+ pxor %mm4, %mm4
+ ALIGN(16)
+L(tp1): movq %mm6, %mm0
+ pcmpeqd %mm1, %mm0
+ paddd %mm7, %mm1
+ movd (tp), %mm2
+ pand %mm0, %mm2
+ por %mm2, %mm4
+ lea (tp,n,4), tp
+ add $-1, i
+ jne L(tp1)
+
+ movd %mm4, (rp)
+
+L(b00): pop %ebp
+ pop %edi
+ pop %esi
+ pop %ebx
+ emms
+ ret
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/mod_34lsub1.asm b/vendor/gmp-6.3.0/mpn/x86/mod_34lsub1.asm
new file mode 100644
index 0000000..e09e702
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/mod_34lsub1.asm
@@ -0,0 +1,183 @@
+dnl Generic x86 mpn_mod_34lsub1 -- mpn remainder modulo 2^24-1.
+
+dnl Copyright 2000-2002, 2004 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C P5 3.0
+C P6 3.66
+C K6 3.0
+C K7 1.3
+C P4 9
+
+
+C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
+C
+
+defframe(PARAM_SIZE, 8)
+defframe(PARAM_SRC, 4)
+
+dnl re-use parameter space
+define(SAVE_EBX, `PARAM_SRC')
+
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mod_34lsub1)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ movl PARAM_SRC, %edx
+
+ subl $2, %ecx
+ ja L(three_or_more)
+
+ movl (%edx), %eax
+ jb L(one)
+
+ movl 4(%edx), %ecx
+ movl %eax, %edx
+ shrl $24, %eax C src[0] low
+
+ andl $0xFFFFFF, %edx C src[0] high
+ addl %edx, %eax
+ movl %ecx, %edx
+
+ andl $0xFFFF, %ecx
+ shrl $16, %edx C src[1] high
+ addl %edx, %eax
+
+ shll $8, %ecx C src[1] low
+ addl %ecx, %eax
+
+L(one):
+ ret
+
+
+L(three_or_more):
+ C eax
+ C ebx
+ C ecx size-2
+ C edx src
+ C esi
+ C edi
+ C ebp
+
+ movl %ebx, SAVE_EBX C and arrange 16-byte loop alignment
+ xorl %ebx, %ebx
+
+ pushl %esi FRAME_pushl()
+ xorl %esi, %esi
+
+ pushl %edi FRAME_pushl()
+ xorl %eax, %eax C and clear carry flag
+
+
+ C offset 0x40 here
+L(top):
+ C eax acc 0mod3
+ C ebx acc 1mod3
+ C ecx counter, limbs
+ C edx src
+ C esi acc 2mod3
+ C edi
+ C ebp
+
+ leal 12(%edx), %edx
+ leal -2(%ecx), %ecx
+
+ adcl -12(%edx), %eax
+ adcl -8(%edx), %ebx
+ adcl -4(%edx), %esi
+
+ decl %ecx
+ jg L(top)
+
+
+ C ecx is -2, -1 or 0 representing 0, 1 or 2 more limbs, respectively
+
+ movl $0xFFFFFFFF, %edi
+ incl %ecx
+ js L(combine)
+
+ adcl (%edx), %eax
+ movl $0xFFFFFF00, %edi
+ decl %ecx
+ js L(combine)
+
+ adcl 4(%edx), %ebx
+ movl $0xFFFF0000, %edi
+
+
+L(combine):
+ C eax acc 0mod3
+ C ebx acc 1mod3
+ C ecx
+ C edx
+ C esi acc 2mod3
+ C edi mask
+ C ebp
+
+ sbbl %ecx, %ecx C carry
+ movl %eax, %edx C 0mod3
+
+ shrl $24, %eax C 0mod3 high
+ andl %edi, %ecx C carry masked
+
+ subl %ecx, %eax C apply carry
+ movl %ebx, %edi C 1mod3
+
+ shrl $16, %ebx C 1mod3 high
+ andl $0x00FFFFFF, %edx C 0mod3 low
+
+ addl %edx, %eax C apply 0mod3 low
+ andl $0xFFFF, %edi
+
+ shll $8, %edi C 1mod3 low
+ addl %ebx, %eax C apply 1mod3 high
+
+ addl %edi, %eax C apply 1mod3 low
+ movl %esi, %edx C 2mod3
+
+ shrl $8, %esi C 2mod3 high
+ andl $0xFF, %edx C 2mod3 low
+
+ shll $16, %edx C 2mod3 low
+ addl %esi, %eax C apply 2mod3 high
+
+ addl %edx, %eax C apply 2mod3 low
+ popl %edi FRAME_popl()
+
+ movl SAVE_EBX, %ebx
+ popl %esi FRAME_popl()
+
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/mul_1.asm b/vendor/gmp-6.3.0/mpn/x86/mul_1.asm
new file mode 100644
index 0000000..421de62
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/mul_1.asm
@@ -0,0 +1,140 @@
+dnl x86 mpn_mul_1 (for 386, 486, and Pentium Pro) -- Multiply a limb vector
+dnl with a limb and store the result in a second limb vector.
+
+dnl Copyright 1992, 1994, 1997-2002, 2005 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C P5 12.5
+C P6 model 0-8,10-12 5.5
+C P6 model 9 (Banias)
+C P6 model 13 (Dothan) 5.25
+C P4 model 0 (Willamette) 19.0
+C P4 model 1 (?) 19.0
+C P4 model 2 (Northwood) 19.0
+C P4 model 3 (Prescott)
+C P4 model 4 (Nocona)
+C AMD K6 10.5
+C AMD K7 4.5
+C AMD K8
+
+
+C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t multiplier);
+
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ TEXT
+ ALIGN(8)
+PROLOGUE(mpn_mul_1)
+deflit(`FRAME',0)
+
+ pushl %edi
+ pushl %esi
+ pushl %ebx
+ pushl %ebp
+deflit(`FRAME',16)
+
+ movl PARAM_DST,%edi
+ movl PARAM_SRC,%esi
+ movl PARAM_SIZE,%ecx
+
+ xorl %ebx,%ebx
+ andl $3,%ecx
+ jz L(end0)
+
+L(oop0):
+ movl (%esi),%eax
+ mull PARAM_MULTIPLIER
+ leal 4(%esi),%esi
+ addl %ebx,%eax
+ movl $0,%ebx
+ adcl %ebx,%edx
+ movl %eax,(%edi)
+ movl %edx,%ebx C propagate carry into cylimb
+
+ leal 4(%edi),%edi
+ decl %ecx
+ jnz L(oop0)
+
+L(end0):
+ movl PARAM_SIZE,%ecx
+ shrl $2,%ecx
+ jz L(end)
+
+
+ ALIGN(8)
+L(oop): movl (%esi),%eax
+ mull PARAM_MULTIPLIER
+ addl %eax,%ebx
+ movl $0,%ebp
+ adcl %edx,%ebp
+
+ movl 4(%esi),%eax
+ mull PARAM_MULTIPLIER
+ movl %ebx,(%edi)
+ addl %eax,%ebp C new lo + cylimb
+ movl $0,%ebx
+ adcl %edx,%ebx
+
+ movl 8(%esi),%eax
+ mull PARAM_MULTIPLIER
+ movl %ebp,4(%edi)
+ addl %eax,%ebx C new lo + cylimb
+ movl $0,%ebp
+ adcl %edx,%ebp
+
+ movl 12(%esi),%eax
+ mull PARAM_MULTIPLIER
+ movl %ebx,8(%edi)
+ addl %eax,%ebp C new lo + cylimb
+ movl $0,%ebx
+ adcl %edx,%ebx
+
+ movl %ebp,12(%edi)
+
+ leal 16(%esi),%esi
+ leal 16(%edi),%edi
+ decl %ecx
+ jnz L(oop)
+
+L(end): movl %ebx,%eax
+
+ popl %ebp
+ popl %ebx
+ popl %esi
+ popl %edi
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/mul_basecase.asm b/vendor/gmp-6.3.0/mpn/x86/mul_basecase.asm
new file mode 100644
index 0000000..8339732
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/mul_basecase.asm
@@ -0,0 +1,223 @@
+dnl x86 mpn_mul_basecase -- Multiply two limb vectors and store the result
+dnl in a third limb vector.
+
+dnl Copyright 1996-2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/crossproduct
+C P5 15
+C P6 7.5
+C K6 12.5
+C K7 5.5
+C P4 24
+
+
+C void mpn_mul_basecase (mp_ptr wp,
+C mp_srcptr xp, mp_size_t xsize,
+C mp_srcptr yp, mp_size_t ysize);
+C
+C This was written in a haste since the Pentium optimized code that was used
+C for all x86 machines was slow for the Pentium II. This code would benefit
+C from some cleanup.
+C
+C To shave off some percentage of the run-time, one should make 4 variants
+C of the Louter loop, for the four different outcomes of un mod 4. That
+C would avoid Loop0 altogether. Code expansion would be > 4-fold for that
+C part of the function, but since it is not very large, that would be
+C acceptable.
+C
+C The mul loop (at L(oopM)) might need some tweaking. It's current speed is
+C unknown.
+
+defframe(PARAM_YSIZE,20)
+defframe(PARAM_YP, 16)
+defframe(PARAM_XSIZE,12)
+defframe(PARAM_XP, 8)
+defframe(PARAM_WP, 4)
+
+defframe(VAR_MULTIPLIER, -4)
+defframe(VAR_COUNTER, -8)
+deflit(VAR_STACK_SPACE, 8)
+
+ TEXT
+ ALIGN(8)
+
+PROLOGUE(mpn_mul_basecase)
+deflit(`FRAME',0)
+
+ subl $VAR_STACK_SPACE,%esp
+ pushl %esi
+ pushl %ebp
+ pushl %edi
+deflit(`FRAME',eval(VAR_STACK_SPACE+12))
+
+ movl PARAM_XP,%esi
+ movl PARAM_WP,%edi
+ movl PARAM_YP,%ebp
+
+ movl (%esi),%eax C load xp[0]
+ mull (%ebp) C multiply by yp[0]
+ movl %eax,(%edi) C store to wp[0]
+ movl PARAM_XSIZE,%ecx C xsize
+ decl %ecx C If xsize = 1, ysize = 1 too
+ jz L(done)
+
+ pushl %ebx
+FRAME_pushl()
+ movl %edx,%ebx
+
+ leal 4(%esi),%esi
+ leal 4(%edi),%edi
+
+L(oopM):
+ movl (%esi),%eax C load next limb at xp[j]
+ leal 4(%esi),%esi
+ mull (%ebp)
+ addl %ebx,%eax
+ movl %edx,%ebx
+ adcl $0,%ebx
+ movl %eax,(%edi)
+ leal 4(%edi),%edi
+ decl %ecx
+ jnz L(oopM)
+
+ movl %ebx,(%edi) C most significant limb of product
+ addl $4,%edi C increment wp
+ movl PARAM_XSIZE,%eax
+ shll $2,%eax
+ subl %eax,%edi
+ subl %eax,%esi
+
+ movl PARAM_YSIZE,%eax C ysize
+ decl %eax
+ jz L(skip)
+ movl %eax,VAR_COUNTER C set index i to ysize
+
+L(outer):
+ movl PARAM_YP,%ebp C yp
+ addl $4,%ebp C make ebp point to next v limb
+ movl %ebp,PARAM_YP
+ movl (%ebp),%eax C copy y limb ...
+ movl %eax,VAR_MULTIPLIER C ... to stack slot
+ movl PARAM_XSIZE,%ecx
+
+ xorl %ebx,%ebx
+ andl $3,%ecx
+ jz L(end0)
+
+L(oop0):
+ movl (%esi),%eax
+ mull VAR_MULTIPLIER
+ leal 4(%esi),%esi
+ addl %ebx,%eax
+ movl $0,%ebx
+ adcl %ebx,%edx
+ addl %eax,(%edi)
+ adcl %edx,%ebx C propagate carry into cylimb
+
+ leal 4(%edi),%edi
+ decl %ecx
+ jnz L(oop0)
+
+L(end0):
+ movl PARAM_XSIZE,%ecx
+ shrl $2,%ecx
+ jz L(endX)
+
+ ALIGN(8)
+L(oopX):
+ movl (%esi),%eax
+ mull VAR_MULTIPLIER
+ addl %eax,%ebx
+ movl $0,%ebp
+ adcl %edx,%ebp
+
+ movl 4(%esi),%eax
+ mull VAR_MULTIPLIER
+ addl %ebx,(%edi)
+ adcl %eax,%ebp C new lo + cylimb
+ movl $0,%ebx
+ adcl %edx,%ebx
+
+ movl 8(%esi),%eax
+ mull VAR_MULTIPLIER
+ addl %ebp,4(%edi)
+ adcl %eax,%ebx C new lo + cylimb
+ movl $0,%ebp
+ adcl %edx,%ebp
+
+ movl 12(%esi),%eax
+ mull VAR_MULTIPLIER
+ addl %ebx,8(%edi)
+ adcl %eax,%ebp C new lo + cylimb
+ movl $0,%ebx
+ adcl %edx,%ebx
+
+ addl %ebp,12(%edi)
+ adcl $0,%ebx C propagate carry into cylimb
+
+ leal 16(%esi),%esi
+ leal 16(%edi),%edi
+ decl %ecx
+ jnz L(oopX)
+
+L(endX):
+ movl %ebx,(%edi)
+ addl $4,%edi
+
+ C we incremented wp and xp in the loop above; compensate
+ movl PARAM_XSIZE,%eax
+ shll $2,%eax
+ subl %eax,%edi
+ subl %eax,%esi
+
+ movl VAR_COUNTER,%eax
+ decl %eax
+ movl %eax,VAR_COUNTER
+ jnz L(outer)
+
+L(skip):
+ popl %ebx
+ popl %edi
+ popl %ebp
+ popl %esi
+ addl $8,%esp
+ ret
+
+L(done):
+ movl %edx,4(%edi) C store to wp[1]
+ popl %edi
+ popl %ebp
+ popl %esi
+ addl $8,%esp
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/nano/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/x86/nano/gmp-mparam.h
new file mode 100644
index 0000000..cd8ac4e
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/nano/gmp-mparam.h
@@ -0,0 +1,162 @@
+/* x86/nano gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000-2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* Generated by tuneup.c, 2011-11-25, gcc 4.2 */
+
+#define MOD_1_1P_METHOD 1
+#define MOD_1_NORM_THRESHOLD 3
+#define MOD_1_UNNORM_THRESHOLD 3
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 10
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 9
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 53
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 12
+#define USE_PREINV_DIVREM_1 1
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 32
+
+#define MUL_TOOM22_THRESHOLD 16
+#define MUL_TOOM33_THRESHOLD 132
+#define MUL_TOOM44_THRESHOLD 195
+#define MUL_TOOM6H_THRESHOLD 270
+#define MUL_TOOM8H_THRESHOLD 478
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 129
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 138
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 130
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 135
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 28
+#define SQR_TOOM3_THRESHOLD 194
+#define SQR_TOOM4_THRESHOLD 502
+#define SQR_TOOM6_THRESHOLD 746
+#define SQR_TOOM8_THRESHOLD 1005
+
+#define MULMID_TOOM42_THRESHOLD 40
+
+#define MULMOD_BNM1_THRESHOLD 14
+#define SQRMOD_BNM1_THRESHOLD 19
+
+#define POWM_SEC_TABLE 4,23,258,828,2246
+
+#define MUL_FFT_MODF_THRESHOLD 308 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 308, 5}, { 13, 6}, { 7, 5}, { 17, 6}, \
+ { 9, 5}, { 19, 6}, { 11, 5}, { 23, 6}, \
+ { 13, 7}, { 7, 6}, { 17, 7}, { 9, 6}, \
+ { 19, 7}, { 11, 6}, { 24, 7}, { 15, 6}, \
+ { 31, 7}, { 19, 8}, { 11, 7}, { 25, 8}, \
+ { 15, 7}, { 33, 8}, { 19, 7}, { 39, 8}, \
+ { 23, 7}, { 47, 9}, { 15, 8}, { 31, 7}, \
+ { 63, 8}, { 39, 9}, { 23, 8}, { 47,10}, \
+ { 15, 9}, { 31, 8}, { 63, 9}, { 47,10}, \
+ { 31, 9}, { 71,10}, { 47, 9}, { 95,11}, \
+ { 31,10}, { 63, 9}, { 127, 8}, { 255,10}, \
+ { 79, 9}, { 159,10}, { 95, 9}, { 191,11}, \
+ { 63,10}, { 127, 9}, { 255, 8}, { 543, 9}, \
+ { 287, 8}, { 575, 7}, { 1215,10}, { 159,11}, \
+ { 95,10}, { 191,12}, { 63,11}, { 127,10}, \
+ { 255, 9}, { 543, 8}, { 1087,10}, { 287, 9}, \
+ { 607, 8}, { 1215,11}, { 159,10}, { 319, 9}, \
+ { 639,10}, { 351, 9}, { 703, 8}, { 1407, 9}, \
+ { 735, 8}, { 1471,11}, { 191,10}, { 383, 9}, \
+ { 767,10}, { 415, 9}, { 831,11}, { 223,10}, \
+ { 447, 9}, { 895,10}, { 479, 9}, { 959, 8}, \
+ { 1919,12}, { 4096,13}, { 8192,14}, { 16384,15}, \
+ { 32768,16} }
+#define MUL_FFT_TABLE3_SIZE 89
+#define MUL_FFT_THRESHOLD 1856
+
+#define SQR_FFT_MODF_THRESHOLD 396 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 396, 5}, { 13, 6}, { 7, 5}, { 21, 6}, \
+ { 11, 5}, { 23, 6}, { 21, 7}, { 11, 6}, \
+ { 25, 7}, { 15, 6}, { 31, 7}, { 19, 6}, \
+ { 39, 7}, { 21, 8}, { 11, 7}, { 23, 6}, \
+ { 47, 7}, { 27, 8}, { 15, 7}, { 33, 8}, \
+ { 19, 7}, { 39, 8}, { 23, 7}, { 47, 8}, \
+ { 27, 9}, { 15, 8}, { 31, 7}, { 63, 8}, \
+ { 39, 9}, { 23, 8}, { 47,10}, { 15, 9}, \
+ { 31, 8}, { 63, 9}, { 39, 8}, { 79, 9}, \
+ { 47,10}, { 31, 9}, { 79,10}, { 47, 9}, \
+ { 95,11}, { 31,10}, { 63, 9}, { 127,10}, \
+ { 79, 9}, { 159,10}, { 95,11}, { 63,10}, \
+ { 127, 9}, { 255, 8}, { 543,10}, { 143, 9}, \
+ { 287, 8}, { 607, 7}, { 1215, 6}, { 2431,10}, \
+ { 159, 8}, { 639,11}, { 95,10}, { 191,12}, \
+ { 63,11}, { 127,10}, { 255, 9}, { 543, 8}, \
+ { 1087,10}, { 287, 9}, { 607, 8}, { 1215,11}, \
+ { 159,10}, { 319, 9}, { 671,10}, { 351, 9}, \
+ { 703, 8}, { 1407, 9}, { 735, 8}, { 1471, 7}, \
+ { 2943,11}, { 191,10}, { 383, 9}, { 799,10}, \
+ { 415, 9}, { 895,10}, { 479,12}, { 4096,13}, \
+ { 8192,14}, { 16384,15}, { 32768,16} }
+#define SQR_FFT_TABLE3_SIZE 87
+#define SQR_FFT_THRESHOLD 2368
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 51
+#define MULLO_MUL_N_THRESHOLD 3369
+
+#define DC_DIV_QR_THRESHOLD 56
+#define DC_DIVAPPR_Q_THRESHOLD 183
+#define DC_BDIV_QR_THRESHOLD 55
+#define DC_BDIV_Q_THRESHOLD 118
+
+#define INV_MULMOD_BNM1_THRESHOLD 30
+#define INV_NEWTON_THRESHOLD 266
+#define INV_APPR_THRESHOLD 218
+
+#define BINV_NEWTON_THRESHOLD 268
+#define REDC_1_TO_REDC_N_THRESHOLD 56
+
+#define MU_DIV_QR_THRESHOLD 1308
+#define MU_DIVAPPR_Q_THRESHOLD 1528
+#define MUPI_DIV_QR_THRESHOLD 124
+#define MU_BDIV_QR_THRESHOLD 855
+#define MU_BDIV_Q_THRESHOLD 1334
+
+#define MATRIX22_STRASSEN_THRESHOLD 14
+#define HGCD_THRESHOLD 104
+#define HGCD_APPR_THRESHOLD 139
+#define HGCD_REDUCE_THRESHOLD 2121
+#define GCD_DC_THRESHOLD 456
+#define GCDEXT_DC_THRESHOLD 321
+#define JACOBI_BASE_METHOD 4
+
+#define GET_STR_DC_THRESHOLD 11
+#define GET_STR_PRECOMPUTE_THRESHOLD 25
+#define SET_STR_DC_THRESHOLD 542
+#define SET_STR_PRECOMPUTE_THRESHOLD 840
diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/README b/vendor/gmp-6.3.0/mpn/x86/p6/README
new file mode 100644
index 0000000..f19d47b
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/p6/README
@@ -0,0 +1,125 @@
+Copyright 2000, 2001 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/.
+
+
+
+
+
+ INTEL P6 MPN SUBROUTINES
+
+
+
+This directory contains code optimized for Intel P6 class CPUs, meaning
+PentiumPro, Pentium II and Pentium III. The mmx and p3mmx subdirectories
+have routines using MMX instructions.
+
+
+
+STATUS
+
+Times for the loops, with all code and data in L1 cache, are as follows.
+Some of these might be able to be improved.
+
+ cycles/limb
+
+ mpn_add_n/sub_n 3.7
+
+ mpn_copyi 0.75
+ mpn_copyd 1.75 (or 0.75 if no overlap)
+
+ mpn_divrem_1 39.0
+ mpn_mod_1 21.5
+ mpn_divexact_by3 8.5
+
+ mpn_mul_1 5.5
+ mpn_addmul/submul_1 6.35
+
+ mpn_l/rshift 2.5
+
+ mpn_mul_basecase 8.2 cycles/crossproduct (approx)
+ mpn_sqr_basecase 4.0 cycles/crossproduct (approx)
+ or 7.75 cycles/triangleproduct (approx)
+
+Pentium II and III have MMX and get the following improvements.
+
+ mpn_divrem_1 25.0 integer part, 17.5 fractional part
+
+ mpn_l/rshift 1.75
+
+
+
+
+NOTES
+
+Write-allocate L1 data cache means prefetching of destinations is unnecessary.
+
+Mispredicted branches have a penalty of between 9 and 15 cycles, and even up
+to 26 cycles depending how far speculative execution has gone. The 9 cycle
+minimum penalty comes from the issue pipeline being 9 stages.
+
+A copy with rep movs seems to copy 16 bytes at a time, since speeds for 4,
+5, 6 or 7 limb operations are all the same. The 0.75 cycles/limb would be 3
+cycles per 16 byte block.
+
+
+
+
+CODING
+
+Instructions in general code have been shown grouped if they can execute
+together, which means up to three instructions with no successive
+dependencies, and with only the first being a multiple micro-op.
+
+P6 has out-of-order execution, so the groupings are really only showing
+dependent paths where some shuffling might allow some latencies to be
+hidden.
+
+
+
+
+REFERENCES
+
+"Intel Architecture Optimization Reference Manual", 1999, revision 001 dated
+02/99, order number 245127 (order number 730795-001 is in the document too).
+Available on-line:
+
+ http://download.intel.com/design/PentiumII/manuals/245127.htm
+
+"Intel Architecture Optimization Manual", 1997, order number 242816. This
+is an older document mostly about P5 and not as good as the above.
+Available on-line:
+
+ http://download.intel.com/design/PentiumII/manuals/242816.htm
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 76
+End:
diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/aors_n.asm b/vendor/gmp-6.3.0/mpn/x86/p6/aors_n.asm
new file mode 100644
index 0000000..df51c2e
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/p6/aors_n.asm
@@ -0,0 +1,156 @@
+dnl Intel P6 mpn_add_n/mpn_sub_n -- mpn add or subtract.
+
+dnl Copyright 2006 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO:
+C * Avoid indexed addressing, it makes us stall on the two-ported register
+C file.
+
+C cycles/limb
+C P6 model 0-8,10-12 3.17
+C P6 model 9 (Banias) 2.15
+C P6 model 13 (Dothan) 2.25
+
+
+define(`rp', `%edi')
+define(`up', `%esi')
+define(`vp', `%ebx')
+define(`n', `%ecx')
+
+ifdef(`OPERATION_add_n', `
+ define(ADCSBB, adc)
+ define(func, mpn_add_n)
+ define(func_nc, mpn_add_nc)')
+ifdef(`OPERATION_sub_n', `
+ define(ADCSBB, sbb)
+ define(func, mpn_sub_n)
+ define(func_nc, mpn_sub_nc)')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+ASM_START()
+
+ TEXT
+ ALIGN(16)
+
+PROLOGUE(func)
+ xor %edx, %edx
+L(start):
+ push %edi
+ push %esi
+ push %ebx
+
+ mov 16(%esp), rp
+ mov 20(%esp), up
+ mov 24(%esp), vp
+ mov 28(%esp), n
+
+ lea (up,n,4), up
+ lea (vp,n,4), vp
+ lea (rp,n,4), rp
+
+ neg n
+ mov n, %eax
+ and $-8, n
+ and $7, %eax
+ shl $2, %eax C 4x
+ifdef(`PIC',`
+ call L(pic_calc)
+L(here):
+',`
+ lea L(ent) (%eax,%eax,2), %eax C 12x
+')
+
+ shr %edx C set cy flag
+ jmp *%eax
+
+ifdef(`PIC',`
+L(pic_calc):
+ C See mpn/x86/README about old gas bugs
+ lea (%eax,%eax,2), %eax
+ add $L(ent)-L(here), %eax
+ add (%esp), %eax
+ ret_internal
+')
+
+L(end):
+ sbb %eax, %eax
+ neg %eax
+ pop %ebx
+ pop %esi
+ pop %edi
+ ret
+
+ ALIGN(16)
+L(top):
+ jecxz L(end)
+L(ent):
+Zdisp( mov, 0,(up,n,4), %eax)
+Zdisp( ADCSBB, 0,(vp,n,4), %eax)
+Zdisp( mov, %eax, 0,(rp,n,4))
+
+ mov 4(up,n,4), %edx
+ ADCSBB 4(vp,n,4), %edx
+ mov %edx, 4(rp,n,4)
+
+ mov 8(up,n,4), %eax
+ ADCSBB 8(vp,n,4), %eax
+ mov %eax, 8(rp,n,4)
+
+ mov 12(up,n,4), %edx
+ ADCSBB 12(vp,n,4), %edx
+ mov %edx, 12(rp,n,4)
+
+ mov 16(up,n,4), %eax
+ ADCSBB 16(vp,n,4), %eax
+ mov %eax, 16(rp,n,4)
+
+ mov 20(up,n,4), %edx
+ ADCSBB 20(vp,n,4), %edx
+ mov %edx, 20(rp,n,4)
+
+ mov 24(up,n,4), %eax
+ ADCSBB 24(vp,n,4), %eax
+ mov %eax, 24(rp,n,4)
+
+ mov 28(up,n,4), %edx
+ ADCSBB 28(vp,n,4), %edx
+ mov %edx, 28(rp,n,4)
+
+ lea 8(n), n
+ jmp L(top)
+
+EPILOGUE()
+
+PROLOGUE(func_nc)
+ movl 20(%esp), %edx
+ jmp L(start)
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/aorsmul_1.asm b/vendor/gmp-6.3.0/mpn/x86/p6/aorsmul_1.asm
new file mode 100644
index 0000000..bc8c49c
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/p6/aorsmul_1.asm
@@ -0,0 +1,320 @@
+dnl Intel P6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
+
+dnl Copyright 1999-2002, 2005 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C P5
+C P6 model 0-8,10-12 6.44
+C P6 model 9 (Banias) 6.15
+C P6 model 13 (Dothan) 6.11
+C P4 model 0 (Willamette)
+C P4 model 1 (?)
+C P4 model 2 (Northwood)
+C P4 model 3 (Prescott)
+C P4 model 4 (Nocona)
+C AMD K6
+C AMD K7
+C AMD K8
+
+
+dnl P6 UNROLL_COUNT cycles/limb
+dnl 8 6.7
+dnl 16 6.35
+dnl 32 6.3
+dnl 64 6.3
+dnl Maximum possible with the current code is 64.
+
+deflit(UNROLL_COUNT, 16)
+
+
+ifdef(`OPERATION_addmul_1', `
+ define(M4_inst, addl)
+ define(M4_function_1, mpn_addmul_1)
+ define(M4_function_1c, mpn_addmul_1c)
+ define(M4_description, add it to)
+ define(M4_desc_retval, carry)
+',`ifdef(`OPERATION_submul_1', `
+ define(M4_inst, subl)
+ define(M4_function_1, mpn_submul_1)
+ define(M4_function_1c, mpn_submul_1c)
+ define(M4_description, subtract it from)
+ define(M4_desc_retval, borrow)
+',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
+')')')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
+
+
+C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t mult);
+C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t mult, mp_limb_t carry);
+C
+C Calculate src,size multiplied by mult and M4_description dst,size.
+C Return the M4_desc_retval limb from the top of the result.
+C
+C This code is pretty much the same as the K6 code. The unrolled loop is
+C the same, but there's just a few scheduling tweaks in the setups and the
+C simple loop.
+C
+C A number of variations have been tried for the unrolled loop, with one or
+C two carries, and with loads scheduled earlier, but nothing faster than 6
+C cycles/limb has been found.
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 5)
+',`
+deflit(UNROLL_THRESHOLD, 5)
+')
+
+defframe(PARAM_CARRY, 20)
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ TEXT
+ ALIGN(32)
+
+PROLOGUE(M4_function_1c)
+ pushl %ebx
+deflit(`FRAME',4)
+ movl PARAM_CARRY, %ebx
+ jmp L(start_nc)
+EPILOGUE()
+
+PROLOGUE(M4_function_1)
+ push %ebx
+deflit(`FRAME',4)
+ xorl %ebx, %ebx C initial carry
+
+L(start_nc):
+ movl PARAM_SIZE, %ecx
+ pushl %esi
+deflit(`FRAME',8)
+
+ movl PARAM_SRC, %esi
+ pushl %edi
+deflit(`FRAME',12)
+
+ movl PARAM_DST, %edi
+ pushl %ebp
+deflit(`FRAME',16)
+ cmpl $UNROLL_THRESHOLD, %ecx
+
+ movl PARAM_MULTIPLIER, %ebp
+ jae L(unroll)
+
+
+ C simple loop
+ C this is offset 0x22, so close enough to aligned
+L(simple):
+ C eax scratch
+ C ebx carry
+ C ecx counter
+ C edx scratch
+ C esi src
+ C edi dst
+ C ebp multiplier
+
+ movl (%esi), %eax
+ addl $4, %edi
+
+ mull %ebp
+
+ addl %ebx, %eax
+ adcl $0, %edx
+
+ M4_inst %eax, -4(%edi)
+ movl %edx, %ebx
+
+ adcl $0, %ebx
+ decl %ecx
+
+ leal 4(%esi), %esi
+ jnz L(simple)
+
+
+ popl %ebp
+ popl %edi
+
+ popl %esi
+ movl %ebx, %eax
+
+ popl %ebx
+ ret
+
+
+
+C------------------------------------------------------------------------------
+C VAR_JUMP holds the computed jump temporarily because there's not enough
+C registers when doing the mul for the initial two carry limbs.
+C
+C The add/adc for the initial carry in %ebx is necessary only for the
+C mpn_add/submul_1c entry points. Duplicating the startup code to
+C eliminate this for the plain mpn_add/submul_1 doesn't seem like a good
+C idea.
+
+dnl overlapping with parameters already fetched
+define(VAR_COUNTER,`PARAM_SIZE')
+define(VAR_JUMP, `PARAM_DST')
+
+ C this is offset 0x43, so close enough to aligned
+L(unroll):
+ C eax
+ C ebx initial carry
+ C ecx size
+ C edx
+ C esi src
+ C edi dst
+ C ebp
+
+ movl %ecx, %edx
+ decl %ecx
+
+ subl $2, %edx
+ negl %ecx
+
+ shrl $UNROLL_LOG2, %edx
+ andl $UNROLL_MASK, %ecx
+
+ movl %edx, VAR_COUNTER
+ movl %ecx, %edx
+
+ C 15 code bytes per limb
+ifdef(`PIC',`
+ call L(pic_calc)
+L(here):
+',`
+ shll $4, %edx
+ negl %ecx
+
+ leal L(entry) (%edx,%ecx,1), %edx
+')
+ movl (%esi), %eax C src low limb
+
+ movl %edx, VAR_JUMP
+ leal ifelse(UNROLL_BYTES,256,128+) 4(%esi,%ecx,4), %esi
+
+ mull %ebp
+
+ addl %ebx, %eax C initial carry (from _1c)
+ adcl $0, %edx
+
+ movl %edx, %ebx C high carry
+ leal ifelse(UNROLL_BYTES,256,128) (%edi,%ecx,4), %edi
+
+ movl VAR_JUMP, %edx
+ testl $1, %ecx
+ movl %eax, %ecx C low carry
+
+ cmovnz( %ebx, %ecx) C high,low carry other way around
+ cmovnz( %eax, %ebx)
+
+ jmp *%edx
+
+
+ifdef(`PIC',`
+L(pic_calc):
+ shll $4, %edx
+ negl %ecx
+
+ C See mpn/x86/README about old gas bugs
+ leal (%edx,%ecx,1), %edx
+ addl $L(entry)-L(here), %edx
+
+ addl (%esp), %edx
+
+ ret_internal
+')
+
+
+C -----------------------------------------------------------
+ ALIGN(32)
+L(top):
+deflit(`FRAME',16)
+ C eax scratch
+ C ebx carry hi
+ C ecx carry lo
+ C edx scratch
+ C esi src
+ C edi dst
+ C ebp multiplier
+ C
+ C VAR_COUNTER loop counter
+ C
+ C 15 code bytes per limb
+
+ addl $UNROLL_BYTES, %edi
+
+L(entry):
+deflit(CHUNK_COUNT,2)
+forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+ deflit(`disp0', eval(i*4*CHUNK_COUNT ifelse(UNROLL_BYTES,256,-128)))
+ deflit(`disp1', eval(disp0 + 4))
+
+Zdisp( movl, disp0,(%esi), %eax)
+ mull %ebp
+Zdisp( M4_inst,%ecx, disp0,(%edi))
+ adcl %eax, %ebx
+ movl %edx, %ecx
+ adcl $0, %ecx
+
+ movl disp1(%esi), %eax
+ mull %ebp
+ M4_inst %ebx, disp1(%edi)
+ adcl %eax, %ecx
+ movl %edx, %ebx
+ adcl $0, %ebx
+')
+
+ decl VAR_COUNTER
+ leal UNROLL_BYTES(%esi), %esi
+
+ jns L(top)
+
+
+deflit(`disp0', eval(UNROLL_BYTES ifelse(UNROLL_BYTES,256,-128)))
+
+ M4_inst %ecx, disp0(%edi)
+ movl %ebx, %eax
+
+ popl %ebp
+ popl %edi
+
+ popl %esi
+ popl %ebx
+ adcl $0, %eax
+
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/bdiv_q_1.asm b/vendor/gmp-6.3.0/mpn/x86/p6/bdiv_q_1.asm
new file mode 100644
index 0000000..a0a9d90
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/p6/bdiv_q_1.asm
@@ -0,0 +1,287 @@
+dnl Intel P6 mpn_modexact_1_odd -- exact division style remainder.
+
+dnl Rearranged from mpn/x86/p6/dive_1.asm by Marco Bodrato.
+
+dnl Copyright 2001, 2002, 2007, 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C odd even divisor
+C P6: 10.0 12.0 cycles/limb
+
+C MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1)
+
+C The odd case is basically the same as mpn_modexact_1_odd, just with an
+C extra store, and it runs at the same 10 cycles which is the dependent
+C chain.
+C
+C The shifts for the even case aren't on the dependent chain so in principle
+C it could run the same too, but nothing running at 10 has been found.
+C Perhaps there's too many uops (an extra 4 over the odd case).
+
+defframe(PARAM_SHIFT, 24)
+defframe(PARAM_INVERSE,20)
+defframe(PARAM_DIVISOR,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+defframe(SAVE_EBX, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EDI, -12)
+defframe(SAVE_EBP, -16)
+deflit(STACK_SPACE, 16)
+
+dnl re-use parameter space
+define(VAR_INVERSE,`PARAM_SRC')
+
+ TEXT
+
+C mp_limb_t
+C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C mp_limb_t inverse, int shift)
+
+ ALIGN(16)
+PROLOGUE(mpn_pi1_bdiv_q_1)
+deflit(`FRAME',0)
+
+ subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE)
+
+ movl %esi, SAVE_ESI
+ movl PARAM_SRC, %esi
+
+ movl %ebx, SAVE_EBX
+ movl PARAM_SIZE, %ebx
+
+ movl %ebp, SAVE_EBP
+ movl PARAM_INVERSE, %ebp
+
+ movl PARAM_SHIFT, %ecx C trailing twos
+
+L(common):
+ movl %edi, SAVE_EDI
+ movl PARAM_DST, %edi
+
+ leal (%esi,%ebx,4), %esi C src end
+
+ leal (%edi,%ebx,4), %edi C dst end
+ negl %ebx C -size
+
+ movl (%esi,%ebx,4), %eax C src[0]
+
+ orl %ecx, %ecx
+ jz L(odd_entry)
+
+ movl %edi, PARAM_DST
+ movl %ebp, VAR_INVERSE
+
+L(even):
+ C eax src[0]
+ C ebx counter, limbs, negative
+ C ecx shift
+ C edx
+ C esi
+ C edi
+ C ebp
+
+ xorl %ebp, %ebp C initial carry bit
+ xorl %edx, %edx C initial carry limb (for size==1)
+
+ incl %ebx
+ jz L(even_one)
+
+ movl (%esi,%ebx,4), %edi C src[1]
+
+ shrdl( %cl, %edi, %eax)
+
+ jmp L(even_entry)
+
+
+L(even_top):
+ C eax scratch
+ C ebx counter, limbs, negative
+ C ecx shift
+ C edx scratch
+ C esi &src[size]
+ C edi &dst[size] and scratch
+ C ebp carry bit
+
+ movl (%esi,%ebx,4), %edi
+
+ mull PARAM_DIVISOR
+
+ movl -4(%esi,%ebx,4), %eax
+ shrdl( %cl, %edi, %eax)
+
+ subl %ebp, %eax
+
+ sbbl %ebp, %ebp
+ subl %edx, %eax
+
+ sbbl $0, %ebp
+
+L(even_entry):
+ imull VAR_INVERSE, %eax
+
+ movl PARAM_DST, %edi
+ negl %ebp
+
+ movl %eax, -4(%edi,%ebx,4)
+ incl %ebx
+ jnz L(even_top)
+
+ mull PARAM_DIVISOR
+
+ movl -4(%esi), %eax
+
+L(even_one):
+ shrl %cl, %eax
+ movl SAVE_ESI, %esi
+
+ subl %ebp, %eax
+ movl SAVE_EBP, %ebp
+
+ subl %edx, %eax
+ movl SAVE_EBX, %ebx
+
+ imull VAR_INVERSE, %eax
+
+ movl %eax, -4(%edi)
+ movl SAVE_EDI, %edi
+ addl $STACK_SPACE, %esp
+
+ ret
+
+C The dependent chain here is
+C
+C subl %edx, %eax 1
+C imull %ebp, %eax 4
+C mull PARAM_DIVISOR 5
+C ----
+C total 10
+C
+C and this is the measured speed. No special scheduling is necessary, out
+C of order execution hides the load latency.
+
+L(odd_top):
+ C eax scratch (src limb)
+ C ebx counter, limbs, negative
+ C ecx carry bit
+ C edx carry limb, high of last product
+ C esi &src[size]
+ C edi &dst[size]
+ C ebp inverse
+
+ mull PARAM_DIVISOR
+
+ movl (%esi,%ebx,4), %eax
+ subl %ecx, %eax
+
+ sbbl %ecx, %ecx
+ subl %edx, %eax
+
+ sbbl $0, %ecx
+
+L(odd_entry):
+ imull %ebp, %eax
+
+ movl %eax, (%edi,%ebx,4)
+ negl %ecx
+
+ incl %ebx
+ jnz L(odd_top)
+
+
+ movl SAVE_ESI, %esi
+
+ movl SAVE_EDI, %edi
+
+ movl SAVE_EBP, %ebp
+
+ movl SAVE_EBX, %ebx
+ addl $STACK_SPACE, %esp
+
+ ret
+
+EPILOGUE()
+
+C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor);
+C
+
+ ALIGN(16)
+PROLOGUE(mpn_bdiv_q_1)
+deflit(`FRAME',0)
+
+ movl PARAM_DIVISOR, %eax
+ subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE)
+
+ movl %esi, SAVE_ESI
+ movl PARAM_SRC, %esi
+
+ movl %ebx, SAVE_EBX
+ movl PARAM_SIZE, %ebx
+
+ bsfl %eax, %ecx C trailing twos
+
+ movl %ebp, SAVE_EBP
+
+ shrl %cl, %eax C d without twos
+
+ movl %eax, %edx
+ shrl %eax C d/2 without twos
+
+ movl %edx, PARAM_DIVISOR
+ andl $127, %eax
+
+ifdef(`PIC',`
+ LEA( binvert_limb_table, %ebp)
+ movzbl (%eax,%ebp), %ebp C inv 8 bits
+',`
+ movzbl binvert_limb_table(%eax), %ebp C inv 8 bits
+')
+
+ leal (%ebp,%ebp), %eax C 2*inv
+
+ imull %ebp, %ebp C inv*inv
+ imull %edx, %ebp C inv*inv*d
+
+ subl %ebp, %eax C inv = 2*inv - inv*inv*d
+ leal (%eax,%eax), %ebp C 2*inv
+
+ imull %eax, %eax C inv*inv
+ imull %edx, %eax C inv*inv*d
+
+ subl %eax, %ebp C inv = 2*inv - inv*inv*d
+
+ jmp L(common)
+
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/copyd.asm b/vendor/gmp-6.3.0/mpn/x86/p6/copyd.asm
new file mode 100644
index 0000000..1be7636
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/p6/copyd.asm
@@ -0,0 +1,178 @@
+dnl Intel P6 mpn_copyd -- copy limb vector backwards.
+
+dnl Copyright 2001, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P6: 1.75 cycles/limb, or 0.75 if no overlap
+
+
+C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C An explicit loop is used because a decrementing rep movsl is a bit slow at
+C 2.4 c/l. That rep movsl also has about a 40 cycle startup time, and the
+C code here stands a chance of being faster if the branches predict well.
+C
+C The slightly strange loop form seems necessary for the claimed speed.
+C Maybe load/store ordering affects it.
+C
+C The source and destination are checked to see if they're actually
+C overlapping, since it might be possible to use an incrementing rep movsl
+C at 0.75 c/l. (It doesn't suffer the bad startup time of the decrementing
+C version.)
+C
+C Enhancements:
+C
+C Top speed for an all-integer copy is probably 1.0 c/l, being one load and
+C one store each cycle. Unrolling the loop below would approach 1.0, but
+C it'd be good to know why something like store/load/subl + store/load/jnz
+C doesn't already run at 1.0 c/l. It looks like it should decode in 2
+C cycles, but doesn't run that way.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+dnl re-using parameter space
+define(SAVE_ESI,`PARAM_SIZE')
+define(SAVE_EDI,`PARAM_SRC')
+
+ TEXT
+ ALIGN(16)
+
+PROLOGUE(mpn_copyd)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+
+ movl %esi, SAVE_ESI
+ movl PARAM_SRC, %esi
+
+ movl %edi, SAVE_EDI
+ movl PARAM_DST, %edi
+
+ subl $1, %ecx
+ jb L(zero)
+
+ movl (%esi,%ecx,4), %eax C src[size-1]
+ jz L(one)
+
+ movl -4(%esi,%ecx,4), %edx C src[size-2]
+ subl $2, %ecx
+ jbe L(done_loop) C 2 or 3 limbs only
+
+
+ C The usual overlap is
+ C
+ C high low
+ C +------------------+
+ C | dst|
+ C +------------------+
+ C +------------------+
+ C | src|
+ C +------------------+
+ C
+ C We can use an incrementing copy in the following circumstances.
+ C
+ C src+4*size<=dst, since then the regions are disjoint
+ C
+ C src==dst, clearly (though this shouldn't occur normally)
+ C
+ C src>dst, since in that case it's a requirement of the
+ C parameters that src>=dst+size*4, and hence the
+ C regions are disjoint
+ C
+
+ leal (%edi,%ecx,4), %edx
+ cmpl %edi, %esi
+ jae L(use_movsl) C src >= dst
+
+ cmpl %edi, %edx
+ movl 4(%esi,%ecx,4), %edx C src[size-2] again
+ jbe L(use_movsl) C src+4*size <= dst
+
+
+L(top):
+ C eax prev high limb
+ C ebx
+ C ecx counter, size-3 down to 0 or -1, inclusive, by 2s
+ C edx prev low limb
+ C esi src
+ C edi dst
+ C ebp
+
+ movl %eax, 8(%edi,%ecx,4)
+ movl (%esi,%ecx,4), %eax
+
+ movl %edx, 4(%edi,%ecx,4)
+ movl -4(%esi,%ecx,4), %edx
+
+ subl $2, %ecx
+ jnbe L(top)
+
+
+L(done_loop):
+ movl %eax, 8(%edi,%ecx,4)
+ movl %edx, 4(%edi,%ecx,4)
+
+ C copy low limb (needed if size was odd, but will already have been
+ C done in the loop if size was even)
+ movl (%esi), %eax
+L(one):
+ movl %eax, (%edi)
+ movl SAVE_EDI, %edi
+ movl SAVE_ESI, %esi
+
+ ret
+
+
+L(use_movsl):
+ C eax
+ C ebx
+ C ecx size-3
+ C edx
+ C esi src
+ C edi dst
+ C ebp
+
+ addl $3, %ecx
+
+ cld C better safe than sorry, see mpn/x86/README
+
+ rep
+ movsl
+
+L(zero):
+ movl SAVE_ESI, %esi
+ movl SAVE_EDI, %edi
+
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/dive_1.asm b/vendor/gmp-6.3.0/mpn/x86/p6/dive_1.asm
new file mode 100644
index 0000000..7d61a18
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/p6/dive_1.asm
@@ -0,0 +1,267 @@
+dnl Intel P6 mpn_modexact_1_odd -- exact division style remainder.
+
+dnl Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C odd even divisor
+C P6: 10.0 12.0 cycles/limb
+
+
+C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor);
+C
+C The odd case is basically the same as mpn_modexact_1_odd, just with an
+C extra store, and it runs at the same 10 cycles which is the dependent
+C chain.
+C
+C The shifts for the even case aren't on the dependent chain so in principle
+C it could run the same too, but nothing running at 10 has been found.
+C Perhaps there's too many uops (an extra 4 over the odd case).
+
+defframe(PARAM_DIVISOR,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+defframe(SAVE_EBX, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EDI, -12)
+defframe(SAVE_EBP, -16)
+defframe(VAR_INVERSE, -20)
+deflit(STACK_SPACE, 20)
+
+ TEXT
+
+ ALIGN(16)
+PROLOGUE(mpn_divexact_1)
+deflit(`FRAME',0)
+
+ movl PARAM_DIVISOR, %eax
+ subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE)
+
+ movl %esi, SAVE_ESI
+ movl PARAM_SRC, %esi
+
+ movl %ebx, SAVE_EBX
+ movl PARAM_SIZE, %ebx
+
+ bsfl %eax, %ecx C trailing twos
+
+ movl %ebp, SAVE_EBP
+
+ shrl %cl, %eax C d without twos
+
+ movl %eax, %edx
+ shrl %eax C d/2 without twos
+
+ movl %edx, PARAM_DIVISOR
+ andl $127, %eax
+
+ifdef(`PIC',`
+ LEA( binvert_limb_table, %ebp)
+ movzbl (%eax,%ebp), %ebp C inv 8 bits
+',`
+ movzbl binvert_limb_table(%eax), %ebp C inv 8 bits
+')
+
+ leal (%ebp,%ebp), %eax C 2*inv
+
+ imull %ebp, %ebp C inv*inv
+
+ movl %edi, SAVE_EDI
+ movl PARAM_DST, %edi
+
+ leal (%esi,%ebx,4), %esi C src end
+
+ imull PARAM_DIVISOR, %ebp C inv*inv*d
+
+ subl %ebp, %eax C inv = 2*inv - inv*inv*d
+ leal (%eax,%eax), %ebp C 2*inv
+
+ imull %eax, %eax C inv*inv
+
+ leal (%edi,%ebx,4), %edi C dst end
+ negl %ebx C -size
+
+ movl %edi, PARAM_DST
+
+ imull PARAM_DIVISOR, %eax C inv*inv*d
+
+ subl %eax, %ebp C inv = 2*inv - inv*inv*d
+
+ ASSERT(e,` C d*inv == 1 mod 2^GMP_LIMB_BITS
+ movl PARAM_DIVISOR, %eax
+ imull %ebp, %eax
+ cmpl $1, %eax')
+
+ movl %ebp, VAR_INVERSE
+ movl (%esi,%ebx,4), %eax C src[0]
+
+ orl %ecx, %ecx
+ jnz L(even)
+
+ C ecx initial carry is zero
+ jmp L(odd_entry)
+
+
+C The dependent chain here is
+C
+C subl %edx, %eax 1
+C imull %ebp, %eax 4
+C mull PARAM_DIVISOR 5
+C ----
+C total 10
+C
+C and this is the measured speed. No special scheduling is necessary, out
+C of order execution hides the load latency.
+
+L(odd_top):
+ C eax scratch (src limb)
+ C ebx counter, limbs, negative
+ C ecx carry bit
+ C edx carry limb, high of last product
+ C esi &src[size]
+ C edi &dst[size]
+ C ebp
+
+ mull PARAM_DIVISOR
+
+ movl (%esi,%ebx,4), %eax
+ subl %ecx, %eax
+
+ sbbl %ecx, %ecx
+ subl %edx, %eax
+
+ sbbl $0, %ecx
+
+L(odd_entry):
+ imull VAR_INVERSE, %eax
+
+ movl %eax, (%edi,%ebx,4)
+ negl %ecx
+
+ incl %ebx
+ jnz L(odd_top)
+
+
+ movl SAVE_ESI, %esi
+
+ movl SAVE_EDI, %edi
+
+ movl SAVE_EBP, %ebp
+
+ movl SAVE_EBX, %ebx
+ addl $STACK_SPACE, %esp
+
+ ret
+
+
+L(even):
+ C eax src[0]
+ C ebx counter, limbs, negative
+ C ecx shift
+ C edx
+ C esi
+ C edi
+ C ebp
+
+ xorl %ebp, %ebp C initial carry bit
+ xorl %edx, %edx C initial carry limb (for size==1)
+
+ incl %ebx
+ jz L(even_one)
+
+ movl (%esi,%ebx,4), %edi C src[1]
+
+ shrdl( %cl, %edi, %eax)
+
+ jmp L(even_entry)
+
+
+L(even_top):
+ C eax scratch
+ C ebx counter, limbs, negative
+ C ecx shift
+ C edx scratch
+ C esi &src[size]
+ C edi &dst[size] and scratch
+ C ebp carry bit
+
+ movl (%esi,%ebx,4), %edi
+
+ mull PARAM_DIVISOR
+
+ movl -4(%esi,%ebx,4), %eax
+ shrdl( %cl, %edi, %eax)
+
+ subl %ebp, %eax
+
+ sbbl %ebp, %ebp
+ subl %edx, %eax
+
+ sbbl $0, %ebp
+
+L(even_entry):
+ imull VAR_INVERSE, %eax
+
+ movl PARAM_DST, %edi
+ negl %ebp
+
+ movl %eax, -4(%edi,%ebx,4)
+ incl %ebx
+ jnz L(even_top)
+
+
+
+ mull PARAM_DIVISOR
+
+ movl -4(%esi), %eax
+
+L(even_one):
+ shrl %cl, %eax
+ movl SAVE_ESI, %esi
+
+ subl %ebp, %eax
+ movl SAVE_EBP, %ebp
+
+ subl %edx, %eax
+ movl SAVE_EBX, %ebx
+
+ imull VAR_INVERSE, %eax
+
+ movl %eax, -4(%edi)
+ movl SAVE_EDI, %edi
+ addl $STACK_SPACE, %esp
+
+ ret
+
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/gcd_11.asm b/vendor/gmp-6.3.0/mpn/x86/p6/gcd_11.asm
new file mode 100644
index 0000000..80e055e
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/p6/gcd_11.asm
@@ -0,0 +1,83 @@
+dnl x86 mpn_gcd_11 optimised for processors with fast BSF.
+
+dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked by Torbjorn Granlund.
+
+dnl Copyright 2000-2002, 2005, 2009, 2011, 2012, 2015 Free Software
+dnl Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/bit (approx)
+C AMD K7 7.80
+C AMD K8,K9 7.79
+C AMD K10 4.08
+C AMD bd1 ?
+C AMD bobcat 7.82
+C Intel P4-2 14.9
+C Intel P4-3/4 14.0
+C Intel P6/13 5.09
+C Intel core2 4.22
+C Intel NHM 5.00
+C Intel SBR 5.00
+C Intel atom 17.1
+C VIA nano ?
+C Numbers measured with: speed -CD -s16-32 -t16 mpn_gcd_1
+
+
+define(`u0', `%eax')
+define(`v0', `%edx')
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_gcd_11)
+ push %edi
+ push %esi
+
+ mov 12(%esp), %eax
+ mov 16(%esp), %edx
+ jmp L(odd)
+
+ ALIGN(16) C K10 BD C2 NHM SBR
+L(top): cmovc( %esi, %eax) C u = |v - u| 0,3 0,3 0,6 0,5 0,5
+ cmovc( %edi, %edx) C v = min(u,v) 0,3 0,3 2,8 1,7 1,7
+ shr %cl, %eax C 1,7 1,6 2,8 2,8 2,8
+L(odd): mov %edx, %esi C 1 1 4 3 3
+ sub %eax, %esi C 2 2 5 4 4
+ bsf %esi, %ecx C 3 3 6 5 5
+ mov %eax, %edi C 2 2 3 3 4
+ sub %edx, %eax C 2 2 4 3 4
+ jnz L(top) C
+
+L(end): mov %edx, %eax
+ pop %esi
+ pop %edi
+ ret
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/x86/p6/gmp-mparam.h
new file mode 100644
index 0000000..96c96fd
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/p6/gmp-mparam.h
@@ -0,0 +1,194 @@
+/* Intel P6 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2003, 2008-2010, 2012 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+
+/* NOTE: In a fat binary build SQR_TOOM2_THRESHOLD here cannot be more than the
+ value in mpn/x86/p6/gmp-mparam.h. The latter is used as a hard limit in
+ mpn/x86/p6/sqr_basecase.asm. */
+
+
+/* 1867 MHz P6 model 13 */
+
+#define MOD_1_NORM_THRESHOLD 4
+#define MOD_1_UNNORM_THRESHOLD 4
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 5
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 4
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 11
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 8
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 21
+
+#define MUL_TOOM22_THRESHOLD 20
+#define MUL_TOOM33_THRESHOLD 74
+#define MUL_TOOM44_THRESHOLD 181
+#define MUL_TOOM6H_THRESHOLD 252
+#define MUL_TOOM8H_THRESHOLD 363
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 73
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 114
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 115
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 80
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 30
+#define SQR_TOOM3_THRESHOLD 101
+#define SQR_TOOM4_THRESHOLD 154
+#define SQR_TOOM6_THRESHOLD 222
+#define SQR_TOOM8_THRESHOLD 527
+
+#define MULMID_TOOM42_THRESHOLD 58
+
+#define MULMOD_BNM1_THRESHOLD 13
+#define SQRMOD_BNM1_THRESHOLD 17
+
+#define POWM_SEC_TABLE 4,23,258,768,2388
+
+#define MUL_FFT_MODF_THRESHOLD 565 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 565, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \
+ { 25, 7}, { 13, 6}, { 28, 7}, { 15, 6}, \
+ { 31, 7}, { 17, 6}, { 35, 7}, { 27, 8}, \
+ { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \
+ { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \
+ { 31, 7}, { 63, 8}, { 39, 9}, { 23, 5}, \
+ { 383, 4}, { 991, 5}, { 511, 6}, { 267, 7}, \
+ { 157, 8}, { 91, 9}, { 47, 8}, { 111, 9}, \
+ { 63, 8}, { 127, 9}, { 79,10}, { 47, 9}, \
+ { 95,11}, { 31,10}, { 63, 9}, { 135,10}, \
+ { 79, 9}, { 159,10}, { 95,11}, { 63,10}, \
+ { 143, 9}, { 287,10}, { 159,11}, { 95,10}, \
+ { 191,12}, { 63,11}, { 127,10}, { 255, 9}, \
+ { 511,10}, { 271, 9}, { 543,10}, { 287,11}, \
+ { 159,10}, { 335, 9}, { 671,11}, { 191,10}, \
+ { 383, 9}, { 767,10}, { 399, 9}, { 799,10}, \
+ { 415,11}, { 223,12}, { 127,11}, { 255,10}, \
+ { 543, 9}, { 1087,11}, { 287,10}, { 607,11}, \
+ { 319,10}, { 671,12}, { 191,11}, { 383,10}, \
+ { 799,11}, { 415,10}, { 831,13}, { 127,12}, \
+ { 255,11}, { 543,10}, { 1087,11}, { 607,10}, \
+ { 1215,12}, { 319,11}, { 671,10}, { 1343,11}, \
+ { 735,10}, { 1471,12}, { 383,11}, { 799,10}, \
+ { 1599,11}, { 863,12}, { 447,11}, { 959,13}, \
+ { 255,12}, { 511,11}, { 1087,12}, { 575,11}, \
+ { 1215,12}, { 639,11}, { 1343,12}, { 703,11}, \
+ { 1471,13}, { 383,12}, { 831,11}, { 1727,12}, \
+ { 959,14}, { 255,13}, { 511,12}, { 1215,13}, \
+ { 639,12}, { 1471,11}, { 2943,13}, { 767,12}, \
+ { 1727,13}, { 895,12}, { 1919,14}, { 511,13}, \
+ { 1023,12}, { 2111,13}, { 1151,12}, { 2431,13}, \
+ { 1407,12}, { 2815,14}, { 767,13}, { 1663,12}, \
+ { 3455,13}, { 8192,14}, { 16384,15}, { 32768,16} }
+#define MUL_FFT_TABLE3_SIZE 132
+#define MUL_FFT_THRESHOLD 6784
+
+#define SQR_FFT_MODF_THRESHOLD 472 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 472, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \
+ { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \
+ { 31, 7}, { 17, 6}, { 35, 7}, { 27, 8}, \
+ { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \
+ { 23, 7}, { 49, 8}, { 27, 9}, { 15, 8}, \
+ { 39, 9}, { 23, 8}, { 51,10}, { 15, 9}, \
+ { 31, 8}, { 63, 4}, { 1023, 8}, { 67, 9}, \
+ { 39, 5}, { 639, 4}, { 1471, 6}, { 383, 7}, \
+ { 209, 8}, { 119, 9}, { 63, 7}, { 255, 8}, \
+ { 139, 9}, { 71, 8}, { 143, 9}, { 79,10}, \
+ { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \
+ { 135,10}, { 79, 9}, { 159, 8}, { 319, 9}, \
+ { 167,10}, { 95,11}, { 63,10}, { 143, 9}, \
+ { 287,10}, { 159,11}, { 95,10}, { 191,12}, \
+ { 63,11}, { 127,10}, { 255, 9}, { 543, 8}, \
+ { 1087,10}, { 287, 9}, { 575,11}, { 159,10}, \
+ { 319, 9}, { 639,10}, { 335, 9}, { 671,10}, \
+ { 351, 9}, { 703,11}, { 191,10}, { 383, 9}, \
+ { 767,10}, { 399, 9}, { 799,10}, { 415, 9}, \
+ { 831,11}, { 223,12}, { 127,11}, { 255,10}, \
+ { 543, 9}, { 1087,11}, { 287,10}, { 607, 9}, \
+ { 1215,11}, { 319,10}, { 671, 9}, { 1343,11}, \
+ { 351,10}, { 703,12}, { 191,11}, { 383,10}, \
+ { 799,11}, { 415,10}, { 831,13}, { 127,12}, \
+ { 255,11}, { 543,10}, { 1087,11}, { 607,12}, \
+ { 319,11}, { 671,10}, { 1343,11}, { 735,12}, \
+ { 383,11}, { 799,10}, { 1599,11}, { 863,12}, \
+ { 447,11}, { 959,13}, { 255,12}, { 511,11}, \
+ { 1087,12}, { 575,11}, { 1215,12}, { 639,11}, \
+ { 1343,12}, { 703,11}, { 1471,13}, { 383,12}, \
+ { 767,11}, { 1599,12}, { 831,11}, { 1727,12}, \
+ { 959,14}, { 255,13}, { 511,12}, { 1215,13}, \
+ { 639,12}, { 1471,13}, { 767,12}, { 1727,13}, \
+ { 895,12}, { 1919,14}, { 511,13}, { 1023,12}, \
+ { 2111,13}, { 1151,12}, { 2431,13}, { 1407,14}, \
+ { 767,13}, { 1663,12}, { 3455,13}, { 8192,14}, \
+ { 16384,15}, { 32768,16} }
+#define SQR_FFT_TABLE3_SIZE 146
+#define SQR_FFT_THRESHOLD 5760
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 33
+#define MULLO_MUL_N_THRESHOLD 13463
+
+#define DC_DIV_QR_THRESHOLD 20
+#define DC_DIVAPPR_Q_THRESHOLD 56
+#define DC_BDIV_QR_THRESHOLD 60
+#define DC_BDIV_Q_THRESHOLD 134
+
+#define INV_MULMOD_BNM1_THRESHOLD 38
+#define INV_NEWTON_THRESHOLD 66
+#define INV_APPR_THRESHOLD 63
+
+#define BINV_NEWTON_THRESHOLD 250
+#define REDC_1_TO_REDC_N_THRESHOLD 63
+
+#define MU_DIV_QR_THRESHOLD 1164
+#define MU_DIVAPPR_Q_THRESHOLD 979
+#define MUPI_DIV_QR_THRESHOLD 38
+#define MU_BDIV_QR_THRESHOLD 1442
+#define MU_BDIV_Q_THRESHOLD 1470
+
+#define MATRIX22_STRASSEN_THRESHOLD 17
+#define HGCD_THRESHOLD 64
+#define HGCD_APPR_THRESHOLD 105
+#define HGCD_REDUCE_THRESHOLD 3524
+#define GCD_DC_THRESHOLD 386
+#define GCDEXT_DC_THRESHOLD 309
+#define JACOBI_BASE_METHOD 1
+
+#define GET_STR_DC_THRESHOLD 13
+#define GET_STR_PRECOMPUTE_THRESHOLD 26
+#define SET_STR_DC_THRESHOLD 587
+#define SET_STR_PRECOMPUTE_THRESHOLD 1104
diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/lshsub_n.asm b/vendor/gmp-6.3.0/mpn/x86/p6/lshsub_n.asm
new file mode 100644
index 0000000..7ada213
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/p6/lshsub_n.asm
@@ -0,0 +1,169 @@
+dnl Intel P6 mpn_lshsub_n -- mpn papillion support.
+
+dnl Copyright 2006 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C P6/13: 3.35 cycles/limb (separate mpn_sub_n + mpn_lshift needs 4.12)
+
+C (1) The loop is not scheduled in any way, and scheduling attempts have not
+C improved speed on P6/13. Presumably, the K7 will want scheduling, if it
+C at all wants to use MMX.
+C (2) We could save a register by not alternatingly using eax and edx in the
+C loop.
+
+define(`rp', `%edi')
+define(`up', `%esi')
+define(`vp', `%ebx')
+define(`n', `%ecx')
+define(`cnt', `%mm7')
+
+ASM_START()
+
+ TEXT
+ ALIGN(16)
+
+PROLOGUE(mpn_lshsub_n)
+ push %edi
+ push %esi
+ push %ebx
+
+ mov 16(%esp), rp
+ mov 20(%esp), up
+ mov 24(%esp), vp
+ mov 28(%esp), n
+ mov $32, %eax
+ sub 32(%esp), %eax
+ movd %eax, cnt
+
+ lea (up,n,4), up
+ lea (vp,n,4), vp
+ lea (rp,n,4), rp
+
+ neg n
+ mov n, %eax
+ and $-8, n
+ and $7, %eax
+ shl %eax C eax = 2x
+ lea (%eax,%eax,4), %edx C edx = 10x
+ifdef(`PIC',`
+ call L(pic_calc)
+L(here):
+',`
+ lea L(ent)(%eax,%edx,2), %eax C eax = 22x
+')
+
+ pxor %mm1, %mm1
+ pxor %mm0, %mm0
+
+ jmp *%eax
+
+ifdef(`PIC',`
+L(pic_calc):
+ C See mpn/x86/README about old gas bugs
+ lea (%eax,%edx,2), %eax
+ add $L(ent)-L(here), %eax
+ add (%esp), %eax
+ ret_internal
+')
+
+L(end): C compute (cy<<cnt) | (edx>>(32-cnt))
+ sbb %eax, %eax
+ neg %eax
+ mov 32(%esp), %ecx
+ shld %cl, %edx, %eax
+
+ emms
+
+ pop %ebx
+ pop %esi
+ pop %edi
+ ret
+ ALIGN(16)
+L(top): jecxz L(end)
+L(ent): mov 0(up,n,4), %eax
+ sbb 0(vp,n,4), %eax
+ movd %eax, %mm0
+ punpckldq %mm0, %mm1
+ psrlq %mm7, %mm1
+ movd %mm1, 0(rp,n,4)
+
+ mov 4(up,n,4), %edx
+ sbb 4(vp,n,4), %edx
+ movd %edx, %mm1
+ punpckldq %mm1, %mm0
+ psrlq %mm7, %mm0
+ movd %mm0, 4(rp,n,4)
+
+ mov 8(up,n,4), %eax
+ sbb 8(vp,n,4), %eax
+ movd %eax, %mm0
+ punpckldq %mm0, %mm1
+ psrlq %mm7, %mm1
+ movd %mm1, 8(rp,n,4)
+
+ mov 12(up,n,4), %edx
+ sbb 12(vp,n,4), %edx
+ movd %edx, %mm1
+ punpckldq %mm1, %mm0
+ psrlq %mm7, %mm0
+ movd %mm0, 12(rp,n,4)
+
+ mov 16(up,n,4), %eax
+ sbb 16(vp,n,4), %eax
+ movd %eax, %mm0
+ punpckldq %mm0, %mm1
+ psrlq %mm7, %mm1
+ movd %mm1, 16(rp,n,4)
+
+ mov 20(up,n,4), %edx
+ sbb 20(vp,n,4), %edx
+ movd %edx, %mm1
+ punpckldq %mm1, %mm0
+ psrlq %mm7, %mm0
+ movd %mm0, 20(rp,n,4)
+
+ mov 24(up,n,4), %eax
+ sbb 24(vp,n,4), %eax
+ movd %eax, %mm0
+ punpckldq %mm0, %mm1
+ psrlq %mm7, %mm1
+ movd %mm1, 24(rp,n,4)
+
+ mov 28(up,n,4), %edx
+ sbb 28(vp,n,4), %edx
+ movd %edx, %mm1
+ punpckldq %mm1, %mm0
+ psrlq %mm7, %mm0
+ movd %mm0, 28(rp,n,4)
+
+ lea 8(n), n
+ jmp L(top)
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/mmx/divrem_1.asm b/vendor/gmp-6.3.0/mpn/x86/p6/mmx/divrem_1.asm
new file mode 100644
index 0000000..5300616
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/p6/mmx/divrem_1.asm
@@ -0,0 +1,767 @@
+dnl Intel Pentium-II mpn_divrem_1 -- mpn by limb division.
+
+dnl Copyright 1999-2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P6MMX: 25.0 cycles/limb integer part, 17.5 cycles/limb fraction part.
+
+
+C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
+C mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor);
+C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
+C mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor, mp_limb_t carry);
+C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize,
+C mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor, mp_limb_t inverse,
+C unsigned shift);
+C
+C This code is a lightly reworked version of mpn/x86/k7/mmx/divrem_1.asm,
+C see that file for some comments. It's possible what's here can be improved.
+
+
+dnl MUL_THRESHOLD is the value of xsize+size at which the multiply by
+dnl inverse method is used, rather than plain "divl"s. Minimum value 1.
+dnl
+dnl The different speeds of the integer and fraction parts means that using
+dnl xsize+size isn't quite right. The threshold wants to be a bit higher
+dnl for the integer part and a bit lower for the fraction part. (Or what's
+dnl really wanted is to speed up the integer part!)
+dnl
+dnl The threshold is set to make the integer part right. At 4 limbs the
+dnl div and mul are about the same there, but on the fractional part the
+dnl mul is much faster.
+
+deflit(MUL_THRESHOLD, 4)
+
+
+defframe(PARAM_PREINV_SHIFT, 28) dnl mpn_preinv_divrem_1
+defframe(PARAM_PREINV_INVERSE, 24) dnl mpn_preinv_divrem_1
+defframe(PARAM_CARRY, 24) dnl mpn_divrem_1c
+defframe(PARAM_DIVISOR,20)
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC, 12)
+defframe(PARAM_XSIZE, 8)
+defframe(PARAM_DST, 4)
+
+defframe(SAVE_EBX, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EDI, -12)
+defframe(SAVE_EBP, -16)
+
+defframe(VAR_NORM, -20)
+defframe(VAR_INVERSE, -24)
+defframe(VAR_SRC, -28)
+defframe(VAR_DST, -32)
+defframe(VAR_DST_STOP,-36)
+
+deflit(STACK_SPACE, 36)
+
+ TEXT
+ ALIGN(16)
+
+PROLOGUE(mpn_preinv_divrem_1)
+deflit(`FRAME',0)
+ movl PARAM_XSIZE, %ecx
+ subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE)
+
+ movl %esi, SAVE_ESI
+ movl PARAM_SRC, %esi
+
+ movl %ebx, SAVE_EBX
+ movl PARAM_SIZE, %ebx
+
+ movl %ebp, SAVE_EBP
+ movl PARAM_DIVISOR, %ebp
+
+ movl %edi, SAVE_EDI
+ movl PARAM_DST, %edx
+
+ movl -4(%esi,%ebx,4), %eax C src high limb
+ xorl %edi, %edi C initial carry (if can't skip a div)
+
+ C
+
+ leal 8(%edx,%ecx,4), %edx C &dst[xsize+2]
+ xor %ecx, %ecx
+
+ movl %edx, VAR_DST_STOP C &dst[xsize+2]
+ cmpl %ebp, %eax C high cmp divisor
+
+ cmovc( %eax, %edi) C high is carry if high<divisor
+
+ cmovnc( %eax, %ecx) C 0 if skip div, src high if not
+ C (the latter in case src==dst)
+
+ movl %ecx, -12(%edx,%ebx,4) C dst high limb
+
+ sbbl $0, %ebx C skip one division if high<divisor
+ movl PARAM_PREINV_SHIFT, %ecx
+
+ leal -8(%edx,%ebx,4), %edx C &dst[xsize+size]
+ movl $32, %eax
+
+ movl %edx, VAR_DST C &dst[xsize+size]
+
+ shll %cl, %ebp C d normalized
+ subl %ecx, %eax
+ movl %ecx, VAR_NORM
+
+ movd %eax, %mm7 C rshift
+ movl PARAM_PREINV_INVERSE, %eax
+ jmp L(start_preinv)
+
+EPILOGUE()
+
+
+
+ ALIGN(16)
+
+PROLOGUE(mpn_divrem_1c)
+deflit(`FRAME',0)
+ movl PARAM_CARRY, %edx
+
+ movl PARAM_SIZE, %ecx
+ subl $STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+ movl %ebx, SAVE_EBX
+ movl PARAM_XSIZE, %ebx
+
+ movl %edi, SAVE_EDI
+ movl PARAM_DST, %edi
+
+ movl %ebp, SAVE_EBP
+ movl PARAM_DIVISOR, %ebp
+
+ movl %esi, SAVE_ESI
+ movl PARAM_SRC, %esi
+
+ leal -4(%edi,%ebx,4), %edi
+ jmp L(start_1c)
+
+EPILOGUE()
+
+
+ C offset 0x31, close enough to aligned
+PROLOGUE(mpn_divrem_1)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ movl $0, %edx C initial carry (if can't skip a div)
+ subl $STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+ movl %ebp, SAVE_EBP
+ movl PARAM_DIVISOR, %ebp
+
+ movl %ebx, SAVE_EBX
+ movl PARAM_XSIZE, %ebx
+
+ movl %esi, SAVE_ESI
+ movl PARAM_SRC, %esi
+ orl %ecx, %ecx C size
+
+ movl %edi, SAVE_EDI
+ movl PARAM_DST, %edi
+
+ leal -4(%edi,%ebx,4), %edi C &dst[xsize-1]
+ jz L(no_skip_div) C if size==0
+
+ movl -4(%esi,%ecx,4), %eax C src high limb
+ xorl %esi, %esi
+ cmpl %ebp, %eax C high cmp divisor
+
+ cmovc( %eax, %edx) C high is carry if high<divisor
+
+ cmovnc( %eax, %esi) C 0 if skip div, src high if not
+ C (the latter in case src==dst)
+
+ movl %esi, (%edi,%ecx,4) C dst high limb
+
+ sbbl $0, %ecx C size-1 if high<divisor
+ movl PARAM_SRC, %esi C reload
+L(no_skip_div):
+
+
+L(start_1c):
+ C eax
+ C ebx xsize
+ C ecx size
+ C edx carry
+ C esi src
+ C edi &dst[xsize-1]
+ C ebp divisor
+
+ leal (%ebx,%ecx), %eax C size+xsize
+ cmpl $MUL_THRESHOLD, %eax
+ jae L(mul_by_inverse)
+
+ orl %ecx, %ecx
+ jz L(divide_no_integer)
+
+L(divide_integer):
+ C eax scratch (quotient)
+ C ebx xsize
+ C ecx counter
+ C edx scratch (remainder)
+ C esi src
+ C edi &dst[xsize-1]
+ C ebp divisor
+
+ movl -4(%esi,%ecx,4), %eax
+
+ divl %ebp
+
+ movl %eax, (%edi,%ecx,4)
+ decl %ecx
+ jnz L(divide_integer)
+
+
+L(divide_no_integer):
+ movl PARAM_DST, %edi
+ orl %ebx, %ebx
+ jnz L(divide_fraction)
+
+L(divide_done):
+ movl SAVE_ESI, %esi
+
+ movl SAVE_EDI, %edi
+
+ movl SAVE_EBX, %ebx
+ movl %edx, %eax
+
+ movl SAVE_EBP, %ebp
+ addl $STACK_SPACE, %esp
+
+ ret
+
+
+L(divide_fraction):
+ C eax scratch (quotient)
+ C ebx counter
+ C ecx
+ C edx scratch (remainder)
+ C esi
+ C edi dst
+ C ebp divisor
+
+ movl $0, %eax
+
+ divl %ebp
+
+ movl %eax, -4(%edi,%ebx,4)
+ decl %ebx
+ jnz L(divide_fraction)
+
+ jmp L(divide_done)
+
+
+
+C -----------------------------------------------------------------------------
+
+L(mul_by_inverse):
+ C eax
+ C ebx xsize
+ C ecx size
+ C edx carry
+ C esi src
+ C edi &dst[xsize-1]
+ C ebp divisor
+
+ leal 12(%edi), %ebx C &dst[xsize+2], loop dst stop
+
+ movl %ebx, VAR_DST_STOP
+ leal 4(%edi,%ecx,4), %edi C &dst[xsize+size]
+
+ movl %edi, VAR_DST
+ movl %ecx, %ebx C size
+
+ bsrl %ebp, %ecx C 31-l
+ movl %edx, %edi C carry
+
+ leal 1(%ecx), %eax C 32-l
+ xorl $31, %ecx C l
+
+ movl %ecx, VAR_NORM
+ movl $-1, %edx
+
+ shll %cl, %ebp C d normalized
+ movd %eax, %mm7
+
+ movl $-1, %eax
+ subl %ebp, %edx C (b-d)-1 giving edx:eax = b*(b-d)-1
+
+ divl %ebp C floor (b*(b-d)-1) / d
+
+L(start_preinv):
+ C eax inverse
+ C ebx size
+ C ecx shift
+ C edx
+ C esi src
+ C edi carry
+ C ebp divisor
+ C
+ C mm7 rshift
+
+ movl %eax, VAR_INVERSE
+ orl %ebx, %ebx C size
+ leal -12(%esi,%ebx,4), %eax C &src[size-3]
+
+ movl %eax, VAR_SRC
+ jz L(start_zero)
+
+ movl 8(%eax), %esi C src high limb
+ cmpl $1, %ebx
+ jz L(start_one)
+
+L(start_two_or_more):
+ movl 4(%eax), %edx C src second highest limb
+
+ shldl( %cl, %esi, %edi) C n2 = carry,high << l
+
+ shldl( %cl, %edx, %esi) C n10 = high,second << l
+
+ cmpl $2, %ebx
+ je L(integer_two_left)
+ jmp L(integer_top)
+
+
+L(start_one):
+ shldl( %cl, %esi, %edi) C n2 = carry,high << l
+
+ shll %cl, %esi C n10 = high << l
+ jmp L(integer_one_left)
+
+
+L(start_zero):
+ C Can be here with xsize==0 if mpn_preinv_divrem_1 had size==1 and
+ C skipped a division.
+
+ shll %cl, %edi C n2 = carry << l
+ movl %edi, %eax C return value for zero_done
+ cmpl $0, PARAM_XSIZE
+
+ je L(zero_done)
+ jmp L(fraction_some)
+
+
+
+C -----------------------------------------------------------------------------
+C
+C This loop runs at about 25 cycles, which is probably sub-optimal, and
+C certainly more than the dependent chain would suggest. A better loop, or
+C a better rough analysis of what's possible, would be welcomed.
+C
+C In the current implementation, the following successively dependent
+C micro-ops seem to exist.
+C
+C uops
+C n2+n1 1 (addl)
+C mul 5
+C q1+1 3 (addl/adcl)
+C mul 5
+C sub 3 (subl/sbbl)
+C addback 2 (cmov)
+C ---
+C 19
+C
+C Lack of registers hinders explicit scheduling and it might be that the
+C normal out of order execution isn't able to hide enough under the mul
+C latencies.
+C
+C Using sarl/negl to pick out n1 for the n2+n1 stage is a touch faster than
+C cmov (and takes one uop off the dependent chain). A sarl/andl/addl
+C combination was tried for the addback (despite the fact it would lengthen
+C the dependent chain) but found to be no faster.
+
+
+ ALIGN(16)
+L(integer_top):
+ C eax scratch
+ C ebx scratch (nadj, q1)
+ C ecx scratch (src, dst)
+ C edx scratch
+ C esi n10
+ C edi n2
+ C ebp d
+ C
+ C mm0 scratch (src qword)
+ C mm7 rshift for normalization
+
+ movl %esi, %eax
+ movl %ebp, %ebx
+
+ sarl $31, %eax C -n1
+ movl VAR_SRC, %ecx
+
+ andl %eax, %ebx C -n1 & d
+ negl %eax C n1
+
+ addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
+ addl %edi, %eax C n2+n1
+ movq (%ecx), %mm0 C next src limb and the one below it
+
+ mull VAR_INVERSE C m*(n2+n1)
+
+ subl $4, %ecx
+
+ movl %ecx, VAR_SRC
+
+ C
+
+ C
+
+ addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
+ movl %ebp, %eax C d
+ leal 1(%edi), %ebx C n2+1
+
+ adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+ jz L(q1_ff)
+
+ mull %ebx C (q1+1)*d
+
+ movl VAR_DST, %ecx
+ psrlq %mm7, %mm0
+
+ C
+
+ C
+
+ C
+
+ subl %eax, %esi
+ movl VAR_DST_STOP, %eax
+
+ sbbl %edx, %edi C n - (q1+1)*d
+ movl %esi, %edi C remainder -> n2
+ leal (%ebp,%esi), %edx
+
+ cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
+ movd %mm0, %esi
+
+ sbbl $0, %ebx C q
+ subl $4, %ecx
+
+ movl %ebx, (%ecx)
+ cmpl %eax, %ecx
+
+ movl %ecx, VAR_DST
+ jne L(integer_top)
+
+
+L(integer_loop_done):
+
+
+C -----------------------------------------------------------------------------
+C
+C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz
+C q1_ff special case. This make the code a bit smaller and simpler, and
+C costs only 2 cycles (each).
+
+L(integer_two_left):
+ C eax scratch
+ C ebx scratch (nadj, q1)
+ C ecx scratch (src, dst)
+ C edx scratch
+ C esi n10
+ C edi n2
+ C ebp divisor
+ C
+ C mm7 rshift
+
+
+ movl %esi, %eax
+ movl %ebp, %ebx
+
+ sarl $31, %eax C -n1
+ movl PARAM_SRC, %ecx
+
+ andl %eax, %ebx C -n1 & d
+ negl %eax C n1
+
+ addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
+ addl %edi, %eax C n2+n1
+
+ mull VAR_INVERSE C m*(n2+n1)
+
+ movd (%ecx), %mm0 C src low limb
+
+ movl VAR_DST_STOP, %ecx
+
+ C
+
+ C
+
+ addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
+ leal 1(%edi), %ebx C n2+1
+ movl %ebp, %eax C d
+
+ adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+ sbbl $0, %ebx
+
+ mull %ebx C (q1+1)*d
+
+ psllq $32, %mm0
+
+ psrlq %mm7, %mm0
+
+ C
+
+ C
+
+ subl %eax, %esi
+
+ sbbl %edx, %edi C n - (q1+1)*d
+ movl %esi, %edi C remainder -> n2
+ leal (%ebp,%esi), %edx
+
+ cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
+ movd %mm0, %esi
+
+ sbbl $0, %ebx C q
+
+ movl %ebx, -4(%ecx)
+
+
+C -----------------------------------------------------------------------------
+L(integer_one_left):
+ C eax scratch
+ C ebx scratch (nadj, q1)
+ C ecx scratch (dst)
+ C edx scratch
+ C esi n10
+ C edi n2
+ C ebp divisor
+ C
+ C mm7 rshift
+
+
+ movl %esi, %eax
+ movl %ebp, %ebx
+
+ sarl $31, %eax C -n1
+ movl VAR_DST_STOP, %ecx
+
+ andl %eax, %ebx C -n1 & d
+ negl %eax C n1
+
+ addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
+ addl %edi, %eax C n2+n1
+
+ mull VAR_INVERSE C m*(n2+n1)
+
+ C
+
+ C
+
+ C
+
+ addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
+ leal 1(%edi), %ebx C n2+1
+ movl %ebp, %eax C d
+
+ C
+
+ adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+ sbbl $0, %ebx C q1 if q1+1 overflowed
+
+ mull %ebx
+
+ C
+
+ C
+
+ C
+
+ C
+
+ subl %eax, %esi
+ movl PARAM_XSIZE, %eax
+
+ sbbl %edx, %edi C n - (q1+1)*d
+ movl %esi, %edi C remainder -> n2
+ leal (%ebp,%esi), %edx
+
+ cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
+
+ sbbl $0, %ebx C q
+
+ movl %ebx, -8(%ecx)
+ subl $8, %ecx
+
+
+
+ orl %eax, %eax C xsize
+ jnz L(fraction_some)
+
+ movl %edi, %eax
+L(fraction_done):
+ movl VAR_NORM, %ecx
+L(zero_done):
+ movl SAVE_EBP, %ebp
+
+ movl SAVE_EDI, %edi
+
+ movl SAVE_ESI, %esi
+
+ movl SAVE_EBX, %ebx
+ addl $STACK_SPACE, %esp
+
+ shrl %cl, %eax
+ emms
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+C
+C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
+C of q*d is simply -d and the remainder n-q*d = n10+d
+
+L(q1_ff):
+ C eax (divisor)
+ C ebx (q1+1 == 0)
+ C ecx
+ C edx
+ C esi n10
+ C edi n2
+ C ebp divisor
+
+ movl VAR_DST, %ecx
+ movl VAR_DST_STOP, %edx
+ subl $4, %ecx
+
+ movl %ecx, VAR_DST
+ psrlq %mm7, %mm0
+ leal (%ebp,%esi), %edi C n-q*d remainder -> next n2
+
+ movl $-1, (%ecx)
+ movd %mm0, %esi C next n10
+
+ cmpl %ecx, %edx
+ jne L(integer_top)
+
+ jmp L(integer_loop_done)
+
+
+
+C -----------------------------------------------------------------------------
+C
+C In the current implementation, the following successively dependent
+C micro-ops seem to exist.
+C
+C uops
+C mul 5
+C q1+1 1 (addl)
+C mul 5
+C sub 3 (negl/sbbl)
+C addback 2 (cmov)
+C ---
+C 16
+C
+C The loop in fact runs at about 17.5 cycles. Using a sarl/andl/addl for
+C the addback was found to be a touch slower.
+
+
+ ALIGN(16)
+L(fraction_some):
+ C eax
+ C ebx
+ C ecx
+ C edx
+ C esi
+ C edi carry
+ C ebp divisor
+
+ movl PARAM_DST, %esi
+ movl VAR_DST_STOP, %ecx C &dst[xsize+2]
+ movl %edi, %eax
+
+ subl $8, %ecx C &dst[xsize]
+
+
+ ALIGN(16)
+L(fraction_top):
+ C eax n2, then scratch
+ C ebx scratch (nadj, q1)
+ C ecx dst, decrementing
+ C edx scratch
+ C esi dst stop point
+ C edi n2
+ C ebp divisor
+
+ mull VAR_INVERSE C m*n2
+
+ movl %ebp, %eax C d
+ subl $4, %ecx C dst
+ leal 1(%edi), %ebx
+
+ C
+
+ C
+
+ C
+
+ addl %edx, %ebx C 1 + high(n2<<32 + m*n2) = q1+1
+
+ mull %ebx C (q1+1)*d
+
+ C
+
+ C
+
+ C
+
+ C
+
+ negl %eax C low of n - (q1+1)*d
+
+ sbbl %edx, %edi C high of n - (q1+1)*d, caring only about carry
+ leal (%ebp,%eax), %edx
+
+ cmovc( %edx, %eax) C n - q1*d if underflow from using q1+1
+
+ sbbl $0, %ebx C q
+ movl %eax, %edi C remainder->n2
+ cmpl %esi, %ecx
+
+ movl %ebx, (%ecx) C previous q
+ jne L(fraction_top)
+
+
+ jmp L(fraction_done)
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/mmx/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/x86/p6/mmx/gmp-mparam.h
new file mode 100644
index 0000000..ef29061
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/p6/mmx/gmp-mparam.h
@@ -0,0 +1,218 @@
+/* Intel P6/mmx gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991-2017 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+
+/* NOTE: In a fat binary build SQR_TOOM2_THRESHOLD here cannot be more than the
+ value in mpn/x86/p6/gmp-mparam.h. The latter is used as a hard limit in
+ mpn/x86/p6/sqr_basecase.asm. */
+
+
+/* 800 MHz P6 model 8 */
+/* Generated by tuneup.c, 2017-02-03, gcc 4.8 */
+
+#define MOD_1_1P_METHOD 2
+#define MOD_1_NORM_THRESHOLD 3
+#define MOD_1_UNNORM_THRESHOLD 4
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 7
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 7
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 8
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 30
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 14
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1N_PI1_METHOD 1
+#define DIV_QR_1_NORM_THRESHOLD 4
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 62
+
+#define DIV_1_VS_MUL_1_PERCENT 168
+
+#define MUL_TOOM22_THRESHOLD 22
+#define MUL_TOOM33_THRESHOLD 73
+#define MUL_TOOM44_THRESHOLD 195
+#define MUL_TOOM6H_THRESHOLD 254
+#define MUL_TOOM8H_THRESHOLD 381
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 81
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 122
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 73
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 80
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 100
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 30 /* WRONG value, see comment above */
+#define SQR_TOOM3_THRESHOLD 83
+#define SQR_TOOM4_THRESHOLD 196
+#define SQR_TOOM6_THRESHOLD 214
+#define SQR_TOOM8_THRESHOLD 381
+
+#define MULMID_TOOM42_THRESHOLD 56
+
+#define MULMOD_BNM1_THRESHOLD 16
+#define SQRMOD_BNM1_THRESHOLD 17
+
+#define MUL_FFT_MODF_THRESHOLD 476 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 476, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
+ { 21, 7}, { 11, 6}, { 25, 7}, { 13, 6}, \
+ { 27, 7}, { 15, 6}, { 31, 7}, { 21, 8}, \
+ { 11, 7}, { 27, 8}, { 15, 7}, { 35, 8}, \
+ { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \
+ { 27, 9}, { 15, 8}, { 31, 7}, { 63, 8}, \
+ { 39, 9}, { 23, 8}, { 51,10}, { 15, 9}, \
+ { 31, 8}, { 67, 9}, { 39, 8}, { 79, 9}, \
+ { 47, 8}, { 95, 9}, { 55,10}, { 31, 9}, \
+ { 63, 8}, { 127, 9}, { 79,10}, { 47, 9}, \
+ { 95,11}, { 31,10}, { 63, 9}, { 135,10}, \
+ { 79, 9}, { 167,10}, { 95, 9}, { 199,10}, \
+ { 111,11}, { 63,10}, { 127, 9}, { 255, 8}, \
+ { 511,10}, { 143, 9}, { 287, 8}, { 575,10}, \
+ { 159,11}, { 95,10}, { 191, 9}, { 383,10}, \
+ { 207,12}, { 63,11}, { 127,10}, { 255, 9}, \
+ { 511,10}, { 271, 9}, { 543, 8}, { 1087,10}, \
+ { 287, 9}, { 575,11}, { 159,10}, { 319, 9}, \
+ { 639,10}, { 351, 9}, { 703,11}, { 191,10}, \
+ { 383, 9}, { 767,10}, { 415, 9}, { 831,11}, \
+ { 223,10}, { 447,12}, { 127,11}, { 255,10}, \
+ { 543, 9}, { 1087,11}, { 287,10}, { 607, 9}, \
+ { 1215,11}, { 319,10}, { 671,11}, { 351,10}, \
+ { 703,12}, { 191,11}, { 383,10}, { 767,11}, \
+ { 415,10}, { 831,11}, { 447,13}, { 127,12}, \
+ { 255,11}, { 543,10}, { 1087,11}, { 607,10}, \
+ { 1215,12}, { 319,11}, { 671,10}, { 1343,11}, \
+ { 703,10}, { 1407,11}, { 735,12}, { 383,11}, \
+ { 831,12}, { 447,11}, { 959,10}, { 1919,13}, \
+ { 255,12}, { 511,11}, { 1087,12}, { 575,11}, \
+ { 1215,10}, { 2431,12}, { 639,11}, { 1343,12}, \
+ { 703,11}, { 1471,13}, { 383,12}, { 767,11}, \
+ { 1535,12}, { 831,11}, { 1727,12}, { 959,11}, \
+ { 1919,14}, { 255,13}, { 511,12}, { 1215,11}, \
+ { 2431,13}, { 639,12}, { 1471,11}, { 2943,13}, \
+ { 767,12}, { 1727,13}, { 895,12}, { 1919,11}, \
+ { 3839,14}, { 511,13}, { 1023,12}, { 2111,13}, \
+ { 1151,12}, { 2431,13}, { 1279,12}, { 2559,13}, \
+ { 1407,12}, { 2943,14}, { 767,13}, { 1663,12}, \
+ { 3327,13}, { 1919,12}, { 3839,15}, { 32768,16} }
+#define MUL_FFT_TABLE3_SIZE 160
+#define MUL_FFT_THRESHOLD 7040
+
+#define SQR_FFT_MODF_THRESHOLD 376 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 376, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
+ { 21, 7}, { 11, 6}, { 24, 7}, { 13, 6}, \
+ { 27, 7}, { 15, 6}, { 31, 7}, { 21, 8}, \
+ { 11, 7}, { 27, 8}, { 15, 7}, { 33, 8}, \
+ { 19, 7}, { 39, 8}, { 23, 7}, { 47, 8}, \
+ { 27, 9}, { 15, 8}, { 39, 9}, { 23, 8}, \
+ { 51,10}, { 15, 9}, { 31, 8}, { 67, 9}, \
+ { 39, 8}, { 79, 9}, { 47, 8}, { 95, 9}, \
+ { 55,10}, { 31, 9}, { 79,10}, { 47, 9}, \
+ { 95,11}, { 31,10}, { 63, 9}, { 127, 8}, \
+ { 255, 9}, { 135,10}, { 79, 9}, { 167,10}, \
+ { 95, 9}, { 191, 8}, { 383,10}, { 111,11}, \
+ { 63,10}, { 127, 9}, { 255, 8}, { 511, 9}, \
+ { 271,10}, { 143, 9}, { 287, 8}, { 575, 9}, \
+ { 303, 8}, { 607,10}, { 159, 9}, { 319,11}, \
+ { 95,10}, { 191, 9}, { 383,10}, { 207,12}, \
+ { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \
+ { 271, 9}, { 543,10}, { 287, 9}, { 575,10}, \
+ { 303,11}, { 159,10}, { 319, 9}, { 639,10}, \
+ { 351, 9}, { 703,11}, { 191,10}, { 383, 9}, \
+ { 767,10}, { 415, 9}, { 831,11}, { 223,10}, \
+ { 479,12}, { 127,11}, { 255,10}, { 543, 9}, \
+ { 1087,11}, { 287,10}, { 607, 9}, { 1215,11}, \
+ { 319,10}, { 671,11}, { 351,10}, { 703,12}, \
+ { 191,11}, { 383,10}, { 767,11}, { 415,10}, \
+ { 831,11}, { 479,13}, { 127,12}, { 255,11}, \
+ { 543,10}, { 1087,11}, { 607,10}, { 1215,12}, \
+ { 319,11}, { 671,10}, { 1343,11}, { 703,10}, \
+ { 1407,11}, { 735,12}, { 383,11}, { 831,12}, \
+ { 447,11}, { 959,10}, { 1919,13}, { 255,12}, \
+ { 511,11}, { 1087,12}, { 575,11}, { 1215,10}, \
+ { 2431,12}, { 639,11}, { 1343,12}, { 703,11}, \
+ { 1407,13}, { 383,12}, { 831,11}, { 1727,12}, \
+ { 959,11}, { 1919,14}, { 255,13}, { 511,12}, \
+ { 1215,11}, { 2431,13}, { 639,12}, { 1471,11}, \
+ { 2943,13}, { 767,12}, { 1727,13}, { 895,12}, \
+ { 1919,11}, { 3839,14}, { 511,13}, { 1023,12}, \
+ { 2111,13}, { 1151,12}, { 2431,13}, { 1407,12}, \
+ { 2943,14}, { 767,13}, { 1535,12}, { 3071,13}, \
+ { 1663,12}, { 3455,13}, { 1919,12}, { 3839,15}, \
+ { 32768,16} }
+#define SQR_FFT_TABLE3_SIZE 161
+#define SQR_FFT_THRESHOLD 3712
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 62
+#define MULLO_MUL_N_THRESHOLD 13463
+#define SQRLO_BASECASE_THRESHOLD 0 /* always */
+#define SQRLO_DC_THRESHOLD 177
+#define SQRLO_SQR_THRESHOLD 8937
+
+#define DC_DIV_QR_THRESHOLD 80
+#define DC_DIVAPPR_Q_THRESHOLD 240
+#define DC_BDIV_QR_THRESHOLD 76
+#define DC_BDIV_Q_THRESHOLD 166
+
+#define INV_MULMOD_BNM1_THRESHOLD 42
+#define INV_NEWTON_THRESHOLD 262
+#define INV_APPR_THRESHOLD 250
+
+#define BINV_NEWTON_THRESHOLD 272
+#define REDC_1_TO_REDC_N_THRESHOLD 72
+
+#define MU_DIV_QR_THRESHOLD 1499
+#define MU_DIVAPPR_Q_THRESHOLD 1470
+#define MUPI_DIV_QR_THRESHOLD 124
+#define MU_BDIV_QR_THRESHOLD 1142
+#define MU_BDIV_Q_THRESHOLD 1341
+
+#define POWM_SEC_TABLE 1,16,96,416,1259
+
+#define GET_STR_DC_THRESHOLD 14
+#define GET_STR_PRECOMPUTE_THRESHOLD 27
+#define SET_STR_DC_THRESHOLD 270
+#define SET_STR_PRECOMPUTE_THRESHOLD 1084
+
+#define FAC_DSC_THRESHOLD 194
+#define FAC_ODD_THRESHOLD 25
+
+#define MATRIX22_STRASSEN_THRESHOLD 16
+#define HGCD_THRESHOLD 124
+#define HGCD_APPR_THRESHOLD 152
+#define HGCD_REDUCE_THRESHOLD 3014
+#define GCD_DC_THRESHOLD 474
+#define GCDEXT_DC_THRESHOLD 321
+#define JACOBI_BASE_METHOD 1
diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/mmx/lshift.asm b/vendor/gmp-6.3.0/mpn/x86/p6/mmx/lshift.asm
new file mode 100644
index 0000000..febd1c0
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/p6/mmx/lshift.asm
@@ -0,0 +1,38 @@
+dnl Intel Pentium-II mpn_lshift -- mpn left shift.
+
+dnl Copyright 2001 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+dnl The P55 code runs well on P-II/III, but could stand some minor tweaks
+dnl at some stage probably.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_lshift)
+include_mpn(`x86/pentium/mmx/lshift.asm')
diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/mmx/popham.asm b/vendor/gmp-6.3.0/mpn/x86/p6/mmx/popham.asm
new file mode 100644
index 0000000..fd340e4
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/p6/mmx/popham.asm
@@ -0,0 +1,39 @@
+dnl Intel Pentium-II mpn_popcount, mpn_hamdist -- population count and
+dnl hamming distance.
+
+dnl Copyright 2000, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P6MMX: popcount 11 cycles/limb (approx), hamdist 11.5 cycles/limb (approx)
+
+
+MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
+include_mpn(`x86/k6/mmx/popham.asm')
diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/mmx/rshift.asm b/vendor/gmp-6.3.0/mpn/x86/p6/mmx/rshift.asm
new file mode 100644
index 0000000..77aa190
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/p6/mmx/rshift.asm
@@ -0,0 +1,38 @@
+dnl Intel Pentium-II mpn_rshift -- mpn left shift.
+
+dnl Copyright 2001 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+dnl The P55 code runs well on P-II/III, but could stand some minor tweaks
+dnl at some stage probably.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_rshift)
+include_mpn(`x86/pentium/mmx/rshift.asm')
diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/mod_34lsub1.asm b/vendor/gmp-6.3.0/mpn/x86/p6/mod_34lsub1.asm
new file mode 100644
index 0000000..b88ab5d
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/p6/mod_34lsub1.asm
@@ -0,0 +1,190 @@
+dnl Intel P6 mpn_mod_34lsub1 -- remainder modulo 2^24-1.
+
+dnl Copyright 2000-2002, 2004 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P6: 2.0 cycles/limb
+
+C TODO
+C Experiments with more unrolling indicate that 1.5 c/l is possible on P6-13
+C with the current carry handling scheme.
+
+C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
+C
+C Groups of three limbs are handled, with carry bits from 0mod3 into 1mod3
+C into 2mod3, but at that point going into a separate carries total so we
+C don't keep the carry flag live across the loop control. Avoiding decl
+C lets us get to 2.0 c/l, as compared to the generic x86 code at 3.66.
+C
+
+defframe(PARAM_SIZE, 8)
+defframe(PARAM_SRC, 4)
+
+dnl re-use parameter space
+define(SAVE_EBX, `PARAM_SIZE')
+define(SAVE_ESI, `PARAM_SRC')
+
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mod_34lsub1)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ movl PARAM_SRC, %edx
+
+ subl $2, %ecx C size-2
+ movl (%edx), %eax C src[0]
+ ja L(three_or_more)
+ jb L(one)
+
+ C size==2
+
+ movl 4(%edx), %ecx C src[1]
+
+ movl %eax, %edx C src[0]
+ shrl $24, %eax C src[0] high
+
+ andl $0xFFFFFF, %edx C src[0] low
+
+ addl %edx, %eax
+ movl %ecx, %edx C src[1]
+ shrl $16, %ecx C src[1] high
+
+ andl $0xFFFF, %edx
+ addl %ecx, %eax
+
+ shll $8, %edx C src[1] low
+
+ addl %edx, %eax
+L(one):
+ ret
+
+
+L(three_or_more):
+ C eax src[0], initial acc 0mod3
+ C ebx
+ C ecx size-2
+ C edx src
+ C esi
+ C edi
+ C ebp
+
+ movl %ebx, SAVE_EBX
+ movl 4(%edx), %ebx C src[1], initial 1mod3
+ subl $3, %ecx C size-5
+
+ movl %esi, SAVE_ESI
+ movl 8(%edx), %esi C src[2], initial 2mod3
+
+ pushl %edi FRAME_pushl()
+ movl $0, %edi C initial carries 0mod3
+ jng L(done) C if size < 6
+
+
+L(top):
+ C eax acc 0mod3
+ C ebx acc 1mod3
+ C ecx counter, limbs
+ C edx src
+ C esi acc 2mod3
+ C edi carrys into 0mod3
+ C ebp
+
+ addl 12(%edx), %eax
+ adcl 16(%edx), %ebx
+ adcl 20(%edx), %esi
+ leal 12(%edx), %edx
+ adcl $0, %edi
+
+ subl $3, %ecx
+ jg L(top) C at least 3 more to process
+
+
+L(done):
+ C ecx is -2, -1 or 0 representing 0, 1 or 2 more limbs respectively
+ cmpl $-1, %ecx
+ jl L(done_0) C if -2, meaning 0 more limbs
+
+ C 1 or 2 more limbs
+ movl $0, %ecx
+ je L(done_1) C if -1, meaning 1 more limb only
+ movl 16(%edx), %ecx
+L(done_1):
+ addl 12(%edx), %eax C 0mod3
+ adcl %ecx, %ebx C 1mod3
+ adcl $0, %esi C 2mod3
+ adcl $0, %edi C carries 0mod3
+
+L(done_0):
+ C eax acc 0mod3
+ C ebx acc 1mod3
+ C ecx
+ C edx
+ C esi acc 2mod3
+ C edi carries 0mod3
+ C ebp
+
+ movl %eax, %ecx C 0mod3
+ shrl $24, %eax C 0mod3 high initial total
+
+ andl $0xFFFFFF, %ecx C 0mod3 low
+ movl %edi, %edx C carries
+ shrl $24, %edi C carries high
+
+ addl %ecx, %eax C add 0mod3 low
+ andl $0xFFFFFF, %edx C carries 0mod3 low
+ movl %ebx, %ecx C 1mod3
+
+ shrl $16, %ebx C 1mod3 high
+ addl %edi, %eax C add carries high
+ addl %edx, %eax C add carries 0mod3 low
+
+ andl $0xFFFF, %ecx C 1mod3 low mask
+ addl %ebx, %eax C add 1mod3 high
+ movl SAVE_EBX, %ebx
+
+ shll $8, %ecx C 1mod3 low
+ movl %esi, %edx C 2mod3
+ popl %edi FRAME_popl()
+
+ shrl $8, %esi C 2mod3 high
+ andl $0xFF, %edx C 2mod3 low mask
+ addl %ecx, %eax C add 1mod3 low
+
+ shll $16, %edx C 2mod3 low
+ addl %esi, %eax C add 2mod3 high
+ movl SAVE_ESI, %esi
+
+ addl %edx, %eax C add 2mod3 low
+
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/mode1o.asm b/vendor/gmp-6.3.0/mpn/x86/p6/mode1o.asm
new file mode 100644
index 0000000..7083195
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/p6/mode1o.asm
@@ -0,0 +1,170 @@
+dnl Intel P6 mpn_modexact_1_odd -- exact division style remainder.
+
+dnl Copyright 2000-2002, 2007 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P6: 10.0 cycles/limb
+
+
+C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor);
+C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor, mp_limb_t carry);
+C
+C It's not worth skipping a step at the end when high<divisor since the main
+C loop is only 10 cycles.
+
+defframe(PARAM_CARRY, 16)
+defframe(PARAM_DIVISOR,12)
+defframe(PARAM_SIZE, 8)
+defframe(PARAM_SRC, 4)
+
+dnl Not enough room under modexact_1 to make these re-use the parameter
+dnl space, unfortunately.
+defframe(SAVE_EBX, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EDI, -12)
+deflit(STACK_SPACE, 12)
+
+ TEXT
+
+ ALIGN(16)
+PROLOGUE(mpn_modexact_1c_odd)
+deflit(`FRAME',0)
+
+ movl PARAM_CARRY, %ecx
+ jmp L(start_1c)
+
+EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(mpn_modexact_1_odd)
+deflit(`FRAME',0)
+
+ xorl %ecx, %ecx
+L(start_1c):
+ movl PARAM_DIVISOR, %eax
+
+ subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE)
+
+ movl %esi, SAVE_ESI
+ movl PARAM_SRC, %esi
+
+ shrl %eax C d/2
+ movl %edi, SAVE_EDI
+
+ andl $127, %eax
+
+ifdef(`PIC',`
+ LEA( binvert_limb_table, %edi)
+ movzbl (%eax,%edi), %edi C inv 8 bits
+',`
+ movzbl binvert_limb_table(%eax), %edi C inv 8 bits
+')
+
+ xorl %edx, %edx C initial extra carry
+ leal (%edi,%edi), %eax C 2*inv
+
+ imull %edi, %edi C inv*inv
+
+ movl %ebx, SAVE_EBX
+ movl PARAM_SIZE, %ebx
+
+ imull PARAM_DIVISOR, %edi C inv*inv*d
+
+ subl %edi, %eax C inv = 2*inv - inv*inv*d
+ leal (%eax,%eax), %edi C 2*inv
+
+ imull %eax, %eax C inv*inv
+
+ imull PARAM_DIVISOR, %eax C inv*inv*d
+
+ leal (%esi,%ebx,4), %esi C src end
+ negl %ebx C -size
+
+ subl %eax, %edi C inv = 2*inv - inv*inv*d
+
+ ASSERT(e,` C d*inv == 1 mod 2^GMP_LIMB_BITS
+ movl PARAM_DIVISOR, %eax
+ imull %edi, %eax
+ cmpl $1, %eax')
+
+
+C The dependent chain here is
+C
+C subl %edx, %eax 1
+C imull %edi, %eax 4
+C mull PARAM_DIVISOR 5
+C ----
+C total 10
+C
+C and this is the measured speed. No special scheduling is necessary, out
+C of order execution hides the load latency.
+
+L(top):
+ C eax scratch (src limb)
+ C ebx counter, limbs, negative
+ C ecx carry bit, 0 or 1
+ C edx carry limb, high of last product
+ C esi &src[size]
+ C edi inverse
+ C ebp
+
+ movl (%esi,%ebx,4), %eax
+ subl %ecx, %eax
+
+ sbbl %ecx, %ecx
+ subl %edx, %eax
+
+ sbbl $0, %ecx
+
+ imull %edi, %eax
+
+ negl %ecx
+
+ mull PARAM_DIVISOR
+
+ incl %ebx
+ jnz L(top)
+
+
+ movl SAVE_ESI, %esi
+ leal (%ecx,%edx), %eax
+
+ movl SAVE_EDI, %edi
+
+ movl SAVE_EBX, %ebx
+ addl $STACK_SPACE, %esp
+
+ ret
+
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/mul_basecase.asm b/vendor/gmp-6.3.0/mpn/x86/p6/mul_basecase.asm
new file mode 100644
index 0000000..d87bc12
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/p6/mul_basecase.asm
@@ -0,0 +1,607 @@
+dnl Intel P6 mpn_mul_basecase -- multiply two mpn numbers.
+
+dnl Copyright 1999-2003 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P6: approx 6.5 cycles per cross product (16 limbs/loop unrolling).
+
+
+dnl P6 UNROLL_COUNT cycles/product (approx)
+dnl 8 7
+dnl 16 6.5
+dnl 32 6.4
+dnl Maximum possible with the current code is 32.
+
+deflit(UNROLL_COUNT, 16)
+
+
+C void mpn_mul_basecase (mp_ptr wp,
+C mp_srcptr xp, mp_size_t xsize,
+C mp_srcptr yp, mp_size_t ysize);
+C
+C This routine is essentially the same as mpn/generic/mul_basecase.c, but
+C it's faster because it does most of the mpn_addmul_1() startup
+C calculations only once.
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 5)
+',`
+deflit(UNROLL_THRESHOLD, 5)
+')
+
+defframe(PARAM_YSIZE,20)
+defframe(PARAM_YP, 16)
+defframe(PARAM_XSIZE,12)
+defframe(PARAM_XP, 8)
+defframe(PARAM_WP, 4)
+
+ TEXT
+ ALIGN(16)
+
+PROLOGUE(mpn_mul_basecase)
+deflit(`FRAME',0)
+
+ movl PARAM_XSIZE, %ecx
+
+ movl PARAM_YP, %eax
+
+ movl PARAM_XP, %edx
+
+ movl (%eax), %eax C yp[0]
+ cmpl $2, %ecx
+ ja L(xsize_more_than_two)
+ je L(two_by_something)
+
+
+ C one limb by one limb
+
+ mull (%edx)
+
+ movl PARAM_WP, %ecx
+ movl %eax, (%ecx)
+ movl %edx, 4(%ecx)
+ ret
+
+
+C -----------------------------------------------------------------------------
+L(two_by_something):
+deflit(`FRAME',0)
+
+dnl re-use parameter space
+define(SAVE_EBX, `PARAM_XSIZE')
+define(SAVE_ESI, `PARAM_YSIZE')
+
+ movl %ebx, SAVE_EBX
+ cmpl $1, PARAM_YSIZE
+ movl %eax, %ecx C yp[0]
+
+ movl %esi, SAVE_ESI C save esi
+ movl PARAM_WP, %ebx
+ movl %edx, %esi C xp
+
+ movl (%edx), %eax C xp[0]
+ jne L(two_by_two)
+
+
+ C two limbs by one limb
+ C
+ C eax xp[0]
+ C ebx wp
+ C ecx yp[0]
+ C edx
+ C esi xp
+
+ mull %ecx
+
+ movl %eax, (%ebx)
+ movl 4(%esi), %eax
+ movl %edx, %esi C carry
+
+ mull %ecx
+
+ addl %eax, %esi
+
+ movl %esi, 4(%ebx)
+ movl SAVE_ESI, %esi
+
+ adcl $0, %edx
+
+ movl %edx, 8(%ebx)
+ movl SAVE_EBX, %ebx
+
+ ret
+
+
+
+C -----------------------------------------------------------------------------
+
+ ALIGN(16)
+L(two_by_two):
+ C eax xp[0]
+ C ebx wp
+ C ecx yp[0]
+ C edx
+ C esi xp
+ C edi
+ C ebp
+
+dnl more parameter space re-use
+define(SAVE_EDI, `PARAM_WP')
+
+ mull %ecx C xp[0] * yp[0]
+
+ movl %edi, SAVE_EDI
+ movl %edx, %edi C carry, for wp[1]
+
+ movl %eax, (%ebx)
+ movl 4(%esi), %eax
+
+ mull %ecx C xp[1] * yp[0]
+
+ addl %eax, %edi
+ movl PARAM_YP, %ecx
+
+ adcl $0, %edx
+ movl 4(%ecx), %ecx C yp[1]
+
+ movl %edi, 4(%ebx)
+ movl 4(%esi), %eax C xp[1]
+ movl %edx, %edi C carry, for wp[2]
+
+ mull %ecx C xp[1] * yp[1]
+
+ addl %eax, %edi
+ movl (%esi), %eax C xp[0]
+
+ adcl $0, %edx
+ movl %edx, %esi C carry, for wp[3]
+
+ mull %ecx C xp[0] * yp[1]
+
+ addl %eax, 4(%ebx)
+ movl %esi, %eax
+
+ adcl %edx, %edi
+ movl SAVE_ESI, %esi
+
+ movl %edi, 8(%ebx)
+
+ adcl $0, %eax
+ movl SAVE_EDI, %edi
+
+ movl %eax, 12(%ebx)
+ movl SAVE_EBX, %ebx
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(16)
+L(xsize_more_than_two):
+
+C The first limb of yp is processed with a simple mpn_mul_1 loop running at
+C about 6.2 c/l. Unrolling this doesn't seem worthwhile since it's only run
+C once (whereas the addmul_1 below is run ysize-1 many times). A call to
+C mpn_mul_1 would be slowed down by the parameter pushing and popping etc,
+C and doesn't seem likely to be worthwhile on the typical sizes reaching
+C here from the Karatsuba code.
+
+ C eax yp[0]
+ C ebx
+ C ecx xsize
+ C edx xp
+ C esi
+ C edi
+ C ebp
+
+defframe(`SAVE_EBX', -4)
+defframe(`SAVE_ESI', -8)
+defframe(`SAVE_EDI', -12)
+defframe(`SAVE_EBP', -16)
+defframe(VAR_COUNTER, -20) dnl for use in the unroll case
+defframe(VAR_ADJUST, -24)
+defframe(VAR_JMP, -28)
+defframe(VAR_SWAP, -32)
+defframe(VAR_XP_LOW, -36)
+deflit(STACK_SPACE, 36)
+
+ subl $STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+ movl %edi, SAVE_EDI
+ movl PARAM_WP, %edi
+
+ movl %ebx, SAVE_EBX
+
+ movl %ebp, SAVE_EBP
+ movl %eax, %ebp
+
+ movl %esi, SAVE_ESI
+ xorl %ebx, %ebx
+ leal (%edx,%ecx,4), %esi C xp end
+
+ leal (%edi,%ecx,4), %edi C wp end of mul1
+ negl %ecx
+
+
+L(mul1):
+ C eax scratch
+ C ebx carry
+ C ecx counter, negative
+ C edx scratch
+ C esi xp end
+ C edi wp end of mul1
+ C ebp multiplier
+
+ movl (%esi,%ecx,4), %eax
+
+ mull %ebp
+
+ addl %ebx, %eax
+ movl %eax, (%edi,%ecx,4)
+ movl $0, %ebx
+
+ adcl %edx, %ebx
+ incl %ecx
+ jnz L(mul1)
+
+
+ movl PARAM_YSIZE, %edx
+
+ movl %ebx, (%edi) C final carry
+ movl PARAM_XSIZE, %ecx
+ decl %edx
+
+ jz L(done) C if ysize==1
+
+ cmpl $UNROLL_THRESHOLD, %ecx
+ movl PARAM_YP, %eax
+ jae L(unroll)
+
+
+C -----------------------------------------------------------------------------
+ C simple addmul looping
+ C
+ C eax yp
+ C ebx
+ C ecx xsize
+ C edx ysize-1
+ C esi xp end
+ C edi wp end of mul1
+ C ebp
+
+ leal 4(%eax,%edx,4), %ebp C yp end
+ negl %ecx
+ negl %edx
+
+ movl %edx, PARAM_YSIZE C -(ysize-1)
+ movl (%esi,%ecx,4), %eax C xp low limb
+ incl %ecx
+
+ movl %ecx, PARAM_XSIZE C -(xsize-1)
+ xorl %ebx, %ebx C initial carry
+
+ movl %ebp, PARAM_YP
+ movl (%ebp,%edx,4), %ebp C yp second lowest limb - multiplier
+ jmp L(simple_outer_entry)
+
+
+L(simple_outer_top):
+ C ebp ysize counter, negative
+
+ movl PARAM_YP, %edx
+
+ movl PARAM_XSIZE, %ecx C -(xsize-1)
+ xorl %ebx, %ebx C carry
+
+ movl %ebp, PARAM_YSIZE
+ addl $4, %edi C next position in wp
+
+ movl (%edx,%ebp,4), %ebp C yp limb - multiplier
+
+ movl -4(%esi,%ecx,4), %eax C xp low limb
+
+
+L(simple_outer_entry):
+
+L(simple_inner_top):
+ C eax xp limb
+ C ebx carry limb
+ C ecx loop counter (negative)
+ C edx scratch
+ C esi xp end
+ C edi wp end
+ C ebp multiplier
+
+ mull %ebp
+
+ addl %eax, %ebx
+ adcl $0, %edx
+
+ addl %ebx, (%edi,%ecx,4)
+ movl (%esi,%ecx,4), %eax
+ adcl $0, %edx
+
+ incl %ecx
+ movl %edx, %ebx
+ jnz L(simple_inner_top)
+
+
+ C separate code for last limb so outer loop counter handling can be
+ C interleaved
+
+ mull %ebp
+
+ movl PARAM_YSIZE, %ebp
+ addl %eax, %ebx
+
+ adcl $0, %edx
+
+ addl %ebx, (%edi)
+
+ adcl $0, %edx
+ incl %ebp
+
+ movl %edx, 4(%edi)
+ jnz L(simple_outer_top)
+
+
+L(done):
+ movl SAVE_EBX, %ebx
+
+ movl SAVE_ESI, %esi
+
+ movl SAVE_EDI, %edi
+
+ movl SAVE_EBP, %ebp
+ addl $FRAME, %esp
+
+ ret
+
+
+
+C -----------------------------------------------------------------------------
+C
+C The unrolled loop is the same as in mpn_addmul_1, see that code for some
+C comments.
+C
+C VAR_ADJUST is the negative of how many limbs the leals in the inner loop
+C increment xp and wp. This is used to adjust xp and wp, and is rshifted to
+C given an initial VAR_COUNTER at the top of the outer loop.
+C
+C VAR_COUNTER is for the unrolled loop, running from VAR_ADJUST/UNROLL_COUNT
+C up to -1, inclusive.
+C
+C VAR_JMP is the computed jump into the unrolled loop.
+C
+C VAR_SWAP is 0 if xsize odd or 0xFFFFFFFF if xsize even, used to swap the
+C initial ebx and ecx on entry to the unrolling.
+C
+C VAR_XP_LOW is the least significant limb of xp, which is needed at the
+C start of the unrolled loop.
+C
+C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
+C inclusive.
+C
+C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be
+C added to give the location of the next limb of yp, which is the multiplier
+C in the unrolled loop.
+C
+C The trick with the VAR_ADJUST value means it's only necessary to do one
+C fetch in the outer loop to take care of xp, wp and the inner loop counter.
+
+
+L(unroll):
+ C eax yp
+ C ebx
+ C ecx xsize
+ C edx ysize-1
+ C esi xp end
+ C edi wp end of mul1
+ C ebp
+
+ movl PARAM_XP, %esi
+
+ movl 4(%eax), %ebp C multiplier (yp second limb)
+ leal 4(%eax,%edx,4), %eax C yp adjust for ysize indexing
+
+ movl %eax, PARAM_YP
+ movl PARAM_WP, %edi
+ negl %edx
+
+ movl %edx, PARAM_YSIZE
+ leal UNROLL_COUNT-2(%ecx), %ebx C (xsize-1)+UNROLL_COUNT-1
+ decl %ecx C xsize-1
+
+ movl (%esi), %eax C xp low limb
+ andl $-UNROLL_MASK-1, %ebx
+ negl %ecx C -(xsize-1)
+
+ negl %ebx
+ andl $UNROLL_MASK, %ecx
+
+ movl %ebx, VAR_ADJUST
+ movl %ecx, %edx
+ shll $4, %ecx
+
+ movl %eax, VAR_XP_LOW
+ sarl $UNROLL_LOG2, %ebx
+ negl %edx
+
+ C 15 code bytes per limb
+ifdef(`PIC',`
+ call L(pic_calc)
+L(unroll_here):
+',`
+ leal L(unroll_inner_entry) (%ecx,%edx,1), %ecx
+')
+
+ movl %ecx, VAR_JMP
+ movl %edx, %ecx
+ shll $31, %edx
+
+ sarl $31, %edx C 0 or -1 as xsize odd or even
+ leal 4(%edi,%ecx,4), %edi C wp and xp, adjust for unrolling,
+ leal 4(%esi,%ecx,4), %esi C and start at second limb
+
+ movl %edx, VAR_SWAP
+ jmp L(unroll_outer_entry)
+
+
+ifdef(`PIC',`
+L(pic_calc):
+ C See mpn/x86/README about old gas bugs
+ leal (%ecx,%edx,1), %ecx
+ addl $L(unroll_inner_entry)-L(unroll_here), %ecx
+ addl (%esp), %ecx
+ ret_internal
+')
+
+
+C --------------------------------------------------------------------------
+ ALIGN(16)
+L(unroll_outer_top):
+ C eax
+ C ebx
+ C ecx
+ C edx
+ C esi xp + offset
+ C edi wp + offset
+ C ebp ysize counter, negative
+
+ movl VAR_ADJUST, %ebx
+ movl PARAM_YP, %edx
+
+ movl VAR_XP_LOW, %eax
+ movl %ebp, PARAM_YSIZE C store incremented ysize counter
+
+ leal eval(UNROLL_BYTES + 4) (%edi,%ebx,4), %edi
+ leal (%esi,%ebx,4), %esi
+ sarl $UNROLL_LOG2, %ebx
+
+ movl (%edx,%ebp,4), %ebp C yp next multiplier
+
+L(unroll_outer_entry):
+ mull %ebp
+
+ movl %ebx, VAR_COUNTER
+ movl %edx, %ebx C carry high
+ movl %eax, %ecx C carry low
+
+ xorl %edx, %eax
+ movl VAR_JMP, %edx
+
+ andl VAR_SWAP, %eax
+
+ xorl %eax, %ebx C carries other way for odd index
+ xorl %eax, %ecx
+
+ jmp *%edx
+
+
+C -----------------------------------------------------------------------------
+
+L(unroll_inner_top):
+ C eax xp limb
+ C ebx carry high
+ C ecx carry low
+ C edx scratch
+ C esi xp+8
+ C edi wp
+ C ebp yp multiplier limb
+ C
+ C VAR_COUNTER loop counter, negative
+ C
+ C 15 bytes each limb
+
+ addl $UNROLL_BYTES, %edi
+
+L(unroll_inner_entry):
+
+deflit(CHUNK_COUNT,2)
+forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+ deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
+ deflit(`disp1', eval(disp0 + 4))
+
+Zdisp( movl, disp0,(%esi), %eax)
+ mull %ebp
+Zdisp( addl, %ecx, disp0,(%edi))
+ adcl %eax, %ebx C new carry low
+ movl %edx, %ecx
+ adcl $0, %ecx C new carry high
+
+ movl disp1(%esi), %eax
+ mull %ebp
+ addl %ebx, disp1(%edi)
+ adcl %eax, %ecx C new carry low
+ movl %edx, %ebx
+ adcl $0, %ebx C new carry high
+')
+
+
+ incl VAR_COUNTER
+ leal UNROLL_BYTES(%esi), %esi
+ jnz L(unroll_inner_top)
+
+
+ C eax
+ C ebx carry high
+ C ecx carry low
+ C edx
+ C esi
+ C edi wp, pointing at second last limb)
+ C ebp
+
+deflit(`disp0', eval(UNROLL_BYTES ifelse(UNROLL_BYTES,256,-128)))
+deflit(`disp1', eval(disp0 + 4))
+
+ movl PARAM_YSIZE, %ebp
+ addl %ecx, disp0(%edi) C carry low
+
+ adcl $0, %ebx
+ incl %ebp
+
+ movl %ebx, disp1(%edi) C carry high
+ jnz L(unroll_outer_top)
+
+
+ movl SAVE_ESI, %esi
+
+ movl SAVE_EBP, %ebp
+
+ movl SAVE_EDI, %edi
+
+ movl SAVE_EBX, %ebx
+ addl $FRAME, %esp
+
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/p3mmx/popham.asm b/vendor/gmp-6.3.0/mpn/x86/p6/p3mmx/popham.asm
new file mode 100644
index 0000000..db2f260
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/p6/p3mmx/popham.asm
@@ -0,0 +1,42 @@
+dnl Intel Pentium-III mpn_popcount, mpn_hamdist -- population count and
+dnl hamming distance.
+
+dnl Copyright 2000, 2002, 2004, 2007 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C popcount hamdist
+C P3 generic 6.5 7
+C P3 model 9 (Banias) ? ?
+C P3 model 13 (Dothan) 5.75 6
+
+
+MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
+include_mpn(`x86/k7/mmx/popham.asm')
diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/sqr_basecase.asm b/vendor/gmp-6.3.0/mpn/x86/p6/sqr_basecase.asm
new file mode 100644
index 0000000..8fc7fdf
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/p6/sqr_basecase.asm
@@ -0,0 +1,649 @@
+dnl Intel P6 mpn_sqr_basecase -- square an mpn number.
+
+dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P6: approx 4.0 cycles per cross product, or 7.75 cycles per triangular
+C product (measured on the speed difference between 20 and 40 limbs,
+C which is the Karatsuba recursing range).
+
+
+dnl These are the same as in mpn/x86/k6/sqr_basecase.asm, see that file for
+dnl a description. The only difference here is that UNROLL_COUNT can go up
+dnl to 64 (not 63) making SQR_TOOM2_THRESHOLD_MAX 67.
+
+deflit(SQR_TOOM2_THRESHOLD_MAX, 67)
+
+ifdef(`SQR_TOOM2_THRESHOLD_OVERRIDE',
+`define(`SQR_TOOM2_THRESHOLD',SQR_TOOM2_THRESHOLD_OVERRIDE)')
+
+m4_config_gmp_mparam(`SQR_TOOM2_THRESHOLD')
+deflit(UNROLL_COUNT, eval(SQR_TOOM2_THRESHOLD-3))
+
+
+C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
+C lot of function call overheads are avoided, especially when the given size
+C is small.
+C
+C The code size might look a bit excessive, but not all of it is executed so
+C it won't all get into the code cache. The 1x1, 2x2 and 3x3 special cases
+C clearly apply only to those sizes; mid sizes like 10x10 only need part of
+C the unrolled addmul; and big sizes like 40x40 that do use the full
+C unrolling will least be making good use of it, because 40x40 will take
+C something like 7000 cycles.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_sqr_basecase)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %edx
+
+ movl PARAM_SRC, %eax
+
+ cmpl $2, %edx
+ movl PARAM_DST, %ecx
+ je L(two_limbs)
+
+ movl (%eax), %eax
+ ja L(three_or_more)
+
+
+C -----------------------------------------------------------------------------
+C one limb only
+ C eax src limb
+ C ebx
+ C ecx dst
+ C edx
+
+ mull %eax
+
+ movl %eax, (%ecx)
+ movl %edx, 4(%ecx)
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+L(two_limbs):
+ C eax src
+ C ebx
+ C ecx dst
+ C edx
+
+defframe(SAVE_ESI, -4)
+defframe(SAVE_EBX, -8)
+defframe(SAVE_EDI, -12)
+defframe(SAVE_EBP, -16)
+deflit(`STACK_SPACE',16)
+
+ subl $STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+ movl %esi, SAVE_ESI
+ movl %eax, %esi
+ movl (%eax), %eax
+
+ mull %eax C src[0]^2
+
+ movl %eax, (%ecx) C dst[0]
+ movl 4(%esi), %eax
+
+ movl %ebx, SAVE_EBX
+ movl %edx, %ebx C dst[1]
+
+ mull %eax C src[1]^2
+
+ movl %edi, SAVE_EDI
+ movl %eax, %edi C dst[2]
+ movl (%esi), %eax
+
+ movl %ebp, SAVE_EBP
+ movl %edx, %ebp C dst[3]
+
+ mull 4(%esi) C src[0]*src[1]
+
+ addl %eax, %ebx
+ movl SAVE_ESI, %esi
+
+ adcl %edx, %edi
+
+ adcl $0, %ebp
+ addl %ebx, %eax
+ movl SAVE_EBX, %ebx
+
+ adcl %edi, %edx
+ movl SAVE_EDI, %edi
+
+ adcl $0, %ebp
+
+ movl %eax, 4(%ecx)
+
+ movl %ebp, 12(%ecx)
+ movl SAVE_EBP, %ebp
+
+ movl %edx, 8(%ecx)
+ addl $FRAME, %esp
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+L(three_or_more):
+ C eax src low limb
+ C ebx
+ C ecx dst
+ C edx size
+deflit(`FRAME',0)
+
+ pushl %esi defframe_pushl(`SAVE_ESI')
+ cmpl $4, %edx
+
+ movl PARAM_SRC, %esi
+ jae L(four_or_more)
+
+
+C -----------------------------------------------------------------------------
+C three limbs
+
+ C eax src low limb
+ C ebx
+ C ecx dst
+ C edx
+ C esi src
+ C edi
+ C ebp
+
+ pushl %ebp defframe_pushl(`SAVE_EBP')
+ pushl %edi defframe_pushl(`SAVE_EDI')
+
+ mull %eax C src[0] ^ 2
+
+ movl %eax, (%ecx)
+ movl %edx, 4(%ecx)
+
+ movl 4(%esi), %eax
+ xorl %ebp, %ebp
+
+ mull %eax C src[1] ^ 2
+
+ movl %eax, 8(%ecx)
+ movl %edx, 12(%ecx)
+ movl 8(%esi), %eax
+
+ pushl %ebx defframe_pushl(`SAVE_EBX')
+
+ mull %eax C src[2] ^ 2
+
+ movl %eax, 16(%ecx)
+ movl %edx, 20(%ecx)
+
+ movl (%esi), %eax
+
+ mull 4(%esi) C src[0] * src[1]
+
+ movl %eax, %ebx
+ movl %edx, %edi
+
+ movl (%esi), %eax
+
+ mull 8(%esi) C src[0] * src[2]
+
+ addl %eax, %edi
+ movl %edx, %ebp
+
+ adcl $0, %ebp
+ movl 4(%esi), %eax
+
+ mull 8(%esi) C src[1] * src[2]
+
+ xorl %esi, %esi
+ addl %eax, %ebp
+
+ C eax
+ C ebx dst[1]
+ C ecx dst
+ C edx dst[4]
+ C esi zero, will be dst[5]
+ C edi dst[2]
+ C ebp dst[3]
+
+ adcl $0, %edx
+ addl %ebx, %ebx
+
+ adcl %edi, %edi
+
+ adcl %ebp, %ebp
+
+ adcl %edx, %edx
+ movl 4(%ecx), %eax
+
+ adcl $0, %esi
+ addl %ebx, %eax
+
+ movl %eax, 4(%ecx)
+ movl 8(%ecx), %eax
+
+ adcl %edi, %eax
+ movl 12(%ecx), %ebx
+
+ adcl %ebp, %ebx
+ movl 16(%ecx), %edi
+
+ movl %eax, 8(%ecx)
+ movl SAVE_EBP, %ebp
+
+ movl %ebx, 12(%ecx)
+ movl SAVE_EBX, %ebx
+
+ adcl %edx, %edi
+ movl 20(%ecx), %eax
+
+ movl %edi, 16(%ecx)
+ movl SAVE_EDI, %edi
+
+ adcl %esi, %eax C no carry out of this
+ movl SAVE_ESI, %esi
+
+ movl %eax, 20(%ecx)
+ addl $FRAME, %esp
+
+ ret
+
+
+
+C -----------------------------------------------------------------------------
+defframe(VAR_COUNTER,-20)
+defframe(VAR_JMP, -24)
+deflit(`STACK_SPACE',24)
+
+L(four_or_more):
+ C eax src low limb
+ C ebx
+ C ecx
+ C edx size
+ C esi src
+ C edi
+ C ebp
+deflit(`FRAME',4) dnl %esi already pushed
+
+C First multiply src[0]*src[1..size-1] and store at dst[1..size].
+
+ subl $STACK_SPACE-FRAME, %esp
+deflit(`FRAME',STACK_SPACE)
+ movl $1, %ecx
+
+ movl %edi, SAVE_EDI
+ movl PARAM_DST, %edi
+
+ movl %ebx, SAVE_EBX
+ subl %edx, %ecx C -(size-1)
+
+ movl %ebp, SAVE_EBP
+ movl $0, %ebx C initial carry
+
+ leal (%esi,%edx,4), %esi C &src[size]
+ movl %eax, %ebp C multiplier
+
+ leal -4(%edi,%edx,4), %edi C &dst[size-1]
+
+
+C This loop runs at just over 6 c/l.
+
+L(mul_1):
+ C eax scratch
+ C ebx carry
+ C ecx counter, limbs, negative, -(size-1) to -1
+ C edx scratch
+ C esi &src[size]
+ C edi &dst[size-1]
+ C ebp multiplier
+
+ movl %ebp, %eax
+
+ mull (%esi,%ecx,4)
+
+ addl %ebx, %eax
+ movl $0, %ebx
+
+ adcl %edx, %ebx
+ movl %eax, 4(%edi,%ecx,4)
+
+ incl %ecx
+ jnz L(mul_1)
+
+
+ movl %ebx, 4(%edi)
+
+
+C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
+C
+C The last two addmuls, which are the bottom right corner of the product
+C triangle, are left to the end. These are src[size-3]*src[size-2,size-1]
+C and src[size-2]*src[size-1]. If size is 4 then it's only these corner
+C cases that need to be done.
+C
+C The unrolled code is the same as mpn_addmul_1(), see that routine for some
+C comments.
+C
+C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive.
+C
+C VAR_JMP is the computed jump into the unrolled code, stepped by one code
+C chunk each outer loop.
+
+dnl This is also hard-coded in the address calculation below.
+deflit(CODE_BYTES_PER_LIMB, 15)
+
+dnl With &src[size] and &dst[size-1] pointers, the displacements in the
+dnl unrolled code fit in a byte for UNROLL_COUNT values up to 32, but above
+dnl that an offset must be added to them.
+deflit(OFFSET,
+ifelse(eval(UNROLL_COUNT>32),1,
+eval((UNROLL_COUNT-32)*4),
+0))
+
+ C eax
+ C ebx carry
+ C ecx
+ C edx
+ C esi &src[size]
+ C edi &dst[size-1]
+ C ebp
+
+ movl PARAM_SIZE, %ecx
+
+ subl $4, %ecx
+ jz L(corner)
+
+ movl %ecx, %edx
+ negl %ecx
+
+ shll $4, %ecx
+ifelse(OFFSET,0,,`subl $OFFSET, %esi')
+
+ifdef(`PIC',`
+ call L(pic_calc)
+L(here):
+',`
+ leal L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
+')
+ negl %edx
+
+ifelse(OFFSET,0,,`subl $OFFSET, %edi')
+
+ C The calculated jump mustn't be before the start of the available
+ C code. This is the limit that UNROLL_COUNT puts on the src operand
+ C size, but checked here using the jump address directly.
+
+ ASSERT(ae,
+ `movl_text_address( L(unroll_inner_start), %eax)
+ cmpl %eax, %ecx')
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(16)
+L(unroll_outer_top):
+ C eax
+ C ebx high limb to store
+ C ecx VAR_JMP
+ C edx VAR_COUNTER, limbs, negative
+ C esi &src[size], constant
+ C edi dst ptr, second highest limb of last addmul
+ C ebp
+
+ movl -12+OFFSET(%esi,%edx,4), %ebp C multiplier
+ movl %edx, VAR_COUNTER
+
+ movl -8+OFFSET(%esi,%edx,4), %eax C first limb of multiplicand
+
+ mull %ebp
+
+define(cmovX,`ifelse(eval(UNROLL_COUNT%2),1,`cmovz($@)',`cmovnz($@)')')
+
+ testb $1, %cl
+
+ movl %edx, %ebx C high carry
+ leal 4(%edi), %edi
+
+ movl %ecx, %edx C jump
+
+ movl %eax, %ecx C low carry
+ leal CODE_BYTES_PER_LIMB(%edx), %edx
+
+ cmovX( %ebx, %ecx) C high carry reverse
+ cmovX( %eax, %ebx) C low carry reverse
+ movl %edx, VAR_JMP
+ jmp *%edx
+
+
+ C Must be on an even address here so the low bit of the jump address
+ C will indicate which way around ecx/ebx should start.
+
+ ALIGN(2)
+
+L(unroll_inner_start):
+ C eax scratch
+ C ebx carry high
+ C ecx carry low
+ C edx scratch
+ C esi src pointer
+ C edi dst pointer
+ C ebp multiplier
+ C
+ C 15 code bytes each limb
+ C ecx/ebx reversed on each chunk
+
+forloop(`i', UNROLL_COUNT, 1, `
+ deflit(`disp_src', eval(-i*4 + OFFSET))
+ deflit(`disp_dst', eval(disp_src))
+
+ m4_assert(`disp_src>=-128 && disp_src<128')
+ m4_assert(`disp_dst>=-128 && disp_dst<128')
+
+ifelse(eval(i%2),0,`
+Zdisp( movl, disp_src,(%esi), %eax)
+ mull %ebp
+Zdisp( addl, %ebx, disp_dst,(%edi))
+ adcl %eax, %ecx
+ movl %edx, %ebx
+ adcl $0, %ebx
+',`
+ dnl this one comes out last
+Zdisp( movl, disp_src,(%esi), %eax)
+ mull %ebp
+Zdisp( addl, %ecx, disp_dst,(%edi))
+ adcl %eax, %ebx
+ movl %edx, %ecx
+ adcl $0, %ecx
+')
+')
+L(unroll_inner_end):
+
+ addl %ebx, m4_empty_if_zero(OFFSET)(%edi)
+
+ movl VAR_COUNTER, %edx
+ adcl $0, %ecx
+
+ movl %ecx, m4_empty_if_zero(OFFSET+4)(%edi)
+ movl VAR_JMP, %ecx
+
+ incl %edx
+ jnz L(unroll_outer_top)
+
+
+ifelse(OFFSET,0,,`
+ addl $OFFSET, %esi
+ addl $OFFSET, %edi
+')
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(16)
+L(corner):
+ C eax
+ C ebx
+ C ecx
+ C edx
+ C esi &src[size]
+ C edi &dst[2*size-5]
+ C ebp
+
+ movl -12(%esi), %eax
+
+ mull -8(%esi)
+
+ addl %eax, (%edi)
+ movl -12(%esi), %eax
+ movl $0, %ebx
+
+ adcl %edx, %ebx
+
+ mull -4(%esi)
+
+ addl %eax, %ebx
+ movl -8(%esi), %eax
+
+ adcl $0, %edx
+
+ addl %ebx, 4(%edi)
+ movl $0, %ebx
+
+ adcl %edx, %ebx
+
+ mull -4(%esi)
+
+ movl PARAM_SIZE, %ecx
+ addl %ebx, %eax
+
+ adcl $0, %edx
+
+ movl %eax, 8(%edi)
+
+ movl %edx, 12(%edi)
+ movl PARAM_DST, %edi
+
+
+C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1].
+
+ subl $1, %ecx C size-1
+ xorl %eax, %eax C ready for final adcl, and clear carry
+
+ movl %ecx, %edx
+ movl PARAM_SRC, %esi
+
+
+L(lshift):
+ C eax
+ C ebx
+ C ecx counter, size-1 to 1
+ C edx size-1 (for later use)
+ C esi src (for later use)
+ C edi dst, incrementing
+ C ebp
+
+ rcll 4(%edi)
+ rcll 8(%edi)
+
+ leal 8(%edi), %edi
+ decl %ecx
+ jnz L(lshift)
+
+
+ adcl %eax, %eax
+
+ movl %eax, 4(%edi) C dst most significant limb
+ movl (%esi), %eax C src[0]
+
+ leal 4(%esi,%edx,4), %esi C &src[size]
+ subl %edx, %ecx C -(size-1)
+
+
+C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
+C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the
+C low limb of src[0]^2.
+
+
+ mull %eax
+
+ movl %eax, (%edi,%ecx,8) C dst[0]
+
+
+L(diag):
+ C eax scratch
+ C ebx scratch
+ C ecx counter, negative
+ C edx carry
+ C esi &src[size]
+ C edi dst[2*size-2]
+ C ebp
+
+ movl (%esi,%ecx,4), %eax
+ movl %edx, %ebx
+
+ mull %eax
+
+ addl %ebx, 4(%edi,%ecx,8)
+ adcl %eax, 8(%edi,%ecx,8)
+ adcl $0, %edx
+
+ incl %ecx
+ jnz L(diag)
+
+
+ movl SAVE_ESI, %esi
+ movl SAVE_EBX, %ebx
+
+ addl %edx, 4(%edi) C dst most significant limb
+
+ movl SAVE_EDI, %edi
+ movl SAVE_EBP, %ebp
+ addl $FRAME, %esp
+ ret
+
+
+
+C -----------------------------------------------------------------------------
+ifdef(`PIC',`
+L(pic_calc):
+ addl (%esp), %ecx
+ addl $L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx
+ addl %edx, %ecx
+ ret_internal
+')
+
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/sse2/addmul_1.asm b/vendor/gmp-6.3.0/mpn/x86/p6/sse2/addmul_1.asm
new file mode 100644
index 0000000..144b627
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/p6/sse2/addmul_1.asm
@@ -0,0 +1,37 @@
+dnl Intel P6/SSE2 mpn_addmul_1.
+
+dnl Copyright 2008 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO
+C * Write P6 specific SSE2 code.
+
+MULFUNC_PROLOGUE(mpn_addmul_1)
+include_mpn(`x86/pentium4/sse2/addmul_1.asm')
diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/sse2/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/x86/p6/sse2/gmp-mparam.h
new file mode 100644
index 0000000..a1e261b
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/p6/sse2/gmp-mparam.h
@@ -0,0 +1,200 @@
+/* Intel P6/sse2 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2003, 2008-2010 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+
+/* NOTE: In a fat binary build SQR_TOOM2_THRESHOLD here cannot be more than the
+ value in mpn/x86/p6/gmp-mparam.h. The latter is used as a hard limit in
+ mpn/x86/p6/sqr_basecase.asm. */
+
+
+/* 1867 MHz P6 model 13 */
+
+#define MOD_1_NORM_THRESHOLD 4
+#define MOD_1_UNNORM_THRESHOLD 4
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 5
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 4
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 11
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 8
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 21
+
+#define MUL_TOOM22_THRESHOLD 20
+#define MUL_TOOM33_THRESHOLD 77
+#define MUL_TOOM44_THRESHOLD 169
+#define MUL_TOOM6H_THRESHOLD 246
+#define MUL_TOOM8H_THRESHOLD 381
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 73
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 114
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 97
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 80
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 106
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 30
+#define SQR_TOOM3_THRESHOLD 101
+#define SQR_TOOM4_THRESHOLD 154
+#define SQR_TOOM6_THRESHOLD 222
+#define SQR_TOOM8_THRESHOLD 527
+
+#define MULMID_TOOM42_THRESHOLD 58
+
+#define MULMOD_BNM1_THRESHOLD 13
+#define SQRMOD_BNM1_THRESHOLD 17
+
+#define MUL_FFT_MODF_THRESHOLD 690 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 565, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \
+ { 25, 7}, { 13, 6}, { 28, 7}, { 15, 6}, \
+ { 31, 7}, { 17, 6}, { 35, 7}, { 27, 8}, \
+ { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \
+ { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \
+ { 31, 7}, { 63, 8}, { 39, 9}, { 23, 5}, \
+ { 383, 4}, { 991, 5}, { 511, 6}, { 267, 7}, \
+ { 157, 8}, { 91, 9}, { 47, 8}, { 111, 9}, \
+ { 63, 8}, { 127, 9}, { 79,10}, { 47, 9}, \
+ { 95,11}, { 31,10}, { 63, 9}, { 135,10}, \
+ { 79, 9}, { 159,10}, { 95,11}, { 63,10}, \
+ { 143, 9}, { 287,10}, { 159,11}, { 95,10}, \
+ { 191,12}, { 63,11}, { 127,10}, { 255, 9}, \
+ { 511,10}, { 271, 9}, { 543,10}, { 287,11}, \
+ { 159,10}, { 335, 9}, { 671,11}, { 191,10}, \
+ { 383, 9}, { 767,10}, { 399, 9}, { 799,10}, \
+ { 415,11}, { 223,12}, { 127,11}, { 255,10}, \
+ { 543, 9}, { 1087,11}, { 287,10}, { 607,11}, \
+ { 319,10}, { 671,12}, { 191,11}, { 383,10}, \
+ { 799,11}, { 415,10}, { 831,13}, { 127,12}, \
+ { 255,11}, { 543,10}, { 1087,11}, { 607,10}, \
+ { 1215,12}, { 319,11}, { 671,10}, { 1343,11}, \
+ { 735,10}, { 1471,12}, { 383,11}, { 799,10}, \
+ { 1599,11}, { 863,12}, { 447,11}, { 959,13}, \
+ { 255,12}, { 511,11}, { 1087,12}, { 575,11}, \
+ { 1215,12}, { 639,11}, { 1343,12}, { 703,11}, \
+ { 1471,13}, { 383,12}, { 831,11}, { 1727,12}, \
+ { 959,14}, { 255,13}, { 511,12}, { 1215,13}, \
+ { 639,12}, { 1471,11}, { 2943,13}, { 767,12}, \
+ { 1727,13}, { 895,12}, { 1919,14}, { 511,13}, \
+ { 1023,12}, { 2111,13}, { 1151,12}, { 2431,13}, \
+ { 1407,12}, { 2815,14}, { 767,13}, { 1663,12}, \
+ { 3455,13}, { 8192,14}, { 16384,15}, { 32768,16} }
+#define MUL_FFT_TABLE3_SIZE 132
+#define MUL_FFT_THRESHOLD 7424
+
+#define SQR_FFT_MODF_THRESHOLD 565 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 472, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \
+ { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \
+ { 31, 7}, { 17, 6}, { 35, 7}, { 27, 8}, \
+ { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \
+ { 23, 7}, { 49, 8}, { 27, 9}, { 15, 8}, \
+ { 39, 9}, { 23, 8}, { 51,10}, { 15, 9}, \
+ { 31, 8}, { 63, 4}, { 1023, 8}, { 67, 9}, \
+ { 39, 5}, { 639, 4}, { 1471, 6}, { 383, 7}, \
+ { 209, 8}, { 119, 9}, { 63, 7}, { 255, 8}, \
+ { 139, 9}, { 71, 8}, { 143, 9}, { 79,10}, \
+ { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \
+ { 135,10}, { 79, 9}, { 159, 8}, { 319, 9}, \
+ { 167,10}, { 95,11}, { 63,10}, { 143, 9}, \
+ { 287,10}, { 159,11}, { 95,10}, { 191,12}, \
+ { 63,11}, { 127,10}, { 255, 9}, { 543, 8}, \
+ { 1087,10}, { 287, 9}, { 575,11}, { 159,10}, \
+ { 319, 9}, { 639,10}, { 335, 9}, { 671,10}, \
+ { 351, 9}, { 703,11}, { 191,10}, { 383, 9}, \
+ { 767,10}, { 399, 9}, { 799,10}, { 415, 9}, \
+ { 831,11}, { 223,12}, { 127,11}, { 255,10}, \
+ { 543, 9}, { 1087,11}, { 287,10}, { 607, 9}, \
+ { 1215,11}, { 319,10}, { 671, 9}, { 1343,11}, \
+ { 351,10}, { 703,12}, { 191,11}, { 383,10}, \
+ { 799,11}, { 415,10}, { 831,13}, { 127,12}, \
+ { 255,11}, { 543,10}, { 1087,11}, { 607,12}, \
+ { 319,11}, { 671,10}, { 1343,11}, { 735,12}, \
+ { 383,11}, { 799,10}, { 1599,11}, { 863,12}, \
+ { 447,11}, { 959,13}, { 255,12}, { 511,11}, \
+ { 1087,12}, { 575,11}, { 1215,12}, { 639,11}, \
+ { 1343,12}, { 703,11}, { 1471,13}, { 383,12}, \
+ { 767,11}, { 1599,12}, { 831,11}, { 1727,12}, \
+ { 959,14}, { 255,13}, { 511,12}, { 1215,13}, \
+ { 639,12}, { 1471,13}, { 767,12}, { 1727,13}, \
+ { 895,12}, { 1919,14}, { 511,13}, { 1023,12}, \
+ { 2111,13}, { 1151,12}, { 2431,13}, { 1407,14}, \
+ { 767,13}, { 1663,12}, { 3455,13}, { 8192,14}, \
+ { 16384,15}, { 32768,16} }
+#define SQR_FFT_TABLE3_SIZE 146
+#define SQR_FFT_THRESHOLD 5760
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 31
+#define MULLO_MUL_N_THRESHOLD 13463
+#define SQRLO_BASECASE_THRESHOLD 0 /* always */
+#define SQRLO_DC_THRESHOLD 100
+#define SQRLO_SQR_THRESHOLD 9236
+
+#define DC_DIV_QR_THRESHOLD 25
+#define DC_DIVAPPR_Q_THRESHOLD 55
+#define DC_BDIV_QR_THRESHOLD 60
+#define DC_BDIV_Q_THRESHOLD 132
+
+#define INV_MULMOD_BNM1_THRESHOLD 38
+#define INV_NEWTON_THRESHOLD 65
+#define INV_APPR_THRESHOLD 65
+
+#define BINV_NEWTON_THRESHOLD 252
+#define REDC_1_TO_REDC_N_THRESHOLD 62
+
+#define MU_DIV_QR_THRESHOLD 1164
+#define MU_DIVAPPR_Q_THRESHOLD 748
+#define MUPI_DIV_QR_THRESHOLD 38
+#define MU_BDIV_QR_THRESHOLD 1360
+#define MU_BDIV_Q_THRESHOLD 1470
+
+#define POWM_SEC_TABLE 2,23,258,879,2246
+
+#define GET_STR_DC_THRESHOLD 13
+#define GET_STR_PRECOMPUTE_THRESHOLD 25
+#define SET_STR_DC_THRESHOLD 582
+#define SET_STR_PRECOMPUTE_THRESHOLD 1118
+
+#define FAC_DSC_THRESHOLD 178
+#define FAC_ODD_THRESHOLD 34
+
+#define MATRIX22_STRASSEN_THRESHOLD 17
+#define HGCD_THRESHOLD 69
+#define HGCD_APPR_THRESHOLD 112
+#define HGCD_REDUCE_THRESHOLD 3389
+#define GCD_DC_THRESHOLD 386
+#define GCDEXT_DC_THRESHOLD 303
+#define JACOBI_BASE_METHOD 1
diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/sse2/mod_1_1.asm b/vendor/gmp-6.3.0/mpn/x86/p6/sse2/mod_1_1.asm
new file mode 100644
index 0000000..8b7b7ad
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/p6/sse2/mod_1_1.asm
@@ -0,0 +1,34 @@
+dnl Intel P6/SSE2 mpn_mod_1_1.
+
+dnl Copyright 2009, 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_mod_1_1p)
+include_mpn(`x86/pentium4/sse2/mod_1_1.asm')
diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/sse2/mod_1_4.asm b/vendor/gmp-6.3.0/mpn/x86/p6/sse2/mod_1_4.asm
new file mode 100644
index 0000000..49c96c6
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/p6/sse2/mod_1_4.asm
@@ -0,0 +1,34 @@
+dnl Intel P6/SSE2 mpn_mod_1_4.
+
+dnl Copyright 2009, 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_mod_1s_4p)
+include_mpn(`x86/pentium4/sse2/mod_1_4.asm')
diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/sse2/mul_1.asm b/vendor/gmp-6.3.0/mpn/x86/p6/sse2/mul_1.asm
new file mode 100644
index 0000000..50e5b69
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/p6/sse2/mul_1.asm
@@ -0,0 +1,38 @@
+dnl Intel P6/SSE2 mpn_mul_1.
+
+dnl Copyright 2008 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO
+C * Write P6 specific SSE2 code. It should reach 3 c/l.
+C The Pentium4 code runs at 4.2 c/l.
+
+MULFUNC_PROLOGUE(mpn_mul_1)
+include_mpn(`x86/pentium4/sse2/mul_1.asm')
diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/sse2/mul_basecase.asm b/vendor/gmp-6.3.0/mpn/x86/p6/sse2/mul_basecase.asm
new file mode 100644
index 0000000..4687625
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/p6/sse2/mul_basecase.asm
@@ -0,0 +1,35 @@
+dnl Intel P6/SSE2 mpn_mul_basecase.
+
+dnl Copyright 2008 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+MULFUNC_PROLOGUE(mpn_mul_basecase)
+include_mpn(`x86/pentium4/sse2/mul_basecase.asm')
diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/sse2/popcount.asm b/vendor/gmp-6.3.0/mpn/x86/p6/sse2/popcount.asm
new file mode 100644
index 0000000..4c02b93
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/p6/sse2/popcount.asm
@@ -0,0 +1,35 @@
+dnl Intel P6/SSE2 mpn_popcount -- population count.
+
+dnl Copyright 2008 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+MULFUNC_PROLOGUE(mpn_popcount)
+include_mpn(`x86/pentium4/sse2/popcount.asm')
diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/sse2/sqr_basecase.asm b/vendor/gmp-6.3.0/mpn/x86/p6/sse2/sqr_basecase.asm
new file mode 100644
index 0000000..76b574b
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/p6/sse2/sqr_basecase.asm
@@ -0,0 +1,35 @@
+dnl Intel P6/SSE2 mpn_sqr_basecase.
+
+dnl Copyright 2008 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+MULFUNC_PROLOGUE(mpn_sqr_basecase)
+include_mpn(`x86/pentium4/sse2/sqr_basecase.asm')
diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/sse2/submul_1.asm b/vendor/gmp-6.3.0/mpn/x86/p6/sse2/submul_1.asm
new file mode 100644
index 0000000..69d940d
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/p6/sse2/submul_1.asm
@@ -0,0 +1,35 @@
+dnl Intel P6/SSE2 mpn_submul_1.
+
+dnl Copyright 2008 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+MULFUNC_PROLOGUE(mpn_submul_1)
+include_mpn(`x86/pentium4/sse2/submul_1.asm')
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/README b/vendor/gmp-6.3.0/mpn/x86/pentium/README
new file mode 100644
index 0000000..305936b
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/README
@@ -0,0 +1,181 @@
+Copyright 1996, 1999-2001, 2003 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/.
+
+
+
+
+
+ INTEL PENTIUM P5 MPN SUBROUTINES
+
+
+This directory contains mpn functions optimized for Intel Pentium (P5,P54)
+processors. The mmx subdirectory has additional code for Pentium with MMX
+(P55).
+
+
+STATUS
+
+ cycles/limb
+
+ mpn_add_n/sub_n 2.375
+
+ mpn_mul_1 12.0
+ mpn_add/submul_1 14.0
+
+ mpn_mul_basecase 14.2 cycles/crossproduct (approx)
+
+ mpn_sqr_basecase 8 cycles/crossproduct (approx)
+ or 15.5 cycles/triangleproduct (approx)
+
+ mpn_l/rshift 5.375 normal (6.0 on P54)
+ 1.875 special shift by 1 bit
+
+ mpn_divrem_1 44.0
+ mpn_mod_1 28.0
+ mpn_divexact_by3 15.0
+
+ mpn_copyi/copyd 1.0
+
+Pentium MMX gets the following improvements
+
+ mpn_l/rshift 1.75
+
+ mpn_mul_1 12.0 normal, 7.0 for 16-bit multiplier
+
+
+mpn_add_n and mpn_sub_n run at asymptotically 2 cycles/limb. Due to loop
+overhead and other delays (cache refill?), they run at or near 2.5
+cycles/limb.
+
+mpn_mul_1, mpn_addmul_1, mpn_submul_1 all run 1 cycle faster than they
+should. Intel documentation says a mul instruction is 10 cycles, but it
+measures 9 and the routines using it run as 9.
+
+
+
+P55 MMX AND X87
+
+The cost of switching between MMX and x87 floating point on P55 is about 100
+cycles (fld1/por/emms for instance). In order to avoid that the two aren't
+mixed and currently that means using MMX and not x87.
+
+MMX offers a big speedup for lshift and rshift, and a nice speedup for
+16-bit multipliers in mpn_mul_1. If fast code using x87 is found then
+perhaps the preference for MMX will be reversed.
+
+
+
+
+P54 SHLDL
+
+mpn_lshift and mpn_rshift run at about 6 cycles/limb on P5 and P54, but the
+documentation indicates that they should take only 43/8 = 5.375 cycles/limb,
+or 5 cycles/limb asymptotically. The P55 runs them at the expected speed.
+
+It seems that on P54 a shldl or shrdl allows pairing in one following cycle,
+but not two. For example, back to back repetitions of the following
+
+ shldl( %cl, %eax, %ebx)
+ xorl %edx, %edx
+ xorl %esi, %esi
+
+run at 5 cycles, as expected, but repetitions of the following run at 7
+cycles, whereas 6 would be expected (and is achieved on P55),
+
+ shldl( %cl, %eax, %ebx)
+ xorl %edx, %edx
+ xorl %esi, %esi
+ xorl %edi, %edi
+ xorl %ebp, %ebp
+
+Three xorls run at 7 cycles too, so it doesn't seem to be just that pairing
+inhibited is only in the second following cycle (or something like that).
+
+Avoiding this problem would bring P54 shifts down from 6.0 c/l to 5.5 with a
+pattern of shift, 2 loads, shift, 2 stores, shift, etc. A start has been
+made on something like that, but it's not yet complete.
+
+
+
+
+OTHER NOTES
+
+Prefetching Destinations
+
+ Pentium doesn't allocate cache lines on writes, unlike most other modern
+ processors. Since the functions in the mpn class do array writes, we
+ have to handle allocating the destination cache lines by reading a word
+ from it in the loops, to achieve the best performance.
+
+Prefetching Sources
+
+ Prefetching of sources is pointless since there's no out-of-order loads.
+ Any load instruction blocks until the line is brought to L1, so it may
+ as well be the load that wants the data which blocks.
+
+Data Cache Bank Clashes
+
+ Pairing of memory operations requires that the two issued operations
+ refer to different cache banks (ie. different addresses modulo 32
+ bytes). The simplest way to ensure this is to read/write two words from
+ the same object. If we make operations on different objects, they might
+ or might not be to the same cache bank.
+
+PIC %eip Fetching
+
+ A simple call $+5 and popl can be used to get %eip, there's no need to
+ balance calls and returns since P5 doesn't have any return stack branch
+ prediction.
+
+Float Multiplies
+
+ fmul is pairable and can be issued every 2 cycles (with a 4 cycle
+ latency for data ready to use). This is a lot better than integer mull
+ or imull at 9 cycles non-pairing. Unfortunately the advantage is
+ quickly eaten away by needing to throw data through memory back to the
+ integer registers to adjust for fild and fist being signed, and to do
+ things like propagating carry bits.
+
+
+
+
+
+REFERENCES
+
+"Intel Architecture Optimization Manual", 1997, order number 242816. This
+is mostly about P5, the parts about P6 aren't relevant. Available on-line:
+
+ http://download.intel.com/design/PentiumII/manuals/242816.htm
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 76
+End:
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/aors_n.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/aors_n.asm
new file mode 100644
index 0000000..01ebfb9
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/aors_n.asm
@@ -0,0 +1,203 @@
+dnl Intel Pentium mpn_add_n/mpn_sub_n -- mpn addition and subtraction.
+
+dnl Copyright 1992, 1994-1996, 1999, 2000, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 2.375 cycles/limb
+
+
+ifdef(`OPERATION_add_n',`
+ define(M4_inst, adcl)
+ define(M4_function_n, mpn_add_n)
+ define(M4_function_nc, mpn_add_nc)
+
+',`ifdef(`OPERATION_sub_n',`
+ define(M4_inst, sbbl)
+ define(M4_function_n, mpn_sub_n)
+ define(M4_function_nc, mpn_sub_nc)
+
+',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
+')')')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+
+C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size);
+C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size, mp_limb_t carry);
+
+defframe(PARAM_CARRY,20)
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC2, 12)
+defframe(PARAM_SRC1, 8)
+defframe(PARAM_DST, 4)
+
+ TEXT
+ ALIGN(8)
+PROLOGUE(M4_function_nc)
+
+ pushl %edi
+ pushl %esi
+ pushl %ebx
+ pushl %ebp
+deflit(`FRAME',16)
+
+ movl PARAM_DST,%edi
+ movl PARAM_SRC1,%esi
+ movl PARAM_SRC2,%ebp
+ movl PARAM_SIZE,%ecx
+
+ movl (%ebp),%ebx
+
+ decl %ecx
+ movl %ecx,%edx
+ shrl $3,%ecx
+ andl $7,%edx
+ testl %ecx,%ecx C zero carry flag
+ jz L(endgo)
+
+ pushl %edx
+FRAME_pushl()
+ movl PARAM_CARRY,%eax
+ shrl %eax C shift bit 0 into carry
+ jmp L(oop)
+
+L(endgo):
+deflit(`FRAME',16)
+ movl PARAM_CARRY,%eax
+ shrl %eax C shift bit 0 into carry
+ jmp L(end)
+
+EPILOGUE()
+
+
+ ALIGN(8)
+PROLOGUE(M4_function_n)
+
+ pushl %edi
+ pushl %esi
+ pushl %ebx
+ pushl %ebp
+deflit(`FRAME',16)
+
+ movl PARAM_DST,%edi
+ movl PARAM_SRC1,%esi
+ movl PARAM_SRC2,%ebp
+ movl PARAM_SIZE,%ecx
+
+ movl (%ebp),%ebx
+
+ decl %ecx
+ movl %ecx,%edx
+ shrl $3,%ecx
+ andl $7,%edx
+ testl %ecx,%ecx C zero carry flag
+ jz L(end)
+ pushl %edx
+FRAME_pushl()
+
+ ALIGN(8)
+L(oop): movl 28(%edi),%eax C fetch destination cache line
+ leal 32(%edi),%edi
+
+L(1): movl (%esi),%eax
+ movl 4(%esi),%edx
+ M4_inst %ebx,%eax
+ movl 4(%ebp),%ebx
+ M4_inst %ebx,%edx
+ movl 8(%ebp),%ebx
+ movl %eax,-32(%edi)
+ movl %edx,-28(%edi)
+
+L(2): movl 8(%esi),%eax
+ movl 12(%esi),%edx
+ M4_inst %ebx,%eax
+ movl 12(%ebp),%ebx
+ M4_inst %ebx,%edx
+ movl 16(%ebp),%ebx
+ movl %eax,-24(%edi)
+ movl %edx,-20(%edi)
+
+L(3): movl 16(%esi),%eax
+ movl 20(%esi),%edx
+ M4_inst %ebx,%eax
+ movl 20(%ebp),%ebx
+ M4_inst %ebx,%edx
+ movl 24(%ebp),%ebx
+ movl %eax,-16(%edi)
+ movl %edx,-12(%edi)
+
+L(4): movl 24(%esi),%eax
+ movl 28(%esi),%edx
+ M4_inst %ebx,%eax
+ movl 28(%ebp),%ebx
+ M4_inst %ebx,%edx
+ movl 32(%ebp),%ebx
+ movl %eax,-8(%edi)
+ movl %edx,-4(%edi)
+
+ leal 32(%esi),%esi
+ leal 32(%ebp),%ebp
+ decl %ecx
+ jnz L(oop)
+
+ popl %edx
+FRAME_popl()
+L(end):
+ decl %edx C test %edx w/o clobbering carry
+ js L(end2)
+ incl %edx
+L(oop2):
+ leal 4(%edi),%edi
+ movl (%esi),%eax
+ M4_inst %ebx,%eax
+ movl 4(%ebp),%ebx
+ movl %eax,-4(%edi)
+ leal 4(%esi),%esi
+ leal 4(%ebp),%ebp
+ decl %edx
+ jnz L(oop2)
+L(end2):
+ movl (%esi),%eax
+ M4_inst %ebx,%eax
+ movl %eax,(%edi)
+
+ sbbl %eax,%eax
+ negl %eax
+
+ popl %ebp
+ popl %ebx
+ popl %esi
+ popl %edi
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/aorsmul_1.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/aorsmul_1.asm
new file mode 100644
index 0000000..d83cc45
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/aorsmul_1.asm
@@ -0,0 +1,144 @@
+dnl Intel Pentium mpn_addmul_1 -- mpn by limb multiplication.
+
+dnl Copyright 1992, 1994, 1996, 1999, 2000, 2002 Free Software Foundation,
+dnl Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 14.0 cycles/limb
+
+
+ifdef(`OPERATION_addmul_1', `
+ define(M4_inst, addl)
+ define(M4_function_1, mpn_addmul_1)
+ define(M4_function_1c, mpn_addmul_1c)
+
+',`ifdef(`OPERATION_submul_1', `
+ define(M4_inst, subl)
+ define(M4_function_1, mpn_submul_1)
+ define(M4_function_1c, mpn_submul_1c)
+
+',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
+')')')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
+
+
+C mp_limb_t mpn_addmul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t mult);
+C mp_limb_t mpn_addmul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t mult, mp_limb_t carry);
+C
+C mp_limb_t mpn_submul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t mult);
+C mp_limb_t mpn_submul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t mult, mp_limb_t carry);
+C
+
+defframe(PARAM_CARRY, 20)
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ TEXT
+
+ ALIGN(8)
+PROLOGUE(M4_function_1c)
+deflit(`FRAME',0)
+
+ movl PARAM_CARRY, %ecx
+ pushl %esi FRAME_pushl()
+
+ jmp L(start_1c)
+
+EPILOGUE()
+
+
+ ALIGN(8)
+PROLOGUE(M4_function_1)
+deflit(`FRAME',0)
+
+ xorl %ecx, %ecx
+ pushl %esi FRAME_pushl()
+
+L(start_1c):
+ movl PARAM_SRC, %esi
+ movl PARAM_SIZE, %eax
+
+ pushl %edi FRAME_pushl()
+ pushl %ebx FRAME_pushl()
+
+ movl PARAM_DST, %edi
+ leal -1(%eax), %ebx C size-1
+
+ leal (%esi,%eax,4), %esi
+ xorl $-1, %ebx C -size, and clear carry
+
+ leal (%edi,%eax,4), %edi
+
+L(top):
+ C eax
+ C ebx counter, negative
+ C ecx carry
+ C edx
+ C esi src end
+ C edi dst end
+ C ebp
+
+ adcl $0, %ecx
+ movl (%esi,%ebx,4), %eax
+
+ mull PARAM_MULTIPLIER
+
+ addl %ecx, %eax
+ movl (%edi,%ebx,4), %ecx
+
+ adcl $0, %edx
+ M4_inst %eax, %ecx
+
+ movl %ecx, (%edi,%ebx,4)
+ incl %ebx
+
+ movl %edx, %ecx
+ jnz L(top)
+
+
+ adcl $0, %ecx
+ popl %ebx
+
+ movl %ecx, %eax
+ popl %edi
+
+ popl %esi
+
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/bdiv_q_1.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/bdiv_q_1.asm
new file mode 100644
index 0000000..c2c4f58
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/bdiv_q_1.asm
@@ -0,0 +1,266 @@
+dnl Intel Pentium mpn_divexact_1 -- mpn by limb exact division.
+
+dnl Rearranged from mpn/x86/pentium/dive_1.asm by Marco Bodrato.
+
+dnl Copyright 2001, 2002, 2011, 2014 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C divisor
+C odd even
+C P54: 24.5 30.5 cycles/limb
+C P55: 23.0 28.0
+
+MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1)
+
+C The P55 speeds noted above, 23 cycles odd or 28 cycles even, are as
+C expected. On P54 in the even case the shrdl pairing nonsense (see
+C mpn/x86/pentium/README) costs 1 cycle, but it's not clear why there's a
+C further 1.5 slowdown for both odd and even.
+
+defframe(PARAM_SHIFT, 24)
+defframe(PARAM_INVERSE,20)
+defframe(PARAM_DIVISOR,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+dnl re-use parameter space
+define(VAR_INVERSE,`PARAM_DST')
+
+ TEXT
+
+ ALIGN(32)
+C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor);
+C
+PROLOGUE(mpn_bdiv_q_1)
+deflit(`FRAME',0)
+
+ movl $-1, %ecx
+ movl PARAM_DIVISOR, %eax
+
+L(strip_twos):
+ ASSERT(nz, `orl %eax, %eax')
+ shrl %eax
+ incl %ecx C shift count
+
+ jnc L(strip_twos)
+
+ leal 1(%eax,%eax), %edx C d
+ andl $127, %eax C d/2, 7 bits
+
+ pushl %ebx FRAME_pushl()
+ pushl %ebp FRAME_pushl()
+
+ifdef(`PIC',`
+ifdef(`DARWIN',`
+ LEA( binvert_limb_table, %ebp)
+ movzbl (%eax,%ebp), %eax
+',`
+ call L(here)
+L(here):
+ popl %ebp C eip
+
+ addl $_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ebp
+ C AGI
+ movl binvert_limb_table@GOT(%ebp), %ebp
+ C AGI
+ movzbl (%eax,%ebp), %eax
+')
+',`
+
+dnl non-PIC
+ movzbl binvert_limb_table(%eax), %eax C inv 8 bits
+')
+
+ movl %eax, %ebp C inv
+ addl %eax, %eax C 2*inv
+
+ imull %ebp, %ebp C inv*inv
+
+ imull %edx, %ebp C inv*inv*d
+
+ subl %ebp, %eax C inv = 2*inv - inv*inv*d
+ movl PARAM_SIZE, %ebx
+
+ movl %eax, %ebp
+ addl %eax, %eax C 2*inv
+
+ imull %ebp, %ebp C inv*inv
+
+ imull %edx, %ebp C inv*inv*d
+
+ subl %ebp, %eax C inv = 2*inv - inv*inv*d
+ movl %edx, PARAM_DIVISOR C d without twos
+
+ ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS
+ pushl %eax FRAME_pushl()
+ imull PARAM_DIVISOR, %eax
+ cmpl $1, %eax
+ popl %eax FRAME_popl()')
+
+ jmp L(common)
+EPILOGUE()
+
+C mp_limb_t
+C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C mp_limb_t inverse, int shift)
+ ALIGN(32)
+PROLOGUE(mpn_pi1_bdiv_q_1)
+deflit(`FRAME',0)
+
+ movl PARAM_SHIFT, %ecx
+
+ pushl %ebx FRAME_pushl()
+ pushl %ebp FRAME_pushl()
+
+ movl PARAM_SIZE, %ebx
+ movl PARAM_INVERSE, %eax
+
+L(common):
+ pushl %esi FRAME_pushl()
+ push %edi FRAME_pushl()
+
+ movl PARAM_SRC, %esi
+ movl PARAM_DST, %edi
+ movl %eax, VAR_INVERSE
+
+ leal (%esi,%ebx,4), %esi C src end
+ leal (%edi,%ebx,4), %edi C dst end
+
+ negl %ebx C -size
+
+ xorl %ebp, %ebp C initial carry bit
+
+ orl %ecx, %ecx C shift
+ movl (%esi,%ebx,4), %eax C src low limb
+ jz L(odd_entry)
+
+ xorl %edx, %edx C initial carry limb (for even, if one)
+ incl %ebx
+ jz L(one)
+
+ movl (%esi,%ebx,4), %edx C src second limb (for even)
+ shrdl( %cl, %edx, %eax)
+
+ jmp L(even_entry)
+
+
+ ALIGN(8)
+L(odd_top):
+ C eax scratch
+ C ebx counter, limbs, negative
+ C ecx
+ C edx
+ C esi src end
+ C edi dst end
+ C ebp carry bit, 0 or -1
+
+ mull PARAM_DIVISOR
+
+ movl (%esi,%ebx,4), %eax
+ subl %ebp, %edx
+
+ subl %edx, %eax
+
+ sbbl %ebp, %ebp
+
+L(odd_entry):
+ imull VAR_INVERSE, %eax
+
+ movl %eax, (%edi,%ebx,4)
+
+ incl %ebx
+ jnz L(odd_top)
+
+ popl %edi
+ popl %esi
+
+ popl %ebp
+ popl %ebx
+
+ ret
+
+L(even_top):
+ C eax scratch
+ C ebx counter, limbs, negative
+ C ecx twos
+ C edx
+ C esi src end
+ C edi dst end
+ C ebp carry bit, 0 or -1
+
+ mull PARAM_DIVISOR
+
+ subl %ebp, %edx C carry bit
+ movl -4(%esi,%ebx,4), %eax C src limb
+
+ movl (%esi,%ebx,4), %ebp C and one above it
+
+ shrdl( %cl, %ebp, %eax)
+
+ subl %edx, %eax C carry limb
+
+ sbbl %ebp, %ebp
+
+L(even_entry):
+ imull VAR_INVERSE, %eax
+
+ movl %eax, -4(%edi,%ebx,4)
+ incl %ebx
+
+ jnz L(even_top)
+
+ mull PARAM_DIVISOR
+
+ movl -4(%esi), %eax C src high limb
+ subl %ebp, %edx
+
+L(one):
+ shrl %cl, %eax
+
+ subl %edx, %eax C no carry if division is exact
+
+ imull VAR_INVERSE, %eax
+
+ movl %eax, -4(%edi) C dst high limb
+ nop C protect against cache bank clash
+
+ popl %edi
+ popl %esi
+
+ popl %ebp
+ popl %ebx
+
+ ret
+
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/com.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/com.asm
new file mode 100644
index 0000000..b080545
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/com.asm
@@ -0,0 +1,181 @@
+dnl Intel Pentium mpn_com -- mpn ones complement.
+
+dnl Copyright 1996, 2001, 2002, 2006 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 1.75 cycles/limb
+
+
+NAILS_SUPPORT(0-31)
+
+
+C void mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C This code is similar to mpn_copyi, basically there's just some "xorl
+C $GMP_NUMB_MASK"s inserted.
+C
+C Alternatives:
+C
+C On P55 some MMX code could be 1.25 c/l (8 limb unrolled) if src and dst
+C are the same alignment mod 8, but it doesn't seem worth the trouble for
+C just that case (there'd need to be some plain integer available too for
+C the unaligned case).
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ TEXT
+ ALIGN(8)
+PROLOGUE(mpn_com)
+deflit(`FRAME',0)
+
+ movl PARAM_SRC, %eax
+ movl PARAM_SIZE, %ecx
+
+ pushl %esi FRAME_pushl()
+ pushl %edi FRAME_pushl()
+
+ leal (%eax,%ecx,4), %eax
+ xorl $-1, %ecx C -size-1
+
+ movl PARAM_DST, %edx
+ addl $8, %ecx C -size+7
+
+ jns L(end)
+
+ movl (%edx), %esi C fetch destination cache line
+ nop
+
+L(top):
+ C eax &src[size]
+ C ebx
+ C ecx counter, limbs, negative
+ C edx dst, incrementing
+ C esi scratch
+ C edi scratch
+ C ebp
+
+ movl 28(%edx), %esi C destination prefetch
+ addl $32, %edx
+
+ movl -28(%eax,%ecx,4), %esi
+ movl -24(%eax,%ecx,4), %edi
+ xorl $GMP_NUMB_MASK, %esi
+ xorl $GMP_NUMB_MASK, %edi
+ movl %esi, -32(%edx)
+ movl %edi, -28(%edx)
+
+ movl -20(%eax,%ecx,4), %esi
+ movl -16(%eax,%ecx,4), %edi
+ xorl $GMP_NUMB_MASK, %esi
+ xorl $GMP_NUMB_MASK, %edi
+ movl %esi, -24(%edx)
+ movl %edi, -20(%edx)
+
+ movl -12(%eax,%ecx,4), %esi
+ movl -8(%eax,%ecx,4), %edi
+ xorl $GMP_NUMB_MASK, %esi
+ xorl $GMP_NUMB_MASK, %edi
+ movl %esi, -16(%edx)
+ movl %edi, -12(%edx)
+
+ movl -4(%eax,%ecx,4), %esi
+ movl (%eax,%ecx,4), %edi
+ xorl $GMP_NUMB_MASK, %esi
+ xorl $GMP_NUMB_MASK, %edi
+ movl %esi, -8(%edx)
+ movl %edi, -4(%edx)
+
+ addl $8, %ecx
+ js L(top)
+
+
+L(end):
+ C eax &src[size]
+ C ecx 0 to 7, representing respectively 7 to 0 limbs remaining
+ C edx dst, next location to store
+
+ subl $4, %ecx
+ nop
+
+ jns L(no4)
+
+ movl -12(%eax,%ecx,4), %esi
+ movl -8(%eax,%ecx,4), %edi
+ xorl $GMP_NUMB_MASK, %esi
+ xorl $GMP_NUMB_MASK, %edi
+ movl %esi, (%edx)
+ movl %edi, 4(%edx)
+
+ movl -4(%eax,%ecx,4), %esi
+ movl (%eax,%ecx,4), %edi
+ xorl $GMP_NUMB_MASK, %esi
+ xorl $GMP_NUMB_MASK, %edi
+ movl %esi, 8(%edx)
+ movl %edi, 12(%edx)
+
+ addl $16, %edx
+ addl $4, %ecx
+L(no4):
+
+ subl $2, %ecx
+ nop
+
+ jns L(no2)
+
+ movl -4(%eax,%ecx,4), %esi
+ movl (%eax,%ecx,4), %edi
+ xorl $GMP_NUMB_MASK, %esi
+ xorl $GMP_NUMB_MASK, %edi
+ movl %esi, (%edx)
+ movl %edi, 4(%edx)
+
+ addl $8, %edx
+ addl $2, %ecx
+L(no2):
+
+ popl %edi
+ jnz L(done)
+
+ movl -4(%eax), %ecx
+
+ xorl $GMP_NUMB_MASK, %ecx
+ popl %esi
+
+ movl %ecx, (%edx)
+ ret
+
+L(done):
+ popl %esi
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/copyd.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/copyd.asm
new file mode 100644
index 0000000..72a543b
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/copyd.asm
@@ -0,0 +1,146 @@
+dnl Intel Pentium mpn_copyd -- copy limb vector, decrementing.
+
+dnl Copyright 1996, 2001, 2002, 2006 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 1.25 cycles/limb
+
+
+C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C See comments in copyi.asm.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ TEXT
+ ALIGN(8)
+PROLOGUE(mpn_copyd)
+deflit(`FRAME',0)
+
+ movl PARAM_SRC, %eax
+ movl PARAM_SIZE, %ecx
+
+ pushl %esi FRAME_pushl()
+ pushl %edi FRAME_pushl()
+
+ leal -4(%eax,%ecx,4), %eax C &src[size-1]
+ movl PARAM_DST, %edx
+
+ subl $7, %ecx C size-7
+ jle L(end)
+
+ movl 28-4(%edx,%ecx,4), %esi C prefetch cache, dst[size-1]
+ nop
+
+L(top):
+ C eax src, decrementing
+ C ebx
+ C ecx counter, limbs
+ C edx dst
+ C esi scratch
+ C edi scratch
+ C ebp
+
+ movl 28-32(%edx,%ecx,4), %esi C prefetch dst cache line
+ subl $8, %ecx
+
+ movl (%eax), %esi C read words pairwise
+ movl -4(%eax), %edi
+ movl %esi, 56(%edx,%ecx,4) C store words pairwise
+ movl %edi, 52(%edx,%ecx,4)
+
+ movl -8(%eax), %esi
+ movl -12(%eax), %edi
+ movl %esi, 48(%edx,%ecx,4)
+ movl %edi, 44(%edx,%ecx,4)
+
+ movl -16(%eax), %esi
+ movl -20(%eax), %edi
+ movl %esi, 40(%edx,%ecx,4)
+ movl %edi, 36(%edx,%ecx,4)
+
+ movl -24(%eax), %esi
+ movl -28(%eax), %edi
+ movl %esi, 32(%edx,%ecx,4)
+ movl %edi, 28(%edx,%ecx,4)
+
+ leal -32(%eax), %eax
+ jg L(top)
+
+
+L(end):
+ C ecx -7 to 0, representing respectively 0 to 7 limbs remaining
+ C eax src end
+ C edx dst, next location to store
+
+ addl $4, %ecx
+ jle L(no4)
+
+ movl (%eax), %esi
+ movl -4(%eax), %edi
+ movl %esi, 8(%edx,%ecx,4)
+ movl %edi, 4(%edx,%ecx,4)
+
+ movl -8(%eax), %esi
+ movl -12(%eax), %edi
+ movl %esi, (%edx,%ecx,4)
+ movl %edi, -4(%edx,%ecx,4)
+
+ subl $16, %eax
+ subl $4, %ecx
+L(no4):
+
+ addl $2, %ecx
+ jle L(no2)
+
+ movl (%eax), %esi
+ movl -4(%eax), %edi
+ movl %esi, (%edx,%ecx,4)
+ movl %edi, -4(%edx,%ecx,4)
+
+ subl $8, %eax
+ subl $2, %ecx
+L(no2):
+
+ jnz L(done)
+
+ movl (%eax), %ecx
+ movl %ecx, (%edx) C risk of cache bank clash here
+
+L(done):
+ popl %edi
+ popl %esi
+
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/copyi.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/copyi.asm
new file mode 100644
index 0000000..d983d6b
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/copyi.asm
@@ -0,0 +1,164 @@
+dnl Intel Pentium mpn_copyi -- copy limb vector, incrementing.
+
+dnl Copyright 1996, 2001, 2002, 2006 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 1.25 cycles/limb
+
+
+C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C Destination prefetching is done to avoid repeated write-throughs on lines
+C not already in L1.
+C
+C At least one of the src or dst pointer needs to be incremented rather than
+C using indexing, so that there's somewhere to put the loop control without
+C an AGI. Incrementing one and not two lets us keep loop overhead to 2
+C cycles. Making it the src pointer incremented avoids an AGI on the %ecx
+C subtracts in the finishup code.
+C
+C The block of finishup code is almost as big as the main loop itself, which
+C is unfortunate, but it's faster that way than with say rep movsl, by about
+C 10 cycles for instance on P55.
+C
+C There's nothing to be gained from MMX on P55, since it can do only one
+C movq load (or store) per cycle, so the throughput would be the same as the
+C code here (and even then only if src and dst have the same alignment mod
+C 8).
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ TEXT
+ ALIGN(8)
+PROLOGUE(mpn_copyi)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ movl PARAM_DST, %edx
+
+ pushl %ebx FRAME_pushl()
+ pushl %esi FRAME_pushl()
+
+ leal (%edx,%ecx,4), %edx C &dst[size-1]
+ xorl $-1, %ecx C -size-1
+
+ movl PARAM_SRC, %esi
+ addl $8, %ecx C -size+7
+
+ jns L(end)
+
+ movl -28(%edx,%ecx,4), %eax C fetch destination cache line, dst[0]
+ nop
+
+L(top):
+ C eax scratch
+ C ebx scratch
+ C ecx counter, limbs, negative
+ C edx &dst[size-1]
+ C esi src, incrementing
+ C edi
+ C ebp
+
+ movl (%edx,%ecx,4), %eax C fetch destination cache line
+ addl $8, %ecx
+
+ movl (%esi), %eax C read words pairwise
+ movl 4(%esi), %ebx
+ movl %eax, -60(%edx,%ecx,4) C store words pairwise
+ movl %ebx, -56(%edx,%ecx,4)
+
+ movl 8(%esi), %eax
+ movl 12(%esi), %ebx
+ movl %eax, -52(%edx,%ecx,4)
+ movl %ebx, -48(%edx,%ecx,4)
+
+ movl 16(%esi), %eax
+ movl 20(%esi), %ebx
+ movl %eax, -44(%edx,%ecx,4)
+ movl %ebx, -40(%edx,%ecx,4)
+
+ movl 24(%esi), %eax
+ movl 28(%esi), %ebx
+ movl %eax, -36(%edx,%ecx,4)
+ movl %ebx, -32(%edx,%ecx,4)
+
+ leal 32(%esi), %esi
+ js L(top)
+
+
+L(end):
+ C ecx 0 to 7, representing respectively 7 to 0 limbs remaining
+ C esi src end
+ C edx dst, next location to store
+
+ subl $4, %ecx
+ jns L(no4)
+
+ movl (%esi), %eax
+ movl 4(%esi), %ebx
+ movl %eax, -12(%edx,%ecx,4)
+ movl %ebx, -8(%edx,%ecx,4)
+
+ movl 8(%esi), %eax
+ movl 12(%esi), %ebx
+ movl %eax, -4(%edx,%ecx,4)
+ movl %ebx, (%edx,%ecx,4)
+
+ addl $16, %esi
+ addl $4, %ecx
+L(no4):
+
+ subl $2, %ecx
+ jns L(no2)
+
+ movl (%esi), %eax
+ movl 4(%esi), %ebx
+ movl %eax, -4(%edx,%ecx,4)
+ movl %ebx, (%edx,%ecx,4)
+
+ addl $8, %esi
+ addl $2, %ecx
+L(no2):
+
+ jnz L(done)
+
+ movl (%esi), %eax
+ movl %eax, -4(%edx,%ecx,4) C risk of cache bank clash here
+
+L(done):
+ popl %esi
+ popl %ebx
+
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/dive_1.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/dive_1.asm
new file mode 100644
index 0000000..21b5287
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/dive_1.asm
@@ -0,0 +1,264 @@
+dnl Intel Pentium mpn_divexact_1 -- mpn by limb exact division.
+
+dnl Copyright 2001, 2002, 2014 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C divisor
+C odd even
+C P54: 24.5 30.5 cycles/limb
+C P55: 23.0 28.0
+
+
+C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor);
+C
+C Plain divl is used for small sizes, since the inverse takes a while to
+C setup. Multiplying works out faster for size>=3 when the divisor is odd,
+C or size>=4 when the divisor is even. Actually on P55 size==2 for odd or
+C size==3 for even are about the same speed for both divl or mul, but the
+C former is used since it will use up less code cache.
+C
+C The P55 speeds noted above, 23 cycles odd or 28 cycles even, are as
+C expected. On P54 in the even case the shrdl pairing nonsense (see
+C mpn/x86/pentium/README) costs 1 cycle, but it's not clear why there's a
+C further 1.5 slowdown for both odd and even.
+
+defframe(PARAM_DIVISOR,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+dnl re-use parameter space
+define(VAR_INVERSE,`PARAM_DST')
+
+ TEXT
+
+ ALIGN(32)
+PROLOGUE(mpn_divexact_1)
+deflit(`FRAME',0)
+
+ movl PARAM_DIVISOR, %eax
+ movl PARAM_SIZE, %ecx
+
+ pushl %esi FRAME_pushl()
+ push %edi FRAME_pushl()
+
+ movl PARAM_SRC, %esi
+ andl $1, %eax
+
+ movl PARAM_DST, %edi
+ addl %ecx, %eax C size if even, size+1 if odd
+
+ cmpl $4, %eax
+ jae L(mul_by_inverse)
+
+
+ xorl %edx, %edx
+L(div_top):
+ movl -4(%esi,%ecx,4), %eax
+
+ divl PARAM_DIVISOR
+
+ movl %eax, -4(%edi,%ecx,4)
+ decl %ecx
+
+ jnz L(div_top)
+
+ popl %edi
+ popl %esi
+
+ ret
+
+
+
+L(mul_by_inverse):
+ movl PARAM_DIVISOR, %eax
+ movl $-1, %ecx
+
+L(strip_twos):
+ ASSERT(nz, `orl %eax, %eax')
+ shrl %eax
+ incl %ecx C shift count
+
+ jnc L(strip_twos)
+
+ leal 1(%eax,%eax), %edx C d
+ andl $127, %eax C d/2, 7 bits
+
+ pushl %ebx FRAME_pushl()
+ pushl %ebp FRAME_pushl()
+
+ifdef(`PIC',`dnl
+ LEA( binvert_limb_table, %ebp)
+ movzbl (%eax,%ebp), %eax C inv 8 bits
+',`
+ movzbl binvert_limb_table(%eax), %eax C inv 8 bits
+')
+
+ movl %eax, %ebp C inv
+ addl %eax, %eax C 2*inv
+
+ imull %ebp, %ebp C inv*inv
+
+ imull %edx, %ebp C inv*inv*d
+
+ subl %ebp, %eax C inv = 2*inv - inv*inv*d
+ movl PARAM_SIZE, %ebx
+
+ movl %eax, %ebp
+ addl %eax, %eax C 2*inv
+
+ imull %ebp, %ebp C inv*inv
+
+ imull %edx, %ebp C inv*inv*d
+
+ subl %ebp, %eax C inv = 2*inv - inv*inv*d
+ movl %edx, PARAM_DIVISOR C d without twos
+
+ leal (%esi,%ebx,4), %esi C src end
+ leal (%edi,%ebx,4), %edi C dst end
+
+ negl %ebx C -size
+
+ ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS
+ pushl %eax FRAME_pushl()
+ imull PARAM_DIVISOR, %eax
+ cmpl $1, %eax
+ popl %eax FRAME_popl()')
+
+ movl %eax, VAR_INVERSE
+ xorl %ebp, %ebp C initial carry bit
+
+ movl (%esi,%ebx,4), %eax C src low limb
+ orl %ecx, %ecx C shift
+
+ movl 4(%esi,%ebx,4), %edx C src second limb (for even)
+ jz L(odd_entry)
+
+ shrdl( %cl, %edx, %eax)
+
+ incl %ebx
+ jmp L(even_entry)
+
+
+ ALIGN(8)
+L(odd_top):
+ C eax scratch
+ C ebx counter, limbs, negative
+ C ecx
+ C edx
+ C esi src end
+ C edi dst end
+ C ebp carry bit, 0 or -1
+
+ mull PARAM_DIVISOR
+
+ movl (%esi,%ebx,4), %eax
+ subl %ebp, %edx
+
+ subl %edx, %eax
+
+ sbbl %ebp, %ebp
+
+L(odd_entry):
+ imull VAR_INVERSE, %eax
+
+ movl %eax, (%edi,%ebx,4)
+
+ incl %ebx
+ jnz L(odd_top)
+
+
+ popl %ebp
+ popl %ebx
+
+ popl %edi
+ popl %esi
+
+ ret
+
+
+L(even_top):
+ C eax scratch
+ C ebx counter, limbs, negative
+ C ecx twos
+ C edx
+ C esi src end
+ C edi dst end
+ C ebp carry bit, 0 or -1
+
+ mull PARAM_DIVISOR
+
+ subl %ebp, %edx C carry bit
+ movl -4(%esi,%ebx,4), %eax C src limb
+
+ movl (%esi,%ebx,4), %ebp C and one above it
+
+ shrdl( %cl, %ebp, %eax)
+
+ subl %edx, %eax C carry limb
+
+ sbbl %ebp, %ebp
+
+L(even_entry):
+ imull VAR_INVERSE, %eax
+
+ movl %eax, -4(%edi,%ebx,4)
+ incl %ebx
+
+ jnz L(even_top)
+
+
+
+ mull PARAM_DIVISOR
+
+ movl -4(%esi), %eax C src high limb
+ subl %ebp, %edx
+
+ shrl %cl, %eax
+
+ subl %edx, %eax C no carry if division is exact
+
+ imull VAR_INVERSE, %eax
+
+ movl %eax, -4(%edi) C dst high limb
+ nop C protect against cache bank clash
+
+ popl %ebp
+ popl %ebx
+
+ popl %edi
+ popl %esi
+
+ ret
+
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/x86/pentium/gmp-mparam.h
new file mode 100644
index 0000000..befa6e2
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/gmp-mparam.h
@@ -0,0 +1,76 @@
+/* Intel P54 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2002, 2004 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+
+/* For mpn/x86/pentium/mod_1.asm */
+#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
+
+
+/* 166MHz P54 */
+
+/* Generated by tuneup.c, 2004-02-10, gcc 2.95 */
+
+#define MUL_TOOM22_THRESHOLD 16
+#define MUL_TOOM33_THRESHOLD 90
+
+#define SQR_BASECASE_THRESHOLD 0 /* always */
+#define SQR_TOOM2_THRESHOLD 22
+#define SQR_TOOM3_THRESHOLD 122
+
+#define DIV_SB_PREINV_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_DC_THRESHOLD 52
+#define POWM_THRESHOLD 77
+
+#define HGCD_THRESHOLD 121
+#define GCD_ACCEL_THRESHOLD 3
+#define GCD_DC_THRESHOLD 615
+#define JACOBI_BASE_METHOD 2
+
+#define USE_PREINV_DIVREM_1 0
+#define USE_PREINV_MOD_1 1 /* native */
+#define DIVREM_2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define MODEXACT_1_ODD_THRESHOLD 0 /* always (native) */
+
+#define GET_STR_DC_THRESHOLD 23
+#define GET_STR_PRECOMPUTE_THRESHOLD 33
+#define SET_STR_THRESHOLD 2788
+
+#define MUL_FFT_TABLE { 432, 928, 1664, 3584, 10240, 40960, 0 }
+#define MUL_FFT_MODF_THRESHOLD 448
+#define MUL_FFT_THRESHOLD 3328
+
+#define SQR_FFT_TABLE { 496, 928, 1920, 4608, 10240, 40960, 0 }
+#define SQR_FFT_MODF_THRESHOLD 512
+#define SQR_FFT_THRESHOLD 3328
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/hamdist.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/hamdist.asm
new file mode 100644
index 0000000..6c6c1a1
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/hamdist.asm
@@ -0,0 +1,154 @@
+dnl Intel P5 mpn_hamdist -- mpn hamming distance.
+
+dnl Copyright 2001, 2002, 2014, 2015 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 14.0 cycles/limb
+
+
+C unsigned long mpn_hamdist (mp_srcptr src1, mp_srcptr src2, mp_size_t size);
+C
+C It might be possible to shave 1 cycle from the loop, and hence 2
+C cycles/limb. The xorb is taking 2 cycles, but a separate load and xor
+C would be 1, if the right schedule could be found (not found so far).
+C Wanting to avoid potential cache bank clashes makes it tricky.
+
+C The slightly strange quoting here helps the renaming done by tune/many.pl.
+deflit(TABLE_NAME,
+m4_assert_defined(`GSYM_PREFIX')
+GSYM_PREFIX`'mpn_popcount``'_table')
+
+C FIXME: referencing popcount.asm's table is incorrect as it hurt incremental
+C linking.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC2, 8)
+defframe(PARAM_SRC1, 4)
+
+ TEXT
+ ALIGN(8)
+
+PROLOGUE(mpn_hamdist)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ pushl %esi FRAME_pushl()
+
+ shll %ecx C size in byte pairs
+ pushl %edi FRAME_pushl()
+
+ifdef(`PIC',`
+ pushl %ebx FRAME_pushl()
+ pushl %ebp FRAME_pushl()
+ifdef(`DARWIN',`
+ movl PARAM_SRC1, %esi
+ movl PARAM_SRC2, %edi
+ LEA( TABLE_NAME, %ebp)
+ xorl %ebx, %ebx C byte
+ xorl %edx, %edx C byte
+ xorl %eax, %eax C total
+',`
+ call L(here) FRAME_pushl()
+L(here):
+ movl PARAM_SRC1, %esi
+ popl %ebp FRAME_popl()
+
+ movl PARAM_SRC2, %edi
+ addl $_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ebp
+
+ xorl %ebx, %ebx C byte
+ xorl %edx, %edx C byte
+
+ movl TABLE_NAME@GOT(%ebp), %ebp
+ xorl %eax, %eax C total
+')
+define(TABLE,`(%ebp,$1)')
+',`
+dnl non-PIC
+ movl PARAM_SRC1, %esi
+ movl PARAM_SRC2, %edi
+
+ xorl %eax, %eax C total
+ pushl %ebx FRAME_pushl()
+
+ xorl %edx, %edx C byte
+ xorl %ebx, %ebx C byte
+
+define(TABLE,`TABLE_NAME($1)')
+')
+
+
+ C The nop after the xorb seems necessary. Although a movb might be
+ C expected to go down the V pipe in the second cycle of the xorb, it
+ C doesn't and costs an extra 2 cycles.
+L(top):
+ C eax total
+ C ebx byte
+ C ecx counter, 2*size to 2
+ C edx byte
+ C esi src1
+ C edi src2
+ C ebp [PIC] table
+
+ addl %ebx, %eax
+ movb -1(%esi,%ecx,2), %bl
+
+ addl %edx, %eax
+ movb -1(%edi,%ecx,2), %dl
+
+ xorb %dl, %bl
+ movb -2(%esi,%ecx,2), %dl
+
+ xorb -2(%edi,%ecx,2), %dl
+ nop
+
+ movb TABLE(%ebx), %bl
+ decl %ecx
+
+ movb TABLE(%edx), %dl
+ jnz L(top)
+
+
+ifdef(`PIC',`
+ popl %ebp
+')
+ addl %ebx, %eax
+ popl %ebx
+
+ addl %edx, %eax
+ popl %edi
+
+ popl %esi
+
+ ret
+
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/logops_n.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/logops_n.asm
new file mode 100644
index 0000000..1877317
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/logops_n.asm
@@ -0,0 +1,176 @@
+dnl Intel Pentium mpn_and_n,...,mpn_xnor_n -- bitwise logical operations.
+
+dnl Copyright 2001, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 3.0 c/l and, ior, xor
+C 3.5 c/l andn, iorn, nand, nior, xnor
+
+
+define(M4_choose_op,
+`ifdef(`OPERATION_$1',`
+define(`M4_function', `mpn_$1')
+define(`M4_want_pre', `$4')
+define(`M4op', `$3')
+define(`M4_want_post',`$2')
+')')
+define(M4pre, `ifelse(M4_want_pre, yes,`$1')')
+define(M4post,`ifelse(M4_want_post,yes,`$1')')
+
+M4_choose_op( and_n, , andl, )
+M4_choose_op( andn_n, , andl, yes)
+M4_choose_op( nand_n, yes, andl, )
+M4_choose_op( ior_n, , orl, )
+M4_choose_op( iorn_n, , orl, yes)
+M4_choose_op( nior_n, yes, orl, )
+M4_choose_op( xor_n, , xorl, )
+M4_choose_op( xnor_n, yes, xorl, )
+
+ifdef(`M4_function',,
+`m4_error(`Unrecognised or undefined OPERATION symbol
+')')
+
+MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
+
+NAILS_SUPPORT(0-31)
+
+
+C void M4_function (mp_ptr wp, mp_srcptr xp, mp_srcptr yp, mp_size_t size);
+C
+C Nothing complicated here, just some care to avoid data cache bank clashes
+C and AGIs.
+C
+C We're one register short of being able to do a simple 4 loads, 2 ops, 2
+C stores. Instead %ebp is juggled a bit and nops are introduced to keep the
+C pairings as intended. An in-place operation would free up a register, for
+C an 0.5 c/l speedup, if that's worth bothering with.
+C
+C This code seems best for P55 too. Data alignment is a big problem for MMX
+C and the pairing restrictions on movq and integer instructions make life
+C difficult.
+
+defframe(PARAM_SIZE,16)
+defframe(PARAM_YP, 12)
+defframe(PARAM_XP, 8)
+defframe(PARAM_WP, 4)
+
+ TEXT
+ ALIGN(8)
+
+PROLOGUE(M4_function)
+deflit(`FRAME',0)
+
+ pushl %ebx FRAME_pushl()
+ pushl %esi FRAME_pushl()
+
+ pushl %edi FRAME_pushl()
+ pushl %ebp FRAME_pushl()
+
+ movl PARAM_SIZE, %ecx
+ movl PARAM_XP, %ebx
+
+ movl PARAM_YP, %esi
+ movl PARAM_WP, %edi
+
+ shrl %ecx
+ jnc L(entry)
+
+ movl (%ebx,%ecx,8), %eax C risk of data cache bank clash here
+ movl (%esi,%ecx,8), %edx
+
+M4pre(` notl_or_xorl_GMP_NUMB_MASK(%edx)')
+
+ M4op %edx, %eax
+
+M4post(`xorl $GMP_NUMB_MASK, %eax')
+ orl %ecx, %ecx
+
+ movl %eax, (%edi,%ecx,8)
+ jz L(done)
+
+ jmp L(entry)
+
+
+L(top):
+ C eax
+ C ebx xp
+ C ecx counter, limb pairs, decrementing
+ C edx
+ C esi yp
+ C edi wp
+ C ebp
+
+ M4op %ebp, %edx
+ nop
+
+M4post(`xorl $GMP_NUMB_MASK, %eax')
+M4post(`xorl $GMP_NUMB_MASK, %edx')
+
+ movl %eax, 4(%edi,%ecx,8)
+ movl %edx, (%edi,%ecx,8)
+
+L(entry):
+ movl -4(%ebx,%ecx,8), %ebp
+ nop
+
+ movl -4(%esi,%ecx,8), %eax
+ movl -8(%esi,%ecx,8), %edx
+
+M4pre(` xorl $GMP_NUMB_MASK, %eax')
+M4pre(` xorl $GMP_NUMB_MASK, %edx')
+
+ M4op %ebp, %eax
+ movl -8(%ebx,%ecx,8), %ebp
+
+ decl %ecx
+ jnz L(top)
+
+
+ M4op %ebp, %edx
+ nop
+
+M4post(`xorl $GMP_NUMB_MASK, %eax')
+M4post(`xorl $GMP_NUMB_MASK, %edx')
+
+ movl %eax, 4(%edi,%ecx,8)
+ movl %edx, (%edi,%ecx,8)
+
+
+L(done):
+ popl %ebp
+ popl %edi
+
+ popl %esi
+ popl %ebx
+
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/lshift.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/lshift.asm
new file mode 100644
index 0000000..2a31f36
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/lshift.asm
@@ -0,0 +1,243 @@
+dnl Intel Pentium mpn_lshift -- mpn left shift.
+
+dnl Copyright 1992, 1994-1996, 1999, 2000, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C P5,P54: 6.0
+C P55: 5.375
+
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C unsigned shift);
+C
+C The main shift-by-N loop should run at 5.375 c/l and that's what P55 does,
+C but P5 and P54 run only at 6.0 c/l, which is 4 cycles lost somewhere.
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ TEXT
+ ALIGN(8)
+PROLOGUE(mpn_lshift)
+
+ pushl %edi
+ pushl %esi
+ pushl %ebx
+ pushl %ebp
+deflit(`FRAME',16)
+
+ movl PARAM_DST,%edi
+ movl PARAM_SRC,%esi
+ movl PARAM_SIZE,%ebp
+ movl PARAM_SHIFT,%ecx
+
+C We can use faster code for shift-by-1 under certain conditions.
+ cmp $1,%ecx
+ jne L(normal)
+ leal 4(%esi),%eax
+ cmpl %edi,%eax
+ jnc L(special) C jump if s_ptr + 1 >= res_ptr
+ leal (%esi,%ebp,4),%eax
+ cmpl %eax,%edi
+ jnc L(special) C jump if res_ptr >= s_ptr + size
+
+L(normal):
+ leal -4(%edi,%ebp,4),%edi
+ leal -4(%esi,%ebp,4),%esi
+
+ movl (%esi),%edx
+ subl $4,%esi
+ xorl %eax,%eax
+ shldl( %cl, %edx, %eax) C compute carry limb
+ pushl %eax C push carry limb onto stack
+
+ decl %ebp
+ pushl %ebp
+ shrl $3,%ebp
+ jz L(end)
+
+ movl (%edi),%eax C fetch destination cache line
+
+ ALIGN(4)
+L(oop): movl -28(%edi),%eax C fetch destination cache line
+ movl %edx,%ebx
+
+ movl (%esi),%eax
+ movl -4(%esi),%edx
+ shldl( %cl, %eax, %ebx)
+ shldl( %cl, %edx, %eax)
+ movl %ebx,(%edi)
+ movl %eax,-4(%edi)
+
+ movl -8(%esi),%ebx
+ movl -12(%esi),%eax
+ shldl( %cl, %ebx, %edx)
+ shldl( %cl, %eax, %ebx)
+ movl %edx,-8(%edi)
+ movl %ebx,-12(%edi)
+
+ movl -16(%esi),%edx
+ movl -20(%esi),%ebx
+ shldl( %cl, %edx, %eax)
+ shldl( %cl, %ebx, %edx)
+ movl %eax,-16(%edi)
+ movl %edx,-20(%edi)
+
+ movl -24(%esi),%eax
+ movl -28(%esi),%edx
+ shldl( %cl, %eax, %ebx)
+ shldl( %cl, %edx, %eax)
+ movl %ebx,-24(%edi)
+ movl %eax,-28(%edi)
+
+ subl $32,%esi
+ subl $32,%edi
+ decl %ebp
+ jnz L(oop)
+
+L(end): popl %ebp
+ andl $7,%ebp
+ jz L(end2)
+L(oop2):
+ movl (%esi),%eax
+ shldl( %cl,%eax,%edx)
+ movl %edx,(%edi)
+ movl %eax,%edx
+ subl $4,%esi
+ subl $4,%edi
+ decl %ebp
+ jnz L(oop2)
+
+L(end2):
+ shll %cl,%edx C compute least significant limb
+ movl %edx,(%edi) C store it
+
+ popl %eax C pop carry limb
+
+ popl %ebp
+ popl %ebx
+ popl %esi
+ popl %edi
+ ret
+
+
+C We loop from least significant end of the arrays, which is only
+C permissable if the source and destination don't overlap, since the
+C function is documented to work for overlapping source and destination.
+
+L(special):
+ movl (%esi),%edx
+ addl $4,%esi
+
+ decl %ebp
+ pushl %ebp
+ shrl $3,%ebp
+
+ addl %edx,%edx
+ incl %ebp
+ decl %ebp
+ jz L(Lend)
+
+ movl (%edi),%eax C fetch destination cache line
+
+ ALIGN(4)
+L(Loop):
+ movl 28(%edi),%eax C fetch destination cache line
+ movl %edx,%ebx
+
+ movl (%esi),%eax
+ movl 4(%esi),%edx
+ adcl %eax,%eax
+ movl %ebx,(%edi)
+ adcl %edx,%edx
+ movl %eax,4(%edi)
+
+ movl 8(%esi),%ebx
+ movl 12(%esi),%eax
+ adcl %ebx,%ebx
+ movl %edx,8(%edi)
+ adcl %eax,%eax
+ movl %ebx,12(%edi)
+
+ movl 16(%esi),%edx
+ movl 20(%esi),%ebx
+ adcl %edx,%edx
+ movl %eax,16(%edi)
+ adcl %ebx,%ebx
+ movl %edx,20(%edi)
+
+ movl 24(%esi),%eax
+ movl 28(%esi),%edx
+ adcl %eax,%eax
+ movl %ebx,24(%edi)
+ adcl %edx,%edx
+ movl %eax,28(%edi)
+
+ leal 32(%esi),%esi C use leal not to clobber carry
+ leal 32(%edi),%edi
+ decl %ebp
+ jnz L(Loop)
+
+L(Lend):
+ popl %ebp
+ sbbl %eax,%eax C save carry in %eax
+ andl $7,%ebp
+ jz L(Lend2)
+ addl %eax,%eax C restore carry from eax
+L(Loop2):
+ movl %edx,%ebx
+ movl (%esi),%edx
+ adcl %edx,%edx
+ movl %ebx,(%edi)
+
+ leal 4(%esi),%esi C use leal not to clobber carry
+ leal 4(%edi),%edi
+ decl %ebp
+ jnz L(Loop2)
+
+ jmp L(L1)
+L(Lend2):
+ addl %eax,%eax C restore carry from eax
+L(L1): movl %edx,(%edi) C store last limb
+
+ sbbl %eax,%eax
+ negl %eax
+
+ popl %ebp
+ popl %ebx
+ popl %esi
+ popl %edi
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/gmp-mparam.h
new file mode 100644
index 0000000..02a0def
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/gmp-mparam.h
@@ -0,0 +1,163 @@
+/* Intel P55 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2002, 2004, 2009, 2010 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+
+/* For mpn/x86/pentium/mod_1.asm */
+#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
+
+
+/* 233MHz P55 */
+
+#define MOD_1_NORM_THRESHOLD 5
+#define MOD_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD MP_SIZE_T_MAX /* never */
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 12
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 11
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 63
+#define USE_PREINV_DIVREM_1 0
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 51
+
+#define MUL_TOOM22_THRESHOLD 16
+#define MUL_TOOM33_THRESHOLD 53
+#define MUL_TOOM44_THRESHOLD 128
+#define MUL_TOOM6H_THRESHOLD 189
+#define MUL_TOOM8H_THRESHOLD 260
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 89
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 91
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 90
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 88
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 20
+#define SQR_TOOM3_THRESHOLD 73
+#define SQR_TOOM4_THRESHOLD 178
+#define SQR_TOOM6_THRESHOLD 210
+#define SQR_TOOM8_THRESHOLD 375
+
+#define MULMOD_BNM1_THRESHOLD 11
+#define SQRMOD_BNM1_THRESHOLD 12
+
+#define MUL_FFT_MODF_THRESHOLD 364 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 364, 5}, { 15, 6}, { 8, 5}, { 17, 6}, \
+ { 9, 5}, { 19, 6}, { 17, 7}, { 9, 6}, \
+ { 21, 7}, { 11, 6}, { 23, 7}, { 15, 6}, \
+ { 31, 7}, { 21, 8}, { 11, 7}, { 27, 8}, \
+ { 15, 7}, { 33, 8}, { 19, 7}, { 39, 8}, \
+ { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \
+ { 31, 7}, { 63, 8}, { 39, 9}, { 23, 8}, \
+ { 47,10}, { 15, 9}, { 31, 8}, { 67, 9}, \
+ { 39, 8}, { 79, 9}, { 47, 8}, { 95, 9}, \
+ { 55,10}, { 31, 9}, { 79,10}, { 47, 9}, \
+ { 95,11}, { 31,10}, { 63, 9}, { 135,10}, \
+ { 79, 9}, { 159, 8}, { 319, 9}, { 167,10}, \
+ { 95, 9}, { 191, 8}, { 383,11}, { 63,10}, \
+ { 127, 9}, { 255,10}, { 143, 9}, { 287,10}, \
+ { 159, 9}, { 319,11}, { 95,10}, { 191, 9}, \
+ { 383,12}, { 63,11}, { 127,10}, { 271, 9}, \
+ { 543,10}, { 287,11}, { 159,10}, { 351,11}, \
+ { 191,10}, { 415,11}, { 223,12}, { 127,11}, \
+ { 255,10}, { 511,11}, { 287,10}, { 575,11}, \
+ { 351,12}, { 191,11}, { 415,13}, { 127,12}, \
+ { 255,11}, { 575,12}, { 319,11}, { 703,12}, \
+ { 383,11}, { 831,12}, { 447,13}, { 8192,14}, \
+ { 16384,15}, { 32768,16} }
+#define MUL_FFT_TABLE3_SIZE 90
+#define MUL_FFT_THRESHOLD 3520
+
+#define SQR_FFT_MODF_THRESHOLD 340 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 340, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
+ { 17, 7}, { 9, 6}, { 21, 7}, { 11, 6}, \
+ { 23, 7}, { 15, 6}, { 31, 7}, { 21, 8}, \
+ { 11, 7}, { 29, 8}, { 15, 7}, { 33, 8}, \
+ { 19, 7}, { 39, 8}, { 27, 7}, { 55, 9}, \
+ { 15, 8}, { 31, 7}, { 65, 8}, { 43, 9}, \
+ { 23, 8}, { 47,10}, { 15, 9}, { 31, 8}, \
+ { 67, 9}, { 39, 8}, { 83, 9}, { 47, 8}, \
+ { 95,10}, { 31, 9}, { 63, 8}, { 127, 9}, \
+ { 79,10}, { 47, 9}, { 95,11}, { 31,10}, \
+ { 63, 9}, { 127, 8}, { 255, 9}, { 135,10}, \
+ { 79, 9}, { 159, 8}, { 319,10}, { 95, 9}, \
+ { 191,11}, { 63,10}, { 127, 9}, { 255, 8}, \
+ { 511, 9}, { 271,10}, { 143, 9}, { 287, 8}, \
+ { 575, 9}, { 303,10}, { 159, 9}, { 319,11}, \
+ { 95,10}, { 191, 9}, { 383,10}, { 207,12}, \
+ { 63,11}, { 127,10}, { 271, 9}, { 543,10}, \
+ { 287, 9}, { 575,10}, { 303,11}, { 159,10}, \
+ { 351,11}, { 191,10}, { 415,11}, { 223,10}, \
+ { 447,12}, { 127,11}, { 255,10}, { 543,11}, \
+ { 287,10}, { 607,11}, { 351,12}, { 191,11}, \
+ { 479,13}, { 127,12}, { 255,11}, { 575,12}, \
+ { 319,11}, { 703,12}, { 383,11}, { 767,12}, \
+ { 447,13}, { 8192,14}, { 16384,15}, { 32768,16} }
+#define SQR_FFT_TABLE3_SIZE 96
+#define SQR_FFT_THRESHOLD 5504
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 48
+#define MULLO_MUL_N_THRESHOLD 6633
+
+#define DC_DIV_QR_THRESHOLD 43
+#define DC_DIVAPPR_Q_THRESHOLD 170
+#define DC_BDIV_QR_THRESHOLD 43
+#define DC_BDIV_Q_THRESHOLD 110
+
+#define INV_MULMOD_BNM1_THRESHOLD 30
+#define INV_NEWTON_THRESHOLD 177
+#define INV_APPR_THRESHOLD 171
+
+#define BINV_NEWTON_THRESHOLD 194
+#define REDC_1_TO_REDC_N_THRESHOLD 50
+
+#define MU_DIV_QR_THRESHOLD 1142
+#define MU_DIVAPPR_Q_THRESHOLD 1142
+#define MUPI_DIV_QR_THRESHOLD 90
+#define MU_BDIV_QR_THRESHOLD 942
+#define MU_BDIV_Q_THRESHOLD 1017
+
+#define MATRIX22_STRASSEN_THRESHOLD 13
+#define HGCD_THRESHOLD 92
+#define GCD_DC_THRESHOLD 283
+#define GCDEXT_DC_THRESHOLD 221
+#define JACOBI_BASE_METHOD 2
+
+#define GET_STR_DC_THRESHOLD 18
+#define GET_STR_PRECOMPUTE_THRESHOLD 31
+#define SET_STR_DC_THRESHOLD 490
+#define SET_STR_PRECOMPUTE_THRESHOLD 994
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/hamdist.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/hamdist.asm
new file mode 100644
index 0000000..72e3196
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/hamdist.asm
@@ -0,0 +1,40 @@
+dnl Intel P55 mpn_hamdist -- mpn hamming distance.
+
+dnl Copyright 2000, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P55: hamdist 12.0 cycles/limb
+
+C For reference, this code runs at 11.5 cycles/limb for popcount, which is
+C slower than the plain integer mpn/x86/pentium/popcount.asm.
+
+MULFUNC_PROLOGUE(mpn_hamdist)
+include_mpn(`x86/k6/mmx/popham.asm')
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/lshift.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/lshift.asm
new file mode 100644
index 0000000..04b0ddc
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/lshift.asm
@@ -0,0 +1,463 @@
+dnl Intel P5 mpn_lshift -- mpn left shift.
+
+dnl Copyright 2000-2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 1.75 cycles/limb.
+
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C unsigned shift);
+C
+C Shift src,size left by shift many bits and store the result in dst,size.
+C Zeros are shifted in at the right. Return the bits shifted out at the
+C left.
+C
+C The comments in mpn_rshift apply here too.
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+dnl minimum 5, because the unrolled loop can't handle less
+deflit(UNROLL_THRESHOLD, 5)
+
+ TEXT
+ ALIGN(8)
+
+PROLOGUE(mpn_lshift)
+
+ pushl %ebx
+ pushl %edi
+deflit(`FRAME',8)
+
+ movl PARAM_SIZE, %eax
+ movl PARAM_DST, %edx
+
+ movl PARAM_SRC, %ebx
+ movl PARAM_SHIFT, %ecx
+
+ cmp $UNROLL_THRESHOLD, %eax
+ jae L(unroll)
+
+ movl -4(%ebx,%eax,4), %edi C src high limb
+ decl %eax
+
+ jnz L(simple)
+
+ shldl( %cl, %edi, %eax) C eax was decremented to zero
+
+ shll %cl, %edi
+
+ movl %edi, (%edx) C dst low limb
+ popl %edi C risk of data cache bank clash
+
+ popl %ebx
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+L(simple):
+ C eax size-1
+ C ebx src
+ C ecx shift
+ C edx dst
+ C esi
+ C edi
+ C ebp
+deflit(`FRAME',8)
+
+ movd (%ebx,%eax,4), %mm5 C src high limb
+
+ movd %ecx, %mm6 C lshift
+ negl %ecx
+
+ psllq %mm6, %mm5
+ addl $32, %ecx
+
+ movd %ecx, %mm7
+ psrlq $32, %mm5 C retval
+
+
+L(simple_top):
+ C eax counter, limbs, negative
+ C ebx src
+ C ecx
+ C edx dst
+ C esi
+ C edi
+ C
+ C mm0 scratch
+ C mm5 return value
+ C mm6 shift
+ C mm7 32-shift
+
+ movq -4(%ebx,%eax,4), %mm0
+ decl %eax
+
+ psrlq %mm7, %mm0
+
+ C
+
+ movd %mm0, 4(%edx,%eax,4)
+ jnz L(simple_top)
+
+
+ movd (%ebx), %mm0
+
+ movd %mm5, %eax
+ psllq %mm6, %mm0
+
+ popl %edi
+ popl %ebx
+
+ movd %mm0, (%edx)
+
+ emms
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(8)
+L(unroll):
+ C eax size
+ C ebx src
+ C ecx shift
+ C edx dst
+ C esi
+ C edi
+ C ebp
+deflit(`FRAME',8)
+
+ movd -4(%ebx,%eax,4), %mm5 C src high limb
+ leal (%ebx,%eax,4), %edi
+
+ movd %ecx, %mm6 C lshift
+ andl $4, %edi
+
+ psllq %mm6, %mm5
+ jz L(start_src_aligned)
+
+
+ C src isn't aligned, process high limb separately (marked xxx) to
+ C make it so.
+ C
+ C source -8(ebx,%eax,4)
+ C |
+ C +-------+-------+-------+--
+ C | |
+ C +-------+-------+-------+--
+ C 0mod8 4mod8 0mod8
+ C
+ C dest
+ C -4(edx,%eax,4)
+ C |
+ C +-------+-------+--
+ C | xxx | |
+ C +-------+-------+--
+
+ movq -8(%ebx,%eax,4), %mm0 C unaligned load
+
+ psllq %mm6, %mm0
+ decl %eax
+
+ psrlq $32, %mm0
+
+ C
+
+ movd %mm0, (%edx,%eax,4)
+L(start_src_aligned):
+
+ movq -8(%ebx,%eax,4), %mm1 C src high qword
+ leal (%edx,%eax,4), %edi
+
+ andl $4, %edi
+ psrlq $32, %mm5 C return value
+
+ movq -16(%ebx,%eax,4), %mm3 C src second highest qword
+ jz L(start_dst_aligned)
+
+ C dst isn't aligned, subtract 4 to make it so, and pretend the shift
+ C is 32 bits extra. High limb of dst (marked xxx) handled here
+ C separately.
+ C
+ C source -8(ebx,%eax,4)
+ C |
+ C +-------+-------+--
+ C | mm1 |
+ C +-------+-------+--
+ C 0mod8 4mod8
+ C
+ C dest
+ C -4(edx,%eax,4)
+ C |
+ C +-------+-------+-------+--
+ C | xxx | |
+ C +-------+-------+-------+--
+ C 0mod8 4mod8 0mod8
+
+ movq %mm1, %mm0
+ addl $32, %ecx C new shift
+
+ psllq %mm6, %mm0
+
+ movd %ecx, %mm6
+ psrlq $32, %mm0
+
+ C wasted cycle here waiting for %mm0
+
+ movd %mm0, -4(%edx,%eax,4)
+ subl $4, %edx
+L(start_dst_aligned):
+
+
+ psllq %mm6, %mm1
+ negl %ecx C -shift
+
+ addl $64, %ecx C 64-shift
+ movq %mm3, %mm2
+
+ movd %ecx, %mm7
+ subl $8, %eax C size-8
+
+ psrlq %mm7, %mm3
+
+ por %mm1, %mm3 C mm3 ready to store
+ jc L(finish)
+
+
+ C The comments in mpn_rshift apply here too.
+
+ ALIGN(8)
+L(unroll_loop):
+ C eax counter, limbs
+ C ebx src
+ C ecx
+ C edx dst
+ C esi
+ C edi
+ C
+ C mm0
+ C mm1
+ C mm2 src qword from 16(%ebx,%eax,4)
+ C mm3 dst qword ready to store to 24(%edx,%eax,4)
+ C
+ C mm5 return value
+ C mm6 lshift
+ C mm7 rshift
+
+ movq 8(%ebx,%eax,4), %mm0
+ psllq %mm6, %mm2
+
+ movq %mm0, %mm1
+ psrlq %mm7, %mm0
+
+ movq %mm3, 24(%edx,%eax,4) C prev
+ por %mm2, %mm0
+
+ movq (%ebx,%eax,4), %mm3 C
+ psllq %mm6, %mm1 C
+
+ movq %mm0, 16(%edx,%eax,4)
+ movq %mm3, %mm2 C
+
+ psrlq %mm7, %mm3 C
+ subl $4, %eax
+
+ por %mm1, %mm3 C
+ jnc L(unroll_loop)
+
+
+
+L(finish):
+ C eax -4 to -1 representing respectively 0 to 3 limbs remaining
+
+ testb $2, %al
+
+ jz L(finish_no_two)
+
+ movq 8(%ebx,%eax,4), %mm0
+ psllq %mm6, %mm2
+
+ movq %mm0, %mm1
+ psrlq %mm7, %mm0
+
+ movq %mm3, 24(%edx,%eax,4) C prev
+ por %mm2, %mm0
+
+ movq %mm1, %mm2
+ movq %mm0, %mm3
+
+ subl $2, %eax
+L(finish_no_two):
+
+
+ C eax -4 or -3 representing respectively 0 or 1 limbs remaining
+ C
+ C mm2 src prev qword, from 16(%ebx,%eax,4)
+ C mm3 dst qword, for 24(%edx,%eax,4)
+
+ testb $1, %al
+ movd %mm5, %eax C retval
+
+ popl %edi
+ jz L(finish_zero)
+
+
+ C One extra src limb, destination was aligned.
+ C
+ C source ebx
+ C --+---------------+-------+
+ C | mm2 | |
+ C --+---------------+-------+
+ C
+ C dest edx+12 edx+4 edx
+ C --+---------------+---------------+-------+
+ C | mm3 | | |
+ C --+---------------+---------------+-------+
+ C
+ C mm6 = shift
+ C mm7 = ecx = 64-shift
+
+
+ C One extra src limb, destination was unaligned.
+ C
+ C source ebx
+ C --+---------------+-------+
+ C | mm2 | |
+ C --+---------------+-------+
+ C
+ C dest edx+12 edx+4
+ C --+---------------+---------------+
+ C | mm3 | |
+ C --+---------------+---------------+
+ C
+ C mm6 = shift+32
+ C mm7 = ecx = 64-(shift+32)
+
+
+ C In both cases there's one extra limb of src to fetch and combine
+ C with mm2 to make a qword at 4(%edx), and in the aligned case
+ C there's an extra limb of dst to be formed from that extra src limb
+ C left shifted.
+
+
+ movd (%ebx), %mm0
+ psllq %mm6, %mm2
+
+ movq %mm3, 12(%edx)
+ psllq $32, %mm0
+
+ movq %mm0, %mm1
+ psrlq %mm7, %mm0
+
+ por %mm2, %mm0
+ psllq %mm6, %mm1
+
+ movq %mm0, 4(%edx)
+ psrlq $32, %mm1
+
+ andl $32, %ecx
+ popl %ebx
+
+ jz L(finish_one_unaligned)
+
+ movd %mm1, (%edx)
+L(finish_one_unaligned):
+
+ emms
+
+ ret
+
+
+L(finish_zero):
+
+ C No extra src limbs, destination was aligned.
+ C
+ C source ebx
+ C --+---------------+
+ C | mm2 |
+ C --+---------------+
+ C
+ C dest edx+8 edx
+ C --+---------------+---------------+
+ C | mm3 | |
+ C --+---------------+---------------+
+ C
+ C mm6 = shift
+ C mm7 = ecx = 64-shift
+
+
+ C No extra src limbs, destination was unaligned.
+ C
+ C source ebx
+ C --+---------------+
+ C | mm2 |
+ C --+---------------+
+ C
+ C dest edx+8 edx+4
+ C --+---------------+-------+
+ C | mm3 | |
+ C --+---------------+-------+
+ C
+ C mm6 = shift+32
+ C mm7 = ecx = 64-(shift+32)
+
+
+ C The movd for the unaligned case writes the same data to 4(%edx)
+ C that the movq does for the aligned case.
+
+
+ movq %mm3, 8(%edx)
+ andl $32, %ecx
+
+ psllq %mm6, %mm2
+ jz L(finish_zero_unaligned)
+
+ movq %mm2, (%edx)
+L(finish_zero_unaligned):
+
+ psrlq $32, %mm2
+ popl %ebx
+
+ movd %mm5, %eax C retval
+
+ movd %mm2, 4(%edx)
+
+ emms
+
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/mul_1.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/mul_1.asm
new file mode 100644
index 0000000..4ced577
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/mul_1.asm
@@ -0,0 +1,371 @@
+dnl Intel Pentium MMX mpn_mul_1 -- mpn by limb multiplication.
+
+dnl Copyright 2000-2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C P5: 12.0 for 32-bit multiplier
+C 7.0 for 16-bit multiplier
+
+
+C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t multiplier);
+C
+C When the multiplier is 16 bits some special case MMX code is used. Small
+C multipliers might arise reasonably often from mpz_mul_ui etc. If the size
+C is odd there's roughly a 5 cycle penalty, so times for say size==7 and
+C size==8 end up being quite close. If src isn't aligned to an 8 byte
+C boundary then one limb is processed separately with roughly a 5 cycle
+C penalty, so in that case it's say size==8 and size==9 which are close.
+C
+C Alternatives:
+C
+C MMX is not believed to be of any use for 32-bit multipliers, since for
+C instance the current method would just have to be more or less duplicated
+C for the high and low halves of the multiplier, and would probably
+C therefore run at about 14 cycles, which is slower than the plain integer
+C at 12.
+C
+C Adding the high and low MMX products using integer code seems best. An
+C attempt at using paddd and carry bit propagation with pcmpgtd didn't give
+C any joy. Perhaps something could be done keeping the values signed and
+C thereby avoiding adjustments to make pcmpgtd into an unsigned compare, or
+C perhaps not.
+C
+C Future:
+C
+C An mpn_mul_1c entrypoint would need a double carry out of the low result
+C limb in the 16-bit code, unless it could be assumed the carry fits in 16
+C bits, possibly as carry<multiplier, this being true of a big calculation
+C done piece by piece. But let's worry about that if/when mul_1c is
+C actually used.
+
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ TEXT
+
+ ALIGN(8)
+PROLOGUE(mpn_mul_1)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ movl PARAM_SRC, %edx
+
+ cmpl $1, %ecx
+ jne L(two_or_more)
+
+ C one limb only
+
+ movl PARAM_MULTIPLIER, %eax
+ movl PARAM_DST, %ecx
+
+ mull (%edx)
+
+ movl %eax, (%ecx)
+ movl %edx, %eax
+
+ ret
+
+
+L(two_or_more):
+ C eax size
+ C ebx
+ C ecx carry
+ C edx
+ C esi src
+ C edi
+ C ebp
+
+ pushl %esi FRAME_pushl()
+ pushl %edi FRAME_pushl()
+
+ movl %edx, %esi C src
+ movl PARAM_DST, %edi
+
+ movl PARAM_MULTIPLIER, %eax
+ pushl %ebx FRAME_pushl()
+
+ leal (%esi,%ecx,4), %esi C src end
+ leal (%edi,%ecx,4), %edi C dst end
+
+ negl %ecx C -size
+
+ pushl %ebp FRAME_pushl()
+ cmpl $65536, %eax
+
+ jb L(small)
+
+
+L(big):
+ xorl %ebx, %ebx C carry limb
+ sarl %ecx C -size/2
+
+ jnc L(top) C with carry flag clear
+
+
+ C size was odd, process one limb separately
+
+ mull 4(%esi,%ecx,8) C m * src[0]
+
+ movl %eax, 4(%edi,%ecx,8)
+ incl %ecx
+
+ orl %edx, %ebx C carry limb, and clear carry flag
+
+
+L(top):
+ C eax
+ C ebx carry
+ C ecx counter, negative
+ C edx
+ C esi src end
+ C edi dst end
+ C ebp (scratch carry)
+
+ adcl $0, %ebx
+ movl (%esi,%ecx,8), %eax
+
+ mull PARAM_MULTIPLIER
+
+ movl %edx, %ebp
+ addl %eax, %ebx
+
+ adcl $0, %ebp
+ movl 4(%esi,%ecx,8), %eax
+
+ mull PARAM_MULTIPLIER
+
+ movl %ebx, (%edi,%ecx,8)
+ addl %ebp, %eax
+
+ movl %eax, 4(%edi,%ecx,8)
+ incl %ecx
+
+ movl %edx, %ebx
+ jnz L(top)
+
+
+ adcl $0, %ebx
+ popl %ebp
+
+ movl %ebx, %eax
+ popl %ebx
+
+ popl %edi
+ popl %esi
+
+ ret
+
+
+L(small):
+ C Special case for 16-bit multiplier.
+ C
+ C eax multiplier
+ C ebx
+ C ecx -size
+ C edx src
+ C esi src end
+ C edi dst end
+ C ebp multiplier
+
+ C size<3 not supported here. At size==3 we're already a couple of
+ C cycles faster, so there's no threshold as such, just use the MMX
+ C as soon as possible.
+
+ cmpl $-3, %ecx
+ ja L(big)
+
+ movd %eax, %mm7 C m
+ pxor %mm6, %mm6 C initial carry word
+
+ punpcklwd %mm7, %mm7 C m replicated 2 times
+ addl $2, %ecx C -size+2
+
+ punpckldq %mm7, %mm7 C m replicated 4 times
+ andl $4, %edx C test alignment, clear carry flag
+
+ movq %mm7, %mm0 C m
+ jz L(small_entry)
+
+
+ C Source is unaligned, process one limb separately.
+ C
+ C Plain integer code is used here, since it's smaller and is about
+ C the same 13 cycles as an mmx block would be.
+ C
+ C An "addl $1,%ecx" doesn't clear the carry flag when size==3, hence
+ C the use of separate incl and orl.
+
+ mull -8(%esi,%ecx,4) C m * src[0]
+
+ movl %eax, -8(%edi,%ecx,4) C dst[0]
+ incl %ecx C one limb processed
+
+ movd %edx, %mm6 C initial carry
+
+ orl %eax, %eax C clear carry flag
+ jmp L(small_entry)
+
+
+C The scheduling here is quite tricky, since so many instructions have
+C pairing restrictions. In particular the js won't pair with a movd, and
+C can't be paired with an adc since it wants flags from the inc, so
+C instructions are rotated to the top of the loop to find somewhere useful
+C for it.
+C
+C Trouble has been taken to avoid overlapping successive loop iterations,
+C since that would greatly increase the size of the startup and finishup
+C code. Actually there's probably not much advantage to be had from
+C overlapping anyway, since the difficulties are mostly with pairing, not
+C with latencies as such.
+C
+C In the comments x represents the src data and m the multiplier (16
+C bits, but replicated 4 times).
+C
+C The m signs calculated in %mm3 are a loop invariant and could be held in
+C say %mm5, but that would save only one instruction and hence be no faster.
+
+L(small_top):
+ C eax l.low, then l.high
+ C ebx (h.low)
+ C ecx counter, -size+2 to 0 or 1
+ C edx (h.high)
+ C esi &src[size]
+ C edi &dst[size]
+ C ebp
+ C
+ C %mm0 (high products)
+ C %mm1 (low products)
+ C %mm2 (adjust for m using x signs)
+ C %mm3 (adjust for x using m signs)
+ C %mm4
+ C %mm5
+ C %mm6 h.low, then carry
+ C %mm7 m replicated 4 times
+
+ movd %mm6, %ebx C h.low
+ psrlq $32, %mm1 C l.high
+
+ movd %mm0, %edx C h.high
+ movq %mm0, %mm6 C new c
+
+ adcl %eax, %ebx
+ incl %ecx
+
+ movd %mm1, %eax C l.high
+ movq %mm7, %mm0
+
+ adcl %eax, %edx
+ movl %ebx, -16(%edi,%ecx,4)
+
+ movl %edx, -12(%edi,%ecx,4)
+ psrlq $32, %mm6 C c
+
+L(small_entry):
+ pmulhw -8(%esi,%ecx,4), %mm0 C h = (x*m).high
+ movq %mm7, %mm1
+
+ pmullw -8(%esi,%ecx,4), %mm1 C l = (x*m).low
+ movq %mm7, %mm3
+
+ movq -8(%esi,%ecx,4), %mm2 C x
+ psraw $15, %mm3 C m signs
+
+ pand -8(%esi,%ecx,4), %mm3 C x selected by m signs
+ psraw $15, %mm2 C x signs
+
+ paddw %mm3, %mm0 C add x to h if m neg
+ pand %mm7, %mm2 C m selected by x signs
+
+ paddw %mm2, %mm0 C add m to h if x neg
+ incl %ecx
+
+ movd %mm1, %eax C l.low
+ punpcklwd %mm0, %mm6 C c + h.low << 16
+
+ psrlq $16, %mm0 C h.high
+ js L(small_top)
+
+
+
+
+ movd %mm6, %ebx C h.low
+ psrlq $32, %mm1 C l.high
+
+ adcl %eax, %ebx
+ popl %ebp FRAME_popl()
+
+ movd %mm0, %edx C h.high
+ psrlq $32, %mm0 C l.high
+
+ movd %mm1, %eax C l.high
+
+ adcl %eax, %edx
+ movl %ebx, -12(%edi,%ecx,4)
+
+ movd %mm0, %eax C c
+
+ adcl $0, %eax
+ movl %edx, -8(%edi,%ecx,4)
+
+ orl %ecx, %ecx
+ jnz L(small_done) C final %ecx==1 means even, ==0 odd
+
+
+ C Size odd, one extra limb to process.
+ C Plain integer code is used here, since it's smaller and is about
+ C the same speed as another mmx block would be.
+
+ movl %eax, %ecx
+ movl PARAM_MULTIPLIER, %eax
+
+ mull -4(%esi)
+
+ addl %ecx, %eax
+
+ adcl $0, %edx
+ movl %eax, -4(%edi)
+
+ movl %edx, %eax
+L(small_done):
+ popl %ebx
+
+ popl %edi
+ popl %esi
+
+ emms
+
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/rshift.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/rshift.asm
new file mode 100644
index 0000000..e3b274b
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/rshift.asm
@@ -0,0 +1,468 @@
+dnl Intel P5 mpn_rshift -- mpn right shift.
+
+dnl Copyright 2000, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 1.75 cycles/limb.
+
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C unsigned shift);
+C
+C Shift src,size right by shift many bits and store the result in dst,size.
+C Zeros are shifted in at the left. Return the bits shifted out at the
+C right.
+C
+C It takes 6 mmx instructions to process 2 limbs, making 1.5 cycles/limb,
+C and with a 4 limb loop and 1 cycle of loop overhead the total is 1.75 c/l.
+C
+C Full speed depends on source and destination being aligned. Unaligned mmx
+C loads and stores on P5 don't pair and have a 2 cycle penalty. Some hairy
+C setups and finish-ups are done to ensure alignment for the loop.
+C
+C MMX shifts work out a bit faster even for the simple loop.
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+dnl Minimum 5, because the unrolled loop can't handle less.
+deflit(UNROLL_THRESHOLD, 5)
+
+ TEXT
+ ALIGN(8)
+
+PROLOGUE(mpn_rshift)
+
+ pushl %ebx
+ pushl %edi
+deflit(`FRAME',8)
+
+ movl PARAM_SIZE, %eax
+ movl PARAM_DST, %edx
+
+ movl PARAM_SRC, %ebx
+ movl PARAM_SHIFT, %ecx
+
+ cmp $UNROLL_THRESHOLD, %eax
+ jae L(unroll)
+
+ decl %eax
+ movl (%ebx), %edi C src low limb
+
+ jnz L(simple)
+
+ shrdl( %cl, %edi, %eax) C eax was decremented to zero
+
+ shrl %cl, %edi
+
+ movl %edi, (%edx) C dst low limb
+ popl %edi C risk of data cache bank clash
+
+ popl %ebx
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(8)
+L(simple):
+ C eax size-1
+ C ebx src
+ C ecx shift
+ C edx dst
+ C esi
+ C edi
+ C ebp
+deflit(`FRAME',8)
+
+ movd (%ebx), %mm5 C src[0]
+ leal (%ebx,%eax,4), %ebx C &src[size-1]
+
+ movd %ecx, %mm6 C rshift
+ leal -4(%edx,%eax,4), %edx C &dst[size-2]
+
+ psllq $32, %mm5
+ negl %eax
+
+
+C This loop is 5 or 8 cycles, with every second load unaligned and a wasted
+C cycle waiting for the mm0 result to be ready. For comparison a shrdl is 4
+C cycles and would be 8 in a simple loop. Using mmx helps the return value
+C and last limb calculations too.
+
+L(simple_top):
+ C eax counter, limbs, negative
+ C ebx &src[size-1]
+ C ecx return value
+ C edx &dst[size-2]
+ C
+ C mm0 scratch
+ C mm5 return value
+ C mm6 shift
+
+ movq (%ebx,%eax,4), %mm0
+ incl %eax
+
+ psrlq %mm6, %mm0
+
+ movd %mm0, (%edx,%eax,4)
+ jnz L(simple_top)
+
+
+ movd (%ebx), %mm0
+ psrlq %mm6, %mm5 C return value
+
+ psrlq %mm6, %mm0
+ popl %edi
+
+ movd %mm5, %eax
+ popl %ebx
+
+ movd %mm0, 4(%edx)
+
+ emms
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(8)
+L(unroll):
+ C eax size
+ C ebx src
+ C ecx shift
+ C edx dst
+ C esi
+ C edi
+ C ebp
+deflit(`FRAME',8)
+
+ movd (%ebx), %mm5 C src[0]
+ movl $4, %edi
+
+ movd %ecx, %mm6 C rshift
+ testl %edi, %ebx
+
+ psllq $32, %mm5
+ jz L(start_src_aligned)
+
+
+ C src isn't aligned, process low limb separately (marked xxx) and
+ C step src and dst by one limb, making src aligned.
+ C
+ C source ebx
+ C --+-------+-------+-------+
+ C | xxx |
+ C --+-------+-------+-------+
+ C 4mod8 0mod8 4mod8
+ C
+ C dest edx
+ C --+-------+-------+
+ C | | xxx |
+ C --+-------+-------+
+
+ movq (%ebx), %mm0 C unaligned load
+
+ psrlq %mm6, %mm0
+ addl $4, %ebx
+
+ decl %eax
+
+ movd %mm0, (%edx)
+ addl $4, %edx
+L(start_src_aligned):
+
+
+ movq (%ebx), %mm1
+ testl %edi, %edx
+
+ psrlq %mm6, %mm5 C retval
+ jz L(start_dst_aligned)
+
+ C dst isn't aligned, add 4 to make it so, and pretend the shift is
+ C 32 bits extra. Low limb of dst (marked xxx) handled here
+ C separately.
+ C
+ C source ebx
+ C --+-------+-------+
+ C | mm1 |
+ C --+-------+-------+
+ C 4mod8 0mod8
+ C
+ C dest edx
+ C --+-------+-------+-------+
+ C | xxx |
+ C --+-------+-------+-------+
+ C 4mod8 0mod8 4mod8
+
+ movq %mm1, %mm0
+ addl $32, %ecx C new shift
+
+ psrlq %mm6, %mm0
+
+ movd %ecx, %mm6
+
+ movd %mm0, (%edx)
+ addl $4, %edx
+L(start_dst_aligned):
+
+
+ movq 8(%ebx), %mm3
+ negl %ecx
+
+ movq %mm3, %mm2 C mm2 src qword
+ addl $64, %ecx
+
+ movd %ecx, %mm7
+ psrlq %mm6, %mm1
+
+ leal -12(%ebx,%eax,4), %ebx
+ leal -20(%edx,%eax,4), %edx
+
+ psllq %mm7, %mm3
+ subl $7, %eax C size-7
+
+ por %mm1, %mm3 C mm3 ready to store
+ negl %eax C -(size-7)
+
+ jns L(finish)
+
+
+ C This loop is the important bit, the rest is just support. Careful
+ C instruction scheduling achieves the claimed 1.75 c/l. The
+ C relevant parts of the pairing rules are:
+ C
+ C - mmx loads and stores execute only in the U pipe
+ C - only one mmx shift in a pair
+ C - wait one cycle before storing an mmx register result
+ C - the usual address generation interlock
+ C
+ C Two qword calculations are slightly interleaved. The instructions
+ C marked "C" belong to the second qword, and the "C prev" one is for
+ C the second qword from the previous iteration.
+
+ ALIGN(8)
+L(unroll_loop):
+ C eax counter, limbs, negative
+ C ebx &src[size-12]
+ C ecx
+ C edx &dst[size-12]
+ C esi
+ C edi
+ C
+ C mm0
+ C mm1
+ C mm2 src qword from -8(%ebx,%eax,4)
+ C mm3 dst qword ready to store to -8(%edx,%eax,4)
+ C
+ C mm5 return value
+ C mm6 rshift
+ C mm7 lshift
+
+ movq (%ebx,%eax,4), %mm0
+ psrlq %mm6, %mm2
+
+ movq %mm0, %mm1
+ psllq %mm7, %mm0
+
+ movq %mm3, -8(%edx,%eax,4) C prev
+ por %mm2, %mm0
+
+ movq 8(%ebx,%eax,4), %mm3 C
+ psrlq %mm6, %mm1 C
+
+ movq %mm0, (%edx,%eax,4)
+ movq %mm3, %mm2 C
+
+ psllq %mm7, %mm3 C
+ addl $4, %eax
+
+ por %mm1, %mm3 C
+ js L(unroll_loop)
+
+
+L(finish):
+ C eax 0 to 3 representing respectively 3 to 0 limbs remaining
+
+ testb $2, %al
+
+ jnz L(finish_no_two)
+
+ movq (%ebx,%eax,4), %mm0
+ psrlq %mm6, %mm2
+
+ movq %mm0, %mm1
+ psllq %mm7, %mm0
+
+ movq %mm3, -8(%edx,%eax,4) C prev
+ por %mm2, %mm0
+
+ movq %mm1, %mm2
+ movq %mm0, %mm3
+
+ addl $2, %eax
+L(finish_no_two):
+
+
+ C eax 2 or 3 representing respectively 1 or 0 limbs remaining
+ C
+ C mm2 src prev qword, from -8(%ebx,%eax,4)
+ C mm3 dst qword, for -8(%edx,%eax,4)
+
+ testb $1, %al
+ popl %edi
+
+ movd %mm5, %eax C retval
+ jnz L(finish_zero)
+
+
+ C One extra limb, destination was aligned.
+ C
+ C source ebx
+ C +-------+---------------+--
+ C | | mm2 |
+ C +-------+---------------+--
+ C
+ C dest edx
+ C +-------+---------------+---------------+--
+ C | | | mm3 |
+ C +-------+---------------+---------------+--
+ C
+ C mm6 = shift
+ C mm7 = ecx = 64-shift
+
+
+ C One extra limb, destination was unaligned.
+ C
+ C source ebx
+ C +-------+---------------+--
+ C | | mm2 |
+ C +-------+---------------+--
+ C
+ C dest edx
+ C +---------------+---------------+--
+ C | | mm3 |
+ C +---------------+---------------+--
+ C
+ C mm6 = shift+32
+ C mm7 = ecx = 64-(shift+32)
+
+
+ C In both cases there's one extra limb of src to fetch and combine
+ C with mm2 to make a qword at 8(%edx), and in the aligned case
+ C there's a further extra limb of dst to be formed.
+
+
+ movd 8(%ebx), %mm0
+ psrlq %mm6, %mm2
+
+ movq %mm0, %mm1
+ psllq %mm7, %mm0
+
+ movq %mm3, (%edx)
+ por %mm2, %mm0
+
+ psrlq %mm6, %mm1
+ andl $32, %ecx
+
+ popl %ebx
+ jz L(finish_one_unaligned)
+
+ C dst was aligned, must store one extra limb
+ movd %mm1, 16(%edx)
+L(finish_one_unaligned):
+
+ movq %mm0, 8(%edx)
+
+ emms
+
+ ret
+
+
+L(finish_zero):
+
+ C No extra limbs, destination was aligned.
+ C
+ C source ebx
+ C +---------------+--
+ C | mm2 |
+ C +---------------+--
+ C
+ C dest edx+4
+ C +---------------+---------------+--
+ C | | mm3 |
+ C +---------------+---------------+--
+ C
+ C mm6 = shift
+ C mm7 = ecx = 64-shift
+
+
+ C No extra limbs, destination was unaligned.
+ C
+ C source ebx
+ C +---------------+--
+ C | mm2 |
+ C +---------------+--
+ C
+ C dest edx+4
+ C +-------+---------------+--
+ C | | mm3 |
+ C +-------+---------------+--
+ C
+ C mm6 = shift+32
+ C mm7 = 64-(shift+32)
+
+
+ C The movd for the unaligned case is clearly the same data as the
+ C movq for the aligned case, it's just a choice between whether one
+ C or two limbs should be written.
+
+
+ movq %mm3, 4(%edx)
+ psrlq %mm6, %mm2
+
+ movd %mm2, 12(%edx)
+ andl $32, %ecx
+
+ popl %ebx
+ jz L(finish_zero_unaligned)
+
+ movq %mm2, 12(%edx)
+L(finish_zero_unaligned):
+
+ emms
+
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/mod_34lsub1.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/mod_34lsub1.asm
new file mode 100644
index 0000000..2d88223
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/mod_34lsub1.asm
@@ -0,0 +1,192 @@
+dnl Intel P5 mpn_mod_34lsub1 -- mpn remainder modulo 2**24-1.
+
+dnl Copyright 2000-2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 1.66 cycles/limb
+
+
+C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
+C
+
+defframe(PARAM_SIZE, 8)
+defframe(PARAM_SRC, 4)
+
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mod_34lsub1)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ movl PARAM_SRC, %edx
+
+ subl $2, %ecx
+ ja L(three_or_more)
+
+ movl (%edx), %eax
+ jne L(one)
+
+
+ movl 4(%edx), %ecx
+ movl %eax, %edx
+
+ shrl $24, %edx
+ andl $0xFFFFFF, %eax
+
+ addl %edx, %eax
+ movl %ecx, %edx
+
+ shrl $16, %ecx
+ andl $0xFFFF, %edx
+
+ shll $8, %edx
+ addl %ecx, %eax
+
+ addl %edx, %eax
+
+L(one):
+ ret
+
+
+L(three_or_more):
+ C eax
+ C ebx
+ C ecx size-2
+ C edx src
+ C esi
+ C edi
+ C ebp
+
+ pushl %ebx FRAME_pushl()
+ pushl %esi FRAME_pushl()
+
+ pushl %edi FRAME_pushl()
+ pushl %ebp FRAME_pushl()
+
+ xorl %esi, %esi C 0mod3
+ xorl %edi, %edi C 1mod3
+
+ xorl %ebp, %ebp C 2mod3, and clear carry
+
+L(top):
+ C eax scratch
+ C ebx scratch
+ C ecx counter, limbs
+ C edx src
+ C esi 0mod3
+ C edi 1mod3
+ C ebp 2mod3
+
+ movl (%edx), %eax
+ movl 4(%edx), %ebx
+
+ adcl %eax, %esi
+ movl 8(%edx), %eax
+
+ adcl %ebx, %edi
+ leal 12(%edx), %edx
+
+ adcl %eax, %ebp
+ leal -2(%ecx), %ecx
+
+ decl %ecx
+ jg L(top)
+
+
+ C ecx is -2, -1 or 0, representing 0, 1 or 2 more limbs, respectively
+
+ movl $0xFFFFFFFF, %ebx C mask
+ incl %ecx
+
+ js L(combine) C 0 more
+
+ movl (%edx), %eax
+ movl $0xFFFFFF00, %ebx
+
+ adcl %eax, %esi
+ decl %ecx
+
+ js L(combine) C 1 more
+
+ movl 4(%edx), %eax
+ movl $0xFFFF0000, %ebx
+
+ adcl %eax, %edi
+
+
+
+L(combine):
+ C eax
+ C ebx mask
+ C ecx
+ C edx
+ C esi 0mod3
+ C edi 1mod3
+ C ebp 2mod3
+
+ sbbl %ecx, %ecx C carry
+ movl %esi, %eax C 0mod3
+
+ andl %ebx, %ecx C masked for position
+ andl $0xFFFFFF, %eax C 0mod3 low
+
+ shrl $24, %esi C 0mod3 high
+ subl %ecx, %eax C apply carry
+
+ addl %esi, %eax C apply 0mod3
+ movl %edi, %ebx C 1mod3
+
+ shrl $16, %edi C 1mod3 high
+ andl $0x0000FFFF, %ebx
+
+ shll $8, %ebx C 1mod3 low
+ addl %edi, %eax C apply 1mod3 high
+
+ addl %ebx, %eax C apply 1mod3 low
+ movl %ebp, %ebx C 2mod3
+
+ shrl $8, %ebp C 2mod3 high
+ andl $0xFF, %ebx
+
+ shll $16, %ebx C 2mod3 low
+ addl %ebp, %eax C apply 2mod3 high
+
+ addl %ebx, %eax C apply 2mod3 low
+
+ popl %ebp
+ popl %edi
+
+ popl %esi
+ popl %ebx
+
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/mode1o.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/mode1o.asm
new file mode 100644
index 0000000..a90abca
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/mode1o.asm
@@ -0,0 +1,279 @@
+dnl Intel Pentium mpn_modexact_1_odd -- exact division style remainder.
+
+dnl Copyright 2000-2002, 2014 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 23.0 cycles/limb
+
+
+C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor);
+C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor, mp_limb_t carry);
+C
+C There seems no way to pair up the two lone instructions in the main loop.
+C
+C The special case for size==1 saves about 20 cycles (non-PIC), making it
+C the same as mpn_mod_1, and in fact making modexact faster than mod_1 at
+C all sizes.
+C
+C Alternatives:
+C
+C Using mmx for the multiplies might be possible, with pmullw and pmulhw
+C having just 3 cycle latencies, but carry bit handling would probably be
+C complicated.
+
+defframe(PARAM_CARRY, 16)
+defframe(PARAM_DIVISOR,12)
+defframe(PARAM_SIZE, 8)
+defframe(PARAM_SRC, 4)
+
+dnl re-using parameter space
+define(VAR_INVERSE,`PARAM_SIZE')
+
+ TEXT
+
+ ALIGN(16)
+PROLOGUE(mpn_modexact_1c_odd)
+deflit(`FRAME',0)
+
+ movl PARAM_DIVISOR, %eax
+ movl PARAM_CARRY, %edx
+
+ jmp L(start_1c)
+
+EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(mpn_modexact_1_odd)
+deflit(`FRAME',0)
+
+ movl PARAM_DIVISOR, %eax
+ xorl %edx, %edx C carry
+
+L(start_1c):
+
+ifdef(`PIC',`
+ifdef(`DARWIN',`
+ shrl %eax C d/2
+ LEA( binvert_limb_table, %ecx)
+ pushl %ebx FRAME_pushl()
+ movl PARAM_SIZE, %ebx
+
+ andl $127, %eax
+ subl $2, %ebx
+
+ movb (%eax,%ecx), %cl
+ jc L(one_limb)
+',`
+ call L(here) FRAME_pushl()
+L(here):
+
+ shrl %eax C d/2
+ movl (%esp), %ecx C eip
+
+ addl $_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ecx
+ movl %ebx, (%esp) C push ebx
+
+ andl $127, %eax
+ movl PARAM_SIZE, %ebx
+
+ movl binvert_limb_table@GOT(%ecx), %ecx
+ subl $2, %ebx
+
+ movb (%eax,%ecx), %cl C inv 8 bits
+ jc L(one_limb)
+')
+',`
+dnl non-PIC
+ shrl %eax C d/2
+ pushl %ebx FRAME_pushl()
+
+ movl PARAM_SIZE, %ebx
+ andl $127, %eax
+
+ subl $2, %ebx
+ jc L(one_limb)
+
+ movb binvert_limb_table(%eax), %cl C inv 8 bits
+')
+
+ movl %ecx, %eax
+ addl %ecx, %ecx C 2*inv
+
+ imull %eax, %eax C inv*inv
+
+ imull PARAM_DIVISOR, %eax C inv*inv*d
+
+ subl %eax, %ecx C inv = 2*inv - inv*inv*d
+
+ movl %ecx, %eax
+ addl %ecx, %ecx C 2*inv
+
+ imull %eax, %eax C inv*inv
+
+ imull PARAM_DIVISOR, %eax C inv*inv*d
+
+ subl %eax, %ecx C inv = 2*inv - inv*inv*d
+ pushl %esi FRAME_pushl()
+
+ ASSERT(e,` C d*inv == 1 mod 2^GMP_LIMB_BITS
+ movl %ecx, %eax
+ imull PARAM_DIVISOR, %eax
+ cmpl $1, %eax')
+
+ movl PARAM_SRC, %esi
+ movl %ecx, VAR_INVERSE
+
+ movl (%esi), %eax C src[0]
+ leal 4(%esi,%ebx,4), %esi C &src[size-1]
+
+ xorl $-1, %ebx C -(size-1)
+ ASSERT(nz)
+ jmp L(entry)
+
+
+C The use of VAR_INVERSE means only a store is needed for that value, rather
+C than a push and pop of say %edi.
+
+ ALIGN(16)
+L(top):
+ C eax scratch, low product
+ C ebx counter, limbs, negative
+ C ecx carry bit
+ C edx scratch, high product
+ C esi &src[size-1]
+ C edi
+ C ebp
+
+ mull PARAM_DIVISOR C h:dummy = q*d
+
+ movl (%esi,%ebx,4), %eax C src[i]
+ subl %ecx, %edx C h -= -c
+
+L(entry):
+ subl %edx, %eax C s = src[i] - h
+
+ sbbl %ecx, %ecx C new -c (0 or -1)
+
+ imull VAR_INVERSE, %eax C q = s*i
+
+ incl %ebx
+ jnz L(top)
+
+
+ mull PARAM_DIVISOR
+
+ movl (%esi), %eax C src high
+ subl %ecx, %edx C h -= -c
+
+ cmpl PARAM_DIVISOR, %eax
+
+ jbe L(skip_last)
+deflit(FRAME_LAST,FRAME)
+
+
+ subl %edx, %eax C s = src[i] - h
+ popl %esi FRAME_popl()
+
+ sbbl %ecx, %ecx C c (0 or -1)
+ popl %ebx FRAME_popl()
+
+ imull VAR_INVERSE, %eax C q = s*i
+
+ mull PARAM_DIVISOR C h:dummy = q*d
+
+ movl %edx, %eax
+
+ subl %ecx, %eax
+
+ ret
+
+
+C When high<divisor can skip last step.
+
+L(skip_last):
+deflit(`FRAME',FRAME_LAST)
+ C eax src high
+ C ebx
+ C ecx
+ C edx r
+ C esi
+
+ subl %eax, %edx C r-s
+ popl %esi FRAME_popl()
+
+ sbbl %eax, %eax C -1 if underflow
+ movl PARAM_DIVISOR, %ebx
+
+ andl %ebx, %eax C divisor if underflow
+ popl %ebx FRAME_popl()
+
+ addl %edx, %eax C addback if underflow
+
+ ret
+
+
+C Special case for size==1 using a division for r = c-a mod d.
+C Could look for a-c<d and save a division sometimes, but that doesn't seem
+C worth bothering about.
+
+L(one_limb):
+deflit(`FRAME',4)
+ C eax
+ C ebx size-2 (==-1)
+ C ecx
+ C edx carry
+ C esi src end
+ C edi
+ C ebp
+
+ movl %edx, %eax
+ movl PARAM_SRC, %edx
+
+ movl PARAM_DIVISOR, %ecx
+ popl %ebx FRAME_popl()
+
+ subl (%edx), %eax C c-a
+
+ sbbl %edx, %edx
+ decl %ecx C d-1
+
+ andl %ecx, %edx C b*d+c-a if c<a, or c-a if c>=a
+
+ divl PARAM_DIVISOR
+
+ movl %edx, %eax
+
+ ret
+
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/mul_1.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/mul_1.asm
new file mode 100644
index 0000000..a0858af
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/mul_1.asm
@@ -0,0 +1,177 @@
+dnl Intel Pentium mpn_mul_1 -- mpn by limb multiplication.
+
+dnl Copyright 1992, 1994, 1996, 1999, 2000, 2002 Free Software Foundation,
+dnl Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 12.0 cycles/limb
+
+
+C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t multiplier);
+C mp_limb_t mpn_mul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t multiplier, mp_limb_t carry);
+C
+
+defframe(PARAM_CARRY, 20)
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ TEXT
+ ALIGN(8)
+PROLOGUE(mpn_mul_1c)
+deflit(`FRAME',0)
+
+ movl PARAM_CARRY, %ecx
+ pushl %esi FRAME_pushl()
+
+ jmp L(start_1c)
+
+EPILOGUE()
+
+
+ ALIGN(8)
+PROLOGUE(mpn_mul_1)
+deflit(`FRAME',0)
+
+ xorl %ecx, %ecx
+ pushl %esi FRAME_pushl()
+
+L(start_1c):
+ movl PARAM_SRC, %esi
+ movl PARAM_SIZE, %eax
+
+ shrl %eax
+ jnz L(two_or_more)
+
+
+ C one limb only
+
+ movl (%esi), %eax
+
+ mull PARAM_MULTIPLIER
+
+ addl %eax, %ecx
+ movl PARAM_DST, %eax
+
+ adcl $0, %edx
+ popl %esi
+
+ movl %ecx, (%eax)
+ movl %edx, %eax
+
+ ret
+
+
+L(two_or_more):
+ C eax size/2
+ C ebx
+ C ecx carry
+ C edx
+ C esi src
+ C edi
+ C ebp
+
+ pushl %edi FRAME_pushl()
+ pushl %ebx FRAME_pushl()
+
+ movl PARAM_DST, %edi
+ leal -1(%eax), %ebx C size/2-1
+
+ notl %ebx C -size, preserve carry
+
+ leal (%esi,%eax,8), %esi C src end
+ leal (%edi,%eax,8), %edi C dst end
+
+ pushl %ebp FRAME_pushl()
+ jnc L(top)
+
+
+ C size was odd, process one limb separately
+
+ movl (%esi,%ebx,8), %eax
+ addl $4, %esi
+
+ mull PARAM_MULTIPLIER
+
+ addl %ecx, %eax
+ movl %edx, %ecx
+
+ movl %eax, (%edi,%ebx,8)
+ leal 4(%edi), %edi
+
+
+L(top):
+ C eax
+ C ebx counter, negative
+ C ecx carry
+ C edx
+ C esi src end
+ C edi dst end
+ C ebp
+
+ adcl $0, %ecx
+ movl (%esi,%ebx,8), %eax
+
+ mull PARAM_MULTIPLIER
+
+ movl %edx, %ebp
+ addl %eax, %ecx
+
+ adcl $0, %ebp
+ movl 4(%esi,%ebx,8), %eax
+
+ mull PARAM_MULTIPLIER
+
+ movl %ecx, (%edi,%ebx,8)
+ addl %ebp, %eax
+
+ movl %eax, 4(%edi,%ebx,8)
+ incl %ebx
+
+ movl %edx, %ecx
+ jnz L(top)
+
+
+ adcl $0, %ecx
+ popl %ebp
+
+ movl %ecx, %eax
+ popl %ebx
+
+ popl %edi
+ popl %esi
+
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/mul_2.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/mul_2.asm
new file mode 100644
index 0000000..4c7beb5
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/mul_2.asm
@@ -0,0 +1,150 @@
+dnl Intel Pentium mpn_mul_2 -- mpn by 2-limb multiplication.
+
+dnl Copyright 2001, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 24.0 cycles/limb
+
+
+C mp_limb_t mpn_mul_2 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_srcptr mult);
+C
+C At 24 c/l this is only 2 cycles faster than a separate mul_1 and addmul_1,
+C but has the advantage of making just one pass over the operands.
+C
+C There's not enough registers to use PARAM_MULT directly, so the multiplier
+C limbs are transferred to local variables on the stack.
+
+defframe(PARAM_MULT, 16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+dnl re-use parameter space
+define(VAR_MULT_LOW, `PARAM_SRC')
+define(VAR_MULT_HIGH,`PARAM_DST')
+
+ TEXT
+ ALIGN(8)
+PROLOGUE(mpn_mul_2)
+deflit(`FRAME',0)
+
+ pushl %esi FRAME_pushl()
+ pushl %edi FRAME_pushl()
+
+ movl PARAM_SRC, %esi
+ movl PARAM_DST, %edi
+
+ movl PARAM_MULT, %eax
+ movl PARAM_SIZE, %ecx
+
+ movl 4(%eax), %edx C mult high
+ movl (%eax), %eax C mult low
+
+ movl %eax, VAR_MULT_LOW
+ movl %edx, VAR_MULT_HIGH
+
+ pushl %ebx FRAME_pushl()
+ pushl %ebp FRAME_pushl()
+
+ mull (%esi) C src[0] * mult[0]
+
+ movl %eax, %ebp C in case src==dst
+ movl (%esi), %eax C src[0]
+
+ movl %ebp, (%edi) C dst[0]
+ movl %edx, %ebx C initial low carry
+
+ xorl %ebp, %ebp C initial high carry
+ leal (%edi,%ecx,4), %edi C dst end
+
+ mull VAR_MULT_HIGH C src[0] * mult[1]
+
+ subl $2, %ecx C size-2
+ js L(done)
+
+ leal 8(%esi,%ecx,4), %esi C &src[size]
+ xorl $-1, %ecx C -(size-1)
+
+
+
+L(top):
+ C eax low prod
+ C ebx low carry
+ C ecx counter, negative
+ C edx high prod
+ C esi src end
+ C edi dst end
+ C ebp high carry (0 or -1)
+
+ andl $1, %ebp C 1 or 0
+ addl %eax, %ebx
+
+ adcl %edx, %ebp
+ ASSERT(nc)
+ movl (%esi,%ecx,4), %eax
+
+ mull VAR_MULT_LOW
+
+ addl %eax, %ebx C low carry
+ movl (%esi,%ecx,4), %eax
+
+ adcl %ebp, %edx C high carry
+ movl %ebx, (%edi,%ecx,4)
+
+ sbbl %ebp, %ebp C new high carry, -1 or 0
+ movl %edx, %ebx C new low carry
+
+ mull VAR_MULT_HIGH
+
+ incl %ecx
+ jnz L(top)
+
+
+L(done):
+ andl $1, %ebp C 1 or 0
+ addl %ebx, %eax
+
+ adcl %ebp, %edx
+ ASSERT(nc)
+ movl %eax, (%edi) C store carry low
+
+ movl %edx, %eax C return carry high
+
+ popl %ebp
+ popl %ebx
+
+ popl %edi
+ popl %esi
+
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/mul_basecase.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/mul_basecase.asm
new file mode 100644
index 0000000..e1d0f05
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/mul_basecase.asm
@@ -0,0 +1,142 @@
+dnl Intel Pentium mpn_mul_basecase -- mpn by mpn multiplication.
+
+dnl Copyright 1996, 1998-2000, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 14.2 cycles/crossproduct (approx)
+
+
+C void mpn_mul_basecase (mp_ptr wp,
+C mp_srcptr xp, mp_size_t xsize,
+C mp_srcptr yp, mp_size_t ysize);
+
+defframe(PARAM_YSIZE, 20)
+defframe(PARAM_YP, 16)
+defframe(PARAM_XSIZE, 12)
+defframe(PARAM_XP, 8)
+defframe(PARAM_WP, 4)
+
+defframe(VAR_COUNTER, -4)
+
+ TEXT
+ ALIGN(8)
+PROLOGUE(mpn_mul_basecase)
+
+ pushl %eax C dummy push for allocating stack slot
+ pushl %esi
+ pushl %ebp
+ pushl %edi
+deflit(`FRAME',16)
+
+ movl PARAM_XP,%esi
+ movl PARAM_WP,%edi
+ movl PARAM_YP,%ebp
+
+ movl (%esi),%eax C load xp[0]
+ mull (%ebp) C multiply by yp[0]
+ movl %eax,(%edi) C store to wp[0]
+ movl PARAM_XSIZE,%ecx C xsize
+ decl %ecx C If xsize = 1, ysize = 1 too
+ jz L(done)
+
+ movl PARAM_XSIZE,%eax
+ pushl %ebx
+FRAME_pushl()
+ movl %edx,%ebx
+ leal (%esi,%eax,4),%esi C make xp point at end
+ leal (%edi,%eax,4),%edi C offset wp by xsize
+ negl %ecx C negate j size/index for inner loop
+ xorl %eax,%eax C clear carry
+
+ ALIGN(8)
+L(oop1): adcl $0,%ebx
+ movl (%esi,%ecx,4),%eax C load next limb at xp[j]
+ mull (%ebp)
+ addl %ebx,%eax
+ movl %eax,(%edi,%ecx,4)
+ incl %ecx
+ movl %edx,%ebx
+ jnz L(oop1)
+
+ adcl $0,%ebx
+ movl PARAM_YSIZE,%eax
+ movl %ebx,(%edi) C most significant limb of product
+ addl $4,%edi C increment wp
+ decl %eax
+ jz L(skip)
+ movl %eax,VAR_COUNTER C set index i to ysize
+
+L(outer):
+ addl $4,%ebp C make ebp point to next y limb
+ movl PARAM_XSIZE,%ecx
+ negl %ecx
+ xorl %ebx,%ebx
+
+ C code at 0x61 here, close enough to aligned
+L(oop2):
+ adcl $0,%ebx
+ movl (%esi,%ecx,4),%eax
+ mull (%ebp)
+ addl %ebx,%eax
+ movl (%edi,%ecx,4),%ebx
+ adcl $0,%edx
+ addl %eax,%ebx
+ movl %ebx,(%edi,%ecx,4)
+ incl %ecx
+ movl %edx,%ebx
+ jnz L(oop2)
+
+ adcl $0,%ebx
+
+ movl %ebx,(%edi)
+ addl $4,%edi
+ movl VAR_COUNTER,%eax
+ decl %eax
+ movl %eax,VAR_COUNTER
+ jnz L(outer)
+
+L(skip):
+ popl %ebx
+ popl %edi
+ popl %ebp
+ popl %esi
+ addl $4,%esp
+ ret
+
+L(done):
+ movl %edx,4(%edi) C store to wp[1]
+ popl %edi
+ popl %ebp
+ popl %esi
+ popl %eax C dummy pop for deallocating stack slot
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/popcount.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/popcount.asm
new file mode 100644
index 0000000..0e82144
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/popcount.asm
@@ -0,0 +1,146 @@
+dnl Intel P5 mpn_popcount -- mpn bit population count.
+
+dnl Copyright 2001, 2002, 2014, 2015 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 8.0 cycles/limb
+
+
+C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
+C
+C An arithmetic approach has been found to be slower than the table lookup,
+C due to needing too many instructions.
+
+C The slightly strange quoting here helps the renaming done by tune/many.pl.
+deflit(TABLE_NAME,
+m4_assert_defined(`GSYM_PREFIX')
+GSYM_PREFIX`'mpn_popcount``'_table')
+
+C FIXME: exporting the table to hamdist is incorrect as it hurt incremental
+C linking.
+
+ RODATA
+ ALIGN(8)
+ GLOBL TABLE_NAME
+TABLE_NAME:
+forloop(i,0,255,
+` .byte m4_popcount(i)
+')
+
+defframe(PARAM_SIZE,8)
+defframe(PARAM_SRC, 4)
+
+ TEXT
+ ALIGN(8)
+
+PROLOGUE(mpn_popcount)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ pushl %esi FRAME_pushl()
+
+ifdef(`PIC',`
+ pushl %ebx FRAME_pushl()
+ pushl %ebp FRAME_pushl()
+ifdef(`DARWIN',`
+ shll %ecx C size in byte pairs
+ LEA( TABLE_NAME, %ebp)
+ movl PARAM_SRC, %esi
+ xorl %eax, %eax C total
+ xorl %ebx, %ebx C byte
+ xorl %edx, %edx C byte
+',`
+ call L(here)
+L(here):
+ popl %ebp
+ shll %ecx C size in byte pairs
+
+ addl $_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ebp
+ movl PARAM_SRC, %esi
+
+ xorl %eax, %eax C total
+ xorl %ebx, %ebx C byte
+
+ movl TABLE_NAME@GOT(%ebp), %ebp
+ xorl %edx, %edx C byte
+')
+define(TABLE,`(%ebp,$1)')
+',`
+dnl non-PIC
+ shll %ecx C size in byte pairs
+ movl PARAM_SRC, %esi
+
+ pushl %ebx FRAME_pushl()
+ xorl %eax, %eax C total
+
+ xorl %ebx, %ebx C byte
+ xorl %edx, %edx C byte
+
+define(TABLE,`TABLE_NAME`'($1)')
+')
+
+
+ ALIGN(8) C necessary on P55 for claimed speed
+L(top):
+ C eax total
+ C ebx byte
+ C ecx counter, 2*size to 2
+ C edx byte
+ C esi src
+ C edi
+ C ebp [PIC] table
+
+ addl %ebx, %eax
+ movb -1(%esi,%ecx,2), %bl
+
+ addl %edx, %eax
+ movb -2(%esi,%ecx,2), %dl
+
+ movb TABLE(%ebx), %bl
+ decl %ecx
+
+ movb TABLE(%edx), %dl
+ jnz L(top)
+
+
+ifdef(`PIC',`
+ popl %ebp
+')
+ addl %ebx, %eax
+ popl %ebx
+
+ addl %edx, %eax
+ popl %esi
+
+ ret
+
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/rshift.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/rshift.asm
new file mode 100644
index 0000000..2105c4c
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/rshift.asm
@@ -0,0 +1,243 @@
+dnl Intel Pentium mpn_rshift -- mpn right shift.
+
+dnl Copyright 1992, 1994-1996, 1999, 2000, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C P5,P54: 6.0
+C P55: 5.375
+
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C unsigned shift);
+C
+C The main shift-by-N loop should run at 5.375 c/l and that's what P55 does,
+C but P5 and P54 run only at 6.0 c/l, which is 4 cycles lost somewhere.
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ TEXT
+ ALIGN(8)
+PROLOGUE(mpn_rshift)
+
+ pushl %edi
+ pushl %esi
+ pushl %ebx
+ pushl %ebp
+deflit(`FRAME',16)
+
+ movl PARAM_DST,%edi
+ movl PARAM_SRC,%esi
+ movl PARAM_SIZE,%ebp
+ movl PARAM_SHIFT,%ecx
+
+C We can use faster code for shift-by-1 under certain conditions.
+ cmp $1,%ecx
+ jne L(normal)
+ leal 4(%edi),%eax
+ cmpl %esi,%eax
+ jnc L(special) C jump if res_ptr + 1 >= s_ptr
+ leal (%edi,%ebp,4),%eax
+ cmpl %eax,%esi
+ jnc L(special) C jump if s_ptr >= res_ptr + size
+
+L(normal):
+ movl (%esi),%edx
+ addl $4,%esi
+ xorl %eax,%eax
+ shrdl( %cl, %edx, %eax) C compute carry limb
+ pushl %eax C push carry limb onto stack
+
+ decl %ebp
+ pushl %ebp
+ shrl $3,%ebp
+ jz L(end)
+
+ movl (%edi),%eax C fetch destination cache line
+
+ ALIGN(4)
+L(oop): movl 28(%edi),%eax C fetch destination cache line
+ movl %edx,%ebx
+
+ movl (%esi),%eax
+ movl 4(%esi),%edx
+ shrdl( %cl, %eax, %ebx)
+ shrdl( %cl, %edx, %eax)
+ movl %ebx,(%edi)
+ movl %eax,4(%edi)
+
+ movl 8(%esi),%ebx
+ movl 12(%esi),%eax
+ shrdl( %cl, %ebx, %edx)
+ shrdl( %cl, %eax, %ebx)
+ movl %edx,8(%edi)
+ movl %ebx,12(%edi)
+
+ movl 16(%esi),%edx
+ movl 20(%esi),%ebx
+ shrdl( %cl, %edx, %eax)
+ shrdl( %cl, %ebx, %edx)
+ movl %eax,16(%edi)
+ movl %edx,20(%edi)
+
+ movl 24(%esi),%eax
+ movl 28(%esi),%edx
+ shrdl( %cl, %eax, %ebx)
+ shrdl( %cl, %edx, %eax)
+ movl %ebx,24(%edi)
+ movl %eax,28(%edi)
+
+ addl $32,%esi
+ addl $32,%edi
+ decl %ebp
+ jnz L(oop)
+
+L(end): popl %ebp
+ andl $7,%ebp
+ jz L(end2)
+L(oop2):
+ movl (%esi),%eax
+ shrdl( %cl,%eax,%edx) C compute result limb
+ movl %edx,(%edi)
+ movl %eax,%edx
+ addl $4,%esi
+ addl $4,%edi
+ decl %ebp
+ jnz L(oop2)
+
+L(end2):
+ shrl %cl,%edx C compute most significant limb
+ movl %edx,(%edi) C store it
+
+ popl %eax C pop carry limb
+
+ popl %ebp
+ popl %ebx
+ popl %esi
+ popl %edi
+ ret
+
+
+C We loop from least significant end of the arrays, which is only
+C permissable if the source and destination don't overlap, since the
+C function is documented to work for overlapping source and destination.
+
+L(special):
+ leal -4(%edi,%ebp,4),%edi
+ leal -4(%esi,%ebp,4),%esi
+
+ movl (%esi),%edx
+ subl $4,%esi
+
+ decl %ebp
+ pushl %ebp
+ shrl $3,%ebp
+
+ shrl %edx
+ incl %ebp
+ decl %ebp
+ jz L(Lend)
+
+ movl (%edi),%eax C fetch destination cache line
+
+ ALIGN(4)
+L(Loop):
+ movl -28(%edi),%eax C fetch destination cache line
+ movl %edx,%ebx
+
+ movl (%esi),%eax
+ movl -4(%esi),%edx
+ rcrl %eax
+ movl %ebx,(%edi)
+ rcrl %edx
+ movl %eax,-4(%edi)
+
+ movl -8(%esi),%ebx
+ movl -12(%esi),%eax
+ rcrl %ebx
+ movl %edx,-8(%edi)
+ rcrl %eax
+ movl %ebx,-12(%edi)
+
+ movl -16(%esi),%edx
+ movl -20(%esi),%ebx
+ rcrl %edx
+ movl %eax,-16(%edi)
+ rcrl %ebx
+ movl %edx,-20(%edi)
+
+ movl -24(%esi),%eax
+ movl -28(%esi),%edx
+ rcrl %eax
+ movl %ebx,-24(%edi)
+ rcrl %edx
+ movl %eax,-28(%edi)
+
+ leal -32(%esi),%esi C use leal not to clobber carry
+ leal -32(%edi),%edi
+ decl %ebp
+ jnz L(Loop)
+
+L(Lend):
+ popl %ebp
+ sbbl %eax,%eax C save carry in %eax
+ andl $7,%ebp
+ jz L(Lend2)
+ addl %eax,%eax C restore carry from eax
+L(Loop2):
+ movl %edx,%ebx
+ movl (%esi),%edx
+ rcrl %edx
+ movl %ebx,(%edi)
+
+ leal -4(%esi),%esi C use leal not to clobber carry
+ leal -4(%edi),%edi
+ decl %ebp
+ jnz L(Loop2)
+
+ jmp L(L1)
+L(Lend2):
+ addl %eax,%eax C restore carry from eax
+L(L1): movl %edx,(%edi) C store last limb
+
+ movl $0,%eax
+ rcrl %eax
+
+ popl %ebp
+ popl %ebx
+ popl %esi
+ popl %edi
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/sqr_basecase.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/sqr_basecase.asm
new file mode 100644
index 0000000..b11d767
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/sqr_basecase.asm
@@ -0,0 +1,528 @@
+dnl Intel P5 mpn_sqr_basecase -- square an mpn number.
+
+dnl Copyright 1999-2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: approx 8 cycles per crossproduct, or 15.5 cycles per triangular
+C product at around 20x20 limbs.
+
+
+C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C Calculate src,size squared, storing the result in dst,2*size.
+C
+C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
+C lot of function call overheads are avoided, especially when the size is
+C small.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ TEXT
+ ALIGN(8)
+PROLOGUE(mpn_sqr_basecase)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %edx
+ movl PARAM_SRC, %eax
+
+ cmpl $2, %edx
+ movl PARAM_DST, %ecx
+
+ je L(two_limbs)
+
+ movl (%eax), %eax
+ ja L(three_or_more)
+
+C -----------------------------------------------------------------------------
+C one limb only
+ C eax src
+ C ebx
+ C ecx dst
+ C edx
+
+ mull %eax
+
+ movl %eax, (%ecx)
+ movl %edx, 4(%ecx)
+
+ ret
+
+C -----------------------------------------------------------------------------
+ ALIGN(8)
+L(two_limbs):
+ C eax src
+ C ebx
+ C ecx dst
+ C edx size
+
+ pushl %ebp
+ pushl %edi
+
+ pushl %esi
+ pushl %ebx
+
+ movl %eax, %ebx
+ movl (%eax), %eax
+
+ mull %eax C src[0]^2
+
+ movl %eax, (%ecx) C dst[0]
+ movl %edx, %esi C dst[1]
+
+ movl 4(%ebx), %eax
+
+ mull %eax C src[1]^2
+
+ movl %eax, %edi C dst[2]
+ movl %edx, %ebp C dst[3]
+
+ movl (%ebx), %eax
+
+ mull 4(%ebx) C src[0]*src[1]
+
+ addl %eax, %esi
+ popl %ebx
+
+ adcl %edx, %edi
+
+ adcl $0, %ebp
+ addl %esi, %eax
+
+ adcl %edi, %edx
+ movl %eax, 4(%ecx)
+
+ adcl $0, %ebp
+ popl %esi
+
+ movl %edx, 8(%ecx)
+ movl %ebp, 12(%ecx)
+
+ popl %edi
+ popl %ebp
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(8)
+L(three_or_more):
+ C eax src low limb
+ C ebx
+ C ecx dst
+ C edx size
+
+ cmpl $4, %edx
+ pushl %ebx
+deflit(`FRAME',4)
+
+ movl PARAM_SRC, %ebx
+ jae L(four_or_more)
+
+
+C -----------------------------------------------------------------------------
+C three limbs
+ C eax src low limb
+ C ebx src
+ C ecx dst
+ C edx size
+
+ pushl %ebp
+ pushl %edi
+
+ mull %eax C src[0] ^ 2
+
+ movl %eax, (%ecx)
+ movl %edx, 4(%ecx)
+
+ movl 4(%ebx), %eax
+ xorl %ebp, %ebp
+
+ mull %eax C src[1] ^ 2
+
+ movl %eax, 8(%ecx)
+ movl %edx, 12(%ecx)
+
+ movl 8(%ebx), %eax
+ pushl %esi C risk of cache bank clash
+
+ mull %eax C src[2] ^ 2
+
+ movl %eax, 16(%ecx)
+ movl %edx, 20(%ecx)
+
+ movl (%ebx), %eax
+
+ mull 4(%ebx) C src[0] * src[1]
+
+ movl %eax, %esi
+ movl %edx, %edi
+
+ movl (%ebx), %eax
+
+ mull 8(%ebx) C src[0] * src[2]
+
+ addl %eax, %edi
+ movl %edx, %ebp
+
+ adcl $0, %ebp
+ movl 4(%ebx), %eax
+
+ mull 8(%ebx) C src[1] * src[2]
+
+ xorl %ebx, %ebx
+ addl %eax, %ebp
+
+ C eax
+ C ebx zero, will be dst[5]
+ C ecx dst
+ C edx dst[4]
+ C esi dst[1]
+ C edi dst[2]
+ C ebp dst[3]
+
+ adcl $0, %edx
+ addl %esi, %esi
+
+ adcl %edi, %edi
+
+ adcl %ebp, %ebp
+
+ adcl %edx, %edx
+ movl 4(%ecx), %eax
+
+ adcl $0, %ebx
+ addl %esi, %eax
+
+ movl %eax, 4(%ecx)
+ movl 8(%ecx), %eax
+
+ adcl %edi, %eax
+ movl 12(%ecx), %esi
+
+ adcl %ebp, %esi
+ movl 16(%ecx), %edi
+
+ movl %eax, 8(%ecx)
+ movl %esi, 12(%ecx)
+
+ adcl %edx, %edi
+ popl %esi
+
+ movl 20(%ecx), %eax
+ movl %edi, 16(%ecx)
+
+ popl %edi
+ popl %ebp
+
+ adcl %ebx, %eax C no carry out of this
+ popl %ebx
+
+ movl %eax, 20(%ecx)
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(8)
+L(four_or_more):
+ C eax src low limb
+ C ebx src
+ C ecx dst
+ C edx size
+ C esi
+ C edi
+ C ebp
+ C
+ C First multiply src[0]*src[1..size-1] and store at dst[1..size].
+
+deflit(`FRAME',4)
+
+ pushl %edi
+FRAME_pushl()
+ pushl %esi
+FRAME_pushl()
+
+ pushl %ebp
+FRAME_pushl()
+ leal (%ecx,%edx,4), %edi C dst end of this mul1
+
+ leal (%ebx,%edx,4), %esi C src end
+ movl %ebx, %ebp C src
+
+ negl %edx C -size
+ xorl %ebx, %ebx C clear carry limb and carry flag
+
+ leal 1(%edx), %ecx C -(size-1)
+
+L(mul1):
+ C eax scratch
+ C ebx carry
+ C ecx counter, negative
+ C edx scratch
+ C esi &src[size]
+ C edi &dst[size]
+ C ebp src
+
+ adcl $0, %ebx
+ movl (%esi,%ecx,4), %eax
+
+ mull (%ebp)
+
+ addl %eax, %ebx
+
+ movl %ebx, (%edi,%ecx,4)
+ incl %ecx
+
+ movl %edx, %ebx
+ jnz L(mul1)
+
+
+ C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for
+ C n=1..size-2.
+ C
+ C The last two products, which are the end corner of the product
+ C triangle, are handled separately to save looping overhead. These
+ C are src[size-3]*src[size-2,size-1] and src[size-2]*src[size-1].
+ C If size is 4 then it's only these that need to be done.
+ C
+ C In the outer loop %esi is a constant, and %edi just advances by 1
+ C limb each time. The size of the operation decreases by 1 limb
+ C each time.
+
+ C eax
+ C ebx carry (needing carry flag added)
+ C ecx
+ C edx
+ C esi &src[size]
+ C edi &dst[size]
+ C ebp
+
+ adcl $0, %ebx
+ movl PARAM_SIZE, %edx
+
+ movl %ebx, (%edi)
+ subl $4, %edx
+
+ negl %edx
+ jz L(corner)
+
+
+L(outer):
+ C ebx previous carry limb to store
+ C edx outer loop counter (negative)
+ C esi &src[size]
+ C edi dst, pointing at stored carry limb of previous loop
+
+ pushl %edx C new outer loop counter
+ leal -2(%edx), %ecx
+
+ movl %ebx, (%edi)
+ addl $4, %edi
+
+ addl $4, %ebp
+ xorl %ebx, %ebx C initial carry limb, clear carry flag
+
+L(inner):
+ C eax scratch
+ C ebx carry (needing carry flag added)
+ C ecx counter, negative
+ C edx scratch
+ C esi &src[size]
+ C edi dst end of this addmul
+ C ebp &src[j]
+
+ adcl $0, %ebx
+ movl (%esi,%ecx,4), %eax
+
+ mull (%ebp)
+
+ addl %ebx, %eax
+ movl (%edi,%ecx,4), %ebx
+
+ adcl $0, %edx
+ addl %eax, %ebx
+
+ movl %ebx, (%edi,%ecx,4)
+ incl %ecx
+
+ movl %edx, %ebx
+ jnz L(inner)
+
+
+ adcl $0, %ebx
+ popl %edx C outer loop counter
+
+ incl %edx
+ jnz L(outer)
+
+
+ movl %ebx, (%edi)
+
+L(corner):
+ C esi &src[size]
+ C edi &dst[2*size-4]
+
+ movl -8(%esi), %eax
+ movl -4(%edi), %ebx C risk of data cache bank clash here
+
+ mull -12(%esi) C src[size-2]*src[size-3]
+
+ addl %eax, %ebx
+ movl %edx, %ecx
+
+ adcl $0, %ecx
+ movl -4(%esi), %eax
+
+ mull -12(%esi) C src[size-1]*src[size-3]
+
+ addl %ecx, %eax
+ movl (%edi), %ecx
+
+ adcl $0, %edx
+ movl %ebx, -4(%edi)
+
+ addl %eax, %ecx
+ movl %edx, %ebx
+
+ adcl $0, %ebx
+ movl -4(%esi), %eax
+
+ mull -8(%esi) C src[size-1]*src[size-2]
+
+ movl %ecx, (%edi)
+ addl %eax, %ebx
+
+ adcl $0, %edx
+ movl PARAM_SIZE, %eax
+
+ negl %eax
+ movl %ebx, 4(%edi)
+
+ addl $1, %eax C -(size-1) and clear carry
+ movl %edx, 8(%edi)
+
+
+C -----------------------------------------------------------------------------
+C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
+
+L(lshift):
+ C eax counter, negative
+ C ebx next limb
+ C ecx
+ C edx
+ C esi
+ C edi &dst[2*size-4]
+ C ebp
+
+ movl 12(%edi,%eax,8), %ebx
+
+ rcll %ebx
+ movl 16(%edi,%eax,8), %ecx
+
+ rcll %ecx
+ movl %ebx, 12(%edi,%eax,8)
+
+ movl %ecx, 16(%edi,%eax,8)
+ incl %eax
+
+ jnz L(lshift)
+
+
+ adcl %eax, %eax C high bit out
+ movl PARAM_SRC, %esi
+
+ movl PARAM_SIZE, %ecx C risk of cache bank clash
+ movl %eax, 12(%edi) C dst most significant limb
+
+
+C -----------------------------------------------------------------------------
+C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ...,
+C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the
+C low limb of src[0]^2.
+
+ movl (%esi), %eax C src[0]
+ leal (%esi,%ecx,4), %esi C src end
+
+ negl %ecx
+
+ mull %eax
+
+ movl %eax, 16(%edi,%ecx,8) C dst[0]
+ movl %edx, %ebx
+
+ addl $1, %ecx C size-1 and clear carry
+
+L(diag):
+ C eax scratch (low product)
+ C ebx carry limb
+ C ecx counter, negative
+ C edx scratch (high product)
+ C esi &src[size]
+ C edi &dst[2*size-4]
+ C ebp scratch (fetched dst limbs)
+
+ movl (%esi,%ecx,4), %eax
+ adcl $0, %ebx
+
+ mull %eax
+
+ movl 16-4(%edi,%ecx,8), %ebp
+
+ addl %ebp, %ebx
+ movl 16(%edi,%ecx,8), %ebp
+
+ adcl %eax, %ebp
+ movl %ebx, 16-4(%edi,%ecx,8)
+
+ movl %ebp, 16(%edi,%ecx,8)
+ incl %ecx
+
+ movl %edx, %ebx
+ jnz L(diag)
+
+
+ adcl $0, %edx
+ movl 16-4(%edi), %eax C dst most significant limb
+
+ addl %eax, %edx
+ popl %ebp
+
+ movl %edx, 16-4(%edi)
+ popl %esi C risk of cache bank clash
+
+ popl %edi
+ popl %ebx
+
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/README b/vendor/gmp-6.3.0/mpn/x86/pentium4/README
new file mode 100644
index 0000000..90f752e
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/README
@@ -0,0 +1,124 @@
+Copyright 2001 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/.
+
+
+
+
+ INTEL PENTIUM-4 MPN SUBROUTINES
+
+
+This directory contains mpn functions optimized for Intel Pentium-4.
+
+The mmx subdirectory has routines using MMX instructions, the sse2
+subdirectory has routines using SSE2 instructions. All P4s have these, the
+separate directories are just so configure can omit that code if the
+assembler doesn't support it.
+
+
+STATUS
+
+ cycles/limb
+
+ mpn_add_n/sub_n 4 normal, 6 in-place
+
+ mpn_mul_1 4 normal, 6 in-place
+ mpn_addmul_1 6
+ mpn_submul_1 7
+
+ mpn_mul_basecase 6 cycles/crossproduct (approx)
+
+ mpn_sqr_basecase 3.5 cycles/crossproduct (approx)
+ or 7.0 cycles/triangleproduct (approx)
+
+ mpn_l/rshift 1.75
+
+
+
+The shifts ought to be able to go at 1.5 c/l, but not much effort has been
+applied to them yet.
+
+In-place operations, and all addmul, submul, mul_basecase and sqr_basecase
+calls, suffer from pipeline anomalies associated with write combining and
+movd reads and writes to the same or nearby locations. The movq
+instructions do not trigger the same hardware problems. Unfortunately,
+using movq and splitting/combining seems to require too many extra
+instructions to help. Perhaps future chip steppings will be better.
+
+
+
+NOTES
+
+The Pentium-4 pipeline "Netburst", provides for quite a number of surprises.
+Many traditional x86 instructions run very slowly, requiring use of
+alterative instructions for acceptable performance.
+
+adcl and sbbl are quite slow at 8 cycles for reg->reg. paddq of 32-bits
+within a 64-bit mmx register seems better, though the combination
+paddq/psrlq when propagating a carry is still a 4 cycle latency.
+
+incl and decl should be avoided, instead use add $1 and sub $1. Apparently
+the carry flag is not separately renamed, so incl and decl depend on all
+previous flags-setting instructions.
+
+shll and shrl have a 4 cycle latency, or 8 times the latency of the fastest
+integer instructions (addl, subl, orl, andl, and some more). shldl and
+shrdl seem to have 13 and 15 cycles latency, respectively. Bizarre.
+
+movq mmx -> mmx does have 6 cycle latency, as noted in the documentation.
+pxor/por or similar combination at 2 cycles latency can be used instead.
+The movq however executes in the float unit, thereby saving MMX execution
+resources. With the right juggling, data moves shouldn't be on a dependent
+chain.
+
+L1 is write-through, but the write-combining sounds like it does enough to
+not require explicit destination prefetching.
+
+xmm registers so far haven't found a use, but not much effort has been
+expended. A configure test for whether the operating system knows
+fxsave/fxrestor will be needed if they're used.
+
+
+
+REFERENCES
+
+Intel Pentium-4 processor manuals,
+
+ http://developer.intel.com/design/pentium4/manuals
+
+"Intel Pentium 4 Processor Optimization Reference Manual", Intel, 2001,
+order number 248966. Available on-line:
+
+ http://developer.intel.com/design/pentium4/manuals/248966.htm
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 76
+End:
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/copyd.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/copyd.asm
new file mode 100644
index 0000000..82af81c
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/copyd.asm
@@ -0,0 +1,71 @@
+dnl Pentium-4 mpn_copyd -- copy limb vector, decrementing.
+
+dnl Copyright 1999-2001 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+dnl The std/rep/movsl/cld is very slow for small blocks on pentium4. Its
+dnl startup time seems to be about 165 cycles. It then needs 2.6 c/l.
+dnl We therefore use an open-coded 2 c/l copying loop.
+
+dnl Ultimately, we may want to use 64-bit movq or 128-bit movdqu in some
+dnl nifty unrolled arrangement. Clearly, that could reach much higher
+dnl speeds, at least for large blocks.
+
+include(`../config.m4')
+
+
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ TEXT
+ ALIGN(8)
+
+PROLOGUE(mpn_copyd)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+
+ movl PARAM_SRC, %eax
+ movl PARAM_DST, %edx
+ movl %ebx, PARAM_SIZE
+ addl $-1, %ecx
+ js L(end)
+
+L(loop):
+ movl (%eax,%ecx,4), %ebx
+ movl %ebx, (%edx,%ecx,4)
+ addl $-1, %ecx
+
+ jns L(loop)
+L(end):
+ movl PARAM_SIZE, %ebx
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/copyi.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/copyi.asm
new file mode 100644
index 0000000..b614887
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/copyi.asm
@@ -0,0 +1,93 @@
+dnl Pentium-4 mpn_copyi -- copy limb vector, incrementing.
+
+dnl Copyright 1999-2001 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+dnl The rep/movsl is very slow for small blocks on pentium4. Its startup
+dnl time seems to be about 110 cycles. It then copies at a rate of one
+dnl limb per cycle. We therefore fall back to an open-coded 2 c/l copying
+dnl loop for smaller sizes.
+
+dnl Ultimately, we may want to use 64-bit movd or 128-bit movdqu in some
+dnl nifty unrolled arrangement. Clearly, that could reach much higher
+dnl speeds, at least for large blocks.
+
+include(`../config.m4')
+
+
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ TEXT
+ ALIGN(8)
+
+PROLOGUE(mpn_copyi)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ cmpl $150, %ecx
+ jg L(replmovs)
+
+ movl PARAM_SRC, %eax
+ movl PARAM_DST, %edx
+ movl %ebx, PARAM_SIZE
+ testl %ecx, %ecx
+ jz L(end)
+
+L(loop):
+ movl (%eax), %ebx
+ leal 4(%eax), %eax
+ addl $-1, %ecx
+ movl %ebx, (%edx)
+ leal 4(%edx), %edx
+
+ jnz L(loop)
+
+L(end):
+ movl PARAM_SIZE, %ebx
+ ret
+
+L(replmovs):
+ cld C better safe than sorry, see mpn/x86/README
+
+ movl %esi, %eax
+ movl PARAM_SRC, %esi
+ movl %edi, %edx
+ movl PARAM_DST, %edi
+
+ rep
+ movsl
+
+ movl %eax, %esi
+ movl %edx, %edi
+
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/mmx/lshift.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/mmx/lshift.asm
new file mode 100644
index 0000000..b5eca66
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/mmx/lshift.asm
@@ -0,0 +1,39 @@
+dnl Intel Pentium-4 mpn_lshift -- left shift.
+
+dnl Copyright 2001, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P4 Willamette, Northwood: 1.75 cycles/limb
+C P4 Prescott: 2.0 cycles/limb
+
+
+MULFUNC_PROLOGUE(mpn_lshift)
+include_mpn(`x86/pentium/mmx/lshift.asm')
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/mmx/popham.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/mmx/popham.asm
new file mode 100644
index 0000000..9563cb5
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/mmx/popham.asm
@@ -0,0 +1,203 @@
+dnl Intel Pentium 4 mpn_popcount, mpn_hamdist -- population count and
+dnl hamming distance.
+
+dnl Copyright 2000-2002, 2007 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C popcount hamdist
+C P3 model 9 (Banias) ? ?
+C P3 model 13 (Dothan) 6 6
+C P4 model 0 (Willamette)
+C P4 model 1 (?)
+C P4 model 2 (Northwood) 8 9
+C P4 model 3 (Prescott) 8 9
+C P4 model 4 (Nocona)
+
+C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
+C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
+C
+C Loading with unaligned movq's costs an extra 1 c/l and hence is avoided.
+C Two movd's and a punpckldq seems to be the same speed as an aligned movq,
+C and using them saves fiddling about with alignment testing on entry.
+C
+C For popcount there's 13 mmx instructions in the loop, so perhaps 6.5 c/l
+C might be possible, but 8 c/l relying on out-of-order execution is already
+C quite reasonable.
+
+ifdef(`OPERATION_popcount',,
+`ifdef(`OPERATION_hamdist',,
+`m4_error(`Need OPERATION_popcount or OPERATION_hamdist defined
+')')')
+
+define(HAM,
+m4_assert_numargs(1)
+`ifdef(`OPERATION_hamdist',`$1')')
+
+define(POP,
+m4_assert_numargs(1)
+`ifdef(`OPERATION_popcount',`$1')')
+
+HAM(`
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC2, 8)
+defframe(PARAM_SRC, 4)
+define(M4_function,mpn_hamdist)
+')
+POP(`
+defframe(PARAM_SIZE, 8)
+defframe(PARAM_SRC, 4)
+define(M4_function,mpn_popcount)
+')
+
+MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
+
+
+ifdef(`PIC',,`
+ dnl non-PIC
+ RODATA
+ ALIGN(8)
+L(rodata_AAAAAAAAAAAAAAAA):
+ .long 0xAAAAAAAA
+ .long 0xAAAAAAAA
+L(rodata_3333333333333333):
+ .long 0x33333333
+ .long 0x33333333
+L(rodata_0F0F0F0F0F0F0F0F):
+ .long 0x0F0F0F0F
+ .long 0x0F0F0F0F
+')
+
+ TEXT
+ ALIGN(16)
+
+PROLOGUE(M4_function)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ movl PARAM_SRC, %eax
+
+ifdef(`PIC',`
+ movl $0xAAAAAAAA, %edx
+ movd %edx, %mm7
+ punpckldq %mm7, %mm7
+
+ movl $0x33333333, %edx
+ movd %edx, %mm6
+ punpckldq %mm6, %mm6
+
+ movl $0x0F0F0F0F, %edx
+ movd %edx, %mm5
+ punpckldq %mm5, %mm5
+
+HAM(` movl PARAM_SRC2, %edx')
+
+',`
+ dnl non-PIC
+HAM(` movl PARAM_SRC2, %edx')
+ movq L(rodata_AAAAAAAAAAAAAAAA), %mm7
+ movq L(rodata_3333333333333333), %mm6
+ movq L(rodata_0F0F0F0F0F0F0F0F), %mm5
+')
+
+ pxor %mm4, %mm4 C zero
+ pxor %mm0, %mm0 C total
+
+ subl $1, %ecx
+ ja L(top)
+
+L(last):
+ movd (%eax,%ecx,4), %mm1 C src high limb
+HAM(` movd (%edx,%ecx,4), %mm2
+ pxor %mm2, %mm1
+')
+ jmp L(loaded)
+
+
+L(top):
+ C eax src
+ C ebx
+ C ecx counter, size-1 to 2 or 1, inclusive
+ C edx [hamdist] src2
+ C
+ C mm0 total (low dword)
+ C mm1 (scratch)
+ C mm2 (scratch)
+ C mm3
+ C mm4 0x0000000000000000
+ C mm5 0x0F0F0F0F0F0F0F0F
+ C mm6 0x3333333333333333
+ C mm7 0xAAAAAAAAAAAAAAAA
+
+ movd (%eax), %mm1
+ movd 4(%eax), %mm2
+ punpckldq %mm2, %mm1
+ addl $8, %eax
+
+HAM(` movd (%edx), %mm2
+ movd 4(%edx), %mm3
+ punpckldq %mm3, %mm2
+ pxor %mm2, %mm1
+ addl $8, %edx
+')
+
+L(loaded):
+ movq %mm7, %mm2
+ pand %mm1, %mm2
+ psrlq $1, %mm2
+ psubd %mm2, %mm1 C bit pairs
+
+ movq %mm6, %mm2
+ pand %mm1, %mm2
+ psrlq $2, %mm1
+ pand %mm6, %mm1
+ paddd %mm2, %mm1 C nibbles
+
+ movq %mm5, %mm2
+ pand %mm1, %mm2
+ psrlq $4, %mm1
+ pand %mm5, %mm1
+ paddd %mm2, %mm1 C bytes
+
+ psadbw( %mm4, %mm1)
+ paddd %mm1, %mm0 C to total
+
+ subl $2, %ecx
+ jg L(top)
+
+ C ecx is 0 or -1 representing respectively 1 or 0 further limbs
+ jz L(last)
+
+
+ movd %mm0, %eax
+ emms
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/mmx/rshift.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/mmx/rshift.asm
new file mode 100644
index 0000000..3ac0094
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/mmx/rshift.asm
@@ -0,0 +1,39 @@
+dnl Intel Pentium-4 mpn_rshift -- right shift.
+
+dnl Copyright 2001, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P4 Willamette, Northwood: 1.75 cycles/limb
+C P4 Prescott: 2.0 cycles/limb
+
+
+MULFUNC_PROLOGUE(mpn_rshift)
+include_mpn(`x86/pentium/mmx/rshift.asm')
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/add_n.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/add_n.asm
new file mode 100644
index 0000000..8e2380e
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/add_n.asm
@@ -0,0 +1,101 @@
+dnl Intel Pentium-4 mpn_add_n -- mpn addition.
+
+dnl Copyright 2001, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C dst!=src1,2 dst==src1 dst==src2
+C P6 model 0-8,10-12 -
+C P6 model 9 (Banias) ?
+C P6 model 13 (Dothan) ?
+C P4 model 0-1 (Willamette) ?
+C P4 model 2 (Northwood) 4 6 6
+C P4 model 3-4 (Prescott) 4.25 7.5 7.5
+
+defframe(PARAM_CARRY,20)
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC2, 12)
+defframe(PARAM_SRC1, 8)
+defframe(PARAM_DST, 4)
+
+dnl re-use parameter space
+define(SAVE_EBX,`PARAM_SRC1')
+
+ TEXT
+ ALIGN(8)
+
+PROLOGUE(mpn_add_nc)
+deflit(`FRAME',0)
+ movd PARAM_CARRY, %mm0
+ jmp L(start_nc)
+EPILOGUE()
+
+ ALIGN(8)
+PROLOGUE(mpn_add_n)
+deflit(`FRAME',0)
+ pxor %mm0, %mm0
+L(start_nc):
+ mov PARAM_SRC1, %eax
+ mov %ebx, SAVE_EBX
+ mov PARAM_SRC2, %ebx
+ mov PARAM_DST, %edx
+ mov PARAM_SIZE, %ecx
+
+ lea (%eax,%ecx,4), %eax C src1 end
+ lea (%ebx,%ecx,4), %ebx C src2 end
+ lea (%edx,%ecx,4), %edx C dst end
+ neg %ecx C -size
+
+L(top):
+ C eax src1 end
+ C ebx src2 end
+ C ecx counter, limbs, negative
+ C edx dst end
+ C mm0 carry bit
+
+ movd (%eax,%ecx,4), %mm1
+ movd (%ebx,%ecx,4), %mm2
+ paddq %mm2, %mm1
+
+ paddq %mm1, %mm0
+ movd %mm0, (%edx,%ecx,4)
+
+ psrlq $32, %mm0
+
+ add $1, %ecx
+ jnz L(top)
+
+ movd %mm0, %eax
+ mov SAVE_EBX, %ebx
+ emms
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/addlsh1_n.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/addlsh1_n.asm
new file mode 100644
index 0000000..93b63b2
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/addlsh1_n.asm
@@ -0,0 +1,108 @@
+dnl Intel Pentium-4 mpn_addlsh1_n -- mpn x+2*y.
+
+dnl Copyright 2001-2004, 2006 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C dst!=src1,2 dst==src1 dst==src2
+C P6 model 0-8,10-12 -
+C P6 model 9 (Banias) ?
+C P6 model 13 (Dothan) ?
+C P4 model 0-1 (Willamette) ?
+C P4 model 2 (Northwood) 4.25 6 6
+C P4 model 3-4 (Prescott) 5 8.5 8.5
+
+C The slightly strange combination of indexing and pointer incrementing
+C that's used seems to work best. Not sure why, but %ecx,4 with src1 and/or
+C src2 is a slowdown.
+C
+C The dependent chain is simply the paddq of x+2*y to the previous carry,
+C then psrlq to get the new carry. That makes 4 c/l the target speed, which
+C is almost achieved for separate src/dst but when src==dst the write
+C combining anomalies slow it down.
+
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC2, 12)
+defframe(PARAM_SRC1, 8)
+defframe(PARAM_DST, 4)
+
+dnl re-use parameter space
+define(SAVE_EBX,`PARAM_SRC1')
+
+ TEXT
+ ALIGN(8)
+
+PROLOGUE(mpn_addlsh1_n)
+deflit(`FRAME',0)
+
+ mov PARAM_SRC1, %eax
+ mov %ebx, SAVE_EBX
+
+ mov PARAM_SRC2, %ebx
+ pxor %mm0, %mm0 C initial carry
+
+ mov PARAM_DST, %edx
+
+ mov PARAM_SIZE, %ecx
+
+ lea (%edx,%ecx,4), %edx C dst end
+ neg %ecx C -size
+
+L(top):
+ C eax src1 end
+ C ebx src2 end
+ C ecx counter, limbs, negative
+ C edx dst end
+ C mm0 carry
+
+ movd (%ebx), %mm2
+ movd (%eax), %mm1
+ psrlq $32, %mm0
+ lea 4(%eax), %eax
+ lea 4(%ebx), %ebx
+
+ psllq $1, %mm2
+ paddq %mm2, %mm1
+
+ paddq %mm1, %mm0
+
+ movd %mm0, (%edx,%ecx,4)
+ add $1, %ecx
+ jnz L(top)
+
+
+ psrlq $32, %mm0
+ mov SAVE_EBX, %ebx
+ movd %mm0, %eax
+ emms
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/addmul_1.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/addmul_1.asm
new file mode 100644
index 0000000..7810207
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/addmul_1.asm
@@ -0,0 +1,189 @@
+dnl mpn_addmul_1 for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
+
+dnl Copyright 2005, 2007, 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C cycles/limb
+C P6 model 0-8,10-12 -
+C P6 model 9 (Banias) 5.24
+C P6 model 13 (Dothan) 5.24
+C P4 model 0-1 (Willamette) 5
+C P4 model 2 (Northwood) 5
+C P4 model 3-4 (Prescott) 5
+
+C TODO:
+C * Tweak eax/edx offsets in loop as to save some lea's
+C * Perhaps software pipeline small-case code
+
+C INPUT PARAMETERS
+C rp sp + 4
+C up sp + 8
+C n sp + 12
+C v0 sp + 16
+
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_addmul_1)
+ pxor %mm6, %mm6
+L(ent): mov 4(%esp), %edx
+ mov 8(%esp), %eax
+ mov 12(%esp), %ecx
+ movd 16(%esp), %mm7
+ cmp $4, %ecx
+ jnc L(big)
+
+L(lp0): movd (%eax), %mm0
+ lea 4(%eax), %eax
+ movd (%edx), %mm4
+ lea 4(%edx), %edx
+ pmuludq %mm7, %mm0
+ paddq %mm0, %mm4
+ paddq %mm4, %mm6
+ movd %mm6, -4(%edx)
+ psrlq $32, %mm6
+ dec %ecx
+ jnz L(lp0)
+ movd %mm6, %eax
+ emms
+ ret
+
+L(big): and $3, %ecx
+ je L(0)
+ cmp $2, %ecx
+ jc L(1)
+ je L(2)
+ jmp L(3) C FIXME: one case should fall through
+
+L(0): movd (%eax), %mm3
+ sub 12(%esp), %ecx C loop count
+ lea -16(%eax), %eax
+ lea -12(%edx), %edx
+ pmuludq %mm7, %mm3
+ movd 20(%eax), %mm0
+ movd 12(%edx), %mm5
+ pmuludq %mm7, %mm0
+ movd 24(%eax), %mm1
+ paddq %mm3, %mm5
+ movd 16(%edx), %mm4
+ jmp L(00)
+
+L(1): movd (%eax), %mm2
+ sub 12(%esp), %ecx
+ lea -12(%eax), %eax
+ lea -8(%edx), %edx
+ movd 8(%edx), %mm4
+ pmuludq %mm7, %mm2
+ movd 16(%eax), %mm3
+ pmuludq %mm7, %mm3
+ movd 20(%eax), %mm0
+ paddq %mm2, %mm4
+ movd 12(%edx), %mm5
+ jmp L(01)
+
+L(2): movd (%eax), %mm1
+ sub 12(%esp), %ecx
+ lea -8(%eax), %eax
+ lea -4(%edx), %edx
+ pmuludq %mm7, %mm1
+ movd 12(%eax), %mm2
+ movd 4(%edx), %mm5
+ pmuludq %mm7, %mm2
+ movd 16(%eax), %mm3
+ paddq %mm1, %mm5
+ movd 8(%edx), %mm4
+ jmp L(10)
+
+L(3): movd (%eax), %mm0
+ sub 12(%esp), %ecx
+ lea -4(%eax), %eax
+ pmuludq %mm7, %mm0
+ movd 8(%eax), %mm1
+ movd (%edx), %mm4
+ pmuludq %mm7, %mm1
+ movd 12(%eax), %mm2
+ paddq %mm0, %mm4
+ movd 4(%edx), %mm5
+
+ ALIGN(16)
+L(top): pmuludq %mm7, %mm2
+ paddq %mm4, %mm6
+ movd 16(%eax), %mm3
+ paddq %mm1, %mm5
+ movd 8(%edx), %mm4
+ movd %mm6, 0(%edx)
+ psrlq $32, %mm6
+L(10): pmuludq %mm7, %mm3
+ paddq %mm5, %mm6
+ movd 20(%eax), %mm0
+ paddq %mm2, %mm4
+ movd 12(%edx), %mm5
+ movd %mm6, 4(%edx)
+ psrlq $32, %mm6
+L(01): pmuludq %mm7, %mm0
+ paddq %mm4, %mm6
+ movd 24(%eax), %mm1
+ paddq %mm3, %mm5
+ movd 16(%edx), %mm4
+ movd %mm6, 8(%edx)
+ psrlq $32, %mm6
+L(00): pmuludq %mm7, %mm1
+ paddq %mm5, %mm6
+ movd 28(%eax), %mm2
+ paddq %mm0, %mm4
+ movd 20(%edx), %mm5
+ movd %mm6, 12(%edx)
+ psrlq $32, %mm6
+ lea 16(%eax), %eax
+ lea 16(%edx), %edx
+ add $4, %ecx
+ jnz L(top)
+
+L(end): pmuludq %mm7, %mm2
+ paddq %mm4, %mm6
+ paddq %mm1, %mm5
+ movd 8(%edx), %mm4
+ movd %mm6, 0(%edx)
+ psrlq $32, %mm6
+ paddq %mm5, %mm6
+ paddq %mm2, %mm4
+ movd %mm6, 4(%edx)
+ psrlq $32, %mm6
+ paddq %mm4, %mm6
+ movd %mm6, 8(%edx)
+ psrlq $32, %mm6
+ movd %mm6, %eax
+ emms
+ ret
+EPILOGUE()
+PROLOGUE(mpn_addmul_1c)
+ movd 20(%esp), %mm6
+ jmp L(ent)
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/bdiv_dbm1c.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/bdiv_dbm1c.asm
new file mode 100644
index 0000000..354300e
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/bdiv_dbm1c.asm
@@ -0,0 +1,141 @@
+dnl Intel Atom mpn_bdiv_dbm1.
+
+dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C cycles/limb
+C P5 -
+C P6 model 0-8,10-12 -
+C P6 model 9 (Banias) 9.75
+C P6 model 13 (Dothan)
+C P4 model 0 (Willamette)
+C P4 model 1 (?)
+C P4 model 2 (Northwood) 8.25
+C P4 model 3 (Prescott)
+C P4 model 4 (Nocona)
+C Intel Atom 8
+C AMD K6 -
+C AMD K7 -
+C AMD K8
+C AMD K10
+
+C TODO: This code was optimised for atom-32, consider moving it back to atom
+C dir(atom currently grabs this code), and write a 4-way version(7c/l).
+
+defframe(PARAM_CARRY,20)
+defframe(PARAM_MUL, 16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+dnl re-use parameter space
+define(SAVE_RP,`PARAM_MUL')
+define(SAVE_UP,`PARAM_SIZE')
+
+define(`rp', `%edi')
+define(`up', `%esi')
+define(`n', `%ecx')
+define(`reg', `%edx')
+define(`cy', `%eax') C contains the return value
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+deflit(`FRAME',0)
+
+PROLOGUE(mpn_bdiv_dbm1c)
+ mov PARAM_SIZE, n C size
+ mov up, SAVE_UP
+ mov PARAM_SRC, up
+ movd PARAM_MUL, %mm7
+ mov rp, SAVE_RP
+ mov PARAM_DST, rp
+
+ movd (up), %mm0
+ pmuludq %mm7, %mm0
+ shr n
+ mov PARAM_CARRY, cy
+ jz L(eq1)
+
+ movd 4(up), %mm1
+ jc L(odd)
+
+ lea 4(up), up
+ pmuludq %mm7, %mm1
+ movd %mm0, reg
+ psrlq $32, %mm0
+ sub reg, cy
+ movd %mm0, reg
+ movq %mm1, %mm0
+ dec n
+ mov cy, (rp)
+ lea 4(rp), rp
+ jz L(end)
+
+C ALIGN(16)
+L(top): movd 4(up), %mm1
+ sbb reg, cy
+L(odd): movd %mm0, reg
+ psrlq $32, %mm0
+ pmuludq %mm7, %mm1
+ sub reg, cy
+ lea 8(up), up
+ movd %mm0, reg
+ movd (up), %mm0
+ mov cy, (rp)
+ sbb reg, cy
+ movd %mm1, reg
+ psrlq $32, %mm1
+ sub reg, cy
+ movd %mm1, reg
+ pmuludq %mm7, %mm0
+ dec n
+ mov cy, 4(rp)
+ lea 8(rp), rp
+ jnz L(top)
+
+L(end): sbb reg, cy
+
+L(eq1): movd %mm0, reg
+ psrlq $32, %mm0
+ mov SAVE_UP, up
+ sub reg, cy
+ movd %mm0, reg
+ emms
+ mov cy, (rp)
+ sbb reg, cy
+
+ mov SAVE_RP, rp
+ ret
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/bdiv_q_1.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/bdiv_q_1.asm
new file mode 100644
index 0000000..d5008f4
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/bdiv_q_1.asm
@@ -0,0 +1,234 @@
+dnl Intel Pentium-4 mpn_divexact_1 -- mpn by limb exact division.
+
+dnl Rearranged from mpn/x86/pentium4/sse2/dive_1.asm by Marco Bodrato.
+
+dnl Copyright 2001, 2002, 2007, 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P4: 19.0 cycles/limb
+
+C Pairs of movd's are used to avoid unaligned loads. Despite the loads not
+C being on the dependent chain and there being plenty of cycles available,
+C using an unaligned movq on every second iteration measured about 23 c/l.
+C
+
+defframe(PARAM_SHIFT, 24)
+defframe(PARAM_INVERSE,20)
+defframe(PARAM_DIVISOR,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ TEXT
+
+C mp_limb_t
+C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C mp_limb_t inverse, int shift)
+ ALIGN(32)
+PROLOGUE(mpn_pi1_bdiv_q_1)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %edx
+
+ movl PARAM_SRC, %eax
+
+ movl PARAM_DIVISOR, %ecx
+
+ movd %ecx, %mm6
+ movl PARAM_SHIFT, %ecx
+
+ movd %ecx, %mm7 C shift
+
+ C
+
+ movl PARAM_INVERSE, %ecx
+ movd %ecx, %mm5 C inv
+
+ movl PARAM_DST, %ecx
+ pxor %mm1, %mm1 C initial carry limb
+ pxor %mm0, %mm0 C initial carry bit
+
+ subl $1, %edx
+ jz L(done)
+
+ pcmpeqd %mm4, %mm4
+ psrlq $32, %mm4 C 0x00000000FFFFFFFF
+
+C The dependent chain here is as follows.
+C
+C latency
+C psubq s = (src-cbit) - climb 2
+C pmuludq q = s*inverse 8
+C pmuludq prod = q*divisor 8
+C psrlq climb = high(prod) 2
+C --
+C 20
+C
+C Yet the loop measures 19.0 c/l, so obviously there's something gained
+C there over a straight reading of the chip documentation.
+
+L(top):
+ C eax src, incrementing
+ C ebx
+ C ecx dst, incrementing
+ C edx counter, size-1 iterations
+ C
+ C mm0 carry bit
+ C mm1 carry limb
+ C mm4 0x00000000FFFFFFFF
+ C mm5 inverse
+ C mm6 divisor
+ C mm7 shift
+
+ movd (%eax), %mm2
+ movd 4(%eax), %mm3
+ addl $4, %eax
+ punpckldq %mm3, %mm2
+
+ psrlq %mm7, %mm2
+ pand %mm4, %mm2 C src
+ psubq %mm0, %mm2 C src - cbit
+
+ psubq %mm1, %mm2 C src - cbit - climb
+ movq %mm2, %mm0
+ psrlq $63, %mm0 C new cbit
+
+ pmuludq %mm5, %mm2 C s*inverse
+ movd %mm2, (%ecx) C q
+ addl $4, %ecx
+
+ movq %mm6, %mm1
+ pmuludq %mm2, %mm1 C q*divisor
+ psrlq $32, %mm1 C new climb
+
+L(entry):
+ subl $1, %edx
+ jnz L(top)
+
+L(done):
+ movd (%eax), %mm2
+ psrlq %mm7, %mm2 C src
+ psubq %mm0, %mm2 C src - cbit
+
+ psubq %mm1, %mm2 C src - cbit - climb
+
+ pmuludq %mm5, %mm2 C s*inverse
+ movd %mm2, (%ecx) C q
+
+ emms
+ ret
+
+EPILOGUE()
+
+ ALIGN(16)
+C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor);
+C
+PROLOGUE(mpn_bdiv_q_1)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %edx
+
+ movl PARAM_DIVISOR, %ecx
+
+ C eax src
+ C ebx
+ C ecx divisor
+ C edx size-1
+
+ movl %ecx, %eax
+ bsfl %ecx, %ecx C trailing twos
+
+ shrl %cl, %eax C d = divisor without twos
+ movd %eax, %mm6
+ movd %ecx, %mm7 C shift
+
+ shrl %eax C d/2
+
+ andl $127, %eax C d/2, 7 bits
+
+ifdef(`PIC',`
+ LEA( binvert_limb_table, %ecx)
+ movzbl (%eax,%ecx), %eax C inv 8 bits
+',`
+ movzbl binvert_limb_table(%eax), %eax C inv 8 bits
+')
+
+ C
+
+ movd %eax, %mm5 C inv
+
+ movd %eax, %mm0 C inv
+
+ pmuludq %mm5, %mm5 C inv*inv
+
+ C
+
+ pmuludq %mm6, %mm5 C inv*inv*d
+ paddd %mm0, %mm0 C 2*inv
+
+ C
+
+ psubd %mm5, %mm0 C inv = 2*inv - inv*inv*d
+ pxor %mm5, %mm5
+
+ paddd %mm0, %mm5
+ pmuludq %mm0, %mm0 C inv*inv
+
+ pcmpeqd %mm4, %mm4
+ psrlq $32, %mm4 C 0x00000000FFFFFFFF
+
+ C
+
+ pmuludq %mm6, %mm0 C inv*inv*d
+ paddd %mm5, %mm5 C 2*inv
+
+ movl PARAM_SRC, %eax
+ movl PARAM_DST, %ecx
+ pxor %mm1, %mm1 C initial carry limb
+
+ C
+
+ psubd %mm0, %mm5 C inv = 2*inv - inv*inv*d
+
+ ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS
+ pushl %eax FRAME_pushl()
+ movq %mm6, %mm0
+ pmuludq %mm5, %mm0
+ movd %mm0, %eax
+ cmpl $1, %eax
+ popl %eax FRAME_popl()')
+
+ pxor %mm0, %mm0 C initial carry bit
+ jmp L(entry)
+
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/cnd_add_n.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/cnd_add_n.asm
new file mode 100644
index 0000000..b3f3474
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/cnd_add_n.asm
@@ -0,0 +1,95 @@
+dnl Intel Pentium-4 mpn_cnd_add_n -- mpn addition.
+
+dnl Copyright 2001, 2002, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C P6 model 0-8,10-12 -
+C P6 model 9 (Banias) ?
+C P6 model 13 (Dothan) 4.67
+C P4 model 0-1 (Willamette) ?
+C P4 model 2 (Northwood) 5
+C P4 model 3-4 (Prescott) 5.25
+
+defframe(PARAM_SIZE, 20)
+defframe(PARAM_SRC2, 16)
+defframe(PARAM_SRC1, 12)
+defframe(PARAM_DST, 8)
+defframe(PARAM_CND, 4)
+
+dnl re-use parameter space
+define(SAVE_EBX,`PARAM_SRC1')
+
+define(`cnd', `%mm3')
+
+ TEXT
+ ALIGN(8)
+
+ ALIGN(8)
+PROLOGUE(mpn_cnd_add_n)
+deflit(`FRAME',0)
+ pxor %mm0, %mm0
+
+ mov PARAM_CND, %eax
+ neg %eax
+ sbb %eax, %eax
+ movd %eax, cnd
+
+ mov PARAM_SRC1, %eax
+ mov %ebx, SAVE_EBX
+ mov PARAM_SRC2, %ebx
+ mov PARAM_DST, %edx
+ mov PARAM_SIZE, %ecx
+
+ lea (%eax,%ecx,4), %eax C src1 end
+ lea (%ebx,%ecx,4), %ebx C src2 end
+ lea (%edx,%ecx,4), %edx C dst end
+ neg %ecx C -size
+
+L(top): movd (%ebx,%ecx,4), %mm2
+ movd (%eax,%ecx,4), %mm1
+ pand cnd, %mm2
+ paddq %mm2, %mm1
+
+ paddq %mm1, %mm0
+ movd %mm0, (%edx,%ecx,4)
+
+ psrlq $32, %mm0
+
+ add $1, %ecx
+ jnz L(top)
+
+ movd %mm0, %eax
+ mov SAVE_EBX, %ebx
+ emms
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/cnd_sub_n.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/cnd_sub_n.asm
new file mode 100644
index 0000000..339a23e
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/cnd_sub_n.asm
@@ -0,0 +1,114 @@
+dnl Intel Pentium-4 mpn_cnd_sub_n -- mpn subtraction.
+
+dnl Copyright 2001, 2002, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C P6 model 0-8,10-12 -
+C P6 model 9 (Banias) ?
+C P6 model 13 (Dothan) 4.67
+C P4 model 0-1 (Willamette) ?
+C P4 model 2 (Northwood) 5
+C P4 model 3-4 (Prescott) 5.25
+
+defframe(PARAM_SIZE, 20)
+defframe(PARAM_SRC2, 16)
+defframe(PARAM_SRC1, 12)
+defframe(PARAM_DST, 8)
+defframe(PARAM_CND, 4)
+
+dnl re-use parameter space
+define(SAVE_EBX,`PARAM_SRC1')
+
+define(`cnd', `%mm3')
+
+ TEXT
+ ALIGN(8)
+
+ ALIGN(8)
+PROLOGUE(mpn_cnd_sub_n)
+deflit(`FRAME',0)
+ pxor %mm0, %mm0
+
+ mov PARAM_CND, %eax
+ neg %eax
+ sbb %eax, %eax
+ movd %eax, cnd
+
+ mov PARAM_SRC1, %eax
+ mov %ebx, SAVE_EBX
+ mov PARAM_SRC2, %ebx
+ mov PARAM_DST, %edx
+ mov PARAM_SIZE, %ecx
+
+ lea (%eax,%ecx,4), %eax C src1 end
+ lea (%ebx,%ecx,4), %ebx C src2 end
+ lea (%edx,%ecx,4), %edx C dst end
+ neg %ecx C -size
+
+L(top): movd (%ebx,%ecx,4), %mm2
+ movd (%eax,%ecx,4), %mm1
+ pand cnd, %mm2
+ psubq %mm2, %mm1
+
+ psubq %mm0, %mm1
+ movd %mm1, (%edx,%ecx,4)
+
+ psrlq $63, %mm1
+
+ add $1, %ecx
+ jz L(done_mm1)
+
+ movd (%ebx,%ecx,4), %mm2
+ movd (%eax,%ecx,4), %mm0
+ pand cnd, %mm2
+ psubq %mm2, %mm0
+
+ psubq %mm1, %mm0
+ movd %mm0, (%edx,%ecx,4)
+
+ psrlq $63, %mm0
+
+ add $1, %ecx
+ jnz L(top)
+
+ movd %mm0, %eax
+ mov SAVE_EBX, %ebx
+ emms
+ ret
+
+L(done_mm1):
+ movd %mm1, %eax
+ mov SAVE_EBX, %ebx
+ emms
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/dive_1.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/dive_1.asm
new file mode 100644
index 0000000..0ceef5b
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/dive_1.asm
@@ -0,0 +1,216 @@
+dnl Intel Pentium-4 mpn_divexact_1 -- mpn by limb exact division.
+
+dnl Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P4: 19.0 cycles/limb
+
+
+C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor);
+C
+C Pairs of movd's are used to avoid unaligned loads. Despite the loads not
+C being on the dependent chain and there being plenty of cycles available,
+C using an unaligned movq on every second iteration measured about 23 c/l.
+C
+C Using divl for size==1 seems a touch quicker than mul-by-inverse. The mul
+C will be about 9+2*4+2*2+10*4+19+12 = 92 cycles latency, though some of
+C that might be hidden by out-of-order execution, whereas divl is around 60.
+C At size==2 an extra 19 for the mul versus 60 for the divl will see the mul
+C faster.
+
+defframe(PARAM_DIVISOR,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ TEXT
+
+ ALIGN(16)
+PROLOGUE(mpn_divexact_1)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %edx
+
+ movl PARAM_SRC, %eax
+
+ movl PARAM_DIVISOR, %ecx
+ subl $1, %edx
+ jnz L(two_or_more)
+
+ movl (%eax), %eax
+ xorl %edx, %edx
+
+ divl %ecx
+ movl PARAM_DST, %ecx
+
+ movl %eax, (%ecx)
+ ret
+
+
+L(two_or_more):
+ C eax src
+ C ebx
+ C ecx divisor
+ C edx size-1
+
+ movl %ecx, %eax
+ bsfl %ecx, %ecx C trailing twos
+
+ shrl %cl, %eax C d = divisor without twos
+ movd %eax, %mm6
+ movd %ecx, %mm7 C shift
+
+ shrl %eax C d/2
+
+ andl $127, %eax C d/2, 7 bits
+
+ifdef(`PIC',`
+ LEA( binvert_limb_table, %ecx)
+ movzbl (%eax,%ecx), %eax C inv 8 bits
+',`
+ movzbl binvert_limb_table(%eax), %eax C inv 8 bits
+')
+
+ C
+
+ movd %eax, %mm5 C inv
+
+ movd %eax, %mm0 C inv
+
+ pmuludq %mm5, %mm5 C inv*inv
+
+ C
+
+ pmuludq %mm6, %mm5 C inv*inv*d
+ paddd %mm0, %mm0 C 2*inv
+
+ C
+
+ psubd %mm5, %mm0 C inv = 2*inv - inv*inv*d
+ pxor %mm5, %mm5
+
+ paddd %mm0, %mm5
+ pmuludq %mm0, %mm0 C inv*inv
+
+ pcmpeqd %mm4, %mm4
+ psrlq $32, %mm4 C 0x00000000FFFFFFFF
+
+ C
+
+ pmuludq %mm6, %mm0 C inv*inv*d
+ paddd %mm5, %mm5 C 2*inv
+
+ movl PARAM_SRC, %eax
+ movl PARAM_DST, %ecx
+ pxor %mm1, %mm1 C initial carry limb
+
+ C
+
+ psubd %mm0, %mm5 C inv = 2*inv - inv*inv*d
+
+ ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS
+ pushl %eax FRAME_pushl()
+ movq %mm6, %mm0
+ pmuludq %mm5, %mm0
+ movd %mm0, %eax
+ cmpl $1, %eax
+ popl %eax FRAME_popl()')
+
+ pxor %mm0, %mm0 C initial carry bit
+
+
+C The dependent chain here is as follows.
+C
+C latency
+C psubq s = (src-cbit) - climb 2
+C pmuludq q = s*inverse 8
+C pmuludq prod = q*divisor 8
+C psrlq climb = high(prod) 2
+C --
+C 20
+C
+C Yet the loop measures 19.0 c/l, so obviously there's something gained
+C there over a straight reading of the chip documentation.
+
+L(top):
+ C eax src, incrementing
+ C ebx
+ C ecx dst, incrementing
+ C edx counter, size-1 iterations
+ C
+ C mm0 carry bit
+ C mm1 carry limb
+ C mm4 0x00000000FFFFFFFF
+ C mm5 inverse
+ C mm6 divisor
+ C mm7 shift
+
+ movd (%eax), %mm2
+ movd 4(%eax), %mm3
+ addl $4, %eax
+ punpckldq %mm3, %mm2
+
+ psrlq %mm7, %mm2
+ pand %mm4, %mm2 C src
+ psubq %mm0, %mm2 C src - cbit
+
+ psubq %mm1, %mm2 C src - cbit - climb
+ movq %mm2, %mm0
+ psrlq $63, %mm0 C new cbit
+
+ pmuludq %mm5, %mm2 C s*inverse
+ movd %mm2, (%ecx) C q
+ addl $4, %ecx
+
+ movq %mm6, %mm1
+ pmuludq %mm2, %mm1 C q*divisor
+ psrlq $32, %mm1 C new climb
+
+ subl $1, %edx
+ jnz L(top)
+
+
+L(done):
+ movd (%eax), %mm2
+ psrlq %mm7, %mm2 C src
+ psubq %mm0, %mm2 C src - cbit
+
+ psubq %mm1, %mm2 C src - cbit - climb
+
+ pmuludq %mm5, %mm2 C s*inverse
+ movd %mm2, (%ecx) C q
+
+ emms
+ ret
+
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/divrem_1.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/divrem_1.asm
new file mode 100644
index 0000000..0146fab
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/divrem_1.asm
@@ -0,0 +1,645 @@
+dnl Intel Pentium-4 mpn_divrem_1 -- mpn by limb division.
+
+dnl Copyright 1999-2004 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P4: 32 cycles/limb integer part, 30 cycles/limb fraction part.
+
+
+C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
+C mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor);
+C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
+C mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor, mp_limb_t carry);
+C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize,
+C mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor, mp_limb_t inverse,
+C unsigned shift);
+C
+C Algorithm:
+C
+C The method and nomenclature follow part 8 of "Division by Invariant
+C Integers using Multiplication" by Granlund and Montgomery, reference in
+C gmp.texi.
+C
+C "m" is written for what is m' in the paper, and "d" for d_norm, which
+C won't cause any confusion since it's only the normalized divisor that's of
+C any use in the code. "b" is written for 2^N, the size of a limb, N being
+C 32 here.
+C
+C The step "sdword dr = n - 2^N*d + (2^N-1-q1) * d" is instead done as
+C "n-d - q1*d". This rearrangement gives the same two-limb answer but lets
+C us have just a psubq on the dependent chain.
+C
+C For reference, the way the k7 code uses "n-(q1+1)*d" would not suit here,
+C detecting an overflow of q1+1 when q1=0xFFFFFFFF would cost too much.
+C
+C Notes:
+C
+C mpn_divrem_1 and mpn_preinv_divrem_1 avoid one division if the src high
+C limb is less than the divisor. mpn_divrem_1c doesn't check for a zero
+C carry, since in normal circumstances that will be a very rare event.
+C
+C The test for skipping a division is branch free (once size>=1 is tested).
+C The store to the destination high limb is 0 when a divide is skipped, or
+C if it's not skipped then a copy of the src high limb is stored. The
+C latter is in case src==dst.
+C
+C There's a small bias towards expecting xsize==0, by having code for
+C xsize==0 in a straight line and xsize!=0 under forward jumps.
+C
+C Enhancements:
+C
+C The loop measures 32 cycles, but the dependent chain would suggest it
+C could be done with 30. Not sure where to start looking for the extras.
+C
+C Alternatives:
+C
+C If the divisor is normalized (high bit set) then a division step can
+C always be skipped, since the high destination limb is always 0 or 1 in
+C that case. It doesn't seem worth checking for this though, since it
+C probably occurs infrequently.
+
+
+dnl MUL_THRESHOLD is the value of xsize+size at which the multiply by
+dnl inverse method is used, rather than plain "divl"s. Minimum value 1.
+dnl
+dnl The inverse takes about 80-90 cycles to calculate, but after that the
+dnl multiply is 32 c/l versus division at about 58 c/l.
+dnl
+dnl At 4 limbs the div is a touch faster than the mul (and of course
+dnl simpler), so start the mul from 5 limbs.
+
+deflit(MUL_THRESHOLD, 5)
+
+
+defframe(PARAM_PREINV_SHIFT, 28) dnl mpn_preinv_divrem_1
+defframe(PARAM_PREINV_INVERSE, 24) dnl mpn_preinv_divrem_1
+defframe(PARAM_CARRY, 24) dnl mpn_divrem_1c
+defframe(PARAM_DIVISOR,20)
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC, 12)
+defframe(PARAM_XSIZE, 8)
+defframe(PARAM_DST, 4)
+
+dnl re-use parameter space
+define(SAVE_ESI,`PARAM_SIZE')
+define(SAVE_EBP,`PARAM_SRC')
+define(SAVE_EDI,`PARAM_DIVISOR')
+define(SAVE_EBX,`PARAM_DST')
+
+ TEXT
+
+ ALIGN(16)
+PROLOGUE(mpn_preinv_divrem_1)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ xorl %edx, %edx C carry if can't skip a div
+
+ movl %esi, SAVE_ESI
+ movl PARAM_SRC, %esi
+
+ movl %ebp, SAVE_EBP
+ movl PARAM_DIVISOR, %ebp
+
+ movl %edi, SAVE_EDI
+ movl PARAM_DST, %edi
+
+ movl -4(%esi,%ecx,4), %eax C src high limb
+
+ movl %ebx, SAVE_EBX
+ movl PARAM_XSIZE, %ebx
+
+ movd PARAM_PREINV_INVERSE, %mm4
+
+ movd PARAM_PREINV_SHIFT, %mm7 C l
+ cmpl %ebp, %eax C high cmp divisor
+
+ cmovc( %eax, %edx) C high is carry if high<divisor
+ movd %edx, %mm0 C carry
+
+ movd %edx, %mm1 C carry
+ movl $0, %edx
+
+ movd %ebp, %mm5 C d
+ cmovnc( %eax, %edx) C 0 if skip div, src high if not
+ C (the latter in case src==dst)
+ leal -4(%edi,%ebx,4), %edi C &dst[xsize-1]
+
+ movl %edx, (%edi,%ecx,4) C dst high limb
+ sbbl $0, %ecx C skip one division if high<divisor
+ movl $32, %eax
+
+ subl PARAM_PREINV_SHIFT, %eax
+ psllq %mm7, %mm5 C d normalized
+ leal (%edi,%ecx,4), %edi C &dst[xsize+size-1]
+ leal -4(%esi,%ecx,4), %esi C &src[size-1]
+
+ movd %eax, %mm6 C 32-l
+ jmp L(start_preinv)
+
+EPILOGUE()
+
+
+ ALIGN(16)
+PROLOGUE(mpn_divrem_1c)
+deflit(`FRAME',0)
+
+ movl PARAM_CARRY, %edx
+
+ movl PARAM_SIZE, %ecx
+
+ movl %esi, SAVE_ESI
+ movl PARAM_SRC, %esi
+
+ movl %ebp, SAVE_EBP
+ movl PARAM_DIVISOR, %ebp
+
+ movl %edi, SAVE_EDI
+ movl PARAM_DST, %edi
+
+ movl %ebx, SAVE_EBX
+ movl PARAM_XSIZE, %ebx
+
+ leal -4(%edi,%ebx,4), %edi C &dst[xsize-1]
+ jmp L(start_1c)
+
+EPILOGUE()
+
+
+ ALIGN(16)
+PROLOGUE(mpn_divrem_1)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ xorl %edx, %edx C initial carry (if can't skip a div)
+
+ movl %esi, SAVE_ESI
+ movl PARAM_SRC, %esi
+
+ movl %ebp, SAVE_EBP
+ movl PARAM_DIVISOR, %ebp
+
+ movl %edi, SAVE_EDI
+ movl PARAM_DST, %edi
+
+ movl %ebx, SAVE_EBX
+ movl PARAM_XSIZE, %ebx
+ leal -4(%edi,%ebx,4), %edi C &dst[xsize-1]
+
+ orl %ecx, %ecx C size
+ jz L(no_skip_div) C if size==0
+ movl -4(%esi,%ecx,4), %eax C src high limb
+
+ cmpl %ebp, %eax C high cmp divisor
+
+ cmovnc( %eax, %edx) C 0 if skip div, src high if not
+ movl %edx, (%edi,%ecx,4) C dst high limb
+
+ movl $0, %edx
+ cmovc( %eax, %edx) C high is carry if high<divisor
+
+ sbbl $0, %ecx C size-1 if high<divisor
+L(no_skip_div):
+
+
+L(start_1c):
+ C eax
+ C ebx xsize
+ C ecx size
+ C edx carry
+ C esi src
+ C edi &dst[xsize-1]
+ C ebp divisor
+
+ leal (%ebx,%ecx), %eax C size+xsize
+ leal -4(%esi,%ecx,4), %esi C &src[size-1]
+ leal (%edi,%ecx,4), %edi C &dst[size+xsize-1]
+
+ cmpl $MUL_THRESHOLD, %eax
+ jae L(mul_by_inverse)
+
+
+ orl %ecx, %ecx
+ jz L(divide_no_integer) C if size==0
+
+L(divide_integer):
+ C eax scratch (quotient)
+ C ebx xsize
+ C ecx counter
+ C edx carry
+ C esi src, decrementing
+ C edi dst, decrementing
+ C ebp divisor
+
+ movl (%esi), %eax
+ subl $4, %esi
+
+ divl %ebp
+
+ movl %eax, (%edi)
+ subl $4, %edi
+
+ subl $1, %ecx
+ jnz L(divide_integer)
+
+
+L(divide_no_integer):
+ orl %ebx, %ebx
+ jnz L(divide_fraction) C if xsize!=0
+
+L(divide_done):
+ movl SAVE_ESI, %esi
+ movl SAVE_EDI, %edi
+ movl SAVE_EBX, %ebx
+ movl SAVE_EBP, %ebp
+ movl %edx, %eax
+ ret
+
+
+L(divide_fraction):
+ C eax scratch (quotient)
+ C ebx counter
+ C ecx
+ C edx carry
+ C esi
+ C edi dst, decrementing
+ C ebp divisor
+
+ movl $0, %eax
+
+ divl %ebp
+
+ movl %eax, (%edi)
+ subl $4, %edi
+
+ subl $1, %ebx
+ jnz L(divide_fraction)
+
+ jmp L(divide_done)
+
+
+
+C -----------------------------------------------------------------------------
+
+L(mul_by_inverse):
+ C eax
+ C ebx xsize
+ C ecx size
+ C edx carry
+ C esi &src[size-1]
+ C edi &dst[size+xsize-1]
+ C ebp divisor
+
+ bsrl %ebp, %eax C 31-l
+ movd %edx, %mm0 C carry
+ movd %edx, %mm1 C carry
+ movl %ecx, %edx C size
+ movl $31, %ecx
+
+ C
+
+ xorl %eax, %ecx C l = leading zeros on d
+ addl $1, %eax
+
+ shll %cl, %ebp C d normalized
+ movd %ecx, %mm7 C l
+ movl %edx, %ecx C size
+
+ movd %eax, %mm6 C 32-l
+ movl $-1, %edx
+ movl $-1, %eax
+
+ C
+
+ subl %ebp, %edx C (b-d)-1 so edx:eax = b*(b-d)-1
+
+ divl %ebp C floor (b*(b-d)-1 / d)
+ movd %ebp, %mm5 C d
+
+ C
+
+ movd %eax, %mm4 C m
+
+
+L(start_preinv):
+ C eax inverse
+ C ebx xsize
+ C ecx size
+ C edx
+ C esi &src[size-1]
+ C edi &dst[size+xsize-1]
+ C ebp
+ C
+ C mm0 carry
+ C mm1 carry
+ C mm2
+ C mm4 m
+ C mm5 d
+ C mm6 31-l
+ C mm7 l
+
+ psllq %mm7, %mm0 C n2 = carry << l, for size==0
+
+ subl $1, %ecx
+ jb L(integer_none)
+
+ movd (%esi), %mm0 C src high limb
+ punpckldq %mm1, %mm0
+ psrlq %mm6, %mm0 C n2 = high (carry:srchigh << l)
+ jz L(integer_last)
+
+
+C The dependent chain here consists of
+C
+C 2 paddd n1+n2
+C 8 pmuludq m*(n1+n2)
+C 2 paddq n2:nadj + m*(n1+n2)
+C 2 psrlq q1
+C 8 pmuludq d*q1
+C 2 psubq (n-d)-q1*d
+C 2 psrlq high n-(q1+1)*d mask
+C 2 pand d masked
+C 2 paddd n2+d addback
+C --
+C 30
+C
+C But it seems to run at 32 cycles, so presumably there's something else
+C going on.
+
+ ALIGN(16)
+L(integer_top):
+ C eax
+ C ebx
+ C ecx counter, size-1 to 0
+ C edx
+ C esi src, decrementing
+ C edi dst, decrementing
+ C
+ C mm0 n2
+ C mm4 m
+ C mm5 d
+ C mm6 32-l
+ C mm7 l
+
+ ASSERT(b,`C n2<d
+ movd %mm0, %eax
+ movd %mm5, %edx
+ cmpl %edx, %eax')
+
+ movd -4(%esi), %mm1 C next src limbs
+ movd (%esi), %mm2
+ leal -4(%esi), %esi
+
+ punpckldq %mm2, %mm1
+ psrlq %mm6, %mm1 C n10
+
+ movq %mm1, %mm2 C n10
+ movq %mm1, %mm3 C n10
+ psrad $31, %mm1 C -n1
+ pand %mm5, %mm1 C -n1 & d
+ paddd %mm2, %mm1 C nadj = n10+(-n1&d), ignore overflow
+
+ psrld $31, %mm2 C n1
+ paddd %mm0, %mm2 C n2+n1
+ punpckldq %mm0, %mm1 C n2:nadj
+
+ pmuludq %mm4, %mm2 C m*(n2+n1)
+
+ C
+
+ paddq %mm2, %mm1 C n2:nadj + m*(n2+n1)
+ pxor %mm2, %mm2 C break dependency, saves 4 cycles
+ pcmpeqd %mm2, %mm2 C FF...FF
+ psrlq $63, %mm2 C 1
+
+ psrlq $32, %mm1 C q1 = high(n2:nadj + m*(n2+n1))
+
+ paddd %mm1, %mm2 C q1+1
+ pmuludq %mm5, %mm1 C q1*d
+
+ punpckldq %mm0, %mm3 C n = n2:n10
+ pxor %mm0, %mm0
+
+ psubq %mm5, %mm3 C n - d
+
+ C
+
+ psubq %mm1, %mm3 C n - (q1+1)*d
+
+ por %mm3, %mm0 C copy remainder -> new n2
+ psrlq $32, %mm3 C high n - (q1+1)*d, 0 or -1
+
+ ASSERT(be,`C 0 or -1
+ movd %mm3, %eax
+ addl $1, %eax
+ cmpl $1, %eax')
+
+ paddd %mm3, %mm2 C q
+ pand %mm5, %mm3 C mask & d
+
+ paddd %mm3, %mm0 C addback if necessary
+ movd %mm2, (%edi)
+ leal -4(%edi), %edi
+
+ subl $1, %ecx
+ ja L(integer_top)
+
+
+L(integer_last):
+ C eax
+ C ebx xsize
+ C ecx
+ C edx
+ C esi &src[0]
+ C edi &dst[xsize]
+ C
+ C mm0 n2
+ C mm4 m
+ C mm5 d
+ C mm6
+ C mm7 l
+
+ ASSERT(b,`C n2<d
+ movd %mm0, %eax
+ movd %mm5, %edx
+ cmpl %edx, %eax')
+
+ movd (%esi), %mm1 C src[0]
+ psllq %mm7, %mm1 C n10
+
+ movq %mm1, %mm2 C n10
+ movq %mm1, %mm3 C n10
+ psrad $31, %mm1 C -n1
+ pand %mm5, %mm1 C -n1 & d
+ paddd %mm2, %mm1 C nadj = n10+(-n1&d), ignore overflow
+
+ psrld $31, %mm2 C n1
+ paddd %mm0, %mm2 C n2+n1
+ punpckldq %mm0, %mm1 C n2:nadj
+
+ pmuludq %mm4, %mm2 C m*(n2+n1)
+
+ C
+
+ paddq %mm2, %mm1 C n2:nadj + m*(n2+n1)
+ pcmpeqd %mm2, %mm2 C FF...FF
+ psrlq $63, %mm2 C 1
+
+ psrlq $32, %mm1 C q1 = high(n2:nadj + m*(n2+n1))
+ paddd %mm1, %mm2 C q1
+
+ pmuludq %mm5, %mm1 C q1*d
+ punpckldq %mm0, %mm3 C n
+ psubq %mm5, %mm3 C n - d
+ pxor %mm0, %mm0
+
+ C
+
+ psubq %mm1, %mm3 C n - (q1+1)*d
+
+ por %mm3, %mm0 C remainder -> n2
+ psrlq $32, %mm3 C high n - (q1+1)*d, 0 or -1
+
+ ASSERT(be,`C 0 or -1
+ movd %mm3, %eax
+ addl $1, %eax
+ cmpl $1, %eax')
+
+ paddd %mm3, %mm2 C q
+ pand %mm5, %mm3 C mask & d
+
+ paddd %mm3, %mm0 C addback if necessary
+ movd %mm2, (%edi)
+ leal -4(%edi), %edi
+
+
+L(integer_none):
+ C eax
+ C ebx xsize
+
+ orl %ebx, %ebx
+ jnz L(fraction_some) C if xsize!=0
+
+
+L(fraction_done):
+ movl SAVE_EBP, %ebp
+ psrld %mm7, %mm0 C remainder
+
+ movl SAVE_EDI, %edi
+ movd %mm0, %eax
+
+ movl SAVE_ESI, %esi
+ movl SAVE_EBX, %ebx
+ emms
+ ret
+
+
+
+C -----------------------------------------------------------------------------
+C
+
+L(fraction_some):
+ C eax
+ C ebx xsize
+ C ecx
+ C edx
+ C esi
+ C edi &dst[xsize-1]
+ C ebp
+
+
+L(fraction_top):
+ C eax
+ C ebx counter, xsize iterations
+ C ecx
+ C edx
+ C esi src, decrementing
+ C edi dst, decrementing
+ C
+ C mm0 n2
+ C mm4 m
+ C mm5 d
+ C mm6 32-l
+ C mm7 l
+
+ ASSERT(b,`C n2<d
+ movd %mm0, %eax
+ movd %mm5, %edx
+ cmpl %edx, %eax')
+
+ movq %mm0, %mm1 C n2
+ pmuludq %mm4, %mm0 C m*n2
+
+ pcmpeqd %mm2, %mm2
+ psrlq $63, %mm2
+
+ C
+
+ psrlq $32, %mm0 C high(m*n2)
+
+ paddd %mm1, %mm0 C q1 = high(n2:0 + m*n2)
+
+ paddd %mm0, %mm2 C q1+1
+ pmuludq %mm5, %mm0 C q1*d
+
+ psllq $32, %mm1 C n = n2:0
+ psubq %mm5, %mm1 C n - d
+
+ C
+
+ psubq %mm0, %mm1 C r = n - (q1+1)*d
+ pxor %mm0, %mm0
+
+ por %mm1, %mm0 C r -> n2
+ psrlq $32, %mm1 C high n - (q1+1)*d, 0 or -1
+
+ ASSERT(be,`C 0 or -1
+ movd %mm1, %eax
+ addl $1, %eax
+ cmpl $1, %eax')
+
+ paddd %mm1, %mm2 C q
+ pand %mm5, %mm1 C mask & d
+
+ paddd %mm1, %mm0 C addback if necessary
+ movd %mm2, (%edi)
+ leal -4(%edi), %edi
+
+ subl $1, %ebx
+ jne L(fraction_top)
+
+
+ jmp L(fraction_done)
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/gmp-mparam.h
new file mode 100644
index 0000000..a047a51
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/gmp-mparam.h
@@ -0,0 +1,213 @@
+/* Intel Pentium-4 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 2600 MHz P4 Northwood */
+/* FFT tuning limit = 23,700,309 */
+/* Generated by tuneup.c, 2019-11-09, gcc 8.2 */
+
+#define MOD_1_NORM_THRESHOLD 5
+#define MOD_1_UNNORM_THRESHOLD 14
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 7
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 5
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 13
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 7
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1N_PI1_METHOD 2 /* 4.36% faster than 1 */
+#define DIV_QR_1_NORM_THRESHOLD 16
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 21
+
+#define DIV_1_VS_MUL_1_PERCENT 358
+
+#define MUL_TOOM22_THRESHOLD 26
+#define MUL_TOOM33_THRESHOLD 101
+#define MUL_TOOM44_THRESHOLD 284
+#define MUL_TOOM6H_THRESHOLD 406
+#define MUL_TOOM8H_THRESHOLD 592
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 101
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 191
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 189
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 195
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 151
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 51
+#define SQR_TOOM3_THRESHOLD 163
+#define SQR_TOOM4_THRESHOLD 254
+#define SQR_TOOM6_THRESHOLD 614
+#define SQR_TOOM8_THRESHOLD 842
+
+#define MULMID_TOOM42_THRESHOLD 58
+
+#define MULMOD_BNM1_THRESHOLD 19
+#define SQRMOD_BNM1_THRESHOLD 23
+
+#define MUL_FFT_MODF_THRESHOLD 824 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 824, 5}, { 29, 6}, { 15, 5}, { 33, 6}, \
+ { 17, 5}, { 36, 6}, { 19, 5}, { 39, 6}, \
+ { 29, 7}, { 15, 6}, { 32, 7}, { 17, 6}, \
+ { 36, 7}, { 19, 6}, { 39, 7}, { 21, 6}, \
+ { 43, 7}, { 23, 6}, { 48, 7}, { 29, 8}, \
+ { 15, 7}, { 37, 8}, { 19, 7}, { 43, 8}, \
+ { 23, 7}, { 49, 8}, { 27, 7}, { 55, 8}, \
+ { 31, 7}, { 63, 8}, { 43, 9}, { 23, 8}, \
+ { 55, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \
+ { 79, 9}, { 47, 8}, { 99, 9}, { 55,10}, \
+ { 31, 9}, { 79,10}, { 47, 9}, { 103,11}, \
+ { 31,10}, { 63, 9}, { 143,10}, { 79, 9}, \
+ { 167,10}, { 95, 9}, { 191,10}, { 111,11}, \
+ { 63,10}, { 127, 9}, { 255,10}, { 159,11}, \
+ { 95,10}, { 191,12}, { 63,11}, { 127,10}, \
+ { 271,11}, { 159,10}, { 319, 9}, { 639,10}, \
+ { 335,11}, { 191,10}, { 383, 9}, { 799,10}, \
+ { 415,11}, { 223,12}, { 127,11}, { 255,10}, \
+ { 527,11}, { 287,10}, { 607, 9}, { 1215,11}, \
+ { 319,10}, { 671,12}, { 191,11}, { 383,10}, \
+ { 799,11}, { 415,10}, { 863,13}, { 127,12}, \
+ { 255,11}, { 543,10}, { 1119, 9}, { 2239,11}, \
+ { 607,10}, { 1215,12}, { 319,11}, { 671,10}, \
+ { 1343,11}, { 735,10}, { 1471, 9}, { 2943,12}, \
+ { 383,11}, { 799,10}, { 1599,11}, { 863,12}, \
+ { 447,11}, { 927,10}, { 1855,11}, { 959,13}, \
+ { 255,12}, { 511,11}, { 1119,12}, { 575,11}, \
+ { 1215,10}, { 2431,11}, { 1247,12}, { 639,11}, \
+ { 1343,12}, { 703,11}, { 1471,10}, { 2943,13}, \
+ { 383,12}, { 767,11}, { 1599,12}, { 831,11}, \
+ { 1727,10}, { 3455,12}, { 895,14}, { 255,13}, \
+ { 511,12}, { 1087,11}, { 2239,10}, { 4479,12}, \
+ { 1215,11}, { 2431,13}, { 639,12}, { 1343,11}, \
+ { 2687,12}, { 1471,11}, { 2943,13}, { 767,12}, \
+ { 1727,11}, { 3455,13}, { 895,12}, { 1983,14}, \
+ { 511,13}, { 1023,12}, { 2239,13}, { 1151,12}, \
+ { 2495,11}, { 4991,13}, { 1407,12}, { 2943,14}, \
+ { 767,13}, { 1535,12}, { 3135,13}, { 1663,12}, \
+ { 3455,13}, { 1919,12}, { 3967,15}, { 511,14}, \
+ { 1023,13}, { 2175,12}, { 4479,13}, { 2431,12}, \
+ { 4991,14}, { 1279,13}, { 2687,12}, { 5503,13}, \
+ { 8192,14}, { 16384,15}, { 32768,16} }
+#define MUL_FFT_TABLE3_SIZE 167
+#define MUL_FFT_THRESHOLD 7808
+
+#define SQR_FFT_MODF_THRESHOLD 560 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 560, 5}, { 33, 6}, { 17, 5}, { 35, 6}, \
+ { 33, 7}, { 17, 6}, { 36, 7}, { 19, 6}, \
+ { 39, 7}, { 35, 8}, { 19, 7}, { 43, 8}, \
+ { 23, 7}, { 47, 8}, { 27, 7}, { 55, 8}, \
+ { 31, 7}, { 63, 8}, { 43, 9}, { 23, 8}, \
+ { 55, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \
+ { 79, 9}, { 47, 8}, { 95, 9}, { 55,10}, \
+ { 31, 9}, { 79,10}, { 47, 9}, { 95,11}, \
+ { 31,10}, { 63, 9}, { 135,10}, { 79, 9}, \
+ { 159,10}, { 111,11}, { 63,10}, { 143, 9}, \
+ { 287,10}, { 159,11}, { 95,10}, { 191,12}, \
+ { 63,11}, { 127, 9}, { 511, 8}, { 1023, 9}, \
+ { 527,11}, { 159,10}, { 319, 9}, { 639,10}, \
+ { 351,11}, { 191,10}, { 431,11}, { 223,12}, \
+ { 127,11}, { 255,10}, { 543,11}, { 287,10}, \
+ { 607, 9}, { 1215,11}, { 319,10}, { 639,11}, \
+ { 351,12}, { 191,11}, { 383,10}, { 767,11}, \
+ { 415,10}, { 831,13}, { 127,12}, { 255,11}, \
+ { 543,10}, { 1119,11}, { 607,12}, { 319,11}, \
+ { 671,10}, { 1343,11}, { 735,12}, { 383,11}, \
+ { 799,10}, { 1599,11}, { 863,12}, { 447,11}, \
+ { 927,10}, { 1855,11}, { 991,13}, { 255,12}, \
+ { 511,11}, { 1055,10}, { 2111,11}, { 1087,12}, \
+ { 575,11}, { 1215,10}, { 2431,12}, { 639,11}, \
+ { 1343,12}, { 703,11}, { 1407,13}, { 383,12}, \
+ { 767,11}, { 1599,12}, { 831,11}, { 1727,10}, \
+ { 3455,12}, { 895,11}, { 1855,12}, { 959,14}, \
+ { 255,13}, { 511,12}, { 1087,11}, { 2239,12}, \
+ { 1215,11}, { 2431,13}, { 639,12}, { 1471,11}, \
+ { 2943,13}, { 767,12}, { 1727,11}, { 3455,13}, \
+ { 895,12}, { 1983,14}, { 511,13}, { 1023,12}, \
+ { 2239,13}, { 1151,12}, { 2495,11}, { 4991,13}, \
+ { 1279,12}, { 2623,13}, { 1407,12}, { 2943,14}, \
+ { 767,13}, { 1663,12}, { 3455,13}, { 1919,12}, \
+ { 3839,15}, { 511,14}, { 1023,13}, { 2175,12}, \
+ { 4479,13}, { 2431,12}, { 4991,14}, { 1279,13}, \
+ { 2687,12}, { 5503,13}, { 8192,14}, { 16384,15}, \
+ { 32768,16} }
+#define SQR_FFT_TABLE3_SIZE 149
+#define SQR_FFT_THRESHOLD 4800
+
+#define MULLO_BASECASE_THRESHOLD 12
+#define MULLO_DC_THRESHOLD 44
+#define MULLO_MUL_N_THRESHOLD 14281
+#define SQRLO_BASECASE_THRESHOLD 13
+#define SQRLO_DC_THRESHOLD 42
+#define SQRLO_SQR_THRESHOLD 9449
+
+#define DC_DIV_QR_THRESHOLD 38
+#define DC_DIVAPPR_Q_THRESHOLD 105
+#define DC_BDIV_QR_THRESHOLD 52
+#define DC_BDIV_Q_THRESHOLD 83
+
+#define INV_MULMOD_BNM1_THRESHOLD 50
+#define INV_NEWTON_THRESHOLD 158
+#define INV_APPR_THRESHOLD 118
+
+#define BINV_NEWTON_THRESHOLD 342
+#define REDC_1_TO_REDC_N_THRESHOLD 67
+
+#define MU_DIV_QR_THRESHOLD 2130
+#define MU_DIVAPPR_Q_THRESHOLD 1895
+#define MUPI_DIV_QR_THRESHOLD 60
+#define MU_BDIV_QR_THRESHOLD 1652
+#define MU_BDIV_Q_THRESHOLD 2089
+
+#define POWM_SEC_TABLE 1,22,96,446,723,1378
+
+#define GET_STR_DC_THRESHOLD 13
+#define GET_STR_PRECOMPUTE_THRESHOLD 20
+#define SET_STR_DC_THRESHOLD 298
+#define SET_STR_PRECOMPUTE_THRESHOLD 960
+
+#define FAC_DSC_THRESHOLD 212
+#define FAC_ODD_THRESHOLD 71
+
+#define MATRIX22_STRASSEN_THRESHOLD 26
+#define HGCD2_DIV1_METHOD 3 /* 0.68% faster than 1 */
+#define HGCD_THRESHOLD 80
+#define HGCD_APPR_THRESHOLD 138
+#define HGCD_REDUCE_THRESHOLD 4455
+#define GCD_DC_THRESHOLD 365
+#define GCDEXT_DC_THRESHOLD 245
+#define JACOBI_BASE_METHOD 4 /* 23.41% faster than 1 */
+
+/* Tuneup completed successfully, took 63807 seconds */
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mod_1_1.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mod_1_1.asm
new file mode 100644
index 0000000..ee88bab
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mod_1_1.asm
@@ -0,0 +1,166 @@
+dnl x86-32 mpn_mod_1_1p for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2009, 2010 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO:
+C * Optimize. The present code was written quite straightforwardly.
+C * Optimize post-loop reduction code; it is from mod_1s_4p, thus overkill.
+C * Write a cps function that uses sse2 insns.
+
+C cycles/limb
+C P6 model 0-8,10-12 -
+C P6 model 9 (Banias) ?
+C P6 model 13 (Dothan) ?
+C P4 model 0-1 (Willamette) ?
+C P4 model 2 (Northwood) 16
+C P4 model 3-4 (Prescott) 18
+
+C INPUT PARAMETERS
+C ap sp + 4
+C n sp + 8
+C b sp + 12
+C cps sp + 16
+
+define(`B1modb', `%mm1')
+define(`B2modb', `%mm2')
+define(`ap', `%edx')
+define(`n', `%eax')
+
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mod_1_1p)
+ push %ebx
+ mov 8(%esp), ap
+ mov 12(%esp), n
+ mov 20(%esp), %ecx
+ movd 8(%ecx), B1modb
+ movd 12(%ecx), B2modb
+
+ lea -4(ap,n,4), ap
+
+C FIXME: See comment in generic/mod_1_1.c.
+ movd (ap), %mm7
+ movd -4(ap), %mm4
+ pmuludq B1modb, %mm7
+ paddq %mm4, %mm7
+ add $-2, n
+ jz L(end)
+
+ ALIGN(8)
+L(top): movq %mm7, %mm6
+ psrlq $32, %mm7 C rh
+ movd -8(ap), %mm0
+ add $-4, ap
+ pmuludq B2modb, %mm7
+ pmuludq B1modb, %mm6
+ add $-1, n
+ paddq %mm0, %mm7
+ paddq %mm6, %mm7
+ jnz L(top)
+
+L(end): pcmpeqd %mm4, %mm4
+ psrlq $32, %mm4 C 0x00000000FFFFFFFF
+ pand %mm7, %mm4 C rl
+ psrlq $32, %mm7 C rh
+ pmuludq B1modb, %mm7 C rh,cl
+ paddq %mm4, %mm7 C rh,rl
+ movd 4(%ecx), %mm4 C cnt
+ psllq %mm4, %mm7 C rh,rl normalized
+ movq %mm7, %mm2 C rl in low half
+ psrlq $32, %mm7 C rh
+ movd (%ecx), %mm1 C bi
+ pmuludq %mm7, %mm1 C qh,ql
+ paddq %mm2, %mm1 C qh-1,ql
+ movd %mm1, %ecx C ql
+ psrlq $32, %mm1 C qh-1
+ movd 16(%esp), %mm3 C b
+ pmuludq %mm1, %mm3 C (qh-1) * b
+ psubq %mm3, %mm2 C r in low half (could use psubd)
+ movd %mm2, %eax C r
+ mov 16(%esp), %ebx
+ sub %ebx, %eax C r
+ cmp %eax, %ecx
+ lea (%eax,%ebx), %edx
+ cmovc( %edx, %eax)
+ movd %mm4, %ecx C cnt
+ cmp %ebx, %eax
+ jae L(fix)
+ emms
+ pop %ebx
+ shr %cl, %eax
+ ret
+
+L(fix): sub %ebx, %eax
+ emms
+ pop %ebx
+ shr %cl, %eax
+ ret
+EPILOGUE()
+
+PROLOGUE(mpn_mod_1_1p_cps)
+C CAUTION: This is the same code as in k7/mod_1_1.asm
+ push %ebp
+ mov 12(%esp), %ebp
+ push %esi
+ bsr %ebp, %ecx
+ push %ebx
+ xor $31, %ecx
+ mov 16(%esp), %esi
+ sal %cl, %ebp
+ mov %ebp, %edx
+ not %edx
+ mov $-1, %eax
+ div %ebp
+ mov %eax, (%esi) C store bi
+ mov %ecx, 4(%esi) C store cnt
+ xor %ebx, %ebx
+ sub %ebp, %ebx
+ mov $1, %edx
+ shld %cl, %eax, %edx
+ imul %edx, %ebx
+ mul %ebx
+ add %ebx, %edx
+ not %edx
+ imul %ebp, %edx
+ add %edx, %ebp
+ cmp %edx, %eax
+ cmovc( %ebp, %edx)
+ shr %cl, %ebx
+ mov %ebx, 8(%esi) C store B1modb
+ shr %cl, %edx
+ mov %edx, 12(%esi) C store B2modb
+ pop %ebx
+ pop %esi
+ pop %ebp
+ ret
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mod_1_4.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mod_1_4.asm
new file mode 100644
index 0000000..eb2edb6
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mod_1_4.asm
@@ -0,0 +1,269 @@
+dnl x86-32 mpn_mod_1s_4p for Pentium 4 and P6 models with SSE2 (i.e. 9,D,E,F).
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2009, 2010 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO:
+C * Optimize. The present code was written quite straightforwardly.
+C * Optimize post-loop reduction code.
+C * Write a cps function that uses sse2 insns.
+
+C cycles/limb
+C P6 model 0-8,10-12 -
+C P6 model 9 (Banias) ?
+C P6 model 13 (Dothan) 3.4
+C P4 model 0-1 (Willamette) ?
+C P4 model 2 (Northwood) 4
+C P4 model 3-4 (Prescott) 4.5
+
+C INPUT PARAMETERS
+C ap sp + 4
+C n sp + 8
+C b sp + 12
+C cps sp + 16
+
+define(`B1modb', `%mm1')
+define(`B2modb', `%mm2')
+define(`B3modb', `%mm3')
+define(`B4modb', `%mm4')
+define(`B5modb', `%mm5')
+define(`ap', `%edx')
+define(`n', `%eax')
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mod_1s_4p)
+ push %ebx
+ mov 8(%esp), ap
+ mov 12(%esp), n
+ mov 20(%esp), %ecx
+
+ movd 8(%ecx), B1modb
+ movd 12(%ecx), B2modb
+ movd 16(%ecx), B3modb
+ movd 20(%ecx), B4modb
+ movd 24(%ecx), B5modb
+
+ mov n, %ebx
+ lea -4(ap,n,4), ap
+ and $3, %ebx
+ je L(b0)
+ cmp $2, %ebx
+ jc L(b1)
+ je L(b2)
+
+L(b3): movd -4(ap), %mm7
+ pmuludq B1modb, %mm7
+ movd -8(ap), %mm6
+ paddq %mm6, %mm7
+ movd (ap), %mm6
+ pmuludq B2modb, %mm6
+ paddq %mm6, %mm7
+ lea -24(ap), ap
+ add $-3, n
+ jz L(end)
+ jmp L(top)
+
+L(b0): movd -8(ap), %mm7
+ pmuludq B1modb, %mm7
+ movd -12(ap), %mm6
+ paddq %mm6, %mm7
+ movd -4(ap), %mm6
+ pmuludq B2modb, %mm6
+ paddq %mm6, %mm7
+ movd (ap), %mm6
+ pmuludq B3modb, %mm6
+ paddq %mm6, %mm7
+ lea -28(ap), ap
+ add $-4, n
+ jz L(end)
+ jmp L(top)
+
+L(b1): movd (ap), %mm7
+ lea -16(ap), ap
+ dec n
+ jz L(x)
+ jmp L(top)
+
+L(b2): movd -4(ap), %mm7 C rl
+ punpckldq (ap), %mm7 C rh
+ lea -20(ap), ap
+ add $-2, n
+ jz L(end)
+
+ ALIGN(8)
+L(top): movd 4(ap), %mm0
+ pmuludq B1modb, %mm0
+ movd 0(ap), %mm6
+ paddq %mm6, %mm0
+
+ movd 8(ap), %mm6
+ pmuludq B2modb, %mm6
+ paddq %mm6, %mm0
+
+ movd 12(ap), %mm6
+ pmuludq B3modb, %mm6
+ paddq %mm6, %mm0
+
+ movq %mm7, %mm6
+ psrlq $32, %mm7 C rh
+ pmuludq B5modb, %mm7
+ pmuludq B4modb, %mm6
+
+ paddq %mm0, %mm7
+ paddq %mm6, %mm7
+
+ add $-16, ap
+ add $-4, n
+ jnz L(top)
+
+L(end): pcmpeqd %mm4, %mm4
+ psrlq $32, %mm4 C 0x00000000FFFFFFFF
+ pand %mm7, %mm4 C rl
+ psrlq $32, %mm7 C rh
+ pmuludq B1modb, %mm7 C rh,cl
+ paddq %mm4, %mm7 C rh,rl
+L(x): movd 4(%ecx), %mm4 C cnt
+ psllq %mm4, %mm7 C rh,rl normalized
+ movq %mm7, %mm2 C rl in low half
+ psrlq $32, %mm7 C rh
+ movd (%ecx), %mm1 C bi
+ pmuludq %mm7, %mm1 C qh,ql
+ paddq %mm2, %mm1 C qh-1,ql
+ movd %mm1, %ecx C ql
+ psrlq $32, %mm1 C qh-1
+ movd 16(%esp), %mm3 C b
+ pmuludq %mm1, %mm3 C (qh-1) * b
+ psubq %mm3, %mm2 C r in low half (could use psubd)
+ movd %mm2, %eax C r
+ mov 16(%esp), %ebx
+ sub %ebx, %eax C r
+ cmp %eax, %ecx
+ lea (%eax,%ebx), %edx
+ cmovc( %edx, %eax)
+ movd %mm4, %ecx C cnt
+ cmp %ebx, %eax
+ jae L(fix)
+ emms
+ pop %ebx
+ shr %cl, %eax
+ ret
+
+L(fix): sub %ebx, %eax
+ emms
+ pop %ebx
+ shr %cl, %eax
+ ret
+EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(mpn_mod_1s_4p_cps)
+C CAUTION: This is the same code as in k7/mod_1_4.asm
+ push %ebp
+ push %edi
+ push %esi
+ push %ebx
+ mov 20(%esp), %ebp C FIXME: avoid bp for 0-idx
+ mov 24(%esp), %ebx
+ bsr %ebx, %ecx
+ xor $31, %ecx
+ sal %cl, %ebx C b << cnt
+ mov %ebx, %edx
+ not %edx
+ mov $-1, %eax
+ div %ebx
+ xor %edi, %edi
+ sub %ebx, %edi
+ mov $1, %esi
+ mov %eax, (%ebp) C store bi
+ mov %ecx, 4(%ebp) C store cnt
+ shld %cl, %eax, %esi
+ imul %edi, %esi
+ mov %eax, %edi
+ mul %esi
+
+ add %esi, %edx
+ shr %cl, %esi
+ mov %esi, 8(%ebp) C store B1modb
+
+ not %edx
+ imul %ebx, %edx
+ lea (%edx,%ebx), %esi
+ cmp %edx, %eax
+ cmovnc( %edx, %esi)
+ mov %edi, %eax
+ mul %esi
+
+ add %esi, %edx
+ shr %cl, %esi
+ mov %esi, 12(%ebp) C store B2modb
+
+ not %edx
+ imul %ebx, %edx
+ lea (%edx,%ebx), %esi
+ cmp %edx, %eax
+ cmovnc( %edx, %esi)
+ mov %edi, %eax
+ mul %esi
+
+ add %esi, %edx
+ shr %cl, %esi
+ mov %esi, 16(%ebp) C store B3modb
+
+ not %edx
+ imul %ebx, %edx
+ lea (%edx,%ebx), %esi
+ cmp %edx, %eax
+ cmovnc( %edx, %esi)
+ mov %edi, %eax
+ mul %esi
+
+ add %esi, %edx
+ shr %cl, %esi
+ mov %esi, 20(%ebp) C store B4modb
+
+ not %edx
+ imul %ebx, %edx
+ add %edx, %ebx
+ cmp %edx, %eax
+ cmovnc( %edx, %ebx)
+
+ shr %cl, %ebx
+ mov %ebx, 24(%ebp) C store B5modb
+
+ pop %ebx
+ pop %esi
+ pop %edi
+ pop %ebp
+ ret
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mod_34lsub1.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mod_34lsub1.asm
new file mode 100644
index 0000000..31e25b7
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mod_34lsub1.asm
@@ -0,0 +1,175 @@
+dnl Intel Pentium 4 mpn_mod_34lsub1 -- remainder modulo 2^24-1.
+
+dnl Copyright 2000-2003 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C Pentium4: 1.0 cycles/limb
+
+
+C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
+C
+C Enhancements:
+C
+C There might a couple of cycles to save by using plain integer code for
+C more small sizes. 2 limbs measures about 20 cycles, but 3 limbs jumps to
+C about 46 (inclusive of some function call overheads).
+
+defframe(PARAM_SIZE, 8)
+defframe(PARAM_SRC, 4)
+
+dnl re-use parameter space
+define(SAVE_EBX, `PARAM_SRC')
+define(SAVE_ESI, `PARAM_SIZE')
+
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mod_34lsub1)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ movl PARAM_SRC, %edx
+ movl (%edx), %eax
+
+ subl $2, %ecx
+ ja L(three_or_more)
+ jne L(one)
+
+ movl 4(%edx), %edx
+ movl %eax, %ecx
+ shrl $24, %eax C src[0] high
+
+ andl $0x00FFFFFF, %ecx C src[0] low
+ addl %ecx, %eax
+
+ movl %edx, %ecx
+ shll $8, %edx
+
+ shrl $16, %ecx C src[1] low
+ addl %ecx, %eax
+
+ andl $0x00FFFF00, %edx C src[1] high
+ addl %edx, %eax
+
+L(one):
+ ret
+
+
+L(three_or_more):
+ pxor %mm0, %mm0
+ pxor %mm1, %mm1
+ pxor %mm2, %mm2
+
+ pcmpeqd %mm7, %mm7
+ psrlq $32, %mm7 C 0x00000000FFFFFFFF, low 32 bits
+
+ pcmpeqd %mm6, %mm6
+ psrlq $40, %mm6 C 0x0000000000FFFFFF, low 24 bits
+
+L(top):
+ C eax
+ C ebx
+ C ecx counter, size-2 to 0, -1 or -2
+ C edx src, incrementing
+ C
+ C mm0 sum 0mod3
+ C mm1 sum 1mod3
+ C mm2 sum 2mod3
+ C mm3
+ C mm4
+ C mm5
+ C mm6 0x0000000000FFFFFF
+ C mm7 0x00000000FFFFFFFF
+
+ movd (%edx), %mm3
+ paddq %mm3, %mm0
+
+ movd 4(%edx), %mm3
+ paddq %mm3, %mm1
+
+ movd 8(%edx), %mm3
+ paddq %mm3, %mm2
+
+ addl $12, %edx
+ subl $3, %ecx
+ ja L(top)
+
+
+ C ecx is -2, -1 or 0 representing 0, 1 or 2 more limbs, respectively
+
+ addl $1, %ecx
+ js L(combine) C 0 more
+
+ movd (%edx), %mm3
+ paddq %mm3, %mm0
+
+ jz L(combine) C 1 more
+
+ movd 4(%edx), %mm3
+ paddq %mm3, %mm1
+
+L(combine):
+ movq %mm7, %mm3 C low halves
+ pand %mm0, %mm3
+
+ movq %mm7, %mm4
+ pand %mm1, %mm4
+
+ movq %mm7, %mm5
+ pand %mm2, %mm5
+
+ psrlq $32, %mm0 C high halves
+ psrlq $32, %mm1
+ psrlq $32, %mm2
+
+ paddq %mm0, %mm4 C fold high halves to give 33 bits each
+ paddq %mm1, %mm5
+ paddq %mm2, %mm3
+
+ psllq $8, %mm4 C combine at respective offsets
+ psllq $16, %mm5
+ paddq %mm4, %mm3
+ paddq %mm5, %mm3 C 0x000cxxxxxxxxxxxx, 50 bits
+
+ pand %mm3, %mm6 C fold at 24 bits
+ psrlq $24, %mm3
+
+ paddq %mm6, %mm3
+ movd %mm3, %eax
+
+ ASSERT(z, C nothing left in high dword
+ `psrlq $32, %mm3
+ movd %mm3, %ecx
+ orl %ecx, %ecx')
+
+ emms
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mode1o.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mode1o.asm
new file mode 100644
index 0000000..aa9ef31
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mode1o.asm
@@ -0,0 +1,175 @@
+dnl Intel Pentium-4 mpn_modexact_1_odd -- mpn by limb exact remainder.
+
+dnl Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P4: 19.0 cycles/limb
+
+
+C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor);
+C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor, mp_limb_t carry);
+C
+
+defframe(PARAM_CARRY, 16)
+defframe(PARAM_DIVISOR,12)
+defframe(PARAM_SIZE, 8)
+defframe(PARAM_SRC, 4)
+
+ TEXT
+
+ ALIGN(16)
+PROLOGUE(mpn_modexact_1c_odd)
+deflit(`FRAME',0)
+
+ movd PARAM_CARRY, %mm1
+ jmp L(start_1c)
+
+EPILOGUE()
+
+
+ ALIGN(16)
+PROLOGUE(mpn_modexact_1_odd)
+deflit(`FRAME',0)
+
+ pxor %mm1, %mm1 C carry limb
+L(start_1c):
+ movl PARAM_DIVISOR, %eax
+
+ movd PARAM_DIVISOR, %mm7
+
+ shrl %eax
+
+ andl $127, %eax C d/2, 7 bits
+
+ifdef(`PIC',`
+ LEA( binvert_limb_table, %edx)
+ movzbl (%eax,%edx), %eax C inv 8 bits
+',`
+ movzbl binvert_limb_table(%eax), %eax C inv 8 bits
+')
+
+ C
+
+ movd %eax, %mm6 C inv
+
+ movd %eax, %mm0 C inv
+
+ pmuludq %mm6, %mm6 C inv*inv
+
+ C
+
+ pmuludq %mm7, %mm6 C inv*inv*d
+ paddd %mm0, %mm0 C 2*inv
+
+ C
+
+ psubd %mm6, %mm0 C inv = 2*inv - inv*inv*d
+ pxor %mm6, %mm6
+
+ paddd %mm0, %mm6
+ pmuludq %mm0, %mm0 C inv*inv
+
+ C
+
+ pmuludq %mm7, %mm0 C inv*inv*d
+ paddd %mm6, %mm6 C 2*inv
+
+
+ movl PARAM_SRC, %eax
+ movl PARAM_SIZE, %ecx
+
+ C
+
+ psubd %mm0, %mm6 C inv = 2*inv - inv*inv*d
+
+ ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS
+ pushl %eax FRAME_pushl()
+ movd %mm6, %eax
+ imul PARAM_DIVISOR, %eax
+ cmpl $1, %eax
+ popl %eax FRAME_popl()')
+
+ pxor %mm0, %mm0 C carry bit
+
+
+C The dependent chain here is as follows.
+C
+C latency
+C psubq s = (src-cbit) - climb 2
+C pmuludq q = s*inverse 8
+C pmuludq prod = q*divisor 8
+C psrlq climb = high(prod) 2
+C --
+C 20
+C
+C Yet the loop measures 19.0 c/l, so obviously there's something gained
+C there over a straight reading of the chip documentation.
+
+L(top):
+ C eax src, incrementing
+ C ebx
+ C ecx counter, limbs
+ C edx
+ C
+ C mm0 carry bit
+ C mm1 carry limb
+ C mm6 inverse
+ C mm7 divisor
+
+ movd (%eax), %mm2
+ addl $4, %eax
+
+ psubq %mm0, %mm2 C src - cbit
+
+ psubq %mm1, %mm2 C src - cbit - climb
+ movq %mm2, %mm0
+ psrlq $63, %mm0 C new cbit
+
+ pmuludq %mm6, %mm2 C s*inverse
+
+ movq %mm7, %mm1
+ pmuludq %mm2, %mm1 C q*divisor
+ psrlq $32, %mm1 C new climb
+
+ subl $1, %ecx
+ jnz L(top)
+
+
+L(done):
+ paddq %mm1, %mm0
+ movd %mm0, %eax
+ emms
+ ret
+
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mul_1.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mul_1.asm
new file mode 100644
index 0000000..6347b8b
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mul_1.asm
@@ -0,0 +1,164 @@
+dnl mpn_mul_1 for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
+
+dnl Copyright 2005, 2007, 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C cycles/limb
+C P6 model 0-8,10-12 -
+C P6 model 9 (Banias) 4.17
+C P6 model 13 (Dothan) 4.17
+C P4 model 0-1 (Willamette) 4
+C P4 model 2 (Northwood) 4
+C P4 model 3-4 (Prescott) 4.55
+
+C TODO:
+C * Tweak eax/edx offsets in loop as to save some lea's
+C * Perhaps software pipeline small-case code
+
+C INPUT PARAMETERS
+C rp sp + 4
+C up sp + 8
+C n sp + 12
+C v0 sp + 16
+
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mul_1)
+ pxor %mm6, %mm6
+L(ent): mov 4(%esp), %edx
+ mov 8(%esp), %eax
+ mov 12(%esp), %ecx
+ movd 16(%esp), %mm7
+ cmp $4, %ecx
+ jnc L(big)
+
+L(lp0): movd (%eax), %mm0
+ lea 4(%eax), %eax
+ lea 4(%edx), %edx
+ pmuludq %mm7, %mm0
+ paddq %mm0, %mm6
+ movd %mm6, -4(%edx)
+ psrlq $32, %mm6
+ dec %ecx
+ jnz L(lp0)
+ movd %mm6, %eax
+ emms
+ ret
+
+L(big): and $3, %ecx
+ je L(0)
+ cmp $2, %ecx
+ jc L(1)
+ je L(2)
+ jmp L(3) C FIXME: one case should fall through
+
+L(0): movd (%eax), %mm3
+ sub 12(%esp), %ecx C loop count
+ lea -16(%eax), %eax
+ lea -12(%edx), %edx
+ pmuludq %mm7, %mm3
+ movd 20(%eax), %mm0
+ pmuludq %mm7, %mm0
+ movd 24(%eax), %mm1
+ jmp L(00)
+
+L(1): movd (%eax), %mm2
+ sub 12(%esp), %ecx
+ lea -12(%eax), %eax
+ lea -8(%edx), %edx
+ pmuludq %mm7, %mm2
+ movd 16(%eax), %mm3
+ pmuludq %mm7, %mm3
+ movd 20(%eax), %mm0
+ jmp L(01)
+
+L(2): movd (%eax), %mm1
+ sub 12(%esp), %ecx
+ lea -8(%eax), %eax
+ lea -4(%edx), %edx
+ pmuludq %mm7, %mm1
+ movd 12(%eax), %mm2
+ pmuludq %mm7, %mm2
+ movd 16(%eax), %mm3
+ jmp L(10)
+
+L(3): movd (%eax), %mm0
+ sub 12(%esp), %ecx
+ lea -4(%eax), %eax
+ pmuludq %mm7, %mm0
+ movd 8(%eax), %mm1
+ pmuludq %mm7, %mm1
+ movd 12(%eax), %mm2
+
+ ALIGN(16)
+L(top): pmuludq %mm7, %mm2
+ paddq %mm0, %mm6
+ movd 16(%eax), %mm3
+ movd %mm6, 0(%edx)
+ psrlq $32, %mm6
+L(10): pmuludq %mm7, %mm3
+ paddq %mm1, %mm6
+ movd 20(%eax), %mm0
+ movd %mm6, 4(%edx)
+ psrlq $32, %mm6
+L(01): pmuludq %mm7, %mm0
+ paddq %mm2, %mm6
+ movd 24(%eax), %mm1
+ movd %mm6, 8(%edx)
+ psrlq $32, %mm6
+L(00): pmuludq %mm7, %mm1
+ paddq %mm3, %mm6
+ movd 28(%eax), %mm2
+ movd %mm6, 12(%edx)
+ psrlq $32, %mm6
+ lea 16(%eax), %eax
+ lea 16(%edx), %edx
+ add $4, %ecx
+ ja L(top)
+
+L(end): pmuludq %mm7, %mm2
+ paddq %mm0, %mm6
+ movd %mm6, 0(%edx)
+ psrlq $32, %mm6
+ paddq %mm1, %mm6
+ movd %mm6, 4(%edx)
+ psrlq $32, %mm6
+ paddq %mm2, %mm6
+ movd %mm6, 8(%edx)
+ psrlq $32, %mm6
+ movd %mm6, %eax
+ emms
+ ret
+EPILOGUE()
+PROLOGUE(mpn_mul_1c)
+ movd 20(%esp), %mm6
+ jmp L(ent)
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mul_basecase.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mul_basecase.asm
new file mode 100644
index 0000000..6e3775a
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/mul_basecase.asm
@@ -0,0 +1,662 @@
+dnl mpn_mul_basecase for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
+
+dnl Copyright 2001, 2002, 2005, 2007 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO:
+C * Improve ad-hoc outer loop code and register handling. Some feed-in
+C scheduling could improve things by several cycles per outer iteration.
+C * In code for un <= 3, try keeping accumulation operands in registers,
+C without storing intermediates to rp.
+C * We might want to keep 32 in a free mm register, since the register form is
+C 3 bytes and the immediate form is 4 bytes. About 70 bytes to save.
+C * Look into different loop alignment, we now expand the code about 50 bytes
+C with possibly needless alignment.
+C * Perhaps rewrap loops 00,01,02 (6 loops) to allow fall-through entry.
+C * Use OSP, should solve feed-in latency problems.
+C * Save a few tens of bytes by doing cross-jumping for Loel0, etc.
+C * Save around 120 bytes by remapping "m 0", "m 1", "m 2" and "m 3" registers
+C so that they can share feed-in code, and changing the branch targets from
+C L<n> to Lm<nn>.
+
+C cycles/limb
+C P6 model 9 (Banias) ?
+C P6 model 13 (Dothan) 5.24
+C P6 model 14 (Yonah) ?
+C P4 model 0-1 (Willamette): 5
+C P4 model 2 (Northwood): 4.60 at 32 limbs
+C P4 model 3-4 (Prescott): 4.94 at 32 limbs
+
+C INPUT PARAMETERS
+C rp sp + 4
+C up sp + 8
+C un sp + 12
+C vp sp + 16
+C vn sp + 20
+
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mul_basecase)
+ push %esi
+ push %ebx
+ mov 12(%esp), %edx C rp
+ mov 16(%esp), %eax C up
+ mov 20(%esp), %ecx C un
+ mov 24(%esp), %esi C vp
+ mov 28(%esp), %ebx C vn
+ movd (%esi), %mm7 C
+L(ent): cmp $3, %ecx
+ ja L(big)
+ movd (%eax), %mm6
+ pmuludq %mm7, %mm6
+ jz L(un3)
+ cmp $2, %ecx
+ jz L(un2)
+
+L(un1): movd %mm6, (%edx) C un=1
+ psrlq $32, %mm6 C un=1
+ movd %mm6, 4(%edx) C un=1
+ jmp L(rtr) C un=1
+
+L(un2): movd 4(%eax), %mm1 C un=2
+ pmuludq %mm7, %mm1 C un=2
+ movd %mm6, (%edx) C un=2
+ psrlq $32, %mm6 C un=2
+ paddq %mm1, %mm6 C un=2
+ movd %mm6, 4(%edx) C un=2
+ psrlq $32, %mm6 C un=2
+ movd %mm6, 8(%edx) C un=2
+ dec %ebx C un=2
+ jz L(rtr) C un=2
+ movd 4(%esi), %mm7 C un=2
+ movd (%eax), %mm6 C un=2
+ pmuludq %mm7, %mm6 C un=2
+ movd 4(%eax), %mm1 C un=2
+ movd 4(%edx), %mm4 C un=2
+ pmuludq %mm7, %mm1 C un=2
+ movd 8(%edx), %mm5 C un=2
+ paddq %mm4, %mm6 C un=2
+ paddq %mm1, %mm5 C un=2
+ movd %mm6, 4(%edx) C un=2
+ psrlq $32, %mm6 C un=2
+ paddq %mm5, %mm6 C un=2
+ movd %mm6, 8(%edx) C un=2
+ psrlq $32, %mm6 C un=2
+ movd %mm6, 12(%edx) C un=2
+L(rtr): emms
+ pop %ebx
+ pop %esi
+ ret
+
+L(un3): movd 4(%eax), %mm1 C un=3
+ pmuludq %mm7, %mm1 C un=3
+ movd 8(%eax), %mm2 C un=3
+ pmuludq %mm7, %mm2 C un=3
+ movd %mm6, (%edx) C un=3
+ psrlq $32, %mm6 C un=3
+ paddq %mm1, %mm6 C un=3
+ movd %mm6, 4(%edx) C un=3
+ psrlq $32, %mm6 C un=3
+ paddq %mm2, %mm6 C un=3
+ movd %mm6, 8(%edx) C un=3
+ psrlq $32, %mm6 C un=3
+ movd %mm6, 12(%edx) C un=3
+ dec %ebx C un=3
+ jz L(rtr) C un=3
+ movd 4(%esi), %mm7 C un=3
+ movd (%eax), %mm6 C un=3
+ pmuludq %mm7, %mm6 C un=3
+ movd 4(%eax), %mm1 C un=3
+ movd 4(%edx), %mm4 C un=3
+ pmuludq %mm7, %mm1 C un=3
+ movd 8(%eax), %mm2 C un=3
+ movd 8(%edx), %mm5 C un=3
+ pmuludq %mm7, %mm2 C un=3
+ paddq %mm4, %mm6 C un=3
+ paddq %mm1, %mm5 C un=3
+ movd 12(%edx), %mm4 C un=3
+ movd %mm6, 4(%edx) C un=3
+ psrlq $32, %mm6 C un=3
+ paddq %mm5, %mm6 C un=3
+ paddq %mm2, %mm4 C un=3
+ movd %mm6, 8(%edx) C un=3
+ psrlq $32, %mm6 C un=3
+ paddq %mm4, %mm6 C un=3
+ movd %mm6, 12(%edx) C un=3
+ psrlq $32, %mm6 C un=3
+ movd %mm6, 16(%edx) C un=3
+ dec %ebx C un=3
+ jz L(rtr) C un=3
+ movd 8(%esi), %mm7 C un=3
+ movd (%eax), %mm6 C un=3
+ pmuludq %mm7, %mm6 C un=3
+ movd 4(%eax), %mm1 C un=3
+ movd 8(%edx), %mm4 C un=3
+ pmuludq %mm7, %mm1 C un=3
+ movd 8(%eax), %mm2 C un=3
+ movd 12(%edx), %mm5 C un=3
+ pmuludq %mm7, %mm2 C un=3
+ paddq %mm4, %mm6 C un=3
+ paddq %mm1, %mm5 C un=3
+ movd 16(%edx), %mm4 C un=3
+ movd %mm6, 8(%edx) C un=3
+ psrlq $32, %mm6 C un=3
+ paddq %mm5, %mm6 C un=3
+ paddq %mm2, %mm4 C un=3
+ movd %mm6, 12(%edx) C un=3
+ psrlq $32, %mm6 C un=3
+ paddq %mm4, %mm6 C un=3
+ movd %mm6, 16(%edx) C un=3
+ psrlq $32, %mm6 C un=3
+ movd %mm6, 20(%edx) C un=3
+ jmp L(rtr)
+
+
+L(big): push %edi
+ pxor %mm6, %mm6
+ lea 4(%esi), %esi
+ and $3, %ecx
+ jz L(0)
+ cmp $2, %ecx
+ jc L(1)
+ jz L(2)
+ jmp L(3) C FIXME: one case should fall through
+
+
+L(0): movd (%eax), %mm3 C m 0
+ sub 24(%esp), %ecx C inner loop count m 0
+ mov %ecx, 24(%esp) C update loop count for later m 0
+ pmuludq %mm7, %mm3 C m 0
+ movd 4(%eax), %mm0 C m 0
+ pmuludq %mm7, %mm0 C m 0
+ movd 8(%eax), %mm1 C m 0
+ jmp L(m00) C m 0
+ ALIGN(16) C m 0
+L(lpm0):
+ pmuludq %mm7, %mm4 C m 0
+ paddq %mm0, %mm6 C m 0
+ movd (%eax), %mm3 C m 0
+ movd %mm6, -12(%edx) C m 0
+ psrlq $32, %mm6 C m 0
+ pmuludq %mm7, %mm3 C m 0
+ paddq %mm1, %mm6 C m 0
+ movd 4(%eax), %mm0 C m 0
+ movd %mm6, -8(%edx) C m 0
+ psrlq $32, %mm6 C m 0
+ pmuludq %mm7, %mm0 C m 0
+ paddq %mm4, %mm6 C m 0
+ movd 8(%eax), %mm1 C m 0
+ movd %mm6, -4(%edx) C m 0
+ psrlq $32, %mm6 C m 0
+L(m00): pmuludq %mm7, %mm1 C m 0
+ paddq %mm3, %mm6 C m 0
+ movd 12(%eax), %mm4 C m 0
+ movd %mm6, (%edx) C m 0
+ psrlq $32, %mm6 C m 0
+ lea 16(%eax), %eax C m 0
+ lea 16(%edx), %edx C m 0
+ add $4, %ecx C m 0
+ ja L(lpm0) C m 0
+ pmuludq %mm7, %mm4 C m 0
+ paddq %mm0, %mm6 C m 0
+ movd %mm6, -12(%edx) C m 0
+ psrlq $32, %mm6 C m 0
+ paddq %mm1, %mm6 C m 0
+ mov 16(%esp), %edi C rp 0
+ jmp L(x0)
+
+L(olp0):
+ lea 4(%edi), %edi C am 0
+ movd (%esi), %mm7 C am 0
+ lea 4(%esi), %esi C am 0
+ mov %edi, %edx C rp am 0
+ mov 20(%esp), %eax C up am 0
+ movd (%eax), %mm3 C am 0
+ mov 24(%esp), %ecx C inner loop count am 0
+ pxor %mm6, %mm6 C am 0
+ pmuludq %mm7, %mm3 C am 0
+ movd 4(%eax), %mm0 C am 0
+ movd (%edx), %mm5 C am 0
+ pmuludq %mm7, %mm0 C am 0
+ movd 8(%eax), %mm1 C am 0
+ paddq %mm3, %mm5 C am 0
+ movd 4(%edx), %mm4 C am 0
+ jmp L(am00) C am 0
+ ALIGN(16) C mm 0
+L(lam0):
+ pmuludq %mm7, %mm2 C am 0
+ paddq %mm4, %mm6 C am 0
+ movd (%eax), %mm3 C am 0
+ paddq %mm1, %mm5 C am 0
+ movd -4(%edx), %mm4 C am 0
+ movd %mm6, -12(%edx) C am 0
+ psrlq $32, %mm6 C am 0
+ pmuludq %mm7, %mm3 C am 0
+ paddq %mm5, %mm6 C am 0
+ movd 4(%eax), %mm0 C am 0
+ paddq %mm2, %mm4 C am 0
+ movd (%edx), %mm5 C am 0
+ movd %mm6, -8(%edx) C am 0
+ psrlq $32, %mm6 C am 0
+ pmuludq %mm7, %mm0 C am 0
+ paddq %mm4, %mm6 C am 0
+ movd 8(%eax), %mm1 C am 0
+ paddq %mm3, %mm5 C am 0
+ movd 4(%edx), %mm4 C am 0
+ movd %mm6, -4(%edx) C am 0
+ psrlq $32, %mm6 C am 0
+L(am00):
+ pmuludq %mm7, %mm1 C am 0
+ paddq %mm5, %mm6 C am 0
+ movd 12(%eax), %mm2 C am 0
+ paddq %mm0, %mm4 C am 0
+ movd 8(%edx), %mm5 C am 0
+ movd %mm6, (%edx) C am 0
+ psrlq $32, %mm6 C am 0
+ lea 16(%eax), %eax C am 0
+ lea 16(%edx), %edx C am 0
+ add $4, %ecx C am 0
+ jnz L(lam0) C am 0
+ pmuludq %mm7, %mm2 C am 0
+ paddq %mm4, %mm6 C am 0
+ paddq %mm1, %mm5 C am 0
+ movd -4(%edx), %mm4 C am 0
+ movd %mm6, -12(%edx) C am 0
+ psrlq $32, %mm6 C am 0
+ paddq %mm5, %mm6 C am 0
+ paddq %mm2, %mm4 C am 0
+L(x0): movd %mm6, -8(%edx) C am 0
+ psrlq $32, %mm6 C am 0
+ paddq %mm4, %mm6 C am 0
+ movd %mm6, -4(%edx) C am 0
+ psrlq $32, %mm6 C am 0
+ movd %mm6, (%edx) C am 0
+ dec %ebx C am 0
+ jnz L(olp0) C am 0
+L(oel0):
+ emms C 0
+ pop %edi C 0
+ pop %ebx C 0
+ pop %esi C 0
+ ret C 0
+
+
+L(1): movd (%eax), %mm4 C m 1
+ sub 24(%esp), %ecx C m 1
+ mov %ecx, 24(%esp) C update loop count for later m 1
+ pmuludq %mm7, %mm4 C m 1
+ movd 4(%eax), %mm3 C m 1
+ pmuludq %mm7, %mm3 C m 1
+ movd 8(%eax), %mm0 C m 1
+ jmp L(m01) C m 1
+ ALIGN(16) C m 1
+L(lpm1):
+ pmuludq %mm7, %mm4 C m 1
+ paddq %mm0, %mm6 C m 1
+ movd 4(%eax), %mm3 C m 1
+ movd %mm6, -8(%edx) C m 1
+ psrlq $32, %mm6 C m 1
+ pmuludq %mm7, %mm3 C m 1
+ paddq %mm1, %mm6 C m 1
+ movd 8(%eax), %mm0 C m 1
+ movd %mm6, -4(%edx) C m 1
+ psrlq $32, %mm6 C m 1
+L(m01): pmuludq %mm7, %mm0 C m 1
+ paddq %mm4, %mm6 C m 1
+ movd 12(%eax), %mm1 C m 1
+ movd %mm6, (%edx) C m 1
+ psrlq $32, %mm6 C m 1
+ pmuludq %mm7, %mm1 C m 1
+ paddq %mm3, %mm6 C m 1
+ movd 16(%eax), %mm4 C m 1
+ movd %mm6, 4(%edx) C m 1
+ psrlq $32, %mm6 C m 1
+ lea 16(%eax), %eax C m 1
+ lea 16(%edx), %edx C m 1
+ add $4, %ecx C m 1
+ ja L(lpm1) C m 1
+ pmuludq %mm7, %mm4 C m 1
+ paddq %mm0, %mm6 C m 1
+ movd %mm6, -8(%edx) C m 1
+ psrlq $32, %mm6 C m 1
+ paddq %mm1, %mm6 C m 1
+ mov 16(%esp), %edi C rp 1
+ jmp L(x1)
+
+L(olp1):
+ lea 4(%edi), %edi C am 1
+ movd (%esi), %mm7 C am 1
+ lea 4(%esi), %esi C am 1
+ mov %edi, %edx C rp am 1
+ mov 20(%esp), %eax C up am 1
+ movd (%eax), %mm2 C am 1
+ mov 24(%esp), %ecx C inner loop count am 1
+ pxor %mm6, %mm6 C am 1
+ pmuludq %mm7, %mm2 C am 1
+ movd 4(%eax), %mm3 C am 1
+ movd (%edx), %mm4 C am 1
+ pmuludq %mm7, %mm3 C am 1
+ movd 8(%eax), %mm0 C am 1
+ paddq %mm2, %mm4 C am 1
+ movd 4(%edx), %mm5 C am 1
+ jmp L(am01) C am 1
+ ALIGN(16) C am 1
+L(lam1):
+ pmuludq %mm7, %mm2 C am 1
+ paddq %mm4, %mm6 C am 1
+ movd 4(%eax), %mm3 C am 1
+ paddq %mm1, %mm5 C am 1
+ movd (%edx), %mm4 C am 1
+ movd %mm6, -8(%edx) C am 1
+ psrlq $32, %mm6 C am 1
+ pmuludq %mm7, %mm3 C am 1
+ paddq %mm5, %mm6 C am 1
+ movd 8(%eax), %mm0 C am 1
+ paddq %mm2, %mm4 C am 1
+ movd 4(%edx), %mm5 C am 1
+ movd %mm6, -4(%edx) C am 1
+ psrlq $32, %mm6 C am 1
+L(am01):
+ pmuludq %mm7, %mm0 C am 1
+ paddq %mm4, %mm6 C am 1
+ movd 12(%eax), %mm1 C am 1
+ paddq %mm3, %mm5 C am 1
+ movd 8(%edx), %mm4 C am 1
+ movd %mm6, (%edx) C am 1
+ psrlq $32, %mm6 C am 1
+ pmuludq %mm7, %mm1 C am 1
+ paddq %mm5, %mm6 C am 1
+ movd 16(%eax), %mm2 C am 1
+ paddq %mm0, %mm4 C am 1
+ movd 12(%edx), %mm5 C am 1
+ movd %mm6, 4(%edx) C am 1
+ psrlq $32, %mm6 C am 1
+ lea 16(%eax), %eax C am 1
+ lea 16(%edx), %edx C am 1
+ add $4, %ecx C am 1
+ jnz L(lam1) C am 1
+ pmuludq %mm7, %mm2 C am 1
+ paddq %mm4, %mm6 C am 1
+ paddq %mm1, %mm5 C am 1
+ movd (%edx), %mm4 C am 1
+ movd %mm6, -8(%edx) C am 1
+ psrlq $32, %mm6 C am 1
+ paddq %mm5, %mm6 C am 1
+ paddq %mm2, %mm4 C am 1
+L(x1): movd %mm6, -4(%edx) C am 1
+ psrlq $32, %mm6 C am 1
+ paddq %mm4, %mm6 C am 1
+ movd %mm6, (%edx) C am 1
+ psrlq $32, %mm6 C am 1
+ movd %mm6, 4(%edx) C am 1
+ dec %ebx C am 1
+ jnz L(olp1) C am 1
+L(oel1):
+ emms C 1
+ pop %edi C 1
+ pop %ebx C 1
+ pop %esi C 1
+ ret C 1
+
+
+L(2): movd (%eax), %mm1 C m 2
+ sub 24(%esp), %ecx C m 2
+ mov %ecx, 24(%esp) C update loop count for later m 2
+ pmuludq %mm7, %mm1 C m 2
+ movd 4(%eax), %mm4 C m 2
+ pmuludq %mm7, %mm4 C m 2
+ movd 8(%eax), %mm3 C m 2
+ jmp L(m10) C m 2
+ ALIGN(16) C m 2
+L(lpm2):
+ pmuludq %mm7, %mm4 C m 2
+ paddq %mm0, %mm6 C m 2
+ movd 8(%eax), %mm3 C m 2
+ movd %mm6, -4(%edx) C m 2
+ psrlq $32, %mm6 C m 2
+L(m10): pmuludq %mm7, %mm3 C m 2
+ paddq %mm1, %mm6 C m 2
+ movd 12(%eax), %mm0 C m 2
+ movd %mm6, (%edx) C m 2
+ psrlq $32, %mm6 C m 2
+ pmuludq %mm7, %mm0 C m 2
+ paddq %mm4, %mm6 C m 2
+ movd 16(%eax), %mm1 C m 2
+ movd %mm6, 4(%edx) C m 2
+ psrlq $32, %mm6 C m 2
+ pmuludq %mm7, %mm1 C m 2
+ paddq %mm3, %mm6 C m 2
+ movd 20(%eax), %mm4 C m 2
+ movd %mm6, 8(%edx) C m 2
+ psrlq $32, %mm6 C m 2
+ lea 16(%eax), %eax C m 2
+ lea 16(%edx), %edx C m 2
+ add $4, %ecx C m 2
+ ja L(lpm2) C m 2
+ pmuludq %mm7, %mm4 C m 2
+ paddq %mm0, %mm6 C m 2
+ movd %mm6, -4(%edx) C m 2
+ psrlq $32, %mm6 C m 2
+ paddq %mm1, %mm6 C m 2
+ mov 16(%esp), %edi C rp 2
+ jmp L(x2)
+
+L(olp2):
+ lea 4(%edi), %edi C am 2
+ movd (%esi), %mm7 C am 2
+ lea 4(%esi), %esi C am 2
+ mov %edi, %edx C rp am 2
+ mov 20(%esp), %eax C up am 2
+ movd (%eax), %mm1 C am 2
+ mov 24(%esp), %ecx C inner loop count am 2
+ pxor %mm6, %mm6 C am 2
+ pmuludq %mm7, %mm1 C am 2
+ movd 4(%eax), %mm2 C am 2
+ movd (%edx), %mm5 C am 2
+ pmuludq %mm7, %mm2 C am 2
+ movd 8(%eax), %mm3 C am 2
+ paddq %mm1, %mm5 C am 2
+ movd 4(%edx), %mm4 C am 2
+ jmp L(am10) C am 2
+ ALIGN(16) C am 2
+L(lam2):
+ pmuludq %mm7, %mm2 C am 2
+ paddq %mm4, %mm6 C am 2
+ movd 8(%eax), %mm3 C am 2
+ paddq %mm1, %mm5 C am 2
+ movd 4(%edx), %mm4 C am 2
+ movd %mm6, -4(%edx) C am 2
+ psrlq $32, %mm6 C am 2
+L(am10):
+ pmuludq %mm7, %mm3 C am 2
+ paddq %mm5, %mm6 C am 2
+ movd 12(%eax), %mm0 C am 2
+ paddq %mm2, %mm4 C am 2
+ movd 8(%edx), %mm5 C am 2
+ movd %mm6, (%edx) C am 2
+ psrlq $32, %mm6 C am 2
+ pmuludq %mm7, %mm0 C am 2
+ paddq %mm4, %mm6 C am 2
+ movd 16(%eax), %mm1 C am 2
+ paddq %mm3, %mm5 C am 2
+ movd 12(%edx), %mm4 C am 2
+ movd %mm6, 4(%edx) C am 2
+ psrlq $32, %mm6 C am 2
+ pmuludq %mm7, %mm1 C am 2
+ paddq %mm5, %mm6 C am 2
+ movd 20(%eax), %mm2 C am 2
+ paddq %mm0, %mm4 C am 2
+ movd 16(%edx), %mm5 C am 2
+ movd %mm6, 8(%edx) C am 2
+ psrlq $32, %mm6 C am 2
+ lea 16(%eax), %eax C am 2
+ lea 16(%edx), %edx C am 2
+ add $4, %ecx C am 2
+ jnz L(lam2) C am 2
+ pmuludq %mm7, %mm2 C am 2
+ paddq %mm4, %mm6 C am 2
+ paddq %mm1, %mm5 C am 2
+ movd 4(%edx), %mm4 C am 2
+ movd %mm6, -4(%edx) C am 2
+ psrlq $32, %mm6 C am 2
+ paddq %mm5, %mm6 C am 2
+ paddq %mm2, %mm4 C am 2
+L(x2): movd %mm6, (%edx) C am 2
+ psrlq $32, %mm6 C am 2
+ paddq %mm4, %mm6 C am 2
+ movd %mm6, 4(%edx) C am 2
+ psrlq $32, %mm6 C am 2
+ movd %mm6, 8(%edx) C am 2
+ dec %ebx C am 2
+ jnz L(olp2) C am 2
+L(oel2):
+ emms C 2
+ pop %edi C 2
+ pop %ebx C 2
+ pop %esi C 2
+ ret C 2
+
+
+L(3): movd (%eax), %mm0 C m 3
+ sub 24(%esp), %ecx C m 3
+ mov %ecx, 24(%esp) C update loop count for later m 3
+ pmuludq %mm7, %mm0 C m 3
+ movd 4(%eax), %mm1 C m 3
+ pmuludq %mm7, %mm1 C m 3
+ movd 8(%eax), %mm4 C m 3
+ jmp L(lpm3) C m 3
+ ALIGN(16) C m 3
+L(lpm3):
+ pmuludq %mm7, %mm4 C m 3
+ paddq %mm0, %mm6 C m 3
+ movd 12(%eax), %mm3 C m 3
+ movd %mm6, (%edx) C m 3
+ psrlq $32, %mm6 C m 3
+ pmuludq %mm7, %mm3 C m 3
+ paddq %mm1, %mm6 C m 3
+ movd 16(%eax), %mm0 C m 3
+ movd %mm6, 4(%edx) C m 3
+ psrlq $32, %mm6 C m 3
+ pmuludq %mm7, %mm0 C m 3
+ paddq %mm4, %mm6 C m 3
+ movd 20(%eax), %mm1 C m 3
+ movd %mm6, 8(%edx) C m 3
+ psrlq $32, %mm6 C m 3
+ pmuludq %mm7, %mm1 C m 3
+ paddq %mm3, %mm6 C m 3
+ movd 24(%eax), %mm4 C m 3
+ movd %mm6, 12(%edx) C m 3
+ psrlq $32, %mm6 C m 3
+ lea 16(%eax), %eax C m 3
+ lea 16(%edx), %edx C m 3
+ add $4, %ecx C m 3
+ ja L(lpm3) C m 3
+ pmuludq %mm7, %mm4 C m 3
+ paddq %mm0, %mm6 C m 3
+ movd %mm6, (%edx) C m 3
+ psrlq $32, %mm6 C m 3
+ paddq %mm1, %mm6 C m 3
+ mov 16(%esp), %edi C rp 3
+ jmp L(x3)
+
+L(olp3):
+ lea 4(%edi), %edi C am 3
+ movd (%esi), %mm7 C am 3
+ lea 4(%esi), %esi C am 3
+ mov %edi, %edx C rp am 3
+ mov 20(%esp), %eax C up am 3
+ movd (%eax), %mm0 C am 3
+ mov 24(%esp), %ecx C inner loop count am 3
+ pxor %mm6, %mm6 C am 3
+ pmuludq %mm7, %mm0 C am 3
+ movd 4(%eax), %mm1 C am 3
+ movd (%edx), %mm4 C am 3
+ pmuludq %mm7, %mm1 C am 3
+ movd 8(%eax), %mm2 C am 3
+ paddq %mm0, %mm4 C am 3
+ movd 4(%edx), %mm5 C am 3
+ jmp L(lam3) C am 3
+ ALIGN(16) C am 3
+L(lam3):
+ pmuludq %mm7, %mm2 C am 3
+ paddq %mm4, %mm6 C am 3
+ movd 12(%eax), %mm3 C am 3
+ paddq %mm1, %mm5 C am 3
+ movd 8(%edx), %mm4 C am 3
+ movd %mm6, (%edx) C am 3
+ psrlq $32, %mm6 C am 3
+ pmuludq %mm7, %mm3 C am 3
+ paddq %mm5, %mm6 C am 3
+ movd 16(%eax), %mm0 C am 3
+ paddq %mm2, %mm4 C am 3
+ movd 12(%edx), %mm5 C am 3
+ movd %mm6, 4(%edx) C am 3
+ psrlq $32, %mm6 C am 3
+ pmuludq %mm7, %mm0 C am 3
+ paddq %mm4, %mm6 C am 3
+ movd 20(%eax), %mm1 C am 3
+ paddq %mm3, %mm5 C am 3
+ movd 16(%edx), %mm4 C am 3
+ movd %mm6, 8(%edx) C am 3
+ psrlq $32, %mm6 C am 3
+ pmuludq %mm7, %mm1 C am 3
+ paddq %mm5, %mm6 C am 3
+ movd 24(%eax), %mm2 C am 3
+ paddq %mm0, %mm4 C am 3
+ movd 20(%edx), %mm5 C am 3
+ movd %mm6, 12(%edx) C am 3
+ psrlq $32, %mm6 C am 3
+ lea 16(%eax), %eax C am 3
+ lea 16(%edx), %edx C am 3
+ add $4, %ecx C am 3
+ jnz L(lam3) C am 3
+ pmuludq %mm7, %mm2 C am 3
+ paddq %mm4, %mm6 C am 3
+ paddq %mm1, %mm5 C am 3
+ movd 8(%edx), %mm4 C am 3
+ movd %mm6, (%edx) C am 3
+ psrlq $32, %mm6 C am 3
+ paddq %mm5, %mm6 C am 3
+ paddq %mm2, %mm4 C am 3
+L(x3): movd %mm6, 4(%edx) C am 3
+ psrlq $32, %mm6 C am 3
+ paddq %mm4, %mm6 C am 3
+ movd %mm6, 8(%edx) C am 3
+ psrlq $32, %mm6 C am 3
+ movd %mm6, 12(%edx) C am 3
+ dec %ebx C am 3
+ jnz L(olp3) C am 3
+L(oel3):
+ emms C 3
+ pop %edi C 3
+ pop %ebx C 3
+ pop %esi C 3
+ ret C 3
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/popcount.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/popcount.asm
new file mode 100644
index 0000000..c7f4426
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/popcount.asm
@@ -0,0 +1,281 @@
+dnl X86-32 and X86-64 mpn_popcount using SSE2.
+
+dnl Copyright 2006, 2007, 2011, 2015, 2020 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+
+C 32-bit popcount hamdist
+C cycles/limb cycles/limb
+C P5 -
+C P6 model 0-8,10-12 -
+C P6 model 9 (Banias) ?
+C P6 model 13 (Dothan) 4
+C P4 model 0 (Willamette) ?
+C P4 model 1 (?) ?
+C P4 model 2 (Northwood) 3.9
+C P4 model 3 (Prescott) ?
+C P4 model 4 (Nocona) ?
+C AMD K6 -
+C AMD K7 -
+C AMD K8 ?
+
+C 64-bit popcount hamdist
+C cycles/limb cycles/limb
+C P4 model 4 (Nocona): 8
+C AMD K8,K9 7.5
+C AMD K10 3.5
+C Intel core2 3.68
+C Intel corei 3.15
+C Intel atom 10.8
+C VIA nano 6.5
+
+C TODO
+C * Make an mpn_hamdist based on this. Alignment could either be handled by
+C using movdqu for one operand and movdqa for the other, or by painfully
+C shifting as we go. Unfortunately, there seem to be no usable shift
+C instruction, except for one that takes an immediate count.
+C * It would probably be possible to cut a few cycles/limb using software
+C pipelining.
+C * There are 35 decode slots unused by the SSE2 instructions. Loop control
+C needs just 2 or 3 slots, leaving around 32 slots. This allows a parallel
+C integer based popcount. Such a combined loop would handle 6 limbs in
+C about 30 cycles on K8.
+C * We could save a byte or two by using 32-bit operations on areg.
+C * Check if using movdqa to a temp of and then register-based pand is faster.
+
+ifelse(GMP_LIMB_BITS,`32',
+` define(`up', `%edx')
+ define(`n', `%ecx')
+ define(`areg',`%eax')
+ define(`breg',`%ebx')
+ define(`zero',`%xmm4')
+ define(`LIMB32',` $1')
+ define(`LIMB64',`dnl')
+',`
+ define(`up', `%rdi')
+ define(`n', `%rsi')
+ define(`areg',`%rax')
+ define(`breg',`%rdx')
+ define(`zero',`%xmm8')
+ define(`LIMB32',`dnl')
+ define(`LIMB64',` $1')
+')
+
+define(`mm01010101',`%xmm6')
+define(`mm00110011',`%xmm7')
+define(`mm00001111',`%xmm2')
+
+define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
+define(`LIMBS_PER_XMM', eval(16/GMP_LIMB_BYTES))
+define(`LIMBS_PER_2XMM', eval(32/GMP_LIMB_BYTES))
+
+undefine(`psadbw') C override inherited m4 version
+
+C This file is shared between 32-bit and 64-bit builds. Only the former has
+C LEAL. Default LEAL as an alias of LEA.
+ifdef(`LEAL',,`define(`LEAL', `LEA($1,$2)')')
+
+ASM_START()
+
+C Make cnsts global to work around Apple relocation bug.
+ifdef(`DARWIN',`
+ define(`cnsts', MPN(popccnsts))
+ GLOBL cnsts')
+
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_popcount)
+
+LIMB32(`mov 4(%esp), up ')
+LIMB32(`mov 8(%esp), n ')
+LIMB32(`push %ebx ')
+
+ pxor %xmm3, %xmm3 C zero grand total count
+LIMB64(`pxor zero, zero ')
+
+ LEAL( cnsts, breg)
+
+ movdqa -48(breg), mm01010101
+ movdqa -32(breg), mm00110011
+ movdqa -16(breg), mm00001111
+
+ mov up, areg
+ and $-16, up C round `up' down to 128-bit boundary
+ and $12, areg C 32:areg = 0, 4, 8, 12
+ C 64:areg = 0, 8
+ movdqa (up), %xmm0
+ pand 64(breg,areg,4), %xmm0
+ shr $m4_log2(GMP_LIMB_BYTES), %eax
+ add areg, n C compensate n for rounded down `up'
+
+ pxor %xmm4, %xmm4
+ sub $LIMBS_PER_XMM, n
+ jbe L(sum)
+
+ sub $LIMBS_PER_XMM, n
+ ja L(ent)
+ jmp L(lsum)
+
+ ALIGN(16)
+L(top): movdqa (up), %xmm0
+L(ent): movdqa 16(up), %xmm4
+
+ movdqa %xmm0, %xmm1
+ movdqa %xmm4, %xmm5
+ psrld $1, %xmm0
+ psrld $1, %xmm4
+ pand mm01010101, %xmm0
+ pand mm01010101, %xmm4
+ psubd %xmm0, %xmm1
+ psubd %xmm4, %xmm5
+
+ movdqa %xmm1, %xmm0
+ movdqa %xmm5, %xmm4
+ psrlq $2, %xmm1
+ psrlq $2, %xmm5
+ pand mm00110011, %xmm0
+ pand mm00110011, %xmm4
+ pand mm00110011, %xmm1
+ pand mm00110011, %xmm5
+ paddq %xmm0, %xmm1
+ paddq %xmm4, %xmm5
+
+LIMB32(`pxor zero, zero ')
+
+ add $32, up
+ sub $LIMBS_PER_2XMM, n
+
+ paddq %xmm5, %xmm1
+ movdqa %xmm1, %xmm0
+ psrlq $4, %xmm1
+ pand mm00001111, %xmm0
+ pand mm00001111, %xmm1
+ paddq %xmm0, %xmm1
+
+ psadbw zero, %xmm1
+ paddq %xmm1, %xmm3 C add to grand total
+
+ jnc L(top)
+L(end):
+ add $LIMBS_PER_2XMM, n
+ jz L(rt)
+ movdqa (up), %xmm0
+ pxor %xmm4, %xmm4
+ sub $LIMBS_PER_XMM, n
+ jbe L(sum)
+L(lsum):
+ movdqa %xmm0, %xmm4
+ movdqa 16(up), %xmm0
+L(sum):
+ shl $m4_log2(GMP_LIMB_BYTES), n
+ and $12, n
+ pand (breg,n,4), %xmm0
+
+ movdqa %xmm0, %xmm1
+ movdqa %xmm4, %xmm5
+ psrld $1, %xmm0
+ psrld $1, %xmm4
+ pand mm01010101, %xmm0
+ pand mm01010101, %xmm4
+ psubd %xmm0, %xmm1
+ psubd %xmm4, %xmm5
+
+ movdqa %xmm1, %xmm0
+ movdqa %xmm5, %xmm4
+ psrlq $2, %xmm1
+ psrlq $2, %xmm5
+ pand mm00110011, %xmm0
+ pand mm00110011, %xmm4
+ pand mm00110011, %xmm1
+ pand mm00110011, %xmm5
+ paddq %xmm0, %xmm1
+ paddq %xmm4, %xmm5
+
+LIMB32(`pxor zero, zero ')
+
+ paddq %xmm5, %xmm1
+ movdqa %xmm1, %xmm0
+ psrlq $4, %xmm1
+ pand mm00001111, %xmm0
+ pand mm00001111, %xmm1
+ paddq %xmm0, %xmm1
+
+ psadbw zero, %xmm1
+ paddq %xmm1, %xmm3 C add to grand total
+
+
+C Add the two 64-bit halves of the grand total counter
+L(rt): movdqa %xmm3, %xmm0
+ psrldq $8, %xmm3
+ paddq %xmm3, %xmm0
+ movd %xmm0, areg C movq avoided due to gas bug
+
+LIMB32(`pop %ebx ')
+ ret
+
+EPILOGUE()
+DEF_OBJECT(dummy,16)
+C Three magic constants used for masking out bits
+ .byte 0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55
+ .byte 0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55
+
+ .byte 0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33
+ .byte 0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33
+
+ .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
+ .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
+cnsts:
+C Masks for high end of number
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+
+ .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
+ .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+ .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+ .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
+C Masks for low end of number
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+
+ .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+
+ .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+
+ .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+ .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
+END_OBJECT(dummy)
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/rsh1add_n.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/rsh1add_n.asm
new file mode 100644
index 0000000..f421d13
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/rsh1add_n.asm
@@ -0,0 +1,126 @@
+dnl Intel Pentium-4 mpn_rsh1add_n -- mpn (x+y)/2
+
+dnl Copyright 2001-2004 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb (approx)
+C dst!=src1,2 dst==src1 dst==src2
+C P4: 4.5 6.5 6.5
+
+
+C mp_limb_t mpn_rsh1add_n (mp_ptr wp, mp_srcptr xp, mp_srcptr yp,
+C mp_size_t size);
+C
+C The slightly strange combination of indexing and pointer incrementing
+C that's used seems to work best. Not sure why, but for instance leal
+C incrementing on %esi is a 1 or 2 cycle slowdown.
+C
+C The dependent chain is paddq combining the carry and next (shifted) part,
+C plus psrlq to move the new carry down. That, and just 4 mmx instructions
+C in total, makes 4 c/l the target speed, which is almost achieved for
+C separate src/dst but when src==dst the write combining anomalies slow it
+C down.
+
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_YP, 12)
+defframe(PARAM_XP, 8)
+defframe(PARAM_WP, 4)
+
+dnl re-use parameter space
+define(SAVE_EBX,`PARAM_XP')
+define(SAVE_ESI,`PARAM_YP')
+
+ TEXT
+ ALIGN(8)
+
+PROLOGUE(mpn_rsh1add_n)
+deflit(`FRAME',0)
+
+ movl PARAM_XP, %edx
+ movl %ebx, SAVE_EBX
+
+ movl PARAM_YP, %ebx
+ movl %esi, SAVE_ESI
+
+ movl PARAM_WP, %esi
+
+ movd (%edx), %mm0 C xp[0]
+
+ movd (%ebx), %mm1 C yp[0]
+ movl PARAM_SIZE, %ecx
+
+ movl (%edx), %eax C xp[0]
+
+ addl (%ebx), %eax C xp[0]+yp[0]
+
+ paddq %mm1, %mm0 C xp[0]+yp[0]
+ leal (%esi,%ecx,4), %esi C wp end
+ negl %ecx C -size
+
+ psrlq $1, %mm0 C (xp[0]+yp[0])/2
+ and $1, %eax C return value, rsh1 bit of xp[0]+yp[0]
+ addl $1, %ecx C -(size-1)
+ jz L(done)
+
+
+L(top):
+ C eax return value
+ C ebx yp end
+ C ecx counter, limbs, -(size-1) to -1 inclusive
+ C edx xp end
+ C esi wp end
+ C mm0 carry (32 bits)
+
+ movd 4(%edx), %mm1 C xp[i+1]
+ movd 4(%ebx), %mm2 C yp[i+1]
+ leal 4(%edx), %edx
+ leal 4(%ebx), %ebx
+ paddq %mm2, %mm1 C xp[i+1]+yp[i+1]
+ psllq $31, %mm1 C low bit at 31, further 32 above
+
+ paddq %mm1, %mm0 C 31 and carry from prev add
+ movd %mm0, -4(%esi,%ecx,4) C low ready to store dst[i]
+
+ psrlq $32, %mm0 C high becomes new carry
+
+ addl $1, %ecx
+ jnz L(top)
+
+
+L(done):
+ movd %mm0, -4(%esi) C dst[size-1]
+ movl SAVE_EBX, %ebx
+
+ movl SAVE_ESI, %esi
+ emms
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/sqr_basecase.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/sqr_basecase.asm
new file mode 100644
index 0000000..0d548e0
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/sqr_basecase.asm
@@ -0,0 +1,705 @@
+dnl mpn_sqr_basecase for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
+
+dnl Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO:
+C * Improve ad-hoc outer loop code and register handling. Some feed-in
+C scheduling could improve things by several cycles per outer iteration.
+C * In Lam3...Lam1 code for, keep accumulation operands in registers, without
+C storing intermediates to rp.
+C * We might want to keep 32 in a free mm register, since the register form is
+C 3 bytes and the immediate form is 4 bytes. About 80 bytes to save.
+C * Look into different loop alignment, we now expand the code about 50 bytes
+C with possibly needless alignment.
+C * Use OSP, should solve feed-in latency problems.
+C * Address relative slowness for un<=3 for Pentium M. The old code is there
+C considerably faster. (1:20/14, 2:34/32, 3:66/57)
+
+C INPUT PARAMETERS
+C rp sp + 4
+C up sp + 8
+C un sp + 12
+
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_sqr_basecase)
+ mov 4(%esp), %edx C rp
+ mov 8(%esp), %eax C up
+ mov 12(%esp), %ecx C un
+
+ cmp $2, %ecx
+ jc L(un1)
+ jz L(un2)
+ cmp $4, %ecx
+ jc L(un3)
+ jz L(un4)
+ jmp L(big)
+
+L(un1): mov (%eax), %eax
+ mov %edx, %ecx
+ mul %eax
+ mov %eax, (%ecx)
+ mov %edx, 4(%ecx)
+ ret
+L(un2): movd (%eax), %mm0 C un=2
+ movd (%eax), %mm2 C un=2
+ movd 4(%eax), %mm1 C un=2
+ pmuludq %mm0, %mm0 C 64b weight 0 un=2
+ pmuludq %mm1, %mm2 C 64b weight 32 un=2
+ pmuludq %mm1, %mm1 C 64b weight 64 un=2
+ movd %mm0, (%edx) C un=2
+ psrlq $32, %mm0 C 32b weight 32 un=2
+ pcmpeqd %mm7, %mm7 C un=2
+ psrlq $33, %mm7 C 0x000000007FFFFFFF un=2
+ pand %mm2, %mm7 C 31b weight 32 un=2
+ psrlq $31, %mm2 C 33b weight 65 un=2
+ psllq $1, %mm7 C 31b weight 33 un=2
+ paddq %mm7, %mm0 C un=2
+ movd %mm0, 4(%edx) C un=2
+ psrlq $32, %mm0 C un=2
+ paddq %mm2, %mm1 C un=2
+ paddq %mm0, %mm1 C un=2
+ movd %mm1, 8(%edx) C un=2
+ psrlq $32, %mm1 C un=2
+ movd %mm1, 12(%edx) C un=2
+ emms
+ ret
+L(un3): movd (%eax), %mm7 C un=3
+ movd 4(%eax), %mm6 C un=3
+ pmuludq %mm7, %mm6 C un=3
+ movd 8(%eax), %mm2 C un=3
+ pmuludq %mm7, %mm2 C un=3
+ movd %mm6, 4(%edx) C un=3
+ psrlq $32, %mm6 C un=3
+ paddq %mm2, %mm6 C un=3
+ movd %mm6, 8(%edx) C un=3
+ psrlq $32, %mm6 C un=3
+ movd %mm6, 12(%edx) C un=3
+ lea 4(%edx), %edx C un=3
+ lea 4(%eax), %eax C un=3
+ jmp L(am1)
+L(un4): movd (%eax), %mm7 C un=4
+ movd 4(%eax), %mm6 C un=4
+ pmuludq %mm7, %mm6 C un=4
+ movd 8(%eax), %mm0 C un=4
+ pmuludq %mm7, %mm0 C un=4
+ movd 12(%eax), %mm1 C un=4
+ pmuludq %mm7, %mm1 C un=4
+ movd %mm6, 4(%edx) C un=4
+ psrlq $32, %mm6 C un=4
+ paddq %mm0, %mm6 C un=4
+ movd %mm6, 8(%edx) C un=4
+ psrlq $32, %mm6 C un=4
+ paddq %mm1, %mm6 C un=4
+ movd %mm6, 12(%edx) C un=4
+ psrlq $32, %mm6 C un=4
+ movd %mm6, 16(%edx) C un=4
+ lea 4(%edx), %edx C un=4
+ lea 4(%eax), %eax C un=4
+ jmp L(am2)
+
+L(big): push %esi
+ push %ebx
+ push %edi
+ pxor %mm6, %mm6
+ movd (%eax), %mm7 C
+ lea 4(%eax), %esi C init up, up++
+ lea 4(%eax), %eax C up2++ FIXME: should fix offsets
+ lea 4(%edx), %edi C init rp, rp++
+ lea 4(%edx), %edx C rp2++
+ lea -4(%ecx), %ebx C loop count
+ and $3, %ecx
+ jz L(3m)
+ cmp $2, %ecx
+ ja L(2m)
+ jb L(0m)
+
+L(1m):
+ movd (%eax), %mm4 C m 1
+ lea (%ebx), %ecx C inner loop count m 1
+ pmuludq %mm7, %mm4 C m 1
+ movd 4(%eax), %mm3 C m 1
+ pmuludq %mm7, %mm3 C m 1
+ movd 8(%eax), %mm0 C m 1
+ jmp L(m01) C m 1
+ ALIGN(16) C m 1
+L(lpm1):
+ pmuludq %mm7, %mm4 C m 1
+ paddq %mm0, %mm6 C m 1
+ movd 4(%eax), %mm3 C m 1
+ movd %mm6, -8(%edx) C m 1
+ psrlq $32, %mm6 C m 1
+ pmuludq %mm7, %mm3 C m 1
+ paddq %mm1, %mm6 C m 1
+ movd 8(%eax), %mm0 C m 1
+ movd %mm6, -4(%edx) C m 1
+ psrlq $32, %mm6 C m 1
+L(m01): pmuludq %mm7, %mm0 C m 1
+ paddq %mm4, %mm6 C m 1
+ movd 12(%eax), %mm1 C m 1
+ movd %mm6, (%edx) C m 1
+ psrlq $32, %mm6 C m 1
+ pmuludq %mm7, %mm1 C m 1
+ paddq %mm3, %mm6 C m 1
+ movd 16(%eax), %mm4 C m 1
+ movd %mm6, 4(%edx) C m 1
+ psrlq $32, %mm6 C m 1
+ lea 16(%eax), %eax C m 1
+ lea 16(%edx), %edx C m 1
+ sub $4, %ecx C m 1
+ ja L(lpm1) C m 1
+ pmuludq %mm7, %mm4 C m 1
+ paddq %mm0, %mm6 C m 1
+ movd %mm6, -8(%edx) C m 1
+ psrlq $32, %mm6 C m 1
+ paddq %mm1, %mm6 C m 1
+ jmp L(0)
+
+L(2m):
+ movd (%eax), %mm1 C m 2
+ lea (%ebx), %ecx C inner loop count m 2
+ pmuludq %mm7, %mm1 C m 2
+ movd 4(%eax), %mm4 C m 2
+ pmuludq %mm7, %mm4 C m 2
+ movd 8(%eax), %mm3 C m 2
+ jmp L(m10) C m 2
+ ALIGN(16) C m 2
+L(lpm2):
+ pmuludq %mm7, %mm4 C m 2
+ paddq %mm0, %mm6 C m 2
+ movd 8(%eax), %mm3 C m 2
+ movd %mm6, -4(%edx) C m 2
+ psrlq $32, %mm6 C m 2
+L(m10): pmuludq %mm7, %mm3 C m 2
+ paddq %mm1, %mm6 C m 2
+ movd 12(%eax), %mm0 C m 2
+ movd %mm6, (%edx) C m 2
+ psrlq $32, %mm6 C m 2
+ pmuludq %mm7, %mm0 C m 2
+ paddq %mm4, %mm6 C m 2
+ movd 16(%eax), %mm1 C m 2
+ movd %mm6, 4(%edx) C m 2
+ psrlq $32, %mm6 C m 2
+ pmuludq %mm7, %mm1 C m 2
+ paddq %mm3, %mm6 C m 2
+ movd 20(%eax), %mm4 C m 2
+ movd %mm6, 8(%edx) C m 2
+ psrlq $32, %mm6 C m 2
+ lea 16(%eax), %eax C m 2
+ lea 16(%edx), %edx C m 2
+ sub $4, %ecx C m 2
+ ja L(lpm2) C m 2
+ pmuludq %mm7, %mm4 C m 2
+ paddq %mm0, %mm6 C m 2
+ movd %mm6, -4(%edx) C m 2
+ psrlq $32, %mm6 C m 2
+ paddq %mm1, %mm6 C m 2
+ jmp L(1)
+
+L(3m):
+ movd (%eax), %mm0 C m 3
+ lea (%ebx), %ecx C inner loop count m 3
+ pmuludq %mm7, %mm0 C m 3
+ movd 4(%eax), %mm1 C m 3
+ pmuludq %mm7, %mm1 C m 3
+ movd 8(%eax), %mm4 C m 3
+ jmp L(lpm3) C m 3
+ ALIGN(16) C m 3
+L(lpm3):
+ pmuludq %mm7, %mm4 C m 3
+ paddq %mm0, %mm6 C m 3
+ movd 12(%eax), %mm3 C m 3
+ movd %mm6, (%edx) C m 3
+ psrlq $32, %mm6 C m 3
+ pmuludq %mm7, %mm3 C m 3
+ paddq %mm1, %mm6 C m 3
+ movd 16(%eax), %mm0 C m 3
+ movd %mm6, 4(%edx) C m 3
+ psrlq $32, %mm6 C m 3
+ pmuludq %mm7, %mm0 C m 3
+ paddq %mm4, %mm6 C m 3
+ movd 20(%eax), %mm1 C m 3
+ movd %mm6, 8(%edx) C m 3
+ psrlq $32, %mm6 C m 3
+ pmuludq %mm7, %mm1 C m 3
+ paddq %mm3, %mm6 C m 3
+ movd 24(%eax), %mm4 C m 3
+ movd %mm6, 12(%edx) C m 3
+ psrlq $32, %mm6 C m 3
+ lea 16(%eax), %eax C m 3
+ lea 16(%edx), %edx C m 3
+ sub $4, %ecx C m 3
+ ja L(lpm3) C m 3
+ pmuludq %mm7, %mm4 C m 3
+ paddq %mm0, %mm6 C m 3
+ movd %mm6, (%edx) C m 3
+ psrlq $32, %mm6 C m 3
+ paddq %mm1, %mm6 C m 3
+ jmp L(2)
+
+L(0m):
+ movd (%eax), %mm3 C m 0
+ lea (%ebx), %ecx C inner loop count m 0
+ pmuludq %mm7, %mm3 C m 0
+ movd 4(%eax), %mm0 C m 0
+ pmuludq %mm7, %mm0 C m 0
+ movd 8(%eax), %mm1 C m 0
+ jmp L(m00) C m 0
+ ALIGN(16) C m 0
+L(lpm0):
+ pmuludq %mm7, %mm4 C m 0
+ paddq %mm0, %mm6 C m 0
+ movd (%eax), %mm3 C m 0
+ movd %mm6, -12(%edx) C m 0
+ psrlq $32, %mm6 C m 0
+ pmuludq %mm7, %mm3 C m 0
+ paddq %mm1, %mm6 C m 0
+ movd 4(%eax), %mm0 C m 0
+ movd %mm6, -8(%edx) C m 0
+ psrlq $32, %mm6 C m 0
+ pmuludq %mm7, %mm0 C m 0
+ paddq %mm4, %mm6 C m 0
+ movd 8(%eax), %mm1 C m 0
+ movd %mm6, -4(%edx) C m 0
+ psrlq $32, %mm6 C m 0
+L(m00): pmuludq %mm7, %mm1 C m 0
+ paddq %mm3, %mm6 C m 0
+ movd 12(%eax), %mm4 C m 0
+ movd %mm6, (%edx) C m 0
+ psrlq $32, %mm6 C m 0
+ lea 16(%eax), %eax C m 0
+ lea 16(%edx), %edx C m 0
+ sub $4, %ecx C m 0
+ ja L(lpm0) C m 0
+ pmuludq %mm7, %mm4 C m 0
+ paddq %mm0, %mm6 C m 0
+ movd %mm6, -12(%edx) C m 0
+ psrlq $32, %mm6 C m 0
+ paddq %mm1, %mm6 C m 0
+ jmp L(3)
+
+L(outer):
+ lea 8(%edi), %edi C rp += 2
+ movd (%esi), %mm7 C am 3
+ mov %edi, %edx C rp2 = rp am 3
+ lea 4(%esi), %esi C up++ am 3
+ lea (%esi), %eax C up2 = up am 3
+ movd (%eax), %mm0 C am 3
+ lea (%ebx), %ecx C inner loop count am 3
+ pxor %mm6, %mm6 C am 3
+ pmuludq %mm7, %mm0 C am 3
+ movd 4(%eax), %mm1 C am 3
+ movd (%edx), %mm4 C am 3
+ pmuludq %mm7, %mm1 C am 3
+ movd 8(%eax), %mm2 C am 3
+ paddq %mm0, %mm4 C am 3
+ movd 4(%edx), %mm5 C am 3
+ jmp L(lam3) C am 3
+ ALIGN(16) C am 3
+L(lam3):
+ pmuludq %mm7, %mm2 C am 3
+ paddq %mm4, %mm6 C am 3
+ movd 12(%eax), %mm3 C am 3
+ paddq %mm1, %mm5 C am 3
+ movd 8(%edx), %mm4 C am 3
+ movd %mm6, (%edx) C am 3
+ psrlq $32, %mm6 C am 3
+ pmuludq %mm7, %mm3 C am 3
+ paddq %mm5, %mm6 C am 3
+ movd 16(%eax), %mm0 C am 3
+ paddq %mm2, %mm4 C am 3
+ movd 12(%edx), %mm5 C am 3
+ movd %mm6, 4(%edx) C am 3
+ psrlq $32, %mm6 C am 3
+ pmuludq %mm7, %mm0 C am 3
+ paddq %mm4, %mm6 C am 3
+ movd 20(%eax), %mm1 C am 3
+ paddq %mm3, %mm5 C am 3
+ movd 16(%edx), %mm4 C am 3
+ movd %mm6, 8(%edx) C am 3
+ psrlq $32, %mm6 C am 3
+ pmuludq %mm7, %mm1 C am 3
+ paddq %mm5, %mm6 C am 3
+ movd 24(%eax), %mm2 C am 3
+ paddq %mm0, %mm4 C am 3
+ movd 20(%edx), %mm5 C am 3
+ movd %mm6, 12(%edx) C am 3
+ psrlq $32, %mm6 C am 3
+ lea 16(%eax), %eax C am 3
+ lea 16(%edx), %edx C am 3
+ sub $4, %ecx C am 3
+ ja L(lam3) C am 3
+ pmuludq %mm7, %mm2 C am 3
+ paddq %mm4, %mm6 C am 3
+ paddq %mm1, %mm5 C am 3
+ movd 8(%edx), %mm4 C am 3
+ movd %mm6, (%edx) C am 3
+ psrlq $32, %mm6 C am 3
+ paddq %mm5, %mm6 C am 3
+ paddq %mm2, %mm4 C am 3
+L(2): movd %mm6, 4(%edx) C am 3
+ psrlq $32, %mm6 C am 3
+ paddq %mm4, %mm6 C am 3
+ movd %mm6, 8(%edx) C am 3
+ psrlq $32, %mm6 C am 3
+ movd %mm6, 12(%edx) C am 3
+
+ lea 8(%edi), %edi C rp += 2
+ movd (%esi), %mm7 C am 2
+ mov %edi, %edx C rp2 = rp am 2
+ lea 4(%esi), %esi C up++ am 2
+ lea (%esi), %eax C up2 = up am 2
+ movd (%eax), %mm1 C am 2
+ lea (%ebx), %ecx C inner loop count am 2
+ pxor %mm6, %mm6 C am 2
+ pmuludq %mm7, %mm1 C am 2
+ movd 4(%eax), %mm2 C am 2
+ movd (%edx), %mm5 C am 2
+ pmuludq %mm7, %mm2 C am 2
+ movd 8(%eax), %mm3 C am 2
+ paddq %mm1, %mm5 C am 2
+ movd 4(%edx), %mm4 C am 2
+ jmp L(am10) C am 2
+ ALIGN(16) C am 2
+L(lam2):
+ pmuludq %mm7, %mm2 C am 2
+ paddq %mm4, %mm6 C am 2
+ movd 8(%eax), %mm3 C am 2
+ paddq %mm1, %mm5 C am 2
+ movd 4(%edx), %mm4 C am 2
+ movd %mm6, -4(%edx) C am 2
+ psrlq $32, %mm6 C am 2
+L(am10):
+ pmuludq %mm7, %mm3 C am 2
+ paddq %mm5, %mm6 C am 2
+ movd 12(%eax), %mm0 C am 2
+ paddq %mm2, %mm4 C am 2
+ movd 8(%edx), %mm5 C am 2
+ movd %mm6, (%edx) C am 2
+ psrlq $32, %mm6 C am 2
+ pmuludq %mm7, %mm0 C am 2
+ paddq %mm4, %mm6 C am 2
+ movd 16(%eax), %mm1 C am 2
+ paddq %mm3, %mm5 C am 2
+ movd 12(%edx), %mm4 C am 2
+ movd %mm6, 4(%edx) C am 2
+ psrlq $32, %mm6 C am 2
+ pmuludq %mm7, %mm1 C am 2
+ paddq %mm5, %mm6 C am 2
+ movd 20(%eax), %mm2 C am 2
+ paddq %mm0, %mm4 C am 2
+ movd 16(%edx), %mm5 C am 2
+ movd %mm6, 8(%edx) C am 2
+ psrlq $32, %mm6 C am 2
+ lea 16(%eax), %eax C am 2
+ lea 16(%edx), %edx C am 2
+ sub $4, %ecx C am 2
+ ja L(lam2) C am 2
+ pmuludq %mm7, %mm2 C am 2
+ paddq %mm4, %mm6 C am 2
+ paddq %mm1, %mm5 C am 2
+ movd 4(%edx), %mm4 C am 2
+ movd %mm6, -4(%edx) C am 2
+ psrlq $32, %mm6 C am 2
+ paddq %mm5, %mm6 C am 2
+ paddq %mm2, %mm4 C am 2
+L(1): movd %mm6, (%edx) C am 2
+ psrlq $32, %mm6 C am 2
+ paddq %mm4, %mm6 C am 2
+ movd %mm6, 4(%edx) C am 2
+ psrlq $32, %mm6 C am 2
+ movd %mm6, 8(%edx) C am 2
+
+ lea 8(%edi), %edi C rp += 2
+ movd (%esi), %mm7 C am 1
+ mov %edi, %edx C rp2 = rp am 1
+ lea 4(%esi), %esi C up++ am 1
+ lea (%esi), %eax C up2 = up am 1
+ movd (%eax), %mm2 C am 1
+ lea (%ebx), %ecx C inner loop count am 1
+ pxor %mm6, %mm6 C am 1
+ pmuludq %mm7, %mm2 C am 1
+ movd 4(%eax), %mm3 C am 1
+ movd (%edx), %mm4 C am 1
+ pmuludq %mm7, %mm3 C am 1
+ movd 8(%eax), %mm0 C am 1
+ paddq %mm2, %mm4 C am 1
+ movd 4(%edx), %mm5 C am 1
+ jmp L(am01) C am 1
+ ALIGN(16) C am 1
+L(lam1):
+ pmuludq %mm7, %mm2 C am 1
+ paddq %mm4, %mm6 C am 1
+ movd 4(%eax), %mm3 C am 1
+ paddq %mm1, %mm5 C am 1
+ movd (%edx), %mm4 C am 1
+ movd %mm6, -8(%edx) C am 1
+ psrlq $32, %mm6 C am 1
+ pmuludq %mm7, %mm3 C am 1
+ paddq %mm5, %mm6 C am 1
+ movd 8(%eax), %mm0 C am 1
+ paddq %mm2, %mm4 C am 1
+ movd 4(%edx), %mm5 C am 1
+ movd %mm6, -4(%edx) C am 1
+ psrlq $32, %mm6 C am 1
+L(am01):
+ pmuludq %mm7, %mm0 C am 1
+ paddq %mm4, %mm6 C am 1
+ movd 12(%eax), %mm1 C am 1
+ paddq %mm3, %mm5 C am 1
+ movd 8(%edx), %mm4 C am 1
+ movd %mm6, (%edx) C am 1
+ psrlq $32, %mm6 C am 1
+ pmuludq %mm7, %mm1 C am 1
+ paddq %mm5, %mm6 C am 1
+ movd 16(%eax), %mm2 C am 1
+ paddq %mm0, %mm4 C am 1
+ movd 12(%edx), %mm5 C am 1
+ movd %mm6, 4(%edx) C am 1
+ psrlq $32, %mm6 C am 1
+ lea 16(%eax), %eax C am 1
+ lea 16(%edx), %edx C am 1
+ sub $4, %ecx C am 1
+ ja L(lam1) C am 1
+ pmuludq %mm7, %mm2 C am 1
+ paddq %mm4, %mm6 C am 1
+ paddq %mm1, %mm5 C am 1
+ movd (%edx), %mm4 C am 1
+ movd %mm6, -8(%edx) C am 1
+ psrlq $32, %mm6 C am 1
+ paddq %mm5, %mm6 C am 1
+ paddq %mm2, %mm4 C am 1
+L(0): movd %mm6, -4(%edx) C am 1
+ psrlq $32, %mm6 C am 1
+ paddq %mm4, %mm6 C am 1
+ movd %mm6, (%edx) C am 1
+ psrlq $32, %mm6 C am 1
+ movd %mm6, 4(%edx) C am 1
+
+ lea 8(%edi), %edi C rp += 2
+ movd (%esi), %mm7 C am 0
+ mov %edi, %edx C rp2 = rp am 0
+ lea 4(%esi), %esi C up++ am 0
+ lea (%esi), %eax C up2 = up am 0
+ movd (%eax), %mm3 C am 0
+ lea (%ebx), %ecx C inner loop count am 0
+ pxor %mm6, %mm6 C am 0
+ pmuludq %mm7, %mm3 C am 0
+ movd 4(%eax), %mm0 C am 0
+ movd (%edx), %mm5 C am 0
+ pmuludq %mm7, %mm0 C am 0
+ movd 8(%eax), %mm1 C am 0
+ paddq %mm3, %mm5 C am 0
+ movd 4(%edx), %mm4 C am 0
+ jmp L(am00) C am 0
+ ALIGN(16) C am 0
+L(lam0):
+ pmuludq %mm7, %mm2 C am 0
+ paddq %mm4, %mm6 C am 0
+ movd (%eax), %mm3 C am 0
+ paddq %mm1, %mm5 C am 0
+ movd -4(%edx), %mm4 C am 0
+ movd %mm6, -12(%edx) C am 0
+ psrlq $32, %mm6 C am 0
+ pmuludq %mm7, %mm3 C am 0
+ paddq %mm5, %mm6 C am 0
+ movd 4(%eax), %mm0 C am 0
+ paddq %mm2, %mm4 C am 0
+ movd (%edx), %mm5 C am 0
+ movd %mm6, -8(%edx) C am 0
+ psrlq $32, %mm6 C am 0
+ pmuludq %mm7, %mm0 C am 0
+ paddq %mm4, %mm6 C am 0
+ movd 8(%eax), %mm1 C am 0
+ paddq %mm3, %mm5 C am 0
+ movd 4(%edx), %mm4 C am 0
+ movd %mm6, -4(%edx) C am 0
+ psrlq $32, %mm6 C am 0
+L(am00):
+ pmuludq %mm7, %mm1 C am 0
+ paddq %mm5, %mm6 C am 0
+ movd 12(%eax), %mm2 C am 0
+ paddq %mm0, %mm4 C am 0
+ movd 8(%edx), %mm5 C am 0
+ movd %mm6, (%edx) C am 0
+ psrlq $32, %mm6 C am 0
+ lea 16(%eax), %eax C am 0
+ lea 16(%edx), %edx C am 0
+ sub $4, %ecx C am 0
+ ja L(lam0) C am 0
+ pmuludq %mm7, %mm2 C am 0
+ paddq %mm4, %mm6 C am 0
+ paddq %mm1, %mm5 C am 0
+ movd -4(%edx), %mm4 C am 0
+ movd %mm6, -12(%edx) C am 0
+ psrlq $32, %mm6 C am 0
+ paddq %mm5, %mm6 C am 0
+ paddq %mm2, %mm4 C am 0
+L(3): movd %mm6, -8(%edx) C am 0
+ psrlq $32, %mm6 C am 0
+ paddq %mm4, %mm6 C am 0
+ movd %mm6, -4(%edx) C am 0
+ psrlq $32, %mm6 C am 0
+ movd %mm6, (%edx) C am 0
+ sub $4, %ebx C am 0
+ ja L(outer) C am 0
+
+ mov %edi, %edx
+ mov %esi, %eax
+ pop %edi
+ pop %ebx
+ pop %esi
+
+L(am3): C up[un-1..un-3] x up[un-4]
+ lea 8(%edx), %edx C rp2 += 2
+ movd (%eax), %mm7
+ movd 4(%eax), %mm1
+ movd 8(%eax), %mm2
+ movd 12(%eax), %mm3
+ movd (%edx), %mm4
+ pmuludq %mm7, %mm1
+ movd 4(%edx), %mm5
+ pmuludq %mm7, %mm2
+ movd 8(%edx), %mm6
+ pmuludq %mm7, %mm3
+ paddq %mm1, %mm4
+ paddq %mm2, %mm5
+ paddq %mm3, %mm6
+ movd %mm4, (%edx)
+ psrlq $32, %mm4
+ paddq %mm5, %mm4
+ movd %mm4, 4(%edx)
+ psrlq $32, %mm4
+ paddq %mm6, %mm4
+ movd %mm4, 8(%edx)
+ psrlq $32, %mm4
+ movd %mm4, 12(%edx) C FIXME feed through!
+ lea 4(%eax), %eax
+
+L(am2): C up[un-1..un-2] x up[un-3]
+ lea 8(%edx), %edx C rp2 += 2
+ movd (%eax), %mm7
+ movd 4(%eax), %mm1
+ movd 8(%eax), %mm2
+ movd (%edx), %mm4
+ movd 4(%edx), %mm5
+ pmuludq %mm7, %mm1
+ pmuludq %mm7, %mm2
+ paddq %mm1, %mm4
+ paddq %mm2, %mm5
+ movd %mm4, (%edx)
+ psrlq $32, %mm4
+ paddq %mm5, %mm4
+ movd %mm4, 4(%edx)
+ psrlq $32, %mm4
+ movd %mm4, 8(%edx) C FIXME feed through!
+ lea 4(%eax), %eax
+
+L(am1): C up[un-1] x up[un-2]
+ lea 8(%edx), %edx C rp2 += 2
+ movd (%eax), %mm7
+ movd 4(%eax), %mm2
+ movd (%edx), %mm4
+ pmuludq %mm7, %mm2
+ paddq %mm2, %mm4
+ movd %mm4, (%edx)
+ psrlq $32, %mm4
+ movd %mm4, 4(%edx)
+
+C *** diag stuff, use elementary code for now
+
+ mov 4(%esp), %edx C rp
+ mov 8(%esp), %eax C up
+ mov 12(%esp), %ecx C un
+
+ movd (%eax), %mm2
+ pmuludq %mm2, %mm2 C src[0]^2
+
+ pcmpeqd %mm7, %mm7
+ psrlq $32, %mm7
+
+ movd 4(%edx), %mm3 C dst[1]
+
+ movd %mm2, (%edx)
+ psrlq $32, %mm2
+
+ psllq $1, %mm3 C 2*dst[1]
+ paddq %mm3, %mm2
+ movd %mm2, 4(%edx)
+ psrlq $32, %mm2
+
+ sub $2, %ecx
+
+L(diag):
+ movd 4(%eax), %mm0 C src limb
+ add $4, %eax
+ pmuludq %mm0, %mm0
+ movq %mm7, %mm1
+ pand %mm0, %mm1 C diagonal low
+ psrlq $32, %mm0 C diagonal high
+
+ movd 8(%edx), %mm3
+ psllq $1, %mm3 C 2*dst[i]
+ paddq %mm3, %mm1
+ paddq %mm1, %mm2
+ movd %mm2, 8(%edx)
+ psrlq $32, %mm2
+
+ movd 12(%edx), %mm3
+ psllq $1, %mm3 C 2*dst[i+1]
+ paddq %mm3, %mm0
+ paddq %mm0, %mm2
+ movd %mm2, 12(%edx)
+ add $8, %edx
+ psrlq $32, %mm2
+
+ sub $1, %ecx
+ jnz L(diag)
+
+ movd 4(%eax), %mm0 C src[size-1]
+ pmuludq %mm0, %mm0
+ pand %mm0, %mm7 C diagonal low
+ psrlq $32, %mm0 C diagonal high
+
+ movd 8(%edx), %mm3 C dst[2*size-2]
+ psllq $1, %mm3
+ paddq %mm3, %mm7
+ paddq %mm7, %mm2
+ movd %mm2, 8(%edx)
+ psrlq $32, %mm2
+
+ paddq %mm0, %mm2
+ movd %mm2, 12(%edx) C dst[2*size-1]
+
+ emms
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/sub_n.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/sub_n.asm
new file mode 100644
index 0000000..5ba1c01
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/sub_n.asm
@@ -0,0 +1,119 @@
+dnl Intel Pentium-4 mpn_sub_n -- mpn subtraction.
+
+dnl Copyright 2001, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C dst!=src1,2 dst==src1 dst==src2
+C P6 model 0-8,10-12 -
+C P6 model 9 (Banias) ?
+C P6 model 13 (Dothan) ?
+C P4 model 0-1 (Willamette) ?
+C P4 model 2 (Northwood) 4 6 6
+C P4 model 3-4 (Prescott) 4.25 7.5 7.5
+
+defframe(PARAM_CARRY,20)
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC2, 12)
+defframe(PARAM_SRC1, 8)
+defframe(PARAM_DST, 4)
+
+dnl re-use parameter space
+define(SAVE_EBX,`PARAM_SRC1')
+
+ TEXT
+ ALIGN(8)
+
+PROLOGUE(mpn_sub_nc)
+deflit(`FRAME',0)
+ movd PARAM_CARRY, %mm0
+ jmp L(start_nc)
+EPILOGUE()
+
+ ALIGN(8)
+PROLOGUE(mpn_sub_n)
+deflit(`FRAME',0)
+ pxor %mm0, %mm0
+L(start_nc):
+ mov PARAM_SRC1, %eax
+ mov %ebx, SAVE_EBX
+ mov PARAM_SRC2, %ebx
+ mov PARAM_DST, %edx
+ mov PARAM_SIZE, %ecx
+
+ lea (%eax,%ecx,4), %eax C src1 end
+ lea (%ebx,%ecx,4), %ebx C src2 end
+ lea (%edx,%ecx,4), %edx C dst end
+ neg %ecx C -size
+
+L(top):
+ C eax src1 end
+ C ebx src2 end
+ C ecx counter, limbs, negative
+ C edx dst end
+ C mm0 carry bit
+
+ movd (%eax,%ecx,4), %mm1
+ movd (%ebx,%ecx,4), %mm2
+ psubq %mm2, %mm1
+
+ psubq %mm0, %mm1
+ movd %mm1, (%edx,%ecx,4)
+
+ psrlq $63, %mm1
+
+ add $1, %ecx
+ jz L(done_mm1)
+
+ movd (%eax,%ecx,4), %mm0
+ movd (%ebx,%ecx,4), %mm2
+ psubq %mm2, %mm0
+
+ psubq %mm1, %mm0
+ movd %mm0, (%edx,%ecx,4)
+
+ psrlq $63, %mm0
+
+ add $1, %ecx
+ jnz L(top)
+
+ movd %mm0, %eax
+ mov SAVE_EBX, %ebx
+ emms
+ ret
+
+L(done_mm1):
+ movd %mm1, %eax
+ mov SAVE_EBX, %ebx
+ emms
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/submul_1.asm b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/submul_1.asm
new file mode 100644
index 0000000..020675b
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium4/sse2/submul_1.asm
@@ -0,0 +1,182 @@
+dnl Intel Pentium-4 mpn_submul_1 -- Multiply a limb vector with a limb and
+dnl subtract the result from a second limb vector.
+
+dnl Copyright 2001, 2002, 2008, 2010 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C P6 model 0-8,10-12 -
+C P6 model 9 (Banias) 6.8
+C P6 model 13 (Dothan) 6.9
+C P4 model 0-1 (Willamette) ?
+C P4 model 2 (Northwood) 5.87
+C P4 model 3-4 (Prescott) 6.5
+
+C This code represents a step forwards compared to the code available before
+C GMP 5.1, but it is not carefully tuned for either P6 or P4. In fact, it is
+C not good for P6. For P4 it saved a bit over 1 c/l for both Northwood and
+C Prescott compared to the old code.
+C
+C The arrangements made here to get a two instruction dependent chain are
+C slightly subtle. In the loop the carry (or borrow rather) is a negative so
+C that a paddq can be used to give a low limb ready to store, and a high limb
+C ready to become the new carry after a psrlq.
+C
+C If the carry was a simple twos complement negative then the psrlq shift would
+C need to bring in 0 bits or 1 bits according to whether the high was zero or
+C non-zero, since a non-zero value would represent a negative needing sign
+C extension. That wouldn't be particularly easy to arrange and certainly would
+C add an instruction to the dependent chain, so instead an offset is applied so
+C that the high limb will be 0xFFFFFFFF+c. With c in the range -0xFFFFFFFF to
+C 0, the value 0xFFFFFFFF+c is in the range 0 to 0xFFFFFFFF and is therefore
+C always positive and can always have 0 bits shifted in, which is what psrlq
+C does.
+C
+C The extra 0xFFFFFFFF must be subtracted before c is used, but that can be
+C done off the dependent chain. The total adjustment then is to add
+C 0xFFFFFFFF00000000 to offset the new carry, and subtract 0x00000000FFFFFFFF
+C to remove the offset from the current carry, for a net add of
+C 0xFFFFFFFE00000001. In the code this is applied to the destination limb when
+C fetched.
+C
+C It's also possible to view the 0xFFFFFFFF adjustment as a ones-complement
+C negative, which is how it's undone for the return value, but that doesn't
+C seem as clear.
+
+defframe(PARAM_CARRY, 20)
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ TEXT
+ ALIGN(16)
+
+PROLOGUE(mpn_submul_1c)
+deflit(`FRAME',0)
+ movd PARAM_CARRY, %mm1
+ jmp L(start_1c)
+EPILOGUE()
+
+PROLOGUE(mpn_submul_1)
+deflit(`FRAME',0)
+ pxor %mm1, %mm1 C initial borrow
+
+L(start_1c):
+ mov PARAM_SRC, %eax
+ pcmpeqd %mm0, %mm0
+
+ movd PARAM_MULTIPLIER, %mm7
+ pcmpeqd %mm6, %mm6
+
+ mov PARAM_DST, %edx
+ psrlq $32, %mm0 C 0x00000000FFFFFFFF
+
+ mov PARAM_SIZE, %ecx
+ psllq $32, %mm6 C 0xFFFFFFFF00000000
+
+ psubq %mm0, %mm6 C 0xFFFFFFFE00000001
+
+ psubq %mm1, %mm0 C 0xFFFFFFFF - borrow
+
+
+ movd (%eax), %mm3 C up
+ movd (%edx), %mm4 C rp
+
+ add $-1, %ecx
+ paddq %mm6, %mm4 C add 0xFFFFFFFE00000001
+ pmuludq %mm7, %mm3
+ jnz L(gt1)
+ psubq %mm3, %mm4 C prod
+ paddq %mm4, %mm0 C borrow
+ movd %mm0, (%edx) C result
+ jmp L(rt)
+
+L(gt1): movd 4(%eax), %mm1 C up
+ movd 4(%edx), %mm2 C rp
+
+ add $-1, %ecx
+ jz L(eev)
+
+ ALIGN(16)
+L(top): paddq %mm6, %mm2 C add 0xFFFFFFFE00000001
+ pmuludq %mm7, %mm1
+ psubq %mm3, %mm4 C prod
+ movd 8(%eax), %mm3 C up
+ paddq %mm4, %mm0 C borrow
+ movd 8(%edx), %mm4 C rp
+ movd %mm0, (%edx) C result
+ psrlq $32, %mm0
+
+ add $-1, %ecx
+ jz L(eod)
+
+ paddq %mm6, %mm4 C add 0xFFFFFFFE00000001
+ pmuludq %mm7, %mm3
+ psubq %mm1, %mm2 C prod
+ movd 12(%eax), %mm1 C up
+ paddq %mm2, %mm0 C borrow
+ movd 12(%edx), %mm2 C rp
+ movd %mm0, 4(%edx) C result
+ psrlq $32, %mm0
+
+ lea 8(%eax), %eax
+ lea 8(%edx), %edx
+ add $-1, %ecx
+ jnz L(top)
+
+
+L(eev): paddq %mm6, %mm2 C add 0xFFFFFFFE00000001
+ pmuludq %mm7, %mm1
+ psubq %mm3, %mm4 C prod
+ paddq %mm4, %mm0 C borrow
+ movd %mm0, (%edx) C result
+ psrlq $32, %mm0
+ psubq %mm1, %mm2 C prod
+ paddq %mm2, %mm0 C borrow
+ movd %mm0, 4(%edx) C result
+L(rt): psrlq $32, %mm0
+ movd %mm0, %eax
+ not %eax
+ emms
+ ret
+
+L(eod): paddq %mm6, %mm4 C add 0xFFFFFFFE00000001
+ pmuludq %mm7, %mm3
+ psubq %mm1, %mm2 C prod
+ paddq %mm2, %mm0 C borrow
+ movd %mm0, 4(%edx) C result
+ psrlq $32, %mm0
+ psubq %mm3, %mm4 C prod
+ paddq %mm4, %mm0 C borrow
+ movd %mm0, 8(%edx) C result
+ jmp L(rt)
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/rshift.asm b/vendor/gmp-6.3.0/mpn/x86/rshift.asm
new file mode 100644
index 0000000..a60dcaa
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/rshift.asm
@@ -0,0 +1,108 @@
+dnl x86 mpn_rshift -- mpn right shift.
+
+dnl Copyright 1992, 1994, 1996, 1999-2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C P54 7.5
+C P55 7.0
+C P6 2.5
+C K6 4.5
+C K7 5.0
+C P4 16.5
+
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C unsigned shift);
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ TEXT
+ ALIGN(8)
+PROLOGUE(mpn_rshift)
+
+ pushl %edi
+ pushl %esi
+ pushl %ebx
+deflit(`FRAME',12)
+
+ movl PARAM_DST,%edi
+ movl PARAM_SRC,%esi
+ movl PARAM_SIZE,%edx
+ movl PARAM_SHIFT,%ecx
+
+ leal -4(%edi,%edx,4),%edi
+ leal (%esi,%edx,4),%esi
+ negl %edx
+
+ movl (%esi,%edx,4),%ebx C read least significant limb
+ xorl %eax,%eax
+ shrdl( %cl, %ebx, %eax) C compute carry limb
+ incl %edx
+ jz L(end)
+ pushl %eax C push carry limb onto stack
+ testb $1,%dl
+ jnz L(1) C enter loop in the middle
+ movl %ebx,%eax
+
+ ALIGN(8)
+L(oop): movl (%esi,%edx,4),%ebx C load next higher limb
+ shrdl( %cl, %ebx, %eax) C compute result limb
+ movl %eax,(%edi,%edx,4) C store it
+ incl %edx
+L(1): movl (%esi,%edx,4),%eax
+ shrdl( %cl, %eax, %ebx)
+ movl %ebx,(%edi,%edx,4)
+ incl %edx
+ jnz L(oop)
+
+ shrl %cl,%eax C compute most significant limb
+ movl %eax,(%edi) C store it
+
+ popl %eax C pop carry limb
+
+ popl %ebx
+ popl %esi
+ popl %edi
+ ret
+
+L(end): shrl %cl,%ebx C compute most significant limb
+ movl %ebx,(%edi) C store it
+
+ popl %ebx
+ popl %esi
+ popl %edi
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/sec_tabselect.asm b/vendor/gmp-6.3.0/mpn/x86/sec_tabselect.asm
new file mode 100644
index 0000000..d9d9952
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/sec_tabselect.asm
@@ -0,0 +1,106 @@
+dnl x86 mpn_sec_tabselect.
+
+dnl Copyright 2011, 2021 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C P5 ?
+C P6 model 0-8,10-12 ?
+C P6 model 9 (Banias) ?
+C P6 model 13 (Dothan) ?
+C P4 model 0 (Willamette) ?
+C P4 model 1 (?) ?
+C P4 model 2 (Northwood) 4.5
+C P4 model 3 (Prescott) ?
+C P4 model 4 (Nocona) ?
+C Intel Atom ?
+C AMD K6 ?
+C AMD K7 3.4
+C AMD K8 ?
+C AMD K10 ?
+
+C NOTES
+C * This has not been tuned for any specific processor. Its speed should not
+C be too bad, though.
+C * Using SSE2 could result in many-fold speedup.
+
+C mpn_sec_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which)
+define(`rp', `%edi')
+define(`tp', `%esi')
+define(`n', `%ebx')
+define(`nents', `32(%esp)')
+define(`which', `36(%esp)')
+
+define(`i', `%ebp')
+define(`mask', `%ecx')
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_sec_tabselect)
+ push %edi
+ push %esi
+ push %ebx
+ push %ebp
+ mov 20(%esp), rp
+ mov 24(%esp), tp
+ mov 28(%esp), n
+
+ lea (rp,n,4), rp
+ lea (tp,n,4), tp
+L(outer):
+ subl $1, which
+ sbb mask, mask
+
+ mov n, i
+ neg i
+
+ ALIGN(16)
+L(top): mov (tp,i,4), %eax
+ mov (rp,i,4), %edx
+ xor %edx, %eax
+ and mask, %eax
+ xor %edx, %eax
+ mov %eax, (rp,i,4)
+ inc i
+ js L(top)
+
+L(end): lea (tp,n,4), tp
+ decl nents
+ jne L(outer)
+
+L(outer_end):
+ pop %ebp
+ pop %ebx
+ pop %esi
+ pop %edi
+ ret
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/silvermont/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/x86/silvermont/gmp-mparam.h
new file mode 100644
index 0000000..e9f1d8f
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/silvermont/gmp-mparam.h
@@ -0,0 +1,222 @@
+/* Intel Silvermont/32 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 2400 MHz Intel Atom C2758 Silvermont/Rangeley */
+/* FFT tuning limit = 67,000,000 */
+/* Generated by tuneup.c, 2019-10-30, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD 3
+#define MOD_1_UNNORM_THRESHOLD 5
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 9
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 5
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 11
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 16
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1N_PI1_METHOD 1 /* 64.62% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 32
+
+#define DIV_1_VS_MUL_1_PERCENT 204
+
+#define MUL_TOOM22_THRESHOLD 26
+#define MUL_TOOM33_THRESHOLD 105
+#define MUL_TOOM44_THRESHOLD 236
+#define MUL_TOOM6H_THRESHOLD 351
+#define MUL_TOOM8H_THRESHOLD 502
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 105
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 163
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 137
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 174
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 215
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 36
+#define SQR_TOOM3_THRESHOLD 138
+#define SQR_TOOM4_THRESHOLD 360
+#define SQR_TOOM6_THRESHOLD 494
+#define SQR_TOOM8_THRESHOLD 620
+
+#define MULMID_TOOM42_THRESHOLD 58
+
+#define MULMOD_BNM1_THRESHOLD 15
+#define SQRMOD_BNM1_THRESHOLD 19
+
+#define MUL_FFT_MODF_THRESHOLD 460 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 460, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
+ { 12, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \
+ { 21, 7}, { 11, 6}, { 25, 7}, { 13, 6}, \
+ { 27, 7}, { 15, 6}, { 31, 7}, { 17, 6}, \
+ { 35, 7}, { 19, 6}, { 39, 7}, { 21, 8}, \
+ { 11, 7}, { 27, 8}, { 15, 7}, { 35, 8}, \
+ { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \
+ { 27, 9}, { 15, 8}, { 31, 7}, { 63, 8}, \
+ { 39, 9}, { 23, 8}, { 51,10}, { 15, 9}, \
+ { 31, 8}, { 67, 9}, { 39, 8}, { 79, 9}, \
+ { 47, 8}, { 95,10}, { 31, 9}, { 79,10}, \
+ { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \
+ { 135,10}, { 79, 9}, { 159,10}, { 95,11}, \
+ { 63,10}, { 127, 9}, { 255,10}, { 143, 9}, \
+ { 287,10}, { 159,11}, { 95,10}, { 191,12}, \
+ { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \
+ { 271, 9}, { 543,10}, { 287, 9}, { 575,11}, \
+ { 159,10}, { 319, 9}, { 639,10}, { 335, 9}, \
+ { 671,10}, { 351, 9}, { 703,11}, { 191,10}, \
+ { 383, 9}, { 767,10}, { 415, 9}, { 831,12}, \
+ { 127,11}, { 255,10}, { 543, 9}, { 1087,11}, \
+ { 287,10}, { 607, 9}, { 1215,11}, { 319,10}, \
+ { 671,11}, { 351,10}, { 735,12}, { 191,11}, \
+ { 383,10}, { 767,11}, { 415,10}, { 831,13}, \
+ { 127,12}, { 255,11}, { 543,10}, { 1087,11}, \
+ { 607,10}, { 1215,12}, { 319,11}, { 671,10}, \
+ { 1343,11}, { 735,10}, { 1471,12}, { 383,11}, \
+ { 863,10}, { 1727,12}, { 447,11}, { 959,13}, \
+ { 255,12}, { 511,11}, { 1087,12}, { 575,11}, \
+ { 1215,10}, { 2431,12}, { 639,11}, { 1343,12}, \
+ { 703,11}, { 1471,13}, { 383,12}, { 767,11}, \
+ { 1535,12}, { 831,11}, { 1727,10}, { 3455,12}, \
+ { 959,11}, { 1919,14}, { 255,13}, { 511,12}, \
+ { 1215,11}, { 2431,13}, { 639,12}, { 1471,11}, \
+ { 2943,13}, { 767,12}, { 1727,11}, { 3455,13}, \
+ { 895,12}, { 1919,14}, { 511,13}, { 1023,12}, \
+ { 2111,13}, { 1151,12}, { 2431,13}, { 1407,12}, \
+ { 2943,14}, { 767,13}, { 1663,12}, { 3455,13}, \
+ { 1919,15}, { 511,14}, { 1023,13}, { 2175,12}, \
+ { 4479,13}, { 2431,14}, { 1279,13}, { 2943,12}, \
+ { 5887,14}, { 1535,13}, { 3455,14}, { 1791,13}, \
+ { 3967,12}, { 7935,15}, { 1023,14}, { 2047,13}, \
+ { 4479,14}, { 2303,13}, { 4991,12}, { 9983,14}, \
+ { 2815,13}, { 5887,15}, { 1535,14}, { 3839,13}, \
+ { 7935,16} }
+#define MUL_FFT_TABLE3_SIZE 177
+#define MUL_FFT_THRESHOLD 4544
+
+#define SQR_FFT_MODF_THRESHOLD 400 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 400, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
+ { 12, 5}, { 25, 6}, { 13, 5}, { 28, 6}, \
+ { 21, 7}, { 11, 6}, { 25, 7}, { 13, 6}, \
+ { 28, 7}, { 15, 6}, { 32, 7}, { 17, 6}, \
+ { 35, 7}, { 19, 6}, { 39, 7}, { 21, 8}, \
+ { 11, 7}, { 27, 8}, { 15, 7}, { 35, 8}, \
+ { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \
+ { 27, 9}, { 15, 8}, { 31, 7}, { 63, 8}, \
+ { 39, 9}, { 23, 8}, { 51,10}, { 15, 9}, \
+ { 31, 8}, { 67, 9}, { 39, 8}, { 79, 9}, \
+ { 55,10}, { 31, 9}, { 79,10}, { 47, 9}, \
+ { 95,11}, { 31,10}, { 63, 9}, { 127,10}, \
+ { 79, 9}, { 159,10}, { 95,11}, { 63,10}, \
+ { 127, 9}, { 255,10}, { 143, 9}, { 287, 8}, \
+ { 575,10}, { 159,11}, { 95,12}, { 63,11}, \
+ { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \
+ { 543,10}, { 287, 9}, { 575,11}, { 159,10}, \
+ { 319, 9}, { 639,10}, { 335, 9}, { 671,10}, \
+ { 351, 9}, { 735,11}, { 191,10}, { 383, 9}, \
+ { 799,10}, { 415, 9}, { 831,11}, { 223,12}, \
+ { 127,11}, { 255,10}, { 543, 9}, { 1087,11}, \
+ { 287,10}, { 607, 9}, { 1215,11}, { 319,10}, \
+ { 671,11}, { 351,10}, { 735, 9}, { 1471,12}, \
+ { 191,11}, { 383,10}, { 799,11}, { 415,10}, \
+ { 863,13}, { 127,12}, { 255,11}, { 543,10}, \
+ { 1087,11}, { 607,10}, { 1215,12}, { 319,11}, \
+ { 671,10}, { 1343,11}, { 735,10}, { 1471,12}, \
+ { 383,11}, { 863,10}, { 1727,12}, { 447,11}, \
+ { 959,13}, { 255,12}, { 511,11}, { 1087,12}, \
+ { 575,11}, { 1215,10}, { 2431,12}, { 639,11}, \
+ { 1343,12}, { 703,11}, { 1471,13}, { 383,12}, \
+ { 767,11}, { 1535,12}, { 831,11}, { 1727,12}, \
+ { 959,14}, { 255,13}, { 511,12}, { 1215,11}, \
+ { 2431,13}, { 639,12}, { 1471,11}, { 2943,13}, \
+ { 767,12}, { 1727,11}, { 3455,13}, { 895,12}, \
+ { 1919,14}, { 511,13}, { 1023,12}, { 2111,13}, \
+ { 1151,12}, { 2431,13}, { 1407,12}, { 2943,14}, \
+ { 767,13}, { 1663,12}, { 3455,13}, { 1919,15}, \
+ { 511,14}, { 1023,13}, { 2175,12}, { 4479,13}, \
+ { 2431,14}, { 1279,13}, { 2943,12}, { 5887,14}, \
+ { 1535,13}, { 3455,14}, { 1791,13}, { 3967,15}, \
+ { 1023,14}, { 2047,13}, { 4479,14}, { 2303,13}, \
+ { 4991,12}, { 9983,14}, { 2815,13}, { 5887,15}, \
+ { 1535,14}, { 3839,13}, { 7679,16} }
+#define SQR_FFT_TABLE3_SIZE 175
+#define SQR_FFT_THRESHOLD 3712
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 56
+#define MULLO_MUL_N_THRESHOLD 8907
+#define SQRLO_BASECASE_THRESHOLD 0 /* always */
+#define SQRLO_DC_THRESHOLD 137
+#define SQRLO_SQR_THRESHOLD 7373
+
+#define DC_DIV_QR_THRESHOLD 76
+#define DC_DIVAPPR_Q_THRESHOLD 336
+#define DC_BDIV_QR_THRESHOLD 66
+#define DC_BDIV_Q_THRESHOLD 218
+
+#define INV_MULMOD_BNM1_THRESHOLD 50
+#define INV_NEWTON_THRESHOLD 345
+#define INV_APPR_THRESHOLD 342
+
+#define BINV_NEWTON_THRESHOLD 366
+#define REDC_1_TO_REDC_N_THRESHOLD 91
+
+#define MU_DIV_QR_THRESHOLD 1652
+#define MU_DIVAPPR_Q_THRESHOLD 1858
+#define MUPI_DIV_QR_THRESHOLD 171
+#define MU_BDIV_QR_THRESHOLD 1442
+#define MU_BDIV_Q_THRESHOLD 1830
+
+#define POWM_SEC_TABLE 3,17,102,404,1185
+
+#define GET_STR_DC_THRESHOLD 14
+#define GET_STR_PRECOMPUTE_THRESHOLD 21
+#define SET_STR_DC_THRESHOLD 272
+#define SET_STR_PRECOMPUTE_THRESHOLD 788
+
+#define FAC_DSC_THRESHOLD 132
+#define FAC_ODD_THRESHOLD 34
+
+#define MATRIX22_STRASSEN_THRESHOLD 19
+#define HGCD2_DIV1_METHOD 1 /* 0.59% faster than 3 */
+#define HGCD_THRESHOLD 142
+#define HGCD_APPR_THRESHOLD 181
+#define HGCD_REDUCE_THRESHOLD 2681
+#define GCD_DC_THRESHOLD 492
+#define GCDEXT_DC_THRESHOLD 365
+#define JACOBI_BASE_METHOD 1 /* 0.41% faster than 2 */
+
+/* Tuneup completed successfully, took 147027 seconds */
diff --git a/vendor/gmp-6.3.0/mpn/x86/skylake/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/x86/skylake/gmp-mparam.h
new file mode 100644
index 0000000..fb87957
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/skylake/gmp-mparam.h
@@ -0,0 +1,211 @@
+/* x86/skylake gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 3600-4000 MHz Intel Xeon E3-1270v5 Skylake */
+/* FFT tuning limit = 67,000,000 */
+/* Generated by tuneup.c, 2019-10-21, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD 15
+#define MOD_1_UNNORM_THRESHOLD 16
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 10
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 8
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 10
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 9
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1N_PI1_METHOD 1 /* 5.63% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD 12
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD 17
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 18
+
+#define DIV_1_VS_MUL_1_PERCENT 348
+
+#define MUL_TOOM22_THRESHOLD 24
+#define MUL_TOOM33_THRESHOLD 81
+#define MUL_TOOM44_THRESHOLD 208
+#define MUL_TOOM6H_THRESHOLD 303
+#define MUL_TOOM8H_THRESHOLD 454
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 81
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 149
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 137
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 145
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 196
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 40
+#define SQR_TOOM3_THRESHOLD 129
+#define SQR_TOOM4_THRESHOLD 220
+#define SQR_TOOM6_THRESHOLD 354
+#define SQR_TOOM8_THRESHOLD 608
+
+#define MULMID_TOOM42_THRESHOLD 72
+
+#define MULMOD_BNM1_THRESHOLD 17
+#define SQRMOD_BNM1_THRESHOLD 21
+
+#define MUL_FFT_MODF_THRESHOLD 530 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 530, 5}, { 29, 6}, { 15, 5}, { 31, 6}, \
+ { 29, 7}, { 15, 6}, { 33, 7}, { 17, 6}, \
+ { 36, 7}, { 19, 6}, { 39, 7}, { 21, 6}, \
+ { 43, 7}, { 23, 6}, { 47, 7}, { 29, 8}, \
+ { 15, 7}, { 35, 8}, { 19, 7}, { 43, 8}, \
+ { 23, 7}, { 49, 8}, { 27, 7}, { 55, 9}, \
+ { 15, 8}, { 31, 7}, { 63, 8}, { 43, 9}, \
+ { 23, 8}, { 51, 9}, { 31, 8}, { 67, 9}, \
+ { 39, 8}, { 83, 9}, { 47, 8}, { 95, 9}, \
+ { 55,10}, { 31, 9}, { 79,10}, { 47, 9}, \
+ { 95,11}, { 31,10}, { 63, 9}, { 135,10}, \
+ { 79, 9}, { 159,10}, { 95, 9}, { 191,10}, \
+ { 111,11}, { 63,10}, { 143, 9}, { 287,10}, \
+ { 159,11}, { 95,10}, { 191,12}, { 63,11}, \
+ { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \
+ { 543,10}, { 287,11}, { 159,10}, { 351,11}, \
+ { 191,10}, { 415,12}, { 127,11}, { 255,10}, \
+ { 543,11}, { 287,10}, { 607,11}, { 319,10}, \
+ { 671,11}, { 351,12}, { 191,11}, { 383,10}, \
+ { 799,11}, { 415,13}, { 127,12}, { 255,11}, \
+ { 543,10}, { 1087,11}, { 607,12}, { 319,11}, \
+ { 671,10}, { 1343,11}, { 735,12}, { 383,11}, \
+ { 799,10}, { 1599,11}, { 863,12}, { 447,11}, \
+ { 959,13}, { 255,12}, { 511,11}, { 1087,12}, \
+ { 575,11}, { 1215,12}, { 639,11}, { 1343,12}, \
+ { 703,13}, { 383,12}, { 767,11}, { 1599,12}, \
+ { 831,11}, { 1727,12}, { 959,14}, { 255,13}, \
+ { 511,12}, { 1087,11}, { 2239,12}, { 1215,11}, \
+ { 2431,13}, { 639,12}, { 1471,13}, { 767,12}, \
+ { 1727,13}, { 895,12}, { 1919,14}, { 511,13}, \
+ { 1023,12}, { 2239,13}, { 1151,12}, { 2431,13}, \
+ { 1279,12}, { 2623,13}, { 1407,12}, { 2815,14}, \
+ { 767,13}, { 1663,12}, { 3455,13}, { 1919,15}, \
+ { 511,14}, { 1023,13}, { 2175,12}, { 4479,13}, \
+ { 2431,14}, { 1279,13}, { 2943,12}, { 5887,14}, \
+ { 1535,13}, { 3455,14}, { 1791,13}, { 3967,15}, \
+ { 1023,14}, { 2047,13}, { 4479,14}, { 2303,13}, \
+ { 4991,12}, { 9983,14}, { 2815,13}, { 5887,15}, \
+ { 1535,14}, { 3839,16} }
+#define MUL_FFT_TABLE3_SIZE 154
+#define MUL_FFT_THRESHOLD 6784
+
+#define SQR_FFT_MODF_THRESHOLD 460 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 460, 5}, { 29, 6}, { 15, 5}, { 31, 6}, \
+ { 29, 7}, { 15, 6}, { 33, 7}, { 17, 6}, \
+ { 36, 7}, { 19, 6}, { 39, 7}, { 29, 8}, \
+ { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \
+ { 23, 7}, { 49, 8}, { 27, 7}, { 55, 9}, \
+ { 15, 8}, { 31, 7}, { 63, 8}, { 43, 9}, \
+ { 23, 8}, { 55,10}, { 15, 9}, { 31, 8}, \
+ { 67, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \
+ { 95, 9}, { 55,10}, { 31, 9}, { 79,10}, \
+ { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \
+ { 135,10}, { 79, 9}, { 159,10}, { 95,11}, \
+ { 63,10}, { 127, 9}, { 255,10}, { 143, 9}, \
+ { 287,10}, { 159,11}, { 95,12}, { 63,11}, \
+ { 127,10}, { 271, 9}, { 543,10}, { 287,11}, \
+ { 159,10}, { 319, 9}, { 639,10}, { 351,11}, \
+ { 191,10}, { 415,12}, { 127,11}, { 255,10}, \
+ { 543,11}, { 287,10}, { 575,11}, { 319,10}, \
+ { 671,11}, { 351,10}, { 703,12}, { 191,11}, \
+ { 383,10}, { 799,11}, { 415,10}, { 831,13}, \
+ { 127,12}, { 255,11}, { 511,10}, { 1023,11}, \
+ { 543,10}, { 1087,11}, { 607,12}, { 319,11}, \
+ { 671,10}, { 1343,11}, { 735,12}, { 383,11}, \
+ { 799,10}, { 1599,11}, { 863,12}, { 447,11}, \
+ { 927,13}, { 255,12}, { 511,11}, { 1087,12}, \
+ { 575,11}, { 1215,12}, { 639,11}, { 1343,12}, \
+ { 703,11}, { 1407,13}, { 383,12}, { 767,11}, \
+ { 1599,12}, { 831,11}, { 1727,12}, { 895,11}, \
+ { 1791,14}, { 255,13}, { 511,12}, { 1087,11}, \
+ { 2239,12}, { 1215,13}, { 639,12}, { 1471,13}, \
+ { 767,12}, { 1727,13}, { 895,12}, { 1919,14}, \
+ { 511,13}, { 1023,12}, { 2239,13}, { 1151,12}, \
+ { 2431,13}, { 1279,12}, { 2623,13}, { 1407,12}, \
+ { 2815,14}, { 767,13}, { 1663,12}, { 3455,13}, \
+ { 1919,15}, { 511,14}, { 1023,13}, { 2175,12}, \
+ { 4479,13}, { 2431,14}, { 1279,13}, { 2943,12}, \
+ { 5887,14}, { 1535,13}, { 3455,14}, { 1791,13}, \
+ { 3967,15}, { 1023,14}, { 2047,13}, { 4479,14}, \
+ { 2303,13}, { 4991,12}, { 9983,14}, { 2815,13}, \
+ { 5887,15}, { 1535,14}, { 3839,16} }
+#define SQR_FFT_TABLE3_SIZE 155
+#define SQR_FFT_THRESHOLD 5568
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 68
+#define MULLO_MUL_N_THRESHOLD 13555
+#define SQRLO_BASECASE_THRESHOLD 0 /* always */
+#define SQRLO_DC_THRESHOLD 117
+#define SQRLO_SQR_THRESHOLD 10988
+
+#define DC_DIV_QR_THRESHOLD 42
+#define DC_DIVAPPR_Q_THRESHOLD 163
+#define DC_BDIV_QR_THRESHOLD 66
+#define DC_BDIV_Q_THRESHOLD 160
+
+#define INV_MULMOD_BNM1_THRESHOLD 46
+#define INV_NEWTON_THRESHOLD 165
+#define INV_APPR_THRESHOLD 157
+
+#define BINV_NEWTON_THRESHOLD 300
+#define REDC_1_TO_REDC_N_THRESHOLD 68
+
+#define MU_DIV_QR_THRESHOLD 1718
+#define MU_DIVAPPR_Q_THRESHOLD 1685
+#define MUPI_DIV_QR_THRESHOLD 62
+#define MU_BDIV_QR_THRESHOLD 1589
+#define MU_BDIV_Q_THRESHOLD 1830
+
+#define POWM_SEC_TABLE 1,17,129,547,1317
+
+#define GET_STR_DC_THRESHOLD 10
+#define GET_STR_PRECOMPUTE_THRESHOLD 16
+#define SET_STR_DC_THRESHOLD 354
+#define SET_STR_PRECOMPUTE_THRESHOLD 860
+
+#define FAC_DSC_THRESHOLD 141
+#define FAC_ODD_THRESHOLD 34
+
+#define MATRIX22_STRASSEN_THRESHOLD 20
+#define HGCD2_DIV1_METHOD 5 /* 1.04% faster than 3 */
+#define HGCD_THRESHOLD 114
+#define HGCD_APPR_THRESHOLD 132
+#define HGCD_REDUCE_THRESHOLD 3524
+#define GCD_DC_THRESHOLD 474
+#define GCDEXT_DC_THRESHOLD 379
+#define JACOBI_BASE_METHOD 1 /* 27.39% faster than 4 */
+
+/* Tuneup completed successfully, took 31721 seconds */
diff --git a/vendor/gmp-6.3.0/mpn/x86/sqr_basecase.asm b/vendor/gmp-6.3.0/mpn/x86/sqr_basecase.asm
new file mode 100644
index 0000000..39f8a89
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/sqr_basecase.asm
@@ -0,0 +1,359 @@
+dnl x86 generic mpn_sqr_basecase -- square an mpn number.
+
+dnl Copyright 1999, 2000, 2002, 2003 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+
+C cycles/crossproduct cycles/triangleproduct
+C P5
+C P6
+C K6
+C K7
+C P4
+
+
+C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
+C lot of function call overheads are avoided, especially when the size is
+C small.
+C
+C The mul1 loop is not unrolled like mul_1.asm, it doesn't seem worth the
+C code size to do so here.
+C
+C Enhancements:
+C
+C The addmul loop here is also not unrolled like aorsmul_1.asm and
+C mul_basecase.asm are. Perhaps it should be done. It'd add to the
+C complexity, but if it's worth doing in the other places then it should be
+C worthwhile here.
+C
+C A fully-unrolled style like other sqr_basecase.asm versions (k6, k7, p6)
+C might be worth considering. That'd add quite a bit to the code size, but
+C only as much as is used would be dragged into L1 cache.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ TEXT
+ ALIGN(8)
+PROLOGUE(mpn_sqr_basecase)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %edx
+
+ movl PARAM_SRC, %eax
+
+ cmpl $2, %edx
+ movl PARAM_DST, %ecx
+
+ je L(two_limbs)
+ ja L(three_or_more)
+
+
+C -----------------------------------------------------------------------------
+C one limb only
+ C eax src
+ C ebx
+ C ecx dst
+ C edx
+
+ movl (%eax), %eax
+ mull %eax
+ movl %eax, (%ecx)
+ movl %edx, 4(%ecx)
+ ret
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(8)
+L(two_limbs):
+ C eax src
+ C ebx
+ C ecx dst
+ C edx
+
+ pushl %ebx
+ pushl %ebp
+
+ movl %eax, %ebx
+ movl (%eax), %eax
+
+ mull %eax C src[0]^2
+
+ pushl %esi
+ pushl %edi
+
+ movl %edx, %esi C dst[1]
+ movl %eax, (%ecx) C dst[0]
+
+ movl 4(%ebx), %eax
+ mull %eax C src[1]^2
+
+ movl %eax, %edi C dst[2]
+ movl %edx, %ebp C dst[3]
+
+ movl (%ebx), %eax
+ mull 4(%ebx) C src[0]*src[1]
+
+ addl %eax, %esi
+
+ adcl %edx, %edi
+
+ adcl $0, %ebp
+ addl %esi, %eax
+
+ adcl %edi, %edx
+ movl %eax, 4(%ecx)
+
+ adcl $0, %ebp
+
+ movl %edx, 8(%ecx)
+ movl %ebp, 12(%ecx)
+
+ popl %edi
+ popl %esi
+
+ popl %ebp
+ popl %ebx
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(8)
+L(three_or_more):
+deflit(`FRAME',0)
+ C eax src
+ C ebx
+ C ecx dst
+ C edx size
+
+ pushl %ebx FRAME_pushl()
+ pushl %edi FRAME_pushl()
+
+ pushl %esi FRAME_pushl()
+ pushl %ebp FRAME_pushl()
+
+ leal (%ecx,%edx,4), %edi C &dst[size], end of this mul1
+ leal (%eax,%edx,4), %esi C &src[size]
+
+C First multiply src[0]*src[1..size-1] and store at dst[1..size].
+
+ movl (%eax), %ebp C src[0], multiplier
+ movl %edx, %ecx
+
+ negl %ecx C -size
+ xorl %ebx, %ebx C clear carry limb
+
+ incl %ecx C -(size-1)
+
+L(mul1):
+ C eax scratch
+ C ebx carry
+ C ecx counter, limbs, negative
+ C edx scratch
+ C esi &src[size]
+ C edi &dst[size]
+ C ebp multiplier
+
+ movl (%esi,%ecx,4), %eax
+ mull %ebp
+ addl %eax, %ebx
+ adcl $0, %edx
+ movl %ebx, (%edi,%ecx,4)
+ movl %edx, %ebx
+ incl %ecx
+ jnz L(mul1)
+
+ movl %ebx, (%edi)
+
+
+ C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for
+ C n=1..size-2.
+ C
+ C The last products src[size-2]*src[size-1], which is the end corner
+ C of the product triangle, is handled separately at the end to save
+ C looping overhead. If size is 3 then it's only this that needs to
+ C be done.
+ C
+ C In the outer loop %esi is a constant, and %edi just advances by 1
+ C limb each time. The size of the operation decreases by 1 limb
+ C each time.
+
+ C eax
+ C ebx carry (needing carry flag added)
+ C ecx
+ C edx
+ C esi &src[size]
+ C edi &dst[size]
+ C ebp
+
+ movl PARAM_SIZE, %ecx
+ subl $3, %ecx
+ jz L(corner)
+
+ negl %ecx
+
+dnl re-use parameter space
+define(VAR_OUTER,`PARAM_DST')
+
+L(outer):
+ C eax
+ C ebx
+ C ecx
+ C edx outer loop counter, -(size-3) to -1
+ C esi &src[size]
+ C edi dst, pointing at stored carry limb of previous loop
+ C ebp
+
+ movl %ecx, VAR_OUTER
+ addl $4, %edi C advance dst end
+
+ movl -8(%esi,%ecx,4), %ebp C next multiplier
+ subl $1, %ecx
+
+ xorl %ebx, %ebx C initial carry limb
+
+L(inner):
+ C eax scratch
+ C ebx carry (needing carry flag added)
+ C ecx counter, -n-1 to -1
+ C edx scratch
+ C esi &src[size]
+ C edi dst end of this addmul
+ C ebp multiplier
+
+ movl (%esi,%ecx,4), %eax
+ mull %ebp
+ addl %ebx, %eax
+ adcl $0, %edx
+ addl %eax, (%edi,%ecx,4)
+ adcl $0, %edx
+ movl %edx, %ebx
+ addl $1, %ecx
+ jl L(inner)
+
+
+ movl %ebx, (%edi)
+ movl VAR_OUTER, %ecx
+ incl %ecx
+ jnz L(outer)
+
+
+L(corner):
+ C esi &src[size]
+ C edi &dst[2*size-3]
+
+ movl -4(%esi), %eax
+ mull -8(%esi) C src[size-1]*src[size-2]
+ addl %eax, 0(%edi)
+ adcl $0, %edx
+ movl %edx, 4(%edi) C dst high limb
+
+
+C -----------------------------------------------------------------------------
+C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
+
+ movl PARAM_SIZE, %eax
+ negl %eax
+ addl $1, %eax C -(size-1) and clear carry
+
+L(lshift):
+ C eax counter, negative
+ C ebx next limb
+ C ecx
+ C edx
+ C esi
+ C edi &dst[2*size-4]
+ C ebp
+
+ rcll 8(%edi,%eax,8)
+ rcll 12(%edi,%eax,8)
+ incl %eax
+ jnz L(lshift)
+
+
+ adcl %eax, %eax C high bit out
+ movl %eax, 8(%edi) C dst most significant limb
+
+
+C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ...,
+C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the
+C low limb of src[0]^2.
+
+ movl PARAM_SRC, %esi
+ movl (%esi), %eax C src[0]
+ mull %eax C src[0]^2
+
+ movl PARAM_SIZE, %ecx
+ leal (%esi,%ecx,4), %esi C src end
+
+ negl %ecx C -size
+ movl %edx, %ebx C initial carry
+
+ movl %eax, 12(%edi,%ecx,8) C dst[0]
+ incl %ecx C -(size-1)
+
+L(diag):
+ C eax scratch (low product)
+ C ebx carry limb
+ C ecx counter, -(size-1) to -1
+ C edx scratch (high product)
+ C esi &src[size]
+ C edi &dst[2*size-3]
+ C ebp scratch (fetched dst limbs)
+
+ movl (%esi,%ecx,4), %eax
+ mull %eax
+
+ addl %ebx, 8(%edi,%ecx,8)
+ movl %edx, %ebx
+
+ adcl %eax, 12(%edi,%ecx,8)
+ adcl $0, %ebx
+
+ incl %ecx
+ jnz L(diag)
+
+
+ addl %ebx, 8(%edi) C dst most significant limb
+
+ popl %ebp
+ popl %esi
+
+ popl %edi
+ popl %ebx
+
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/t-zdisp.sh b/vendor/gmp-6.3.0/mpn/x86/t-zdisp.sh
new file mode 100755
index 0000000..61efdd6
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/t-zdisp.sh
@@ -0,0 +1,71 @@
+#! /bin/sh
+#
+# Copyright 2000 Free Software Foundation, Inc.
+#
+# This file is part of the GNU MP Library.
+#
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of either:
+#
+# * the GNU Lesser General Public License as published by the Free
+# Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
+#
+# or
+#
+# * the GNU General Public License as published by the Free Software
+# Foundation; either version 2 of the License, or (at your option) any
+# later version.
+#
+# or both in parallel, as here.
+#
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+# for more details.
+#
+# You should have received copies of the GNU General Public License and the
+# GNU Lesser General Public License along with the GNU MP Library. If not,
+# see https://www.gnu.org/licenses/.
+
+
+# Usage: cd $(builddir)/mpn
+# $(srcdir)/x86/t-zdisp.sh
+#
+# Run the Zdisp() macro instructions through the assembler to check
+# the encodings used. Mismatches are printed, no output means all ok.
+#
+# This program is only meant for use during development. It can be
+# run in the mpn build directory of any x86 configuration.
+#
+# For this test the assembler needs to generate byte sized 0
+# displacements when given something like 0(%eax). Recent versions of
+# gas are suitable (eg. 2.9.x or 2.10.x).
+
+set -e
+
+cat >tmp-zdisptest.asm <<\EOF
+
+include(`../config.m4')
+
+dnl Redefine Zdisp_match to output its pattern and encoding.
+define(`Zdisp_match',
+`define(`Zdisp_found',1)dnl
+ifelse(`$2',0,` $1 $2$3, $4')`'dnl
+ifelse(`$3',0,` $1 $2, $3$4')`'dnl
+
+ .byte $5
+')
+ .text
+ Zdisp()
+EOF
+
+m4 tmp-zdisptest.asm >tmp-zdisptest.s
+as -o tmp-zdisptest.o tmp-zdisptest.s
+
+# Demand duplicates from the instruction patterns and byte encodings.
+objdump -d tmp-zdisptest.o | awk '
+/^ *[a-z0-9]+:/ {
+ sub(/^ *[a-z0-9]+:/,"")
+ print
+}' | sort | uniq -u
diff --git a/vendor/gmp-6.3.0/mpn/x86/t-zdisp2.pl b/vendor/gmp-6.3.0/mpn/x86/t-zdisp2.pl
new file mode 100755
index 0000000..b441b65
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/t-zdisp2.pl
@@ -0,0 +1,147 @@
+#!/usr/bin/perl -w
+#
+# Copyright 2001, 2002 Free Software Foundation, Inc.
+#
+# This file is part of the GNU MP Library.
+#
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of either:
+#
+# * the GNU Lesser General Public License as published by the Free
+# Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
+#
+# or
+#
+# * the GNU General Public License as published by the Free Software
+# Foundation; either version 2 of the License, or (at your option) any
+# later version.
+#
+# or both in parallel, as here.
+#
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+# for more details.
+#
+# You should have received copies of the GNU General Public License and the
+# GNU Lesser General Public License along with the GNU MP Library. If not,
+# see https://www.gnu.org/licenses/.
+
+
+# Usage: cd $(builddir)/mpn
+# $(srcdir)/x86/t-zdisp2.pl
+#
+# Grep for any "0(reg...)" addressing modes coming out of the x86 .asm
+# files. Additive expressions like "12+4-16" are recognised too.
+#
+# Old gas doesn't preserve the "0" displacement, so if it's wanted then
+# Zdisp ought to be used to give explicit .byte sequences. See
+# mpn/x86/README.
+#
+# No output means everything is ok. All the asm files are put through m4 in
+# PIC and non-PIC modes, and in each multi-function form, all of which can
+# take a while to run.
+#
+# This program is only meant for use during development.
+
+use strict;
+use File::Find;
+use File::Basename;
+use Getopt::Std;
+
+my %opt;
+getopts('t', \%opt);
+
+
+my $srcdir;
+open IN, '<Makefile' or die;
+while (<IN>) {
+ if (/^srcdir[ \t]*=[ \t]*(.*)/) {
+ $srcdir = $1;
+ last;
+ }
+}
+close IN or die;
+defined $srcdir or die "Cannot find \$srcdir in Makefile\n";
+
+my $filecount = 0;
+
+my $tempfile = 't-zdisp2.tmp';
+open KARA, ">$tempfile" or die;
+close KARA or die;
+
+find({ wanted => \&process, preprocess => \&process_mparam, no_chdir => 1 },
+ "$srcdir/x86");
+
+sub process {
+ if (/gmp-mparam.h$/) {
+ process_mparam($_);
+ } elsif (/\.asm$/) {
+ process_asm($_);
+ }
+}
+
+# Ensure we're using the right SQR_TOOM2_THRESHOLD for the part of the
+# tree being processed.
+sub process_mparam {
+ my $file = "$File::Find::dir/gmp-mparam.h";
+ if (-f $file) {
+ print "$file\n" if $opt{'t'};
+ open MPARAM, "<$file" or die;
+ while (<MPARAM>) {
+ if (/^#define SQR_TOOM2_THRESHOLD[ \t]*([0-9][0-9]*)/) {
+ open KARA, ">$tempfile" or die;
+ print KARA "define(\`SQR_TOOM2_THRESHOLD',$1)\n\n";
+ print "define(\`SQR_TOOM2_THRESHOLD',$1)\n" if $opt{'t'};
+ close KARA or die;
+ last;
+ }
+ }
+ close MPARAM or die;
+ }
+ return @_;
+}
+
+sub process_asm {
+ my ($file) = @_;
+ my $base = basename ($file, '.asm');
+
+ my @funs;
+ if ($base eq 'aors_n') { @funs = qw(add_n sub_n); }
+ elsif ($base eq 'aorsmul_1') { @funs = qw(addmul_1 submul_1); }
+ elsif ($base eq 'popham') { @funs = qw(popcount hamdist); }
+ elsif ($base eq 'logops_n') { @funs = qw(and_n andn_n nand_n ior_n iorn_n nior_n xor_n xnor_n); }
+ elsif ($base eq 'lorrshift') { @funs = qw(lshift rshift); }
+ else { @funs = ($base); }
+
+ foreach my $fun (@funs) {
+ foreach my $pic ('', ' -DPIC') {
+ my $header = "$file: 0: $pic\n";
+ $filecount++;
+
+ my $m4 = "m4 -DHAVE_HOST_CPU_athlon -DOPERATION_$fun $pic ../config.m4 $tempfile $file";
+ print "$m4\n" if $opt{'t'};
+
+ open IN, "$m4 |" or die;
+ while (<IN>) {
+ next unless /([0-9+-][0-9 \t+-]*)\(%/;
+ my $pat=$1;
+ $pat = eval($pat);
+ next if ($pat != 0);
+ print "$header$_";
+ $header='';
+ }
+ close IN or die;
+ }
+ }
+}
+
+unlink($tempfile);
+print "total $filecount processed\n";
+exit 0;
+
+
+# Local variables:
+# perl-indent-level: 2
+# End:
diff --git a/vendor/gmp-6.3.0/mpn/x86/udiv.asm b/vendor/gmp-6.3.0/mpn/x86/udiv.asm
new file mode 100644
index 0000000..a3ee088
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/udiv.asm
@@ -0,0 +1,52 @@
+dnl x86 mpn_udiv_qrnnd -- 2 by 1 limb division
+
+dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_udiv_qrnnd (mp_limb_t *remptr, mp_limb_t high, mp_limb_t low,
+C mp_limb_t divisor);
+
+defframe(PARAM_DIVISOR, 16)
+defframe(PARAM_LOW, 12)
+defframe(PARAM_HIGH, 8)
+defframe(PARAM_REMPTR, 4)
+
+ TEXT
+ ALIGN(8)
+PROLOGUE(mpn_udiv_qrnnd)
+deflit(`FRAME',0)
+ movl PARAM_LOW, %eax
+ movl PARAM_HIGH, %edx
+ divl PARAM_DIVISOR
+ movl PARAM_REMPTR, %ecx
+ movl %edx, (%ecx)
+ ret
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/umul.asm b/vendor/gmp-6.3.0/mpn/x86/umul.asm
new file mode 100644
index 0000000..34fe434
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/umul.asm
@@ -0,0 +1,51 @@
+dnl mpn_umul_ppmm -- 1x1->2 limb multiplication
+
+dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_umul_ppmm (mp_limb_t *lowptr, mp_limb_t m1, mp_limb_t m2);
+C
+
+defframe(PARAM_M2, 12)
+defframe(PARAM_M1, 8)
+defframe(PARAM_LOWPTR, 4)
+
+ TEXT
+ ALIGN(8)
+PROLOGUE(mpn_umul_ppmm)
+deflit(`FRAME',0)
+ movl PARAM_LOWPTR, %ecx
+ movl PARAM_M1, %eax
+ mull PARAM_M2
+ movl %eax, (%ecx)
+ movl %edx, %eax
+ ret
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/x86-defs.m4 b/vendor/gmp-6.3.0/mpn/x86/x86-defs.m4
new file mode 100644
index 0000000..81309b2
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/x86-defs.m4
@@ -0,0 +1,1024 @@
+divert(-1)
+
+dnl m4 macros for x86 assembler.
+
+dnl Copyright 1999-2003, 2007, 2010, 2012, 2014 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+dnl Notes:
+dnl
+dnl m4 isn't perfect for processing BSD style x86 assembler code, the main
+dnl problems are,
+dnl
+dnl 1. Doing define(foo,123) and then using foo in an addressing mode like
+dnl foo(%ebx) expands as a macro rather than a constant. This is worked
+dnl around by using deflit() from asm-defs.m4, instead of define().
+dnl
+dnl 2. Immediates in macro definitions need a space or `' to stop the $
+dnl looking like a macro parameter. For example,
+dnl
+dnl define(foo, `mov $ 123, %eax')
+dnl
+dnl This is only a problem in macro definitions, not in ordinary text,
+dnl and not in macro parameters like text passed to forloop() or ifdef().
+
+
+deflit(GMP_LIMB_BYTES, 4)
+
+
+dnl Libtool gives -DPIC -DDLL_EXPORT to indicate a cygwin or mingw DLL. We
+dnl undefine PIC since we don't need to be position independent in this
+dnl case and definitely don't want the ELF style _GLOBAL_OFFSET_TABLE_ etc.
+
+ifdef(`DLL_EXPORT',`undefine(`PIC')')
+
+
+dnl Usage: CPUVEC_FUNCS_LIST
+dnl
+dnl A list of the functions from gmp-impl.h x86 struct cpuvec_t, in the
+dnl order they appear in that structure.
+
+define(CPUVEC_FUNCS_LIST,
+``add_n',
+`addlsh1_n',
+`addlsh2_n',
+`addmul_1',
+`addmul_2',
+`bdiv_dbm1c',
+`cnd_add_n',
+`cnd_sub_n',
+`com',
+`copyd',
+`copyi',
+`divexact_1',
+`divrem_1',
+`gcd_11',
+`lshift',
+`lshiftc',
+`mod_1',
+`mod_1_1p',
+`mod_1_1p_cps',
+`mod_1s_2p',
+`mod_1s_2p_cps',
+`mod_1s_4p',
+`mod_1s_4p_cps',
+`mod_34lsub1',
+`modexact_1c_odd',
+`mul_1',
+`mul_basecase',
+`mullo_basecase',
+`preinv_divrem_1',
+`preinv_mod_1',
+`redc_1',
+`redc_2',
+`rshift',
+`sqr_basecase',
+`sub_n',
+`sublsh1_n',
+`submul_1'')
+
+
+dnl Called: PROLOGUE_cpu(GSYM_PREFIX`'foo)
+dnl
+dnl In the x86 code we use explicit TEXT and ALIGN() calls in the code,
+dnl since different alignments are wanted in various circumstances. So for
+dnl instance,
+dnl
+dnl TEXT
+dnl ALIGN(16)
+dnl PROLOGUE(mpn_add_n)
+dnl ...
+dnl EPILOGUE()
+
+define(`PROLOGUE_cpu',
+m4_assert_numargs(1)
+m4_assert_defined(`WANT_PROFILING')
+ `GLOBL $1
+ TYPE($1,`function')
+ COFF_TYPE($1)
+$1:
+ifelse(WANT_PROFILING,`prof', ` call_mcount')
+ifelse(WANT_PROFILING,`gprof', ` call_mcount')
+ifelse(WANT_PROFILING,`instrument',` call_instrument(enter)')
+')
+
+
+dnl Usage: COFF_TYPE(GSYM_PREFIX`'foo)
+dnl
+dnl Emit COFF style ".def ... .endef" type information for a function, when
+dnl supported. The argument should include any GSYM_PREFIX.
+dnl
+dnl See autoconf macro GMP_ASM_COFF_TYPE for HAVE_COFF_TYPE.
+
+define(COFF_TYPE,
+m4_assert_numargs(1)
+m4_assert_defined(`HAVE_COFF_TYPE')
+`ifelse(HAVE_COFF_TYPE,yes,
+ `.def $1
+ .scl 2
+ .type 32
+ .endef')')
+
+
+dnl Usage: call_mcount
+dnl
+dnl For `gprof' style profiling, %ebp is setup as a frame pointer. None of
+dnl the assembler routines use %ebp this way, so it's done only for the
+dnl benefit of mcount. glibc sysdeps/i386/i386-mcount.S shows how mcount
+dnl gets the current function from (%esp) and the parent from 4(%ebp).
+dnl
+dnl For `prof' style profiling gcc generates mcount calls without setting
+dnl up %ebp, and the same is done here.
+
+define(`call_mcount',
+m4_assert_numargs(-1)
+m4_assert_defined(`WANT_PROFILING')
+m4_assert_defined(`MCOUNT_PIC_REG')
+m4_assert_defined(`MCOUNT_NONPIC_REG')
+m4_assert_defined(`MCOUNT_PIC_CALL')
+m4_assert_defined(`MCOUNT_NONPIC_CALL')
+`ifelse(ifdef(`PIC',`MCOUNT_PIC_REG',`MCOUNT_NONPIC_REG'),,,
+` DATA
+ ALIGN(4)
+L(mcount_data_`'mcount_counter):
+ W32 0
+ TEXT
+')dnl
+ifelse(WANT_PROFILING,`gprof',
+` pushl %ebp
+ movl %esp, %ebp
+')dnl
+ifdef(`PIC',
+` pushl %ebx
+ call_movl_eip_to_ebx
+L(mcount_here_`'mcount_counter):
+ addl $_GLOBAL_OFFSET_TABLE_+[.-L(mcount_here_`'mcount_counter)], %ebx
+ifelse(MCOUNT_PIC_REG,,,
+` leal L(mcount_data_`'mcount_counter)@GOTOFF(%ebx), MCOUNT_PIC_REG')
+MCOUNT_PIC_CALL
+ popl %ebx
+',`dnl non-PIC
+ifelse(MCOUNT_NONPIC_REG,,,
+` movl `$'L(mcount_data_`'mcount_counter), MCOUNT_NONPIC_REG
+')dnl
+MCOUNT_NONPIC_CALL
+')dnl
+ifelse(WANT_PROFILING,`gprof',
+` popl %ebp
+')
+define(`mcount_counter',incr(mcount_counter))
+')
+
+define(mcount_counter,1)
+
+
+dnl Usage: call_instrument(enter|exit)
+dnl
+dnl Call __cyg_profile_func_enter or __cyg_profile_func_exit.
+dnl
+dnl For PIC, most routines don't require _GLOBAL_OFFSET_TABLE_ themselves
+dnl so %ebx is just setup for these calls. It's a bit wasteful to repeat
+dnl the setup for the exit call having done it earlier for the enter, but
+dnl there's nowhere very convenient to hold %ebx through the length of a
+dnl routine, in general.
+dnl
+dnl For PIC, because instrument_current_function will be within the current
+dnl object file we can get it just as an offset from %eip, there's no need
+dnl to use the GOT.
+dnl
+dnl No attempt is made to maintain the stack alignment gcc generates with
+dnl -mpreferred-stack-boundary. This wouldn't be hard, but it seems highly
+dnl unlikely the instrumenting functions would be doing anything that'd
+dnl benefit from alignment, in particular they're unlikely to be using
+dnl doubles or long doubles on the stack.
+dnl
+dnl The FRAME scheme is used to conveniently account for the register saves
+dnl before accessing the return address. Any previous value is saved and
+dnl restored, since plenty of code keeps a value across a "ret" in the
+dnl middle of a routine.
+
+define(call_instrument,
+m4_assert_numargs(1)
+` pushdef(`FRAME',0)
+ifelse($1,exit,
+` pushl %eax FRAME_pushl() C return value
+')
+ifdef(`PIC',
+` pushl %ebx FRAME_pushl()
+ call_movl_eip_to_ebx
+L(instrument_here_`'instrument_count):
+ movl %ebx, %ecx
+ addl $_GLOBAL_OFFSET_TABLE_+[.-L(instrument_here_`'instrument_count)], %ebx
+ C use addl rather than leal to avoid old gas bugs, see mpn/x86/README
+ addl $instrument_current_function-L(instrument_here_`'instrument_count), %ecx
+ pushl m4_empty_if_zero(FRAME)(%esp) FRAME_pushl() C return addr
+ pushl %ecx FRAME_pushl() C this function
+ call GSYM_PREFIX`'__cyg_profile_func_$1@PLT
+ addl $`'8, %esp
+ popl %ebx
+',
+` C non-PIC
+ pushl m4_empty_if_zero(FRAME)(%esp) FRAME_pushl() C return addr
+ pushl $instrument_current_function FRAME_pushl() C this function
+ call GSYM_PREFIX`'__cyg_profile_func_$1
+ addl $`'8, %esp
+')
+ifelse($1,exit,
+` popl %eax C return value
+')
+ popdef(`FRAME')
+define(`instrument_count',incr(instrument_count))
+')
+define(instrument_count,1)
+
+
+dnl Usage: instrument_current_function
+dnl
+dnl Return the current function name for instrumenting purposes. This is
+dnl PROLOGUE_current_function, but it sticks at the first such name seen.
+dnl
+dnl Sticking to the first name seen ensures that multiple-entrypoint
+dnl functions like mpn_add_nc and mpn_add_n will make enter and exit calls
+dnl giving the same function address.
+
+define(instrument_current_function,
+m4_assert_numargs(-1)
+`ifdef(`instrument_current_function_seen',
+`instrument_current_function_seen',
+`define(`instrument_current_function_seen',PROLOGUE_current_function)dnl
+PROLOGUE_current_function')')
+
+
+dnl Usage: call_movl_eip_to_ebx
+dnl
+dnl Generate a call to L(movl_eip_to_ebx), and record the need for that
+dnl routine.
+
+define(call_movl_eip_to_ebx,
+m4_assert_numargs(-1)
+`call L(movl_eip_to_ebx)
+define(`movl_eip_to_ebx_needed',1)')
+
+dnl Usage: generate_movl_eip_to_ebx
+dnl
+dnl Emit a L(movl_eip_to_ebx) routine, if needed and not already generated.
+
+define(generate_movl_eip_to_ebx,
+m4_assert_numargs(-1)
+`ifelse(movl_eip_to_ebx_needed,1,
+`ifelse(movl_eip_to_ebx_done,1,,
+`L(movl_eip_to_ebx):
+ movl (%esp), %ebx
+ ret_internal
+define(`movl_eip_to_ebx_done',1)
+')')')
+
+
+dnl Usage: ret
+dnl
+dnl Generate a "ret", but if doing instrumented profiling then call
+dnl __cyg_profile_func_exit first.
+
+define(ret,
+m4_assert_numargs(-1)
+m4_assert_defined(`WANT_PROFILING')
+`ifelse(WANT_PROFILING,instrument,
+`ret_instrument',
+`ret_internal')
+generate_movl_eip_to_ebx
+')
+
+
+dnl Usage: ret_internal
+dnl
+dnl A plain "ret", without any __cyg_profile_func_exit call. This can be
+dnl used for a return which is internal to some function, such as when
+dnl getting %eip for PIC.
+
+define(ret_internal,
+m4_assert_numargs(-1)
+``ret'')
+
+
+dnl Usage: ret_instrument
+dnl
+dnl Generate call to __cyg_profile_func_exit and then a ret. If a ret has
+dnl already been seen from this function then jump to that chunk of code,
+dnl rather than emitting it again.
+
+define(ret_instrument,
+m4_assert_numargs(-1)
+`ifelse(m4_unquote(ret_instrument_seen_`'instrument_current_function),1,
+`jmp L(instrument_exit_`'instrument_current_function)',
+`define(ret_instrument_seen_`'instrument_current_function,1)
+L(instrument_exit_`'instrument_current_function):
+call_instrument(exit)
+ ret_internal')')
+
+
+dnl Usage: _GLOBAL_OFFSET_TABLE_
+dnl
+dnl Expand to _GLOBAL_OFFSET_TABLE_ plus any necessary underscore prefix.
+dnl This lets us write plain _GLOBAL_OFFSET_TABLE_ in SVR4 style, but still
+dnl work with systems requiring an extra underscore such as OpenBSD.
+dnl
+dnl deflit is used so "leal _GLOBAL_OFFSET_TABLE_(%eax), %ebx" will come
+dnl out right, though that form doesn't work properly in gas (see
+dnl mpn/x86/README).
+
+deflit(_GLOBAL_OFFSET_TABLE_,
+m4_assert_defined(`GOT_GSYM_PREFIX')
+`GOT_GSYM_PREFIX`_GLOBAL_OFFSET_TABLE_'')
+
+
+dnl --------------------------------------------------------------------------
+dnl Various x86 macros.
+dnl
+
+
+dnl Usage: ALIGN_OFFSET(bytes,offset)
+dnl
+dnl Align to `offset' away from a multiple of `bytes'.
+dnl
+dnl This is useful for testing, for example align to something very strict
+dnl and see what effect offsets from it have, "ALIGN_OFFSET(256,32)".
+dnl
+dnl Generally you wouldn't execute across the padding, but it's done with
+dnl nop's so it'll work.
+
+define(ALIGN_OFFSET,
+m4_assert_numargs(2)
+`ALIGN($1)
+forloop(`i',1,$2,` nop
+')')
+
+
+dnl Usage: defframe(name,offset)
+dnl
+dnl Make a definition like the following with which to access a parameter
+dnl or variable on the stack.
+dnl
+dnl define(name,`FRAME+offset(%esp)')
+dnl
+dnl Actually m4_empty_if_zero(FRAME+offset) is used, which will save one
+dnl byte if FRAME+offset is zero, by putting (%esp) rather than 0(%esp).
+dnl Use define(`defframe_empty_if_zero_disabled',1) if for some reason the
+dnl zero offset is wanted.
+dnl
+dnl The new macro also gets a check that when it's used FRAME is actually
+dnl defined, and that the final %esp offset isn't negative, which would
+dnl mean an attempt to access something below the current %esp.
+dnl
+dnl deflit() is used rather than a plain define(), so the new macro won't
+dnl delete any following parenthesized expression. name(%edi) will come
+dnl out say as 16(%esp)(%edi). This isn't valid assembler and should
+dnl provoke an error, which is better than silently giving just 16(%esp).
+dnl
+dnl See README for more on the suggested way to access the stack frame.
+
+define(defframe,
+m4_assert_numargs(2)
+`deflit(`$1',
+m4_assert_defined(`FRAME')
+`defframe_check_notbelow(`$1',$2,FRAME)dnl
+defframe_empty_if_zero(FRAME+($2))(%esp)')')
+
+dnl Called: defframe_empty_if_zero(expression)
+define(defframe_empty_if_zero,
+m4_assert_numargs(1)
+`ifelse(defframe_empty_if_zero_disabled,1,
+`eval($1)',
+`m4_empty_if_zero($1)')')
+
+dnl Called: defframe_check_notbelow(`name',offset,FRAME)
+define(defframe_check_notbelow,
+m4_assert_numargs(3)
+`ifelse(eval(($3)+($2)<0),1,
+`m4_error(`$1 at frame offset $2 used when FRAME is only $3 bytes
+')')')
+
+
+dnl Usage: FRAME_pushl()
+dnl FRAME_popl()
+dnl FRAME_addl_esp(n)
+dnl FRAME_subl_esp(n)
+dnl
+dnl Adjust FRAME appropriately for a pushl or popl, or for an addl or subl
+dnl %esp of n bytes.
+dnl
+dnl Using these macros is completely optional. Sometimes it makes more
+dnl sense to put explicit deflit(`FRAME',N) forms, especially when there's
+dnl jumps and different sequences of FRAME values need to be used in
+dnl different places.
+
+define(FRAME_pushl,
+m4_assert_numargs(0)
+m4_assert_defined(`FRAME')
+`deflit(`FRAME',eval(FRAME+4))')
+
+define(FRAME_popl,
+m4_assert_numargs(0)
+m4_assert_defined(`FRAME')
+`deflit(`FRAME',eval(FRAME-4))')
+
+define(FRAME_addl_esp,
+m4_assert_numargs(1)
+m4_assert_defined(`FRAME')
+`deflit(`FRAME',eval(FRAME-($1)))')
+
+define(FRAME_subl_esp,
+m4_assert_numargs(1)
+m4_assert_defined(`FRAME')
+`deflit(`FRAME',eval(FRAME+($1)))')
+
+
+dnl Usage: defframe_pushl(name)
+dnl
+dnl Do a combination FRAME_pushl() and a defframe() to name the stack
+dnl location just pushed. This should come after a pushl instruction.
+dnl Putting it on the same line works and avoids lengthening the code. For
+dnl example,
+dnl
+dnl pushl %eax defframe_pushl(VAR_COUNTER)
+dnl
+dnl Notice the defframe() is done with an unquoted -FRAME thus giving its
+dnl current value without tracking future changes.
+
+define(defframe_pushl,
+m4_assert_numargs(1)
+`FRAME_pushl()defframe(`$1',-FRAME)')
+
+
+dnl --------------------------------------------------------------------------
+dnl Assembler instruction macros.
+dnl
+
+
+dnl Usage: emms_or_femms
+dnl femms_available_p
+dnl
+dnl femms_available_p expands to 1 or 0 according to whether the AMD 3DNow
+dnl femms instruction is available. emms_or_femms expands to femms if
+dnl available, or emms if not.
+dnl
+dnl emms_or_femms is meant for use in the K6 directory where plain K6
+dnl (without femms) and K6-2 and K6-3 (with a slightly faster femms) are
+dnl supported together.
+dnl
+dnl On K7 femms is no longer faster and is just an alias for emms, so plain
+dnl emms may as well be used.
+
+define(femms_available_p,
+m4_assert_numargs(-1)
+`m4_ifdef_anyof_p(
+ `HAVE_HOST_CPU_k62',
+ `HAVE_HOST_CPU_k63',
+ `HAVE_HOST_CPU_athlon')')
+
+define(emms_or_femms,
+m4_assert_numargs(-1)
+`ifelse(femms_available_p,1,`femms',`emms')')
+
+
+dnl Usage: femms
+dnl
+dnl Gas 2.9.1 which comes with FreeBSD 3.4 doesn't support femms, so the
+dnl following is a replacement using .byte.
+
+define(femms,
+m4_assert_numargs(-1)
+`.byte 15,14 C AMD 3DNow femms')
+
+
+dnl Usage: jadcl0(op)
+dnl
+dnl Generate a jnc/incl as a substitute for adcl $0,op. Note this isn't an
+dnl exact replacement, since it doesn't set the flags like adcl does.
+dnl
+dnl This finds a use in K6 mpn_addmul_1, mpn_submul_1, mpn_mul_basecase and
+dnl mpn_sqr_basecase because on K6 an adcl is slow, the branch
+dnl misprediction penalty is small, and the multiply algorithm used leads
+dnl to a carry bit on average only 1/4 of the time.
+dnl
+dnl jadcl0_disabled can be set to 1 to instead generate an ordinary adcl
+dnl for comparison. For example,
+dnl
+dnl define(`jadcl0_disabled',1)
+dnl
+dnl When using a register operand, eg. "jadcl0(%edx)", the jnc/incl code is
+dnl the same size as an adcl. This makes it possible to use the exact same
+dnl computed jump code when testing the relative speed of the two.
+
+define(jadcl0,
+m4_assert_numargs(1)
+`ifelse(jadcl0_disabled,1,
+ `adcl $`'0, $1',
+ `jnc L(jadcl0_`'jadcl0_counter)
+ incl $1
+L(jadcl0_`'jadcl0_counter):
+define(`jadcl0_counter',incr(jadcl0_counter))')')
+
+define(jadcl0_counter,1)
+
+
+dnl Usage: x86_lookup(target, key,value, key,value, ...)
+dnl x86_lookup_p(target, key,value, key,value, ...)
+dnl
+dnl Look for `target' among the `key' parameters.
+dnl
+dnl x86_lookup expands to the corresponding `value', or generates an error
+dnl if `target' isn't found.
+dnl
+dnl x86_lookup_p expands to 1 if `target' is found, or 0 if not.
+
+define(x86_lookup,
+m4_assert_numargs_range(1,999)
+`ifelse(eval($#<3),1,
+`m4_error(`unrecognised part of x86 instruction: $1
+')',
+`ifelse(`$1',`$2', `$3',
+`x86_lookup(`$1',shift(shift(shift($@))))')')')
+
+define(x86_lookup_p,
+m4_assert_numargs_range(1,999)
+`ifelse(eval($#<3),1, `0',
+`ifelse(`$1',`$2', `1',
+`x86_lookup_p(`$1',shift(shift(shift($@))))')')')
+
+
+dnl Usage: x86_opcode_reg32(reg)
+dnl x86_opcode_reg32_p(reg)
+dnl
+dnl x86_opcode_reg32 expands to the standard 3 bit encoding for the given
+dnl 32-bit register, eg. `%ebp' turns into 5.
+dnl
+dnl x86_opcode_reg32_p expands to 1 if reg is a valid 32-bit register, or 0
+dnl if not.
+
+define(x86_opcode_reg32,
+m4_assert_numargs(1)
+`x86_lookup(`$1',x86_opcode_reg32_list)')
+
+define(x86_opcode_reg32_p,
+m4_assert_onearg()
+`x86_lookup_p(`$1',x86_opcode_reg32_list)')
+
+define(x86_opcode_reg32_list,
+``%eax',0,
+`%ecx',1,
+`%edx',2,
+`%ebx',3,
+`%esp',4,
+`%ebp',5,
+`%esi',6,
+`%edi',7')
+
+
+dnl Usage: x86_opcode_tttn(cond)
+dnl
+dnl Expand to the 4-bit "tttn" field value for the given x86 branch
+dnl condition (like `c', `ae', etc).
+
+define(x86_opcode_tttn,
+m4_assert_numargs(1)
+`x86_lookup(`$1',x86_opcode_ttn_list)')
+
+define(x86_opcode_tttn_list,
+``o', 0,
+`no', 1,
+`b', 2, `c', 2, `nae',2,
+`nb', 3, `nc', 3, `ae', 3,
+`e', 4, `z', 4,
+`ne', 5, `nz', 5,
+`be', 6, `na', 6,
+`nbe', 7, `a', 7,
+`s', 8,
+`ns', 9,
+`p', 10, `pe', 10, `npo',10,
+`np', 11, `npe',11, `po', 11,
+`l', 12, `nge',12,
+`nl', 13, `ge', 13,
+`le', 14, `ng', 14,
+`nle',15, `g', 15')
+
+
+dnl Usage: cmovCC(%srcreg,%dstreg)
+dnl
+dnl Emit a cmov instruction, using a .byte sequence, since various past
+dnl versions of gas don't know cmov. For example,
+dnl
+dnl cmovz( %eax, %ebx)
+dnl
+dnl The source operand can only be a plain register. (m4 code implementing
+dnl full memory addressing modes exists, believe it or not, but isn't
+dnl currently needed and isn't included.)
+dnl
+dnl All the standard conditions are defined. Attempting to use one without
+dnl the macro parentheses, such as just "cmovbe %eax, %ebx", will provoke
+dnl an error. This protects against writing something old gas wouldn't
+dnl understand.
+
+dnl Called: define_cmov_many(cond,tttn,cond,tttn,...)
+define(define_cmov_many,
+`ifelse(m4_length(`$1'),0,,
+`define_cmov(`$1',`$2')define_cmov_many(shift(shift($@)))')')
+
+dnl Called: define_cmov(cond,tttn)
+dnl Emit basically define(cmov<cond>,`cmov_internal(<cond>,<ttn>,`$1',`$2')')
+define(define_cmov,
+m4_assert_numargs(2)
+`define(`cmov$1',
+m4_instruction_wrapper()
+m4_assert_numargs(2)
+`cmov_internal'(m4_doublequote($`'0),``$2'',dnl
+m4_doublequote($`'1),m4_doublequote($`'2)))')
+
+define_cmov_many(x86_opcode_tttn_list)
+
+dnl Called: cmov_internal(name,tttn,src,dst)
+define(cmov_internal,
+m4_assert_numargs(4)
+`.byte dnl
+15, dnl
+eval(64+$2), dnl
+eval(192+8*x86_opcode_reg32(`$4')+x86_opcode_reg32(`$3')) dnl
+ C `$1 $3, $4'')
+
+
+dnl Usage: x86_opcode_regmmx(reg)
+dnl
+dnl Validate the given mmx register, and return its number, 0 to 7.
+
+define(x86_opcode_regmmx,
+m4_assert_numargs(1)
+`x86_lookup(`$1',x86_opcode_regmmx_list)')
+
+define(x86_opcode_regmmx_list,
+``%mm0',0,
+`%mm1',1,
+`%mm2',2,
+`%mm3',3,
+`%mm4',4,
+`%mm5',5,
+`%mm6',6,
+`%mm7',7')
+
+
+dnl Usage: psadbw(%srcreg,%dstreg)
+dnl
+dnl Oldish versions of gas don't know psadbw, in particular gas 2.9.1 on
+dnl FreeBSD 3.3 and 3.4 doesn't, so instead emit .byte sequences. For
+dnl example,
+dnl
+dnl psadbw( %mm1, %mm2)
+dnl
+dnl Only register->register forms are supported here, which suffices for
+dnl the current code.
+
+define(psadbw,
+m4_instruction_wrapper()
+m4_assert_numargs(2)
+`.byte 0x0f,0xf6,dnl
+eval(192+x86_opcode_regmmx(`$2')*8+x86_opcode_regmmx(`$1')) dnl
+ C `psadbw $1, $2'')
+
+
+dnl Usage: Zdisp(inst,op,op,op)
+dnl
+dnl Generate explicit .byte sequences if necessary to force a byte-sized
+dnl zero displacement on an instruction. For example,
+dnl
+dnl Zdisp( movl, 0,(%esi), %eax)
+dnl
+dnl expands to
+dnl
+dnl .byte 139,70,0 C movl 0(%esi), %eax
+dnl
+dnl If the displacement given isn't 0, then normal assembler code is
+dnl generated. For example,
+dnl
+dnl Zdisp( movl, 4,(%esi), %eax)
+dnl
+dnl expands to
+dnl
+dnl movl 4(%esi), %eax
+dnl
+dnl This means a single Zdisp() form can be used with an expression for the
+dnl displacement, and .byte will be used only if necessary. The
+dnl displacement argument is eval()ed.
+dnl
+dnl Because there aren't many places a 0(reg) form is wanted, Zdisp is
+dnl implemented with a table of instructions and encodings. A new entry is
+dnl needed for any different operation or registers. The table is split
+dnl into separate macros to avoid overflowing BSD m4 macro expansion space.
+
+define(Zdisp,
+m4_assert_numargs(4)
+`define(`Zdisp_found',0)dnl
+Zdisp_1($@)dnl
+Zdisp_2($@)dnl
+Zdisp_3($@)dnl
+Zdisp_4($@)dnl
+ifelse(Zdisp_found,0,
+`m4_error(`unrecognised instruction in Zdisp: $1 $2 $3 $4
+')')')
+
+define(Zdisp_1,`dnl
+Zdisp_match( adcl, 0,(%edx), %eax, `0x13,0x42,0x00', $@)`'dnl
+Zdisp_match( adcl, 0,(%edx), %ebx, `0x13,0x5a,0x00', $@)`'dnl
+Zdisp_match( adcl, 0,(%edx), %esi, `0x13,0x72,0x00', $@)`'dnl
+Zdisp_match( addl, %ebx, 0,(%edi), `0x01,0x5f,0x00', $@)`'dnl
+Zdisp_match( addl, %ecx, 0,(%edi), `0x01,0x4f,0x00', $@)`'dnl
+Zdisp_match( addl, %esi, 0,(%edi), `0x01,0x77,0x00', $@)`'dnl
+Zdisp_match( sbbl, 0,(%edx), %eax, `0x1b,0x42,0x00', $@)`'dnl
+Zdisp_match( sbbl, 0,(%edx), %esi, `0x1b,0x72,0x00', $@)`'dnl
+Zdisp_match( subl, %ecx, 0,(%edi), `0x29,0x4f,0x00', $@)`'dnl
+Zdisp_match( movzbl, 0,(%eax,%ebp), %eax, `0x0f,0xb6,0x44,0x28,0x00', $@)`'dnl
+Zdisp_match( movzbl, 0,(%ecx,%edi), %edi, `0x0f,0xb6,0x7c,0x39,0x00', $@)`'dnl
+Zdisp_match( adc, 0,(%ebx,%ecx,4), %eax, `0x13,0x44,0x8b,0x00', $@)`'dnl
+Zdisp_match( sbb, 0,(%ebx,%ecx,4), %eax, `0x1b,0x44,0x8b,0x00', $@)`'dnl
+')
+define(Zdisp_2,`dnl
+Zdisp_match( movl, %eax, 0,(%edi), `0x89,0x47,0x00', $@)`'dnl
+Zdisp_match( movl, %ebx, 0,(%edi), `0x89,0x5f,0x00', $@)`'dnl
+Zdisp_match( movl, %esi, 0,(%edi), `0x89,0x77,0x00', $@)`'dnl
+Zdisp_match( movl, 0,(%ebx), %eax, `0x8b,0x43,0x00', $@)`'dnl
+Zdisp_match( movl, 0,(%ebx), %esi, `0x8b,0x73,0x00', $@)`'dnl
+Zdisp_match( movl, 0,(%edx), %eax, `0x8b,0x42,0x00', $@)`'dnl
+Zdisp_match( movl, 0,(%esi), %eax, `0x8b,0x46,0x00', $@)`'dnl
+Zdisp_match( movl, 0,(%esi,%ecx,4), %eax, `0x8b,0x44,0x8e,0x00', $@)`'dnl
+Zdisp_match( mov, 0,(%esi,%ecx,4), %eax, `0x8b,0x44,0x8e,0x00', $@)`'dnl
+Zdisp_match( mov, %eax, 0,(%edi,%ecx,4), `0x89,0x44,0x8f,0x00', $@)`'dnl
+')
+define(Zdisp_3,`dnl
+Zdisp_match( movq, 0,(%eax,%ecx,8), %mm0, `0x0f,0x6f,0x44,0xc8,0x00', $@)`'dnl
+Zdisp_match( movq, 0,(%ebx,%eax,4), %mm0, `0x0f,0x6f,0x44,0x83,0x00', $@)`'dnl
+Zdisp_match( movq, 0,(%ebx,%eax,4), %mm2, `0x0f,0x6f,0x54,0x83,0x00', $@)`'dnl
+Zdisp_match( movq, 0,(%ebx,%ecx,4), %mm0, `0x0f,0x6f,0x44,0x8b,0x00', $@)`'dnl
+Zdisp_match( movq, 0,(%edx), %mm0, `0x0f,0x6f,0x42,0x00', $@)`'dnl
+Zdisp_match( movq, 0,(%esi), %mm0, `0x0f,0x6f,0x46,0x00', $@)`'dnl
+Zdisp_match( movq, %mm0, 0,(%edi), `0x0f,0x7f,0x47,0x00', $@)`'dnl
+Zdisp_match( movq, %mm2, 0,(%ecx,%eax,4), `0x0f,0x7f,0x54,0x81,0x00', $@)`'dnl
+Zdisp_match( movq, %mm2, 0,(%edx,%eax,4), `0x0f,0x7f,0x54,0x82,0x00', $@)`'dnl
+Zdisp_match( movq, %mm0, 0,(%edx,%ecx,8), `0x0f,0x7f,0x44,0xca,0x00', $@)`'dnl
+')
+define(Zdisp_4,`dnl
+Zdisp_match( movd, 0,(%eax,%ecx,4), %mm0, `0x0f,0x6e,0x44,0x88,0x00', $@)`'dnl
+Zdisp_match( movd, 0,(%eax,%ecx,8), %mm1, `0x0f,0x6e,0x4c,0xc8,0x00', $@)`'dnl
+Zdisp_match( movd, 0,(%edx,%ecx,8), %mm0, `0x0f,0x6e,0x44,0xca,0x00', $@)`'dnl
+Zdisp_match( movd, %mm0, 0,(%eax,%ecx,4), `0x0f,0x7e,0x44,0x88,0x00', $@)`'dnl
+Zdisp_match( movd, %mm0, 0,(%ecx,%eax,4), `0x0f,0x7e,0x44,0x81,0x00', $@)`'dnl
+Zdisp_match( movd, %mm2, 0,(%ecx,%eax,4), `0x0f,0x7e,0x54,0x81,0x00', $@)`'dnl
+Zdisp_match( movd, %mm0, 0,(%edx,%ecx,4), `0x0f,0x7e,0x44,0x8a,0x00', $@)`'dnl
+')
+
+define(Zdisp_match,
+m4_assert_numargs(9)
+`ifelse(eval(m4_stringequal_p(`$1',`$6')
+ && m4_stringequal_p(`$2',0)
+ && m4_stringequal_p(`$3',`$8')
+ && m4_stringequal_p(`$4',`$9')),1,
+`define(`Zdisp_found',1)dnl
+ifelse(eval(`$7'),0,
+` .byte $5 C `$1 0$3, $4'',
+` $6 $7$8, $9')',
+
+`ifelse(eval(m4_stringequal_p(`$1',`$6')
+ && m4_stringequal_p(`$2',`$7')
+ && m4_stringequal_p(`$3',0)
+ && m4_stringequal_p(`$4',`$9')),1,
+`define(`Zdisp_found',1)dnl
+ifelse(eval(`$8'),0,
+` .byte $5 C `$1 $2, 0$4'',
+` $6 $7, $8$9')')')')
+
+
+dnl Usage: shldl(count,src,dst)
+dnl shrdl(count,src,dst)
+dnl shldw(count,src,dst)
+dnl shrdw(count,src,dst)
+dnl
+dnl Generate a double-shift instruction, possibly omitting a %cl count
+dnl parameter if that's what the assembler requires, as indicated by
+dnl WANT_SHLDL_CL in config.m4. For example,
+dnl
+dnl shldl( %cl, %eax, %ebx)
+dnl
+dnl turns into either
+dnl
+dnl shldl %cl, %eax, %ebx
+dnl or
+dnl shldl %eax, %ebx
+dnl
+dnl Immediate counts are always passed through unchanged. For example,
+dnl
+dnl shrdl( $2, %esi, %edi)
+dnl becomes
+dnl shrdl $2, %esi, %edi
+dnl
+dnl
+dnl If you forget to use the macro form "shldl( ...)" and instead write
+dnl just a plain "shldl ...", an error results. This ensures the necessary
+dnl variant treatment of %cl isn't accidentally bypassed.
+
+define(define_shd_instruction,
+m4_assert_numargs(1)
+`define($1,
+m4_instruction_wrapper()
+m4_assert_numargs(3)
+`shd_instruction'(m4_doublequote($`'0),m4_doublequote($`'1),dnl
+m4_doublequote($`'2),m4_doublequote($`'3)))')
+
+dnl Effectively: define(shldl,`shd_instruction(`$0',`$1',`$2',`$3')') etc
+define_shd_instruction(shldl)
+define_shd_instruction(shrdl)
+define_shd_instruction(shldw)
+define_shd_instruction(shrdw)
+
+dnl Called: shd_instruction(op,count,src,dst)
+define(shd_instruction,
+m4_assert_numargs(4)
+m4_assert_defined(`WANT_SHLDL_CL')
+`ifelse(eval(m4_stringequal_p(`$2',`%cl') && !WANT_SHLDL_CL),1,
+``$1' `$3', `$4'',
+``$1' `$2', `$3', `$4'')')
+
+
+dnl Usage: ASSERT([cond][,instructions])
+dnl
+dnl If WANT_ASSERT is 1, output the given instructions and expect the given
+dnl flags condition to then be satisfied. For example,
+dnl
+dnl ASSERT(ne, `cmpl %eax, %ebx')
+dnl
+dnl The instructions can be omitted to just assert a flags condition with
+dnl no extra calculation. For example,
+dnl
+dnl ASSERT(nc)
+dnl
+dnl When `instructions' is not empty, a pushf/popf is added to preserve the
+dnl flags, but the instructions themselves must preserve any registers that
+dnl matter. FRAME is adjusted for the push and pop, so the instructions
+dnl given can use defframe() stack variables.
+dnl
+dnl The condition can be omitted to just output the given instructions when
+dnl assertion checking is wanted. In this case the pushf/popf is omitted.
+dnl For example,
+dnl
+dnl ASSERT(, `movl %eax, VAR_KEEPVAL')
+
+define(ASSERT,
+m4_assert_numargs_range(1,2)
+m4_assert_defined(`WANT_ASSERT')
+`ifelse(WANT_ASSERT,1,
+`ifelse(`$1',,
+ `$2',
+ `C ASSERT
+ifelse(`$2',,,` pushf ifdef(`FRAME',`FRAME_pushl()')')
+ $2
+ j`$1' L(ASSERT_ok`'ASSERT_counter)
+ ud2 C assertion failed
+L(ASSERT_ok`'ASSERT_counter):
+ifelse(`$2',,,` popf ifdef(`FRAME',`FRAME_popl()')')
+define(`ASSERT_counter',incr(ASSERT_counter))')')')
+
+define(ASSERT_counter,1)
+
+
+dnl Usage: movl_text_address(label,register)
+dnl
+dnl Get the address of a text segment label, using either a plain movl or a
+dnl position-independent calculation, as necessary. For example,
+dnl
+dnl movl_code_address(L(foo),%eax)
+dnl
+dnl This macro is only meant for use in ASSERT()s or when testing, since
+dnl the PIC sequence it generates will want to be done with a ret balancing
+dnl the call on CPUs with return address branch prediction.
+dnl
+dnl The addl generated here has a backward reference to the label, and so
+dnl won't suffer from the two forwards references bug in old gas (described
+dnl in mpn/x86/README).
+
+define(movl_text_address,
+m4_assert_numargs(2)
+`ifdef(`PIC',
+ `call L(movl_text_address_`'movl_text_address_counter)
+L(movl_text_address_`'movl_text_address_counter):
+ popl $2 C %eip
+ addl `$'$1-L(movl_text_address_`'movl_text_address_counter), $2
+define(`movl_text_address_counter',incr(movl_text_address_counter))',
+ `movl `$'$1, $2')')
+
+define(movl_text_address_counter,1)
+
+
+dnl Usage: notl_or_xorl_GMP_NUMB_MASK(reg)
+dnl
+dnl Expand to either "notl `reg'" or "xorl $GMP_NUMB_BITS,`reg'" as
+dnl appropriate for nails in use or not.
+
+define(notl_or_xorl_GMP_NUMB_MASK,
+m4_assert_numargs(1)
+`ifelse(GMP_NAIL_BITS,0,
+`notl `$1'',
+`xorl $GMP_NUMB_MASK, `$1'')')
+
+
+dnl Usage LEA(symbol,reg)
+dnl Usage LEAL(symbol_local_to_file,reg)
+
+define(`LEA',
+m4_assert_numargs(2)
+`ifdef(`PIC',`dnl
+ifelse(index(defn(`load_eip'), `$2'),-1,
+`m4append(`load_eip',
+` TEXT
+ ALIGN(16)
+L(movl_eip_`'substr($2,1)):
+ movl (%esp), $2
+ ret_internal
+')')dnl
+ call L(movl_eip_`'substr($2,1))
+ addl $_GLOBAL_OFFSET_TABLE_, $2
+ movl $1@GOT($2), $2
+',`
+ movl `$'$1, $2
+')')
+
+define(`LEAL',
+m4_assert_numargs(2)
+`ifdef(`PIC',`dnl
+ifelse(index(defn(`load_eip'), `$2'),-1,
+`m4append(`load_eip',
+` TEXT
+ ALIGN(16)
+L(movl_eip_`'substr($2,1)):
+ movl (%esp), $2
+ ret_internal
+')')dnl
+ call L(movl_eip_`'substr($2,1))
+ addl $_GLOBAL_OFFSET_TABLE_, $2
+ leal $1@GOTOFF($2), $2
+',`
+ movl `$'$1, $2
+')')
+
+dnl ASM_END
+
+define(`ASM_END',`load_eip')
+
+define(`load_eip', `') dnl updated in LEA/LEAL
+
+
+define(`DEF_OBJECT',
+m4_assert_numargs_range(1,2)
+ `RODATA
+ ALIGN(ifelse($#,1,2,$2))
+$1:
+')
+
+define(`END_OBJECT',
+m4_assert_numargs(1)
+` SIZE(`$1',.-`$1')')
+
+dnl Usage: CALL(funcname)
+dnl
+
+define(`CALL',
+m4_assert_numargs(1)
+`ifdef(`PIC',
+ `call GSYM_PREFIX`'$1@PLT',
+ `call GSYM_PREFIX`'$1')')
+
+ifdef(`PIC',
+`define(`PIC_WITH_EBX')',
+`undefine(`PIC_WITH_EBX')')
+
+divert`'dnl
diff --git a/vendor/gmp-6.3.0/mpn/x86/zn1/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/x86/zn1/gmp-mparam.h
new file mode 100644
index 0000000..8e6c052
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/zn1/gmp-mparam.h
@@ -0,0 +1,220 @@
+/* AMD zn1/32 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 3700-4300 MHz Pinnacle Ridge */
+/* FFT tuning limit = 67,000,000 */
+/* Generated by tuneup.c, 2019-10-21, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 3
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 4
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 10
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 9
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1N_PI1_METHOD 1 /* 14.00% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD 4
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 22
+
+#define DIV_1_VS_MUL_1_PERCENT 248
+
+#define MUL_TOOM22_THRESHOLD 28
+#define MUL_TOOM33_THRESHOLD 91
+#define MUL_TOOM44_THRESHOLD 137
+#define MUL_TOOM6H_THRESHOLD 222
+#define MUL_TOOM8H_THRESHOLD 454
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 85
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 103
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 88
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 105
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 130
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 63
+#define SQR_TOOM3_THRESHOLD 98
+#define SQR_TOOM4_THRESHOLD 172
+#define SQR_TOOM6_THRESHOLD 286
+#define SQR_TOOM8_THRESHOLD 478
+
+#define MULMID_TOOM42_THRESHOLD 64
+
+#define MULMOD_BNM1_THRESHOLD 21
+#define SQRMOD_BNM1_THRESHOLD 17
+
+#define MUL_FFT_MODF_THRESHOLD 606 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 606, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \
+ { 15, 5}, { 31, 6}, { 27, 7}, { 15, 6}, \
+ { 33, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \
+ { 39, 7}, { 29, 8}, { 15, 7}, { 35, 8}, \
+ { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \
+ { 27, 9}, { 15, 8}, { 31, 7}, { 63, 8}, \
+ { 43, 9}, { 23, 8}, { 51, 9}, { 31, 8}, \
+ { 67, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \
+ { 95,10}, { 31, 9}, { 79,10}, { 47, 9}, \
+ { 95,11}, { 31,10}, { 63, 9}, { 127,10}, \
+ { 79, 9}, { 159,10}, { 95,11}, { 63,10}, \
+ { 159,11}, { 95,10}, { 191,12}, { 63,11}, \
+ { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \
+ { 543, 8}, { 1087,11}, { 159,10}, { 319, 9}, \
+ { 639,10}, { 335, 9}, { 671,11}, { 191,10}, \
+ { 383, 9}, { 767,10}, { 399,11}, { 223,12}, \
+ { 127,11}, { 255,10}, { 543, 9}, { 1087,11}, \
+ { 287,10}, { 607, 9}, { 1215,11}, { 319,10}, \
+ { 671, 9}, { 1343,11}, { 351,12}, { 191,11}, \
+ { 383,10}, { 799,11}, { 415,13}, { 127,12}, \
+ { 255,11}, { 543,10}, { 1087,11}, { 607,10}, \
+ { 1215,12}, { 319,11}, { 671,10}, { 1343,11}, \
+ { 735,10}, { 1471,12}, { 383,11}, { 799,10}, \
+ { 1599,11}, { 863,10}, { 1727,12}, { 447,11}, \
+ { 959,10}, { 1919,11}, { 991,13}, { 255,12}, \
+ { 511,11}, { 1087,12}, { 575,11}, { 1215,10}, \
+ { 2431,12}, { 639,11}, { 1343,12}, { 703,11}, \
+ { 1471,10}, { 2943,13}, { 383,12}, { 767,11}, \
+ { 1599,12}, { 831,11}, { 1727,10}, { 3455,12}, \
+ { 959,11}, { 1919,14}, { 255,13}, { 511,12}, \
+ { 1087,11}, { 2239,12}, { 1215,11}, { 2431,13}, \
+ { 639,12}, { 1471,11}, { 2943,13}, { 767,12}, \
+ { 1727,11}, { 3455,13}, { 895,12}, { 1983,14}, \
+ { 511,13}, { 1023,12}, { 2239,13}, { 1151,12}, \
+ { 2495,13}, { 1279,12}, { 2623,13}, { 1407,12}, \
+ { 2943,14}, { 767,13}, { 1663,12}, { 3455,13}, \
+ { 1919,12}, { 3839,15}, { 511,14}, { 1023,13}, \
+ { 2175,12}, { 4479,13}, { 2431,14}, { 1279,13}, \
+ { 2943,12}, { 5887,14}, { 1535,13}, { 3455,14}, \
+ { 1791,13}, { 3967,12}, { 7935,11}, { 15871,15}, \
+ { 1023,14}, { 2047,13}, { 4479,14}, { 2303,13}, \
+ { 4991,12}, { 9983,14}, { 2815,13}, { 5887,15}, \
+ { 1535,14}, { 3839,13}, { 7935,12}, { 15871,16} }
+#define MUL_FFT_TABLE3_SIZE 172
+#define MUL_FFT_THRESHOLD 5760
+
+#define SQR_FFT_MODF_THRESHOLD 464 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 464, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \
+ { 25, 7}, { 13, 6}, { 28, 7}, { 15, 6}, \
+ { 31, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \
+ { 39, 7}, { 27, 8}, { 15, 7}, { 35, 8}, \
+ { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \
+ { 27, 9}, { 15, 8}, { 39, 9}, { 23, 8}, \
+ { 51,10}, { 15, 9}, { 31, 8}, { 67, 9}, \
+ { 39, 8}, { 79, 9}, { 47, 8}, { 95, 9}, \
+ { 55,10}, { 31, 9}, { 79,10}, { 47, 9}, \
+ { 95,11}, { 31,10}, { 63, 9}, { 135,10}, \
+ { 79, 9}, { 159,10}, { 95,11}, { 63,10}, \
+ { 127, 9}, { 255,10}, { 143, 9}, { 287,10}, \
+ { 159,11}, { 95,12}, { 63,11}, { 127,10}, \
+ { 255, 9}, { 511,10}, { 271, 9}, { 543,10}, \
+ { 287, 9}, { 575,11}, { 159, 9}, { 639,10}, \
+ { 335, 9}, { 671,10}, { 351, 9}, { 703,11}, \
+ { 191,10}, { 383, 9}, { 767,10}, { 399, 9}, \
+ { 799,10}, { 415,12}, { 127,11}, { 255,10}, \
+ { 543,11}, { 287,10}, { 607,11}, { 319,10}, \
+ { 671,11}, { 351,10}, { 703,12}, { 191,11}, \
+ { 383,10}, { 799,11}, { 415,10}, { 831,13}, \
+ { 127,12}, { 255,11}, { 543,10}, { 1087,11}, \
+ { 607,10}, { 1215,12}, { 319,11}, { 671,10}, \
+ { 1343,11}, { 735,10}, { 1471,12}, { 383,11}, \
+ { 799,10}, { 1599,11}, { 863,12}, { 447,11}, \
+ { 959,13}, { 255,12}, { 511,11}, { 1087,12}, \
+ { 575,11}, { 1215,10}, { 2431,12}, { 639,11}, \
+ { 1343,12}, { 703,11}, { 1471,13}, { 383,12}, \
+ { 767,11}, { 1599,12}, { 831,11}, { 1727,10}, \
+ { 3455,12}, { 959,11}, { 1919,14}, { 255,13}, \
+ { 511,12}, { 1087,11}, { 2239,12}, { 1215,11}, \
+ { 2431,13}, { 639,12}, { 1471,11}, { 2943,13}, \
+ { 767,12}, { 1727,11}, { 3455,13}, { 895,12}, \
+ { 1919,14}, { 511,13}, { 1023,12}, { 2239,13}, \
+ { 1151,12}, { 2431,13}, { 1279,12}, { 2623,13}, \
+ { 1407,12}, { 2943,14}, { 767,13}, { 1663,12}, \
+ { 3455,13}, { 1919,12}, { 3839,15}, { 511,14}, \
+ { 1023,13}, { 2175,12}, { 4479,13}, { 2431,14}, \
+ { 1279,13}, { 2943,12}, { 5887,14}, { 1535,13}, \
+ { 3455,14}, { 1791,13}, { 3839,12}, { 7679,13}, \
+ { 3967,12}, { 7935,15}, { 1023,14}, { 2047,13}, \
+ { 4479,14}, { 2303,13}, { 4991,12}, { 9983,14}, \
+ { 2815,13}, { 5887,15}, { 1535,14}, { 3839,13}, \
+ { 7935,16} }
+#define SQR_FFT_TABLE3_SIZE 173
+#define SQR_FFT_THRESHOLD 4736
+
+#define MULLO_BASECASE_THRESHOLD 3
+#define MULLO_DC_THRESHOLD 60
+#define MULLO_MUL_N_THRESHOLD 11278
+#define SQRLO_BASECASE_THRESHOLD 8
+#define SQRLO_DC_THRESHOLD 161
+#define SQRLO_SQR_THRESHOLD 9335
+
+#define DC_DIV_QR_THRESHOLD 71
+#define DC_DIVAPPR_Q_THRESHOLD 206
+#define DC_BDIV_QR_THRESHOLD 63
+#define DC_BDIV_Q_THRESHOLD 126
+
+#define INV_MULMOD_BNM1_THRESHOLD 78
+#define INV_NEWTON_THRESHOLD 274
+#define INV_APPR_THRESHOLD 228
+
+#define BINV_NEWTON_THRESHOLD 274
+#define REDC_1_TO_REDC_N_THRESHOLD 71
+
+#define MU_DIV_QR_THRESHOLD 1652
+#define MU_DIVAPPR_Q_THRESHOLD 1718
+#define MUPI_DIV_QR_THRESHOLD 122
+#define MU_BDIV_QR_THRESHOLD 1470
+#define MU_BDIV_Q_THRESHOLD 1589
+
+#define POWM_SEC_TABLE 3,28,54,386,1337
+
+#define GET_STR_DC_THRESHOLD 13
+#define GET_STR_PRECOMPUTE_THRESHOLD 19
+#define SET_STR_DC_THRESHOLD 262
+#define SET_STR_PRECOMPUTE_THRESHOLD 558
+
+#define FAC_DSC_THRESHOLD 109
+#define FAC_ODD_THRESHOLD 39
+
+#define MATRIX22_STRASSEN_THRESHOLD 21
+#define HGCD2_DIV1_METHOD 1 /* 7.49% faster than 3 */
+#define HGCD_THRESHOLD 74
+#define HGCD_APPR_THRESHOLD 70
+#define HGCD_REDUCE_THRESHOLD 3389
+#define GCD_DC_THRESHOLD 440
+#define GCDEXT_DC_THRESHOLD 327
+#define JACOBI_BASE_METHOD 1 /* 11.98% faster than 3 */
+
+/* Tuneup completed successfully, took 36916 seconds */
diff --git a/vendor/gmp-6.3.0/mpn/x86/zn2/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/x86/zn2/gmp-mparam.h
new file mode 100644
index 0000000..152e6b7
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/zn2/gmp-mparam.h
@@ -0,0 +1,226 @@
+/* AMD zn2/32 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 3600-4400 MHz Matisse */
+/* FFT tuning limit = 67,000,000 */
+/* Generated by tuneup.c, 2019-10-23, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 3
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 15
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 9
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1N_PI1_METHOD 1 /* 4.78% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD 3
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD 7
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 23
+
+#define DIV_1_VS_MUL_1_PERCENT 274
+
+#define MUL_TOOM22_THRESHOLD 24
+#define MUL_TOOM33_THRESHOLD 85
+#define MUL_TOOM44_THRESHOLD 166
+#define MUL_TOOM6H_THRESHOLD 290
+#define MUL_TOOM8H_THRESHOLD 430
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 114
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 97
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 113
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 130
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 26
+#define SQR_TOOM3_THRESHOLD 153
+#define SQR_TOOM4_THRESHOLD 214
+#define SQR_TOOM6_THRESHOLD 318
+#define SQR_TOOM8_THRESHOLD 478
+
+#define MULMID_TOOM42_THRESHOLD 48
+
+#define MULMOD_BNM1_THRESHOLD 18
+#define SQRMOD_BNM1_THRESHOLD 24
+
+#define MUL_FFT_MODF_THRESHOLD 444 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 444, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
+ { 12, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \
+ { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \
+ { 31, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \
+ { 39, 7}, { 27, 8}, { 15, 7}, { 35, 8}, \
+ { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \
+ { 27, 9}, { 15, 8}, { 31, 7}, { 63, 8}, \
+ { 39, 9}, { 23, 8}, { 51,10}, { 15, 9}, \
+ { 31, 8}, { 67, 9}, { 39, 8}, { 79, 9}, \
+ { 47,10}, { 31, 9}, { 79,10}, { 47, 9}, \
+ { 95,11}, { 31,10}, { 63, 9}, { 127,10}, \
+ { 79, 9}, { 159,10}, { 95,11}, { 63,10}, \
+ { 127, 9}, { 255, 8}, { 511,10}, { 143, 9}, \
+ { 287, 8}, { 575,10}, { 159,11}, { 95,12}, \
+ { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \
+ { 271, 9}, { 543,10}, { 287, 9}, { 575,11}, \
+ { 159,10}, { 319, 9}, { 639,10}, { 335, 9}, \
+ { 671, 8}, { 1343,10}, { 351, 9}, { 703,10}, \
+ { 367, 9}, { 735,11}, { 191,10}, { 383, 9}, \
+ { 767,10}, { 415,11}, { 223,10}, { 447,12}, \
+ { 127,11}, { 255,10}, { 543, 9}, { 1087,11}, \
+ { 287,10}, { 607,11}, { 319,10}, { 671, 9}, \
+ { 1343,11}, { 351,10}, { 735,12}, { 191,11}, \
+ { 383,10}, { 767,11}, { 415,10}, { 831,11}, \
+ { 447,13}, { 127,12}, { 255,11}, { 543,10}, \
+ { 1087,11}, { 607,10}, { 1215,12}, { 319,11}, \
+ { 671,10}, { 1343,11}, { 735,10}, { 1471, 9}, \
+ { 2943,12}, { 383,11}, { 799,10}, { 1599,11}, \
+ { 863,12}, { 447,11}, { 959,10}, { 1919,13}, \
+ { 255,12}, { 511,11}, { 1087,12}, { 575,11}, \
+ { 1215,10}, { 2431,12}, { 639,11}, { 1343,12}, \
+ { 703,11}, { 1471,10}, { 2943,13}, { 383,12}, \
+ { 767,11}, { 1599,12}, { 831,11}, { 1727,10}, \
+ { 3455,12}, { 959,11}, { 1919,10}, { 3839,14}, \
+ { 255,13}, { 511,12}, { 1215,11}, { 2431,13}, \
+ { 639,12}, { 1471,11}, { 2943,10}, { 5887,13}, \
+ { 767,12}, { 1727,11}, { 3455,13}, { 895,12}, \
+ { 1919,11}, { 3839,14}, { 511,13}, { 1023,12}, \
+ { 2111,13}, { 1151,12}, { 2431,13}, { 1407,12}, \
+ { 2943,11}, { 5887,14}, { 767,13}, { 1663,12}, \
+ { 3455,13}, { 1919,12}, { 3839,15}, { 511,14}, \
+ { 1023,13}, { 2431,14}, { 1279,13}, { 2943,12}, \
+ { 5887,14}, { 1535,13}, { 3455,14}, { 1791,13}, \
+ { 3839,12}, { 7679,13}, { 3967,12}, { 7935,11}, \
+ { 15871,15}, { 1023,14}, { 2047,13}, { 4351,14}, \
+ { 2303,13}, { 4991,12}, { 9983,14}, { 2815,13}, \
+ { 5887,15}, { 1535,14}, { 3839,13}, { 7935,12}, \
+ { 15871,16} }
+#define MUL_FFT_TABLE3_SIZE 189
+#define MUL_FFT_THRESHOLD 4736
+
+#define SQR_FFT_MODF_THRESHOLD 404 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 404, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
+ { 12, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \
+ { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \
+ { 31, 7}, { 19, 6}, { 39, 7}, { 23, 6}, \
+ { 47, 7}, { 27, 8}, { 15, 7}, { 35, 8}, \
+ { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \
+ { 27, 9}, { 15, 8}, { 39, 9}, { 23, 8}, \
+ { 47,10}, { 15, 9}, { 31, 8}, { 63, 9}, \
+ { 39, 8}, { 79, 9}, { 47,10}, { 31, 9}, \
+ { 79,10}, { 47,11}, { 31,10}, { 63, 9}, \
+ { 127,10}, { 95,11}, { 63,10}, { 127, 9}, \
+ { 255, 8}, { 511, 9}, { 271,10}, { 143, 9}, \
+ { 287, 8}, { 607, 7}, { 1215,11}, { 95,12}, \
+ { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \
+ { 271, 9}, { 543, 8}, { 1087, 9}, { 607, 8}, \
+ { 1215,11}, { 159, 9}, { 671, 8}, { 1343,10}, \
+ { 351, 9}, { 735, 8}, { 1471,11}, { 191,10}, \
+ { 383, 9}, { 767,10}, { 415,11}, { 223,12}, \
+ { 127,11}, { 255,10}, { 543, 9}, { 1087,10}, \
+ { 607, 9}, { 1215, 8}, { 2431,10}, { 671, 9}, \
+ { 1343,10}, { 735, 9}, { 1471,12}, { 191,11}, \
+ { 383,10}, { 767,11}, { 415,10}, { 831,13}, \
+ { 127,12}, { 255,11}, { 543,10}, { 1087,11}, \
+ { 607,10}, { 1215, 9}, { 2431,11}, { 671,10}, \
+ { 1343,11}, { 735,10}, { 1471, 9}, { 2943,12}, \
+ { 383,11}, { 863,12}, { 447,11}, { 959,10}, \
+ { 1919,13}, { 255,12}, { 511,11}, { 1087,12}, \
+ { 575,11}, { 1215,10}, { 2431,12}, { 639,11}, \
+ { 1343,12}, { 703,11}, { 1471,10}, { 2943, 9}, \
+ { 5887,12}, { 767,11}, { 1599,12}, { 831,11}, \
+ { 1727,12}, { 959,11}, { 1919,10}, { 3839,14}, \
+ { 255,13}, { 511,12}, { 1215,11}, { 2431,13}, \
+ { 639,12}, { 1471,11}, { 2943,10}, { 5887,13}, \
+ { 767,12}, { 1727,13}, { 895,12}, { 1919,11}, \
+ { 3839,14}, { 511,13}, { 1023,12}, { 2111,13}, \
+ { 1151,12}, { 2431,13}, { 1279,12}, { 2623,13}, \
+ { 1407,12}, { 2943,11}, { 5887,14}, { 767,13}, \
+ { 1663,12}, { 3455,13}, { 1919,12}, { 3839,15}, \
+ { 511,14}, { 1023,13}, { 2431,14}, { 1279,13}, \
+ { 2943,12}, { 5887,14}, { 1535,13}, { 3455,14}, \
+ { 1791,13}, { 3839,12}, { 7679,13}, { 3967,12}, \
+ { 7935,11}, { 15871,15}, { 1023,14}, { 2047,13}, \
+ { 4223,14}, { 2303,13}, { 4991,12}, { 9983,14}, \
+ { 2815,13}, { 5887,15}, { 1535,14}, { 3839,13}, \
+ { 7935,12}, { 15871,16} }
+#define SQR_FFT_TABLE3_SIZE 178
+#define SQR_FFT_THRESHOLD 3712
+
+#define MULLO_BASECASE_THRESHOLD 4
+#define MULLO_DC_THRESHOLD 62
+#define MULLO_MUL_N_THRESHOLD 8907
+#define SQRLO_BASECASE_THRESHOLD 8
+#define SQRLO_DC_THRESHOLD 107
+#define SQRLO_SQR_THRESHOLD 6633
+
+#define DC_DIV_QR_THRESHOLD 54
+#define DC_DIVAPPR_Q_THRESHOLD 206
+#define DC_BDIV_QR_THRESHOLD 55
+#define DC_BDIV_Q_THRESHOLD 136
+
+#define INV_MULMOD_BNM1_THRESHOLD 74
+#define INV_NEWTON_THRESHOLD 212
+#define INV_APPR_THRESHOLD 204
+
+#define BINV_NEWTON_THRESHOLD 292
+#define REDC_1_TO_REDC_N_THRESHOLD 67
+
+#define MU_DIV_QR_THRESHOLD 1442
+#define MU_DIVAPPR_Q_THRESHOLD 1528
+#define MUPI_DIV_QR_THRESHOLD 97
+#define MU_BDIV_QR_THRESHOLD 1142
+#define MU_BDIV_Q_THRESHOLD 1470
+
+#define POWM_SEC_TABLE 1,16,96,386,1555
+
+#define GET_STR_DC_THRESHOLD 10
+#define GET_STR_PRECOMPUTE_THRESHOLD 16
+#define SET_STR_DC_THRESHOLD 303
+#define SET_STR_PRECOMPUTE_THRESHOLD 748
+
+#define FAC_DSC_THRESHOLD 141
+#define FAC_ODD_THRESHOLD 55
+
+#define MATRIX22_STRASSEN_THRESHOLD 20
+#define HGCD2_DIV1_METHOD 1 /* 14.03% faster than 3 */
+#define HGCD_THRESHOLD 103
+#define HGCD_APPR_THRESHOLD 127
+#define HGCD_REDUCE_THRESHOLD 3014
+#define GCD_DC_THRESHOLD 396
+#define GCDEXT_DC_THRESHOLD 265
+#define JACOBI_BASE_METHOD 1 /* 47.88% faster than 4 */
+
+/* Tuneup completed successfully, took 29014 seconds */