diff options
Diffstat (limited to 'vendor/gmp-6.3.0/mpn/x86/p6')
28 files changed, 5020 insertions, 0 deletions
diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/README b/vendor/gmp-6.3.0/mpn/x86/p6/README new file mode 100644 index 0000000..f19d47b --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/p6/README @@ -0,0 +1,125 @@ +Copyright 2000, 2001 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. + + + + + + INTEL P6 MPN SUBROUTINES + + + +This directory contains code optimized for Intel P6 class CPUs, meaning +PentiumPro, Pentium II and Pentium III. The mmx and p3mmx subdirectories +have routines using MMX instructions. + + + +STATUS + +Times for the loops, with all code and data in L1 cache, are as follows. +Some of these might be able to be improved. + + cycles/limb + + mpn_add_n/sub_n 3.7 + + mpn_copyi 0.75 + mpn_copyd 1.75 (or 0.75 if no overlap) + + mpn_divrem_1 39.0 + mpn_mod_1 21.5 + mpn_divexact_by3 8.5 + + mpn_mul_1 5.5 + mpn_addmul/submul_1 6.35 + + mpn_l/rshift 2.5 + + mpn_mul_basecase 8.2 cycles/crossproduct (approx) + mpn_sqr_basecase 4.0 cycles/crossproduct (approx) + or 7.75 cycles/triangleproduct (approx) + +Pentium II and III have MMX and get the following improvements. + + mpn_divrem_1 25.0 integer part, 17.5 fractional part + + mpn_l/rshift 1.75 + + + + +NOTES + +Write-allocate L1 data cache means prefetching of destinations is unnecessary. + +Mispredicted branches have a penalty of between 9 and 15 cycles, and even up +to 26 cycles depending how far speculative execution has gone. The 9 cycle +minimum penalty comes from the issue pipeline being 9 stages. + +A copy with rep movs seems to copy 16 bytes at a time, since speeds for 4, +5, 6 or 7 limb operations are all the same. The 0.75 cycles/limb would be 3 +cycles per 16 byte block. + + + + +CODING + +Instructions in general code have been shown grouped if they can execute +together, which means up to three instructions with no successive +dependencies, and with only the first being a multiple micro-op. + +P6 has out-of-order execution, so the groupings are really only showing +dependent paths where some shuffling might allow some latencies to be +hidden. + + + + +REFERENCES + +"Intel Architecture Optimization Reference Manual", 1999, revision 001 dated +02/99, order number 245127 (order number 730795-001 is in the document too). +Available on-line: + + http://download.intel.com/design/PentiumII/manuals/245127.htm + +"Intel Architecture Optimization Manual", 1997, order number 242816. This +is an older document mostly about P5 and not as good as the above. +Available on-line: + + http://download.intel.com/design/PentiumII/manuals/242816.htm + + + +---------------- +Local variables: +mode: text +fill-column: 76 +End: diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/aors_n.asm b/vendor/gmp-6.3.0/mpn/x86/p6/aors_n.asm new file mode 100644 index 0000000..df51c2e --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/p6/aors_n.asm @@ -0,0 +1,156 @@ +dnl Intel P6 mpn_add_n/mpn_sub_n -- mpn add or subtract. + +dnl Copyright 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C TODO: +C * Avoid indexed addressing, it makes us stall on the two-ported register +C file. + +C cycles/limb +C P6 model 0-8,10-12 3.17 +C P6 model 9 (Banias) 2.15 +C P6 model 13 (Dothan) 2.25 + + +define(`rp', `%edi') +define(`up', `%esi') +define(`vp', `%ebx') +define(`n', `%ecx') + +ifdef(`OPERATION_add_n', ` + define(ADCSBB, adc) + define(func, mpn_add_n) + define(func_nc, mpn_add_nc)') +ifdef(`OPERATION_sub_n', ` + define(ADCSBB, sbb) + define(func, mpn_sub_n) + define(func_nc, mpn_sub_nc)') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) + +ASM_START() + + TEXT + ALIGN(16) + +PROLOGUE(func) + xor %edx, %edx +L(start): + push %edi + push %esi + push %ebx + + mov 16(%esp), rp + mov 20(%esp), up + mov 24(%esp), vp + mov 28(%esp), n + + lea (up,n,4), up + lea (vp,n,4), vp + lea (rp,n,4), rp + + neg n + mov n, %eax + and $-8, n + and $7, %eax + shl $2, %eax C 4x +ifdef(`PIC',` + call L(pic_calc) +L(here): +',` + lea L(ent) (%eax,%eax,2), %eax C 12x +') + + shr %edx C set cy flag + jmp *%eax + +ifdef(`PIC',` +L(pic_calc): + C See mpn/x86/README about old gas bugs + lea (%eax,%eax,2), %eax + add $L(ent)-L(here), %eax + add (%esp), %eax + ret_internal +') + +L(end): + sbb %eax, %eax + neg %eax + pop %ebx + pop %esi + pop %edi + ret + + ALIGN(16) +L(top): + jecxz L(end) +L(ent): +Zdisp( mov, 0,(up,n,4), %eax) +Zdisp( ADCSBB, 0,(vp,n,4), %eax) +Zdisp( mov, %eax, 0,(rp,n,4)) + + mov 4(up,n,4), %edx + ADCSBB 4(vp,n,4), %edx + mov %edx, 4(rp,n,4) + + mov 8(up,n,4), %eax + ADCSBB 8(vp,n,4), %eax + mov %eax, 8(rp,n,4) + + mov 12(up,n,4), %edx + ADCSBB 12(vp,n,4), %edx + mov %edx, 12(rp,n,4) + + mov 16(up,n,4), %eax + ADCSBB 16(vp,n,4), %eax + mov %eax, 16(rp,n,4) + + mov 20(up,n,4), %edx + ADCSBB 20(vp,n,4), %edx + mov %edx, 20(rp,n,4) + + mov 24(up,n,4), %eax + ADCSBB 24(vp,n,4), %eax + mov %eax, 24(rp,n,4) + + mov 28(up,n,4), %edx + ADCSBB 28(vp,n,4), %edx + mov %edx, 28(rp,n,4) + + lea 8(n), n + jmp L(top) + +EPILOGUE() + +PROLOGUE(func_nc) + movl 20(%esp), %edx + jmp L(start) +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/aorsmul_1.asm b/vendor/gmp-6.3.0/mpn/x86/p6/aorsmul_1.asm new file mode 100644 index 0000000..bc8c49c --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/p6/aorsmul_1.asm @@ -0,0 +1,320 @@ +dnl Intel P6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple. + +dnl Copyright 1999-2002, 2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C P5 +C P6 model 0-8,10-12 6.44 +C P6 model 9 (Banias) 6.15 +C P6 model 13 (Dothan) 6.11 +C P4 model 0 (Willamette) +C P4 model 1 (?) +C P4 model 2 (Northwood) +C P4 model 3 (Prescott) +C P4 model 4 (Nocona) +C AMD K6 +C AMD K7 +C AMD K8 + + +dnl P6 UNROLL_COUNT cycles/limb +dnl 8 6.7 +dnl 16 6.35 +dnl 32 6.3 +dnl 64 6.3 +dnl Maximum possible with the current code is 64. + +deflit(UNROLL_COUNT, 16) + + +ifdef(`OPERATION_addmul_1', ` + define(M4_inst, addl) + define(M4_function_1, mpn_addmul_1) + define(M4_function_1c, mpn_addmul_1c) + define(M4_description, add it to) + define(M4_desc_retval, carry) +',`ifdef(`OPERATION_submul_1', ` + define(M4_inst, subl) + define(M4_function_1, mpn_submul_1) + define(M4_function_1c, mpn_submul_1c) + define(M4_description, subtract it from) + define(M4_desc_retval, borrow) +',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1 +')')') + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c) + + +C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t mult); +C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t mult, mp_limb_t carry); +C +C Calculate src,size multiplied by mult and M4_description dst,size. +C Return the M4_desc_retval limb from the top of the result. +C +C This code is pretty much the same as the K6 code. The unrolled loop is +C the same, but there's just a few scheduling tweaks in the setups and the +C simple loop. +C +C A number of variations have been tried for the unrolled loop, with one or +C two carries, and with loads scheduled earlier, but nothing faster than 6 +C cycles/limb has been found. + +ifdef(`PIC',` +deflit(UNROLL_THRESHOLD, 5) +',` +deflit(UNROLL_THRESHOLD, 5) +') + +defframe(PARAM_CARRY, 20) +defframe(PARAM_MULTIPLIER,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + TEXT + ALIGN(32) + +PROLOGUE(M4_function_1c) + pushl %ebx +deflit(`FRAME',4) + movl PARAM_CARRY, %ebx + jmp L(start_nc) +EPILOGUE() + +PROLOGUE(M4_function_1) + push %ebx +deflit(`FRAME',4) + xorl %ebx, %ebx C initial carry + +L(start_nc): + movl PARAM_SIZE, %ecx + pushl %esi +deflit(`FRAME',8) + + movl PARAM_SRC, %esi + pushl %edi +deflit(`FRAME',12) + + movl PARAM_DST, %edi + pushl %ebp +deflit(`FRAME',16) + cmpl $UNROLL_THRESHOLD, %ecx + + movl PARAM_MULTIPLIER, %ebp + jae L(unroll) + + + C simple loop + C this is offset 0x22, so close enough to aligned +L(simple): + C eax scratch + C ebx carry + C ecx counter + C edx scratch + C esi src + C edi dst + C ebp multiplier + + movl (%esi), %eax + addl $4, %edi + + mull %ebp + + addl %ebx, %eax + adcl $0, %edx + + M4_inst %eax, -4(%edi) + movl %edx, %ebx + + adcl $0, %ebx + decl %ecx + + leal 4(%esi), %esi + jnz L(simple) + + + popl %ebp + popl %edi + + popl %esi + movl %ebx, %eax + + popl %ebx + ret + + + +C------------------------------------------------------------------------------ +C VAR_JUMP holds the computed jump temporarily because there's not enough +C registers when doing the mul for the initial two carry limbs. +C +C The add/adc for the initial carry in %ebx is necessary only for the +C mpn_add/submul_1c entry points. Duplicating the startup code to +C eliminate this for the plain mpn_add/submul_1 doesn't seem like a good +C idea. + +dnl overlapping with parameters already fetched +define(VAR_COUNTER,`PARAM_SIZE') +define(VAR_JUMP, `PARAM_DST') + + C this is offset 0x43, so close enough to aligned +L(unroll): + C eax + C ebx initial carry + C ecx size + C edx + C esi src + C edi dst + C ebp + + movl %ecx, %edx + decl %ecx + + subl $2, %edx + negl %ecx + + shrl $UNROLL_LOG2, %edx + andl $UNROLL_MASK, %ecx + + movl %edx, VAR_COUNTER + movl %ecx, %edx + + C 15 code bytes per limb +ifdef(`PIC',` + call L(pic_calc) +L(here): +',` + shll $4, %edx + negl %ecx + + leal L(entry) (%edx,%ecx,1), %edx +') + movl (%esi), %eax C src low limb + + movl %edx, VAR_JUMP + leal ifelse(UNROLL_BYTES,256,128+) 4(%esi,%ecx,4), %esi + + mull %ebp + + addl %ebx, %eax C initial carry (from _1c) + adcl $0, %edx + + movl %edx, %ebx C high carry + leal ifelse(UNROLL_BYTES,256,128) (%edi,%ecx,4), %edi + + movl VAR_JUMP, %edx + testl $1, %ecx + movl %eax, %ecx C low carry + + cmovnz( %ebx, %ecx) C high,low carry other way around + cmovnz( %eax, %ebx) + + jmp *%edx + + +ifdef(`PIC',` +L(pic_calc): + shll $4, %edx + negl %ecx + + C See mpn/x86/README about old gas bugs + leal (%edx,%ecx,1), %edx + addl $L(entry)-L(here), %edx + + addl (%esp), %edx + + ret_internal +') + + +C ----------------------------------------------------------- + ALIGN(32) +L(top): +deflit(`FRAME',16) + C eax scratch + C ebx carry hi + C ecx carry lo + C edx scratch + C esi src + C edi dst + C ebp multiplier + C + C VAR_COUNTER loop counter + C + C 15 code bytes per limb + + addl $UNROLL_BYTES, %edi + +L(entry): +deflit(CHUNK_COUNT,2) +forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, ` + deflit(`disp0', eval(i*4*CHUNK_COUNT ifelse(UNROLL_BYTES,256,-128))) + deflit(`disp1', eval(disp0 + 4)) + +Zdisp( movl, disp0,(%esi), %eax) + mull %ebp +Zdisp( M4_inst,%ecx, disp0,(%edi)) + adcl %eax, %ebx + movl %edx, %ecx + adcl $0, %ecx + + movl disp1(%esi), %eax + mull %ebp + M4_inst %ebx, disp1(%edi) + adcl %eax, %ecx + movl %edx, %ebx + adcl $0, %ebx +') + + decl VAR_COUNTER + leal UNROLL_BYTES(%esi), %esi + + jns L(top) + + +deflit(`disp0', eval(UNROLL_BYTES ifelse(UNROLL_BYTES,256,-128))) + + M4_inst %ecx, disp0(%edi) + movl %ebx, %eax + + popl %ebp + popl %edi + + popl %esi + popl %ebx + adcl $0, %eax + + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/bdiv_q_1.asm b/vendor/gmp-6.3.0/mpn/x86/p6/bdiv_q_1.asm new file mode 100644 index 0000000..a0a9d90 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/p6/bdiv_q_1.asm @@ -0,0 +1,287 @@ +dnl Intel P6 mpn_modexact_1_odd -- exact division style remainder. + +dnl Rearranged from mpn/x86/p6/dive_1.asm by Marco Bodrato. + +dnl Copyright 2001, 2002, 2007, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C odd even divisor +C P6: 10.0 12.0 cycles/limb + +C MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1) + +C The odd case is basically the same as mpn_modexact_1_odd, just with an +C extra store, and it runs at the same 10 cycles which is the dependent +C chain. +C +C The shifts for the even case aren't on the dependent chain so in principle +C it could run the same too, but nothing running at 10 has been found. +C Perhaps there's too many uops (an extra 4 over the odd case). + +defframe(PARAM_SHIFT, 24) +defframe(PARAM_INVERSE,20) +defframe(PARAM_DIVISOR,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +defframe(SAVE_EBX, -4) +defframe(SAVE_ESI, -8) +defframe(SAVE_EDI, -12) +defframe(SAVE_EBP, -16) +deflit(STACK_SPACE, 16) + +dnl re-use parameter space +define(VAR_INVERSE,`PARAM_SRC') + + TEXT + +C mp_limb_t +C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor, +C mp_limb_t inverse, int shift) + + ALIGN(16) +PROLOGUE(mpn_pi1_bdiv_q_1) +deflit(`FRAME',0) + + subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE) + + movl %esi, SAVE_ESI + movl PARAM_SRC, %esi + + movl %ebx, SAVE_EBX + movl PARAM_SIZE, %ebx + + movl %ebp, SAVE_EBP + movl PARAM_INVERSE, %ebp + + movl PARAM_SHIFT, %ecx C trailing twos + +L(common): + movl %edi, SAVE_EDI + movl PARAM_DST, %edi + + leal (%esi,%ebx,4), %esi C src end + + leal (%edi,%ebx,4), %edi C dst end + negl %ebx C -size + + movl (%esi,%ebx,4), %eax C src[0] + + orl %ecx, %ecx + jz L(odd_entry) + + movl %edi, PARAM_DST + movl %ebp, VAR_INVERSE + +L(even): + C eax src[0] + C ebx counter, limbs, negative + C ecx shift + C edx + C esi + C edi + C ebp + + xorl %ebp, %ebp C initial carry bit + xorl %edx, %edx C initial carry limb (for size==1) + + incl %ebx + jz L(even_one) + + movl (%esi,%ebx,4), %edi C src[1] + + shrdl( %cl, %edi, %eax) + + jmp L(even_entry) + + +L(even_top): + C eax scratch + C ebx counter, limbs, negative + C ecx shift + C edx scratch + C esi &src[size] + C edi &dst[size] and scratch + C ebp carry bit + + movl (%esi,%ebx,4), %edi + + mull PARAM_DIVISOR + + movl -4(%esi,%ebx,4), %eax + shrdl( %cl, %edi, %eax) + + subl %ebp, %eax + + sbbl %ebp, %ebp + subl %edx, %eax + + sbbl $0, %ebp + +L(even_entry): + imull VAR_INVERSE, %eax + + movl PARAM_DST, %edi + negl %ebp + + movl %eax, -4(%edi,%ebx,4) + incl %ebx + jnz L(even_top) + + mull PARAM_DIVISOR + + movl -4(%esi), %eax + +L(even_one): + shrl %cl, %eax + movl SAVE_ESI, %esi + + subl %ebp, %eax + movl SAVE_EBP, %ebp + + subl %edx, %eax + movl SAVE_EBX, %ebx + + imull VAR_INVERSE, %eax + + movl %eax, -4(%edi) + movl SAVE_EDI, %edi + addl $STACK_SPACE, %esp + + ret + +C The dependent chain here is +C +C subl %edx, %eax 1 +C imull %ebp, %eax 4 +C mull PARAM_DIVISOR 5 +C ---- +C total 10 +C +C and this is the measured speed. No special scheduling is necessary, out +C of order execution hides the load latency. + +L(odd_top): + C eax scratch (src limb) + C ebx counter, limbs, negative + C ecx carry bit + C edx carry limb, high of last product + C esi &src[size] + C edi &dst[size] + C ebp inverse + + mull PARAM_DIVISOR + + movl (%esi,%ebx,4), %eax + subl %ecx, %eax + + sbbl %ecx, %ecx + subl %edx, %eax + + sbbl $0, %ecx + +L(odd_entry): + imull %ebp, %eax + + movl %eax, (%edi,%ebx,4) + negl %ecx + + incl %ebx + jnz L(odd_top) + + + movl SAVE_ESI, %esi + + movl SAVE_EDI, %edi + + movl SAVE_EBP, %ebp + + movl SAVE_EBX, %ebx + addl $STACK_SPACE, %esp + + ret + +EPILOGUE() + +C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t divisor); +C + + ALIGN(16) +PROLOGUE(mpn_bdiv_q_1) +deflit(`FRAME',0) + + movl PARAM_DIVISOR, %eax + subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE) + + movl %esi, SAVE_ESI + movl PARAM_SRC, %esi + + movl %ebx, SAVE_EBX + movl PARAM_SIZE, %ebx + + bsfl %eax, %ecx C trailing twos + + movl %ebp, SAVE_EBP + + shrl %cl, %eax C d without twos + + movl %eax, %edx + shrl %eax C d/2 without twos + + movl %edx, PARAM_DIVISOR + andl $127, %eax + +ifdef(`PIC',` + LEA( binvert_limb_table, %ebp) + movzbl (%eax,%ebp), %ebp C inv 8 bits +',` + movzbl binvert_limb_table(%eax), %ebp C inv 8 bits +') + + leal (%ebp,%ebp), %eax C 2*inv + + imull %ebp, %ebp C inv*inv + imull %edx, %ebp C inv*inv*d + + subl %ebp, %eax C inv = 2*inv - inv*inv*d + leal (%eax,%eax), %ebp C 2*inv + + imull %eax, %eax C inv*inv + imull %edx, %eax C inv*inv*d + + subl %eax, %ebp C inv = 2*inv - inv*inv*d + + jmp L(common) + +EPILOGUE() +ASM_END() diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/copyd.asm b/vendor/gmp-6.3.0/mpn/x86/p6/copyd.asm new file mode 100644 index 0000000..1be7636 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/p6/copyd.asm @@ -0,0 +1,178 @@ +dnl Intel P6 mpn_copyd -- copy limb vector backwards. + +dnl Copyright 2001, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P6: 1.75 cycles/limb, or 0.75 if no overlap + + +C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size); +C +C An explicit loop is used because a decrementing rep movsl is a bit slow at +C 2.4 c/l. That rep movsl also has about a 40 cycle startup time, and the +C code here stands a chance of being faster if the branches predict well. +C +C The slightly strange loop form seems necessary for the claimed speed. +C Maybe load/store ordering affects it. +C +C The source and destination are checked to see if they're actually +C overlapping, since it might be possible to use an incrementing rep movsl +C at 0.75 c/l. (It doesn't suffer the bad startup time of the decrementing +C version.) +C +C Enhancements: +C +C Top speed for an all-integer copy is probably 1.0 c/l, being one load and +C one store each cycle. Unrolling the loop below would approach 1.0, but +C it'd be good to know why something like store/load/subl + store/load/jnz +C doesn't already run at 1.0 c/l. It looks like it should decode in 2 +C cycles, but doesn't run that way. + +defframe(PARAM_SIZE,12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +dnl re-using parameter space +define(SAVE_ESI,`PARAM_SIZE') +define(SAVE_EDI,`PARAM_SRC') + + TEXT + ALIGN(16) + +PROLOGUE(mpn_copyd) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + + movl %esi, SAVE_ESI + movl PARAM_SRC, %esi + + movl %edi, SAVE_EDI + movl PARAM_DST, %edi + + subl $1, %ecx + jb L(zero) + + movl (%esi,%ecx,4), %eax C src[size-1] + jz L(one) + + movl -4(%esi,%ecx,4), %edx C src[size-2] + subl $2, %ecx + jbe L(done_loop) C 2 or 3 limbs only + + + C The usual overlap is + C + C high low + C +------------------+ + C | dst| + C +------------------+ + C +------------------+ + C | src| + C +------------------+ + C + C We can use an incrementing copy in the following circumstances. + C + C src+4*size<=dst, since then the regions are disjoint + C + C src==dst, clearly (though this shouldn't occur normally) + C + C src>dst, since in that case it's a requirement of the + C parameters that src>=dst+size*4, and hence the + C regions are disjoint + C + + leal (%edi,%ecx,4), %edx + cmpl %edi, %esi + jae L(use_movsl) C src >= dst + + cmpl %edi, %edx + movl 4(%esi,%ecx,4), %edx C src[size-2] again + jbe L(use_movsl) C src+4*size <= dst + + +L(top): + C eax prev high limb + C ebx + C ecx counter, size-3 down to 0 or -1, inclusive, by 2s + C edx prev low limb + C esi src + C edi dst + C ebp + + movl %eax, 8(%edi,%ecx,4) + movl (%esi,%ecx,4), %eax + + movl %edx, 4(%edi,%ecx,4) + movl -4(%esi,%ecx,4), %edx + + subl $2, %ecx + jnbe L(top) + + +L(done_loop): + movl %eax, 8(%edi,%ecx,4) + movl %edx, 4(%edi,%ecx,4) + + C copy low limb (needed if size was odd, but will already have been + C done in the loop if size was even) + movl (%esi), %eax +L(one): + movl %eax, (%edi) + movl SAVE_EDI, %edi + movl SAVE_ESI, %esi + + ret + + +L(use_movsl): + C eax + C ebx + C ecx size-3 + C edx + C esi src + C edi dst + C ebp + + addl $3, %ecx + + cld C better safe than sorry, see mpn/x86/README + + rep + movsl + +L(zero): + movl SAVE_ESI, %esi + movl SAVE_EDI, %edi + + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/dive_1.asm b/vendor/gmp-6.3.0/mpn/x86/p6/dive_1.asm new file mode 100644 index 0000000..7d61a18 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/p6/dive_1.asm @@ -0,0 +1,267 @@ +dnl Intel P6 mpn_modexact_1_odd -- exact division style remainder. + +dnl Copyright 2001, 2002, 2007 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C odd even divisor +C P6: 10.0 12.0 cycles/limb + + +C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t divisor); +C +C The odd case is basically the same as mpn_modexact_1_odd, just with an +C extra store, and it runs at the same 10 cycles which is the dependent +C chain. +C +C The shifts for the even case aren't on the dependent chain so in principle +C it could run the same too, but nothing running at 10 has been found. +C Perhaps there's too many uops (an extra 4 over the odd case). + +defframe(PARAM_DIVISOR,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +defframe(SAVE_EBX, -4) +defframe(SAVE_ESI, -8) +defframe(SAVE_EDI, -12) +defframe(SAVE_EBP, -16) +defframe(VAR_INVERSE, -20) +deflit(STACK_SPACE, 20) + + TEXT + + ALIGN(16) +PROLOGUE(mpn_divexact_1) +deflit(`FRAME',0) + + movl PARAM_DIVISOR, %eax + subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE) + + movl %esi, SAVE_ESI + movl PARAM_SRC, %esi + + movl %ebx, SAVE_EBX + movl PARAM_SIZE, %ebx + + bsfl %eax, %ecx C trailing twos + + movl %ebp, SAVE_EBP + + shrl %cl, %eax C d without twos + + movl %eax, %edx + shrl %eax C d/2 without twos + + movl %edx, PARAM_DIVISOR + andl $127, %eax + +ifdef(`PIC',` + LEA( binvert_limb_table, %ebp) + movzbl (%eax,%ebp), %ebp C inv 8 bits +',` + movzbl binvert_limb_table(%eax), %ebp C inv 8 bits +') + + leal (%ebp,%ebp), %eax C 2*inv + + imull %ebp, %ebp C inv*inv + + movl %edi, SAVE_EDI + movl PARAM_DST, %edi + + leal (%esi,%ebx,4), %esi C src end + + imull PARAM_DIVISOR, %ebp C inv*inv*d + + subl %ebp, %eax C inv = 2*inv - inv*inv*d + leal (%eax,%eax), %ebp C 2*inv + + imull %eax, %eax C inv*inv + + leal (%edi,%ebx,4), %edi C dst end + negl %ebx C -size + + movl %edi, PARAM_DST + + imull PARAM_DIVISOR, %eax C inv*inv*d + + subl %eax, %ebp C inv = 2*inv - inv*inv*d + + ASSERT(e,` C d*inv == 1 mod 2^GMP_LIMB_BITS + movl PARAM_DIVISOR, %eax + imull %ebp, %eax + cmpl $1, %eax') + + movl %ebp, VAR_INVERSE + movl (%esi,%ebx,4), %eax C src[0] + + orl %ecx, %ecx + jnz L(even) + + C ecx initial carry is zero + jmp L(odd_entry) + + +C The dependent chain here is +C +C subl %edx, %eax 1 +C imull %ebp, %eax 4 +C mull PARAM_DIVISOR 5 +C ---- +C total 10 +C +C and this is the measured speed. No special scheduling is necessary, out +C of order execution hides the load latency. + +L(odd_top): + C eax scratch (src limb) + C ebx counter, limbs, negative + C ecx carry bit + C edx carry limb, high of last product + C esi &src[size] + C edi &dst[size] + C ebp + + mull PARAM_DIVISOR + + movl (%esi,%ebx,4), %eax + subl %ecx, %eax + + sbbl %ecx, %ecx + subl %edx, %eax + + sbbl $0, %ecx + +L(odd_entry): + imull VAR_INVERSE, %eax + + movl %eax, (%edi,%ebx,4) + negl %ecx + + incl %ebx + jnz L(odd_top) + + + movl SAVE_ESI, %esi + + movl SAVE_EDI, %edi + + movl SAVE_EBP, %ebp + + movl SAVE_EBX, %ebx + addl $STACK_SPACE, %esp + + ret + + +L(even): + C eax src[0] + C ebx counter, limbs, negative + C ecx shift + C edx + C esi + C edi + C ebp + + xorl %ebp, %ebp C initial carry bit + xorl %edx, %edx C initial carry limb (for size==1) + + incl %ebx + jz L(even_one) + + movl (%esi,%ebx,4), %edi C src[1] + + shrdl( %cl, %edi, %eax) + + jmp L(even_entry) + + +L(even_top): + C eax scratch + C ebx counter, limbs, negative + C ecx shift + C edx scratch + C esi &src[size] + C edi &dst[size] and scratch + C ebp carry bit + + movl (%esi,%ebx,4), %edi + + mull PARAM_DIVISOR + + movl -4(%esi,%ebx,4), %eax + shrdl( %cl, %edi, %eax) + + subl %ebp, %eax + + sbbl %ebp, %ebp + subl %edx, %eax + + sbbl $0, %ebp + +L(even_entry): + imull VAR_INVERSE, %eax + + movl PARAM_DST, %edi + negl %ebp + + movl %eax, -4(%edi,%ebx,4) + incl %ebx + jnz L(even_top) + + + + mull PARAM_DIVISOR + + movl -4(%esi), %eax + +L(even_one): + shrl %cl, %eax + movl SAVE_ESI, %esi + + subl %ebp, %eax + movl SAVE_EBP, %ebp + + subl %edx, %eax + movl SAVE_EBX, %ebx + + imull VAR_INVERSE, %eax + + movl %eax, -4(%edi) + movl SAVE_EDI, %edi + addl $STACK_SPACE, %esp + + ret + +EPILOGUE() +ASM_END() diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/gcd_11.asm b/vendor/gmp-6.3.0/mpn/x86/p6/gcd_11.asm new file mode 100644 index 0000000..80e055e --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/p6/gcd_11.asm @@ -0,0 +1,83 @@ +dnl x86 mpn_gcd_11 optimised for processors with fast BSF. + +dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked by Torbjorn Granlund. + +dnl Copyright 2000-2002, 2005, 2009, 2011, 2012, 2015 Free Software +dnl Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/bit (approx) +C AMD K7 7.80 +C AMD K8,K9 7.79 +C AMD K10 4.08 +C AMD bd1 ? +C AMD bobcat 7.82 +C Intel P4-2 14.9 +C Intel P4-3/4 14.0 +C Intel P6/13 5.09 +C Intel core2 4.22 +C Intel NHM 5.00 +C Intel SBR 5.00 +C Intel atom 17.1 +C VIA nano ? +C Numbers measured with: speed -CD -s16-32 -t16 mpn_gcd_1 + + +define(`u0', `%eax') +define(`v0', `%edx') + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_gcd_11) + push %edi + push %esi + + mov 12(%esp), %eax + mov 16(%esp), %edx + jmp L(odd) + + ALIGN(16) C K10 BD C2 NHM SBR +L(top): cmovc( %esi, %eax) C u = |v - u| 0,3 0,3 0,6 0,5 0,5 + cmovc( %edi, %edx) C v = min(u,v) 0,3 0,3 2,8 1,7 1,7 + shr %cl, %eax C 1,7 1,6 2,8 2,8 2,8 +L(odd): mov %edx, %esi C 1 1 4 3 3 + sub %eax, %esi C 2 2 5 4 4 + bsf %esi, %ecx C 3 3 6 5 5 + mov %eax, %edi C 2 2 3 3 4 + sub %edx, %eax C 2 2 4 3 4 + jnz L(top) C + +L(end): mov %edx, %eax + pop %esi + pop %edi + ret +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/x86/p6/gmp-mparam.h new file mode 100644 index 0000000..96c96fd --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/p6/gmp-mparam.h @@ -0,0 +1,194 @@ +/* Intel P6 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2003, 2008-2010, 2012 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + + +/* NOTE: In a fat binary build SQR_TOOM2_THRESHOLD here cannot be more than the + value in mpn/x86/p6/gmp-mparam.h. The latter is used as a hard limit in + mpn/x86/p6/sqr_basecase.asm. */ + + +/* 1867 MHz P6 model 13 */ + +#define MOD_1_NORM_THRESHOLD 4 +#define MOD_1_UNNORM_THRESHOLD 4 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 11 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 8 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 21 + +#define MUL_TOOM22_THRESHOLD 20 +#define MUL_TOOM33_THRESHOLD 74 +#define MUL_TOOM44_THRESHOLD 181 +#define MUL_TOOM6H_THRESHOLD 252 +#define MUL_TOOM8H_THRESHOLD 363 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 73 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 114 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 115 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 80 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 30 +#define SQR_TOOM3_THRESHOLD 101 +#define SQR_TOOM4_THRESHOLD 154 +#define SQR_TOOM6_THRESHOLD 222 +#define SQR_TOOM8_THRESHOLD 527 + +#define MULMID_TOOM42_THRESHOLD 58 + +#define MULMOD_BNM1_THRESHOLD 13 +#define SQRMOD_BNM1_THRESHOLD 17 + +#define POWM_SEC_TABLE 4,23,258,768,2388 + +#define MUL_FFT_MODF_THRESHOLD 565 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 565, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 25, 7}, { 13, 6}, { 28, 7}, { 15, 6}, \ + { 31, 7}, { 17, 6}, { 35, 7}, { 27, 8}, \ + { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \ + { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \ + { 31, 7}, { 63, 8}, { 39, 9}, { 23, 5}, \ + { 383, 4}, { 991, 5}, { 511, 6}, { 267, 7}, \ + { 157, 8}, { 91, 9}, { 47, 8}, { 111, 9}, \ + { 63, 8}, { 127, 9}, { 79,10}, { 47, 9}, \ + { 95,11}, { 31,10}, { 63, 9}, { 135,10}, \ + { 79, 9}, { 159,10}, { 95,11}, { 63,10}, \ + { 143, 9}, { 287,10}, { 159,11}, { 95,10}, \ + { 191,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,10}, { 271, 9}, { 543,10}, { 287,11}, \ + { 159,10}, { 335, 9}, { 671,11}, { 191,10}, \ + { 383, 9}, { 767,10}, { 399, 9}, { 799,10}, \ + { 415,11}, { 223,12}, { 127,11}, { 255,10}, \ + { 543, 9}, { 1087,11}, { 287,10}, { 607,11}, \ + { 319,10}, { 671,12}, { 191,11}, { 383,10}, \ + { 799,11}, { 415,10}, { 831,13}, { 127,12}, \ + { 255,11}, { 543,10}, { 1087,11}, { 607,10}, \ + { 1215,12}, { 319,11}, { 671,10}, { 1343,11}, \ + { 735,10}, { 1471,12}, { 383,11}, { 799,10}, \ + { 1599,11}, { 863,12}, { 447,11}, { 959,13}, \ + { 255,12}, { 511,11}, { 1087,12}, { 575,11}, \ + { 1215,12}, { 639,11}, { 1343,12}, { 703,11}, \ + { 1471,13}, { 383,12}, { 831,11}, { 1727,12}, \ + { 959,14}, { 255,13}, { 511,12}, { 1215,13}, \ + { 639,12}, { 1471,11}, { 2943,13}, { 767,12}, \ + { 1727,13}, { 895,12}, { 1919,14}, { 511,13}, \ + { 1023,12}, { 2111,13}, { 1151,12}, { 2431,13}, \ + { 1407,12}, { 2815,14}, { 767,13}, { 1663,12}, \ + { 3455,13}, { 8192,14}, { 16384,15}, { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 132 +#define MUL_FFT_THRESHOLD 6784 + +#define SQR_FFT_MODF_THRESHOLD 472 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 472, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \ + { 31, 7}, { 17, 6}, { 35, 7}, { 27, 8}, \ + { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \ + { 23, 7}, { 49, 8}, { 27, 9}, { 15, 8}, \ + { 39, 9}, { 23, 8}, { 51,10}, { 15, 9}, \ + { 31, 8}, { 63, 4}, { 1023, 8}, { 67, 9}, \ + { 39, 5}, { 639, 4}, { 1471, 6}, { 383, 7}, \ + { 209, 8}, { 119, 9}, { 63, 7}, { 255, 8}, \ + { 139, 9}, { 71, 8}, { 143, 9}, { 79,10}, \ + { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \ + { 135,10}, { 79, 9}, { 159, 8}, { 319, 9}, \ + { 167,10}, { 95,11}, { 63,10}, { 143, 9}, \ + { 287,10}, { 159,11}, { 95,10}, { 191,12}, \ + { 63,11}, { 127,10}, { 255, 9}, { 543, 8}, \ + { 1087,10}, { 287, 9}, { 575,11}, { 159,10}, \ + { 319, 9}, { 639,10}, { 335, 9}, { 671,10}, \ + { 351, 9}, { 703,11}, { 191,10}, { 383, 9}, \ + { 767,10}, { 399, 9}, { 799,10}, { 415, 9}, \ + { 831,11}, { 223,12}, { 127,11}, { 255,10}, \ + { 543, 9}, { 1087,11}, { 287,10}, { 607, 9}, \ + { 1215,11}, { 319,10}, { 671, 9}, { 1343,11}, \ + { 351,10}, { 703,12}, { 191,11}, { 383,10}, \ + { 799,11}, { 415,10}, { 831,13}, { 127,12}, \ + { 255,11}, { 543,10}, { 1087,11}, { 607,12}, \ + { 319,11}, { 671,10}, { 1343,11}, { 735,12}, \ + { 383,11}, { 799,10}, { 1599,11}, { 863,12}, \ + { 447,11}, { 959,13}, { 255,12}, { 511,11}, \ + { 1087,12}, { 575,11}, { 1215,12}, { 639,11}, \ + { 1343,12}, { 703,11}, { 1471,13}, { 383,12}, \ + { 767,11}, { 1599,12}, { 831,11}, { 1727,12}, \ + { 959,14}, { 255,13}, { 511,12}, { 1215,13}, \ + { 639,12}, { 1471,13}, { 767,12}, { 1727,13}, \ + { 895,12}, { 1919,14}, { 511,13}, { 1023,12}, \ + { 2111,13}, { 1151,12}, { 2431,13}, { 1407,14}, \ + { 767,13}, { 1663,12}, { 3455,13}, { 8192,14}, \ + { 16384,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 146 +#define SQR_FFT_THRESHOLD 5760 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 33 +#define MULLO_MUL_N_THRESHOLD 13463 + +#define DC_DIV_QR_THRESHOLD 20 +#define DC_DIVAPPR_Q_THRESHOLD 56 +#define DC_BDIV_QR_THRESHOLD 60 +#define DC_BDIV_Q_THRESHOLD 134 + +#define INV_MULMOD_BNM1_THRESHOLD 38 +#define INV_NEWTON_THRESHOLD 66 +#define INV_APPR_THRESHOLD 63 + +#define BINV_NEWTON_THRESHOLD 250 +#define REDC_1_TO_REDC_N_THRESHOLD 63 + +#define MU_DIV_QR_THRESHOLD 1164 +#define MU_DIVAPPR_Q_THRESHOLD 979 +#define MUPI_DIV_QR_THRESHOLD 38 +#define MU_BDIV_QR_THRESHOLD 1442 +#define MU_BDIV_Q_THRESHOLD 1470 + +#define MATRIX22_STRASSEN_THRESHOLD 17 +#define HGCD_THRESHOLD 64 +#define HGCD_APPR_THRESHOLD 105 +#define HGCD_REDUCE_THRESHOLD 3524 +#define GCD_DC_THRESHOLD 386 +#define GCDEXT_DC_THRESHOLD 309 +#define JACOBI_BASE_METHOD 1 + +#define GET_STR_DC_THRESHOLD 13 +#define GET_STR_PRECOMPUTE_THRESHOLD 26 +#define SET_STR_DC_THRESHOLD 587 +#define SET_STR_PRECOMPUTE_THRESHOLD 1104 diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/lshsub_n.asm b/vendor/gmp-6.3.0/mpn/x86/p6/lshsub_n.asm new file mode 100644 index 0000000..7ada213 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/p6/lshsub_n.asm @@ -0,0 +1,169 @@ +dnl Intel P6 mpn_lshsub_n -- mpn papillion support. + +dnl Copyright 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C P6/13: 3.35 cycles/limb (separate mpn_sub_n + mpn_lshift needs 4.12) + +C (1) The loop is not scheduled in any way, and scheduling attempts have not +C improved speed on P6/13. Presumably, the K7 will want scheduling, if it +C at all wants to use MMX. +C (2) We could save a register by not alternatingly using eax and edx in the +C loop. + +define(`rp', `%edi') +define(`up', `%esi') +define(`vp', `%ebx') +define(`n', `%ecx') +define(`cnt', `%mm7') + +ASM_START() + + TEXT + ALIGN(16) + +PROLOGUE(mpn_lshsub_n) + push %edi + push %esi + push %ebx + + mov 16(%esp), rp + mov 20(%esp), up + mov 24(%esp), vp + mov 28(%esp), n + mov $32, %eax + sub 32(%esp), %eax + movd %eax, cnt + + lea (up,n,4), up + lea (vp,n,4), vp + lea (rp,n,4), rp + + neg n + mov n, %eax + and $-8, n + and $7, %eax + shl %eax C eax = 2x + lea (%eax,%eax,4), %edx C edx = 10x +ifdef(`PIC',` + call L(pic_calc) +L(here): +',` + lea L(ent)(%eax,%edx,2), %eax C eax = 22x +') + + pxor %mm1, %mm1 + pxor %mm0, %mm0 + + jmp *%eax + +ifdef(`PIC',` +L(pic_calc): + C See mpn/x86/README about old gas bugs + lea (%eax,%edx,2), %eax + add $L(ent)-L(here), %eax + add (%esp), %eax + ret_internal +') + +L(end): C compute (cy<<cnt) | (edx>>(32-cnt)) + sbb %eax, %eax + neg %eax + mov 32(%esp), %ecx + shld %cl, %edx, %eax + + emms + + pop %ebx + pop %esi + pop %edi + ret + ALIGN(16) +L(top): jecxz L(end) +L(ent): mov 0(up,n,4), %eax + sbb 0(vp,n,4), %eax + movd %eax, %mm0 + punpckldq %mm0, %mm1 + psrlq %mm7, %mm1 + movd %mm1, 0(rp,n,4) + + mov 4(up,n,4), %edx + sbb 4(vp,n,4), %edx + movd %edx, %mm1 + punpckldq %mm1, %mm0 + psrlq %mm7, %mm0 + movd %mm0, 4(rp,n,4) + + mov 8(up,n,4), %eax + sbb 8(vp,n,4), %eax + movd %eax, %mm0 + punpckldq %mm0, %mm1 + psrlq %mm7, %mm1 + movd %mm1, 8(rp,n,4) + + mov 12(up,n,4), %edx + sbb 12(vp,n,4), %edx + movd %edx, %mm1 + punpckldq %mm1, %mm0 + psrlq %mm7, %mm0 + movd %mm0, 12(rp,n,4) + + mov 16(up,n,4), %eax + sbb 16(vp,n,4), %eax + movd %eax, %mm0 + punpckldq %mm0, %mm1 + psrlq %mm7, %mm1 + movd %mm1, 16(rp,n,4) + + mov 20(up,n,4), %edx + sbb 20(vp,n,4), %edx + movd %edx, %mm1 + punpckldq %mm1, %mm0 + psrlq %mm7, %mm0 + movd %mm0, 20(rp,n,4) + + mov 24(up,n,4), %eax + sbb 24(vp,n,4), %eax + movd %eax, %mm0 + punpckldq %mm0, %mm1 + psrlq %mm7, %mm1 + movd %mm1, 24(rp,n,4) + + mov 28(up,n,4), %edx + sbb 28(vp,n,4), %edx + movd %edx, %mm1 + punpckldq %mm1, %mm0 + psrlq %mm7, %mm0 + movd %mm0, 28(rp,n,4) + + lea 8(n), n + jmp L(top) + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/mmx/divrem_1.asm b/vendor/gmp-6.3.0/mpn/x86/p6/mmx/divrem_1.asm new file mode 100644 index 0000000..5300616 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/p6/mmx/divrem_1.asm @@ -0,0 +1,767 @@ +dnl Intel Pentium-II mpn_divrem_1 -- mpn by limb division. + +dnl Copyright 1999-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P6MMX: 25.0 cycles/limb integer part, 17.5 cycles/limb fraction part. + + +C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize, +C mp_srcptr src, mp_size_t size, +C mp_limb_t divisor); +C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize, +C mp_srcptr src, mp_size_t size, +C mp_limb_t divisor, mp_limb_t carry); +C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize, +C mp_srcptr src, mp_size_t size, +C mp_limb_t divisor, mp_limb_t inverse, +C unsigned shift); +C +C This code is a lightly reworked version of mpn/x86/k7/mmx/divrem_1.asm, +C see that file for some comments. It's possible what's here can be improved. + + +dnl MUL_THRESHOLD is the value of xsize+size at which the multiply by +dnl inverse method is used, rather than plain "divl"s. Minimum value 1. +dnl +dnl The different speeds of the integer and fraction parts means that using +dnl xsize+size isn't quite right. The threshold wants to be a bit higher +dnl for the integer part and a bit lower for the fraction part. (Or what's +dnl really wanted is to speed up the integer part!) +dnl +dnl The threshold is set to make the integer part right. At 4 limbs the +dnl div and mul are about the same there, but on the fractional part the +dnl mul is much faster. + +deflit(MUL_THRESHOLD, 4) + + +defframe(PARAM_PREINV_SHIFT, 28) dnl mpn_preinv_divrem_1 +defframe(PARAM_PREINV_INVERSE, 24) dnl mpn_preinv_divrem_1 +defframe(PARAM_CARRY, 24) dnl mpn_divrem_1c +defframe(PARAM_DIVISOR,20) +defframe(PARAM_SIZE, 16) +defframe(PARAM_SRC, 12) +defframe(PARAM_XSIZE, 8) +defframe(PARAM_DST, 4) + +defframe(SAVE_EBX, -4) +defframe(SAVE_ESI, -8) +defframe(SAVE_EDI, -12) +defframe(SAVE_EBP, -16) + +defframe(VAR_NORM, -20) +defframe(VAR_INVERSE, -24) +defframe(VAR_SRC, -28) +defframe(VAR_DST, -32) +defframe(VAR_DST_STOP,-36) + +deflit(STACK_SPACE, 36) + + TEXT + ALIGN(16) + +PROLOGUE(mpn_preinv_divrem_1) +deflit(`FRAME',0) + movl PARAM_XSIZE, %ecx + subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE) + + movl %esi, SAVE_ESI + movl PARAM_SRC, %esi + + movl %ebx, SAVE_EBX + movl PARAM_SIZE, %ebx + + movl %ebp, SAVE_EBP + movl PARAM_DIVISOR, %ebp + + movl %edi, SAVE_EDI + movl PARAM_DST, %edx + + movl -4(%esi,%ebx,4), %eax C src high limb + xorl %edi, %edi C initial carry (if can't skip a div) + + C + + leal 8(%edx,%ecx,4), %edx C &dst[xsize+2] + xor %ecx, %ecx + + movl %edx, VAR_DST_STOP C &dst[xsize+2] + cmpl %ebp, %eax C high cmp divisor + + cmovc( %eax, %edi) C high is carry if high<divisor + + cmovnc( %eax, %ecx) C 0 if skip div, src high if not + C (the latter in case src==dst) + + movl %ecx, -12(%edx,%ebx,4) C dst high limb + + sbbl $0, %ebx C skip one division if high<divisor + movl PARAM_PREINV_SHIFT, %ecx + + leal -8(%edx,%ebx,4), %edx C &dst[xsize+size] + movl $32, %eax + + movl %edx, VAR_DST C &dst[xsize+size] + + shll %cl, %ebp C d normalized + subl %ecx, %eax + movl %ecx, VAR_NORM + + movd %eax, %mm7 C rshift + movl PARAM_PREINV_INVERSE, %eax + jmp L(start_preinv) + +EPILOGUE() + + + + ALIGN(16) + +PROLOGUE(mpn_divrem_1c) +deflit(`FRAME',0) + movl PARAM_CARRY, %edx + + movl PARAM_SIZE, %ecx + subl $STACK_SPACE, %esp +deflit(`FRAME',STACK_SPACE) + + movl %ebx, SAVE_EBX + movl PARAM_XSIZE, %ebx + + movl %edi, SAVE_EDI + movl PARAM_DST, %edi + + movl %ebp, SAVE_EBP + movl PARAM_DIVISOR, %ebp + + movl %esi, SAVE_ESI + movl PARAM_SRC, %esi + + leal -4(%edi,%ebx,4), %edi + jmp L(start_1c) + +EPILOGUE() + + + C offset 0x31, close enough to aligned +PROLOGUE(mpn_divrem_1) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + movl $0, %edx C initial carry (if can't skip a div) + subl $STACK_SPACE, %esp +deflit(`FRAME',STACK_SPACE) + + movl %ebp, SAVE_EBP + movl PARAM_DIVISOR, %ebp + + movl %ebx, SAVE_EBX + movl PARAM_XSIZE, %ebx + + movl %esi, SAVE_ESI + movl PARAM_SRC, %esi + orl %ecx, %ecx C size + + movl %edi, SAVE_EDI + movl PARAM_DST, %edi + + leal -4(%edi,%ebx,4), %edi C &dst[xsize-1] + jz L(no_skip_div) C if size==0 + + movl -4(%esi,%ecx,4), %eax C src high limb + xorl %esi, %esi + cmpl %ebp, %eax C high cmp divisor + + cmovc( %eax, %edx) C high is carry if high<divisor + + cmovnc( %eax, %esi) C 0 if skip div, src high if not + C (the latter in case src==dst) + + movl %esi, (%edi,%ecx,4) C dst high limb + + sbbl $0, %ecx C size-1 if high<divisor + movl PARAM_SRC, %esi C reload +L(no_skip_div): + + +L(start_1c): + C eax + C ebx xsize + C ecx size + C edx carry + C esi src + C edi &dst[xsize-1] + C ebp divisor + + leal (%ebx,%ecx), %eax C size+xsize + cmpl $MUL_THRESHOLD, %eax + jae L(mul_by_inverse) + + orl %ecx, %ecx + jz L(divide_no_integer) + +L(divide_integer): + C eax scratch (quotient) + C ebx xsize + C ecx counter + C edx scratch (remainder) + C esi src + C edi &dst[xsize-1] + C ebp divisor + + movl -4(%esi,%ecx,4), %eax + + divl %ebp + + movl %eax, (%edi,%ecx,4) + decl %ecx + jnz L(divide_integer) + + +L(divide_no_integer): + movl PARAM_DST, %edi + orl %ebx, %ebx + jnz L(divide_fraction) + +L(divide_done): + movl SAVE_ESI, %esi + + movl SAVE_EDI, %edi + + movl SAVE_EBX, %ebx + movl %edx, %eax + + movl SAVE_EBP, %ebp + addl $STACK_SPACE, %esp + + ret + + +L(divide_fraction): + C eax scratch (quotient) + C ebx counter + C ecx + C edx scratch (remainder) + C esi + C edi dst + C ebp divisor + + movl $0, %eax + + divl %ebp + + movl %eax, -4(%edi,%ebx,4) + decl %ebx + jnz L(divide_fraction) + + jmp L(divide_done) + + + +C ----------------------------------------------------------------------------- + +L(mul_by_inverse): + C eax + C ebx xsize + C ecx size + C edx carry + C esi src + C edi &dst[xsize-1] + C ebp divisor + + leal 12(%edi), %ebx C &dst[xsize+2], loop dst stop + + movl %ebx, VAR_DST_STOP + leal 4(%edi,%ecx,4), %edi C &dst[xsize+size] + + movl %edi, VAR_DST + movl %ecx, %ebx C size + + bsrl %ebp, %ecx C 31-l + movl %edx, %edi C carry + + leal 1(%ecx), %eax C 32-l + xorl $31, %ecx C l + + movl %ecx, VAR_NORM + movl $-1, %edx + + shll %cl, %ebp C d normalized + movd %eax, %mm7 + + movl $-1, %eax + subl %ebp, %edx C (b-d)-1 giving edx:eax = b*(b-d)-1 + + divl %ebp C floor (b*(b-d)-1) / d + +L(start_preinv): + C eax inverse + C ebx size + C ecx shift + C edx + C esi src + C edi carry + C ebp divisor + C + C mm7 rshift + + movl %eax, VAR_INVERSE + orl %ebx, %ebx C size + leal -12(%esi,%ebx,4), %eax C &src[size-3] + + movl %eax, VAR_SRC + jz L(start_zero) + + movl 8(%eax), %esi C src high limb + cmpl $1, %ebx + jz L(start_one) + +L(start_two_or_more): + movl 4(%eax), %edx C src second highest limb + + shldl( %cl, %esi, %edi) C n2 = carry,high << l + + shldl( %cl, %edx, %esi) C n10 = high,second << l + + cmpl $2, %ebx + je L(integer_two_left) + jmp L(integer_top) + + +L(start_one): + shldl( %cl, %esi, %edi) C n2 = carry,high << l + + shll %cl, %esi C n10 = high << l + jmp L(integer_one_left) + + +L(start_zero): + C Can be here with xsize==0 if mpn_preinv_divrem_1 had size==1 and + C skipped a division. + + shll %cl, %edi C n2 = carry << l + movl %edi, %eax C return value for zero_done + cmpl $0, PARAM_XSIZE + + je L(zero_done) + jmp L(fraction_some) + + + +C ----------------------------------------------------------------------------- +C +C This loop runs at about 25 cycles, which is probably sub-optimal, and +C certainly more than the dependent chain would suggest. A better loop, or +C a better rough analysis of what's possible, would be welcomed. +C +C In the current implementation, the following successively dependent +C micro-ops seem to exist. +C +C uops +C n2+n1 1 (addl) +C mul 5 +C q1+1 3 (addl/adcl) +C mul 5 +C sub 3 (subl/sbbl) +C addback 2 (cmov) +C --- +C 19 +C +C Lack of registers hinders explicit scheduling and it might be that the +C normal out of order execution isn't able to hide enough under the mul +C latencies. +C +C Using sarl/negl to pick out n1 for the n2+n1 stage is a touch faster than +C cmov (and takes one uop off the dependent chain). A sarl/andl/addl +C combination was tried for the addback (despite the fact it would lengthen +C the dependent chain) but found to be no faster. + + + ALIGN(16) +L(integer_top): + C eax scratch + C ebx scratch (nadj, q1) + C ecx scratch (src, dst) + C edx scratch + C esi n10 + C edi n2 + C ebp d + C + C mm0 scratch (src qword) + C mm7 rshift for normalization + + movl %esi, %eax + movl %ebp, %ebx + + sarl $31, %eax C -n1 + movl VAR_SRC, %ecx + + andl %eax, %ebx C -n1 & d + negl %eax C n1 + + addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow + addl %edi, %eax C n2+n1 + movq (%ecx), %mm0 C next src limb and the one below it + + mull VAR_INVERSE C m*(n2+n1) + + subl $4, %ecx + + movl %ecx, VAR_SRC + + C + + C + + addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag + movl %ebp, %eax C d + leal 1(%edi), %ebx C n2+1 + + adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 + jz L(q1_ff) + + mull %ebx C (q1+1)*d + + movl VAR_DST, %ecx + psrlq %mm7, %mm0 + + C + + C + + C + + subl %eax, %esi + movl VAR_DST_STOP, %eax + + sbbl %edx, %edi C n - (q1+1)*d + movl %esi, %edi C remainder -> n2 + leal (%ebp,%esi), %edx + + cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 + movd %mm0, %esi + + sbbl $0, %ebx C q + subl $4, %ecx + + movl %ebx, (%ecx) + cmpl %eax, %ecx + + movl %ecx, VAR_DST + jne L(integer_top) + + +L(integer_loop_done): + + +C ----------------------------------------------------------------------------- +C +C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz +C q1_ff special case. This make the code a bit smaller and simpler, and +C costs only 2 cycles (each). + +L(integer_two_left): + C eax scratch + C ebx scratch (nadj, q1) + C ecx scratch (src, dst) + C edx scratch + C esi n10 + C edi n2 + C ebp divisor + C + C mm7 rshift + + + movl %esi, %eax + movl %ebp, %ebx + + sarl $31, %eax C -n1 + movl PARAM_SRC, %ecx + + andl %eax, %ebx C -n1 & d + negl %eax C n1 + + addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow + addl %edi, %eax C n2+n1 + + mull VAR_INVERSE C m*(n2+n1) + + movd (%ecx), %mm0 C src low limb + + movl VAR_DST_STOP, %ecx + + C + + C + + addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag + leal 1(%edi), %ebx C n2+1 + movl %ebp, %eax C d + + adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 + + sbbl $0, %ebx + + mull %ebx C (q1+1)*d + + psllq $32, %mm0 + + psrlq %mm7, %mm0 + + C + + C + + subl %eax, %esi + + sbbl %edx, %edi C n - (q1+1)*d + movl %esi, %edi C remainder -> n2 + leal (%ebp,%esi), %edx + + cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 + movd %mm0, %esi + + sbbl $0, %ebx C q + + movl %ebx, -4(%ecx) + + +C ----------------------------------------------------------------------------- +L(integer_one_left): + C eax scratch + C ebx scratch (nadj, q1) + C ecx scratch (dst) + C edx scratch + C esi n10 + C edi n2 + C ebp divisor + C + C mm7 rshift + + + movl %esi, %eax + movl %ebp, %ebx + + sarl $31, %eax C -n1 + movl VAR_DST_STOP, %ecx + + andl %eax, %ebx C -n1 & d + negl %eax C n1 + + addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow + addl %edi, %eax C n2+n1 + + mull VAR_INVERSE C m*(n2+n1) + + C + + C + + C + + addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag + leal 1(%edi), %ebx C n2+1 + movl %ebp, %eax C d + + C + + adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 + + sbbl $0, %ebx C q1 if q1+1 overflowed + + mull %ebx + + C + + C + + C + + C + + subl %eax, %esi + movl PARAM_XSIZE, %eax + + sbbl %edx, %edi C n - (q1+1)*d + movl %esi, %edi C remainder -> n2 + leal (%ebp,%esi), %edx + + cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 + + sbbl $0, %ebx C q + + movl %ebx, -8(%ecx) + subl $8, %ecx + + + + orl %eax, %eax C xsize + jnz L(fraction_some) + + movl %edi, %eax +L(fraction_done): + movl VAR_NORM, %ecx +L(zero_done): + movl SAVE_EBP, %ebp + + movl SAVE_EDI, %edi + + movl SAVE_ESI, %esi + + movl SAVE_EBX, %ebx + addl $STACK_SPACE, %esp + + shrl %cl, %eax + emms + + ret + + +C ----------------------------------------------------------------------------- +C +C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword +C of q*d is simply -d and the remainder n-q*d = n10+d + +L(q1_ff): + C eax (divisor) + C ebx (q1+1 == 0) + C ecx + C edx + C esi n10 + C edi n2 + C ebp divisor + + movl VAR_DST, %ecx + movl VAR_DST_STOP, %edx + subl $4, %ecx + + movl %ecx, VAR_DST + psrlq %mm7, %mm0 + leal (%ebp,%esi), %edi C n-q*d remainder -> next n2 + + movl $-1, (%ecx) + movd %mm0, %esi C next n10 + + cmpl %ecx, %edx + jne L(integer_top) + + jmp L(integer_loop_done) + + + +C ----------------------------------------------------------------------------- +C +C In the current implementation, the following successively dependent +C micro-ops seem to exist. +C +C uops +C mul 5 +C q1+1 1 (addl) +C mul 5 +C sub 3 (negl/sbbl) +C addback 2 (cmov) +C --- +C 16 +C +C The loop in fact runs at about 17.5 cycles. Using a sarl/andl/addl for +C the addback was found to be a touch slower. + + + ALIGN(16) +L(fraction_some): + C eax + C ebx + C ecx + C edx + C esi + C edi carry + C ebp divisor + + movl PARAM_DST, %esi + movl VAR_DST_STOP, %ecx C &dst[xsize+2] + movl %edi, %eax + + subl $8, %ecx C &dst[xsize] + + + ALIGN(16) +L(fraction_top): + C eax n2, then scratch + C ebx scratch (nadj, q1) + C ecx dst, decrementing + C edx scratch + C esi dst stop point + C edi n2 + C ebp divisor + + mull VAR_INVERSE C m*n2 + + movl %ebp, %eax C d + subl $4, %ecx C dst + leal 1(%edi), %ebx + + C + + C + + C + + addl %edx, %ebx C 1 + high(n2<<32 + m*n2) = q1+1 + + mull %ebx C (q1+1)*d + + C + + C + + C + + C + + negl %eax C low of n - (q1+1)*d + + sbbl %edx, %edi C high of n - (q1+1)*d, caring only about carry + leal (%ebp,%eax), %edx + + cmovc( %edx, %eax) C n - q1*d if underflow from using q1+1 + + sbbl $0, %ebx C q + movl %eax, %edi C remainder->n2 + cmpl %esi, %ecx + + movl %ebx, (%ecx) C previous q + jne L(fraction_top) + + + jmp L(fraction_done) + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/mmx/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/x86/p6/mmx/gmp-mparam.h new file mode 100644 index 0000000..ef29061 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/p6/mmx/gmp-mparam.h @@ -0,0 +1,218 @@ +/* Intel P6/mmx gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991-2017 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + + +/* NOTE: In a fat binary build SQR_TOOM2_THRESHOLD here cannot be more than the + value in mpn/x86/p6/gmp-mparam.h. The latter is used as a hard limit in + mpn/x86/p6/sqr_basecase.asm. */ + + +/* 800 MHz P6 model 8 */ +/* Generated by tuneup.c, 2017-02-03, gcc 4.8 */ + +#define MOD_1_1P_METHOD 2 +#define MOD_1_NORM_THRESHOLD 3 +#define MOD_1_UNNORM_THRESHOLD 4 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 7 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 7 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 8 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 30 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 14 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1N_PI1_METHOD 1 +#define DIV_QR_1_NORM_THRESHOLD 4 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 62 + +#define DIV_1_VS_MUL_1_PERCENT 168 + +#define MUL_TOOM22_THRESHOLD 22 +#define MUL_TOOM33_THRESHOLD 73 +#define MUL_TOOM44_THRESHOLD 195 +#define MUL_TOOM6H_THRESHOLD 254 +#define MUL_TOOM8H_THRESHOLD 381 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 81 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 122 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 73 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 80 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 100 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 30 /* WRONG value, see comment above */ +#define SQR_TOOM3_THRESHOLD 83 +#define SQR_TOOM4_THRESHOLD 196 +#define SQR_TOOM6_THRESHOLD 214 +#define SQR_TOOM8_THRESHOLD 381 + +#define MULMID_TOOM42_THRESHOLD 56 + +#define MULMOD_BNM1_THRESHOLD 16 +#define SQRMOD_BNM1_THRESHOLD 17 + +#define MUL_FFT_MODF_THRESHOLD 476 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 476, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 21, 7}, { 11, 6}, { 25, 7}, { 13, 6}, \ + { 27, 7}, { 15, 6}, { 31, 7}, { 21, 8}, \ + { 11, 7}, { 27, 8}, { 15, 7}, { 35, 8}, \ + { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \ + { 27, 9}, { 15, 8}, { 31, 7}, { 63, 8}, \ + { 39, 9}, { 23, 8}, { 51,10}, { 15, 9}, \ + { 31, 8}, { 67, 9}, { 39, 8}, { 79, 9}, \ + { 47, 8}, { 95, 9}, { 55,10}, { 31, 9}, \ + { 63, 8}, { 127, 9}, { 79,10}, { 47, 9}, \ + { 95,11}, { 31,10}, { 63, 9}, { 135,10}, \ + { 79, 9}, { 167,10}, { 95, 9}, { 199,10}, \ + { 111,11}, { 63,10}, { 127, 9}, { 255, 8}, \ + { 511,10}, { 143, 9}, { 287, 8}, { 575,10}, \ + { 159,11}, { 95,10}, { 191, 9}, { 383,10}, \ + { 207,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,10}, { 271, 9}, { 543, 8}, { 1087,10}, \ + { 287, 9}, { 575,11}, { 159,10}, { 319, 9}, \ + { 639,10}, { 351, 9}, { 703,11}, { 191,10}, \ + { 383, 9}, { 767,10}, { 415, 9}, { 831,11}, \ + { 223,10}, { 447,12}, { 127,11}, { 255,10}, \ + { 543, 9}, { 1087,11}, { 287,10}, { 607, 9}, \ + { 1215,11}, { 319,10}, { 671,11}, { 351,10}, \ + { 703,12}, { 191,11}, { 383,10}, { 767,11}, \ + { 415,10}, { 831,11}, { 447,13}, { 127,12}, \ + { 255,11}, { 543,10}, { 1087,11}, { 607,10}, \ + { 1215,12}, { 319,11}, { 671,10}, { 1343,11}, \ + { 703,10}, { 1407,11}, { 735,12}, { 383,11}, \ + { 831,12}, { 447,11}, { 959,10}, { 1919,13}, \ + { 255,12}, { 511,11}, { 1087,12}, { 575,11}, \ + { 1215,10}, { 2431,12}, { 639,11}, { 1343,12}, \ + { 703,11}, { 1471,13}, { 383,12}, { 767,11}, \ + { 1535,12}, { 831,11}, { 1727,12}, { 959,11}, \ + { 1919,14}, { 255,13}, { 511,12}, { 1215,11}, \ + { 2431,13}, { 639,12}, { 1471,11}, { 2943,13}, \ + { 767,12}, { 1727,13}, { 895,12}, { 1919,11}, \ + { 3839,14}, { 511,13}, { 1023,12}, { 2111,13}, \ + { 1151,12}, { 2431,13}, { 1279,12}, { 2559,13}, \ + { 1407,12}, { 2943,14}, { 767,13}, { 1663,12}, \ + { 3327,13}, { 1919,12}, { 3839,15}, { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 160 +#define MUL_FFT_THRESHOLD 7040 + +#define SQR_FFT_MODF_THRESHOLD 376 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 376, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 21, 7}, { 11, 6}, { 24, 7}, { 13, 6}, \ + { 27, 7}, { 15, 6}, { 31, 7}, { 21, 8}, \ + { 11, 7}, { 27, 8}, { 15, 7}, { 33, 8}, \ + { 19, 7}, { 39, 8}, { 23, 7}, { 47, 8}, \ + { 27, 9}, { 15, 8}, { 39, 9}, { 23, 8}, \ + { 51,10}, { 15, 9}, { 31, 8}, { 67, 9}, \ + { 39, 8}, { 79, 9}, { 47, 8}, { 95, 9}, \ + { 55,10}, { 31, 9}, { 79,10}, { 47, 9}, \ + { 95,11}, { 31,10}, { 63, 9}, { 127, 8}, \ + { 255, 9}, { 135,10}, { 79, 9}, { 167,10}, \ + { 95, 9}, { 191, 8}, { 383,10}, { 111,11}, \ + { 63,10}, { 127, 9}, { 255, 8}, { 511, 9}, \ + { 271,10}, { 143, 9}, { 287, 8}, { 575, 9}, \ + { 303, 8}, { 607,10}, { 159, 9}, { 319,11}, \ + { 95,10}, { 191, 9}, { 383,10}, { 207,12}, \ + { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \ + { 271, 9}, { 543,10}, { 287, 9}, { 575,10}, \ + { 303,11}, { 159,10}, { 319, 9}, { 639,10}, \ + { 351, 9}, { 703,11}, { 191,10}, { 383, 9}, \ + { 767,10}, { 415, 9}, { 831,11}, { 223,10}, \ + { 479,12}, { 127,11}, { 255,10}, { 543, 9}, \ + { 1087,11}, { 287,10}, { 607, 9}, { 1215,11}, \ + { 319,10}, { 671,11}, { 351,10}, { 703,12}, \ + { 191,11}, { 383,10}, { 767,11}, { 415,10}, \ + { 831,11}, { 479,13}, { 127,12}, { 255,11}, \ + { 543,10}, { 1087,11}, { 607,10}, { 1215,12}, \ + { 319,11}, { 671,10}, { 1343,11}, { 703,10}, \ + { 1407,11}, { 735,12}, { 383,11}, { 831,12}, \ + { 447,11}, { 959,10}, { 1919,13}, { 255,12}, \ + { 511,11}, { 1087,12}, { 575,11}, { 1215,10}, \ + { 2431,12}, { 639,11}, { 1343,12}, { 703,11}, \ + { 1407,13}, { 383,12}, { 831,11}, { 1727,12}, \ + { 959,11}, { 1919,14}, { 255,13}, { 511,12}, \ + { 1215,11}, { 2431,13}, { 639,12}, { 1471,11}, \ + { 2943,13}, { 767,12}, { 1727,13}, { 895,12}, \ + { 1919,11}, { 3839,14}, { 511,13}, { 1023,12}, \ + { 2111,13}, { 1151,12}, { 2431,13}, { 1407,12}, \ + { 2943,14}, { 767,13}, { 1535,12}, { 3071,13}, \ + { 1663,12}, { 3455,13}, { 1919,12}, { 3839,15}, \ + { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 161 +#define SQR_FFT_THRESHOLD 3712 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 62 +#define MULLO_MUL_N_THRESHOLD 13463 +#define SQRLO_BASECASE_THRESHOLD 0 /* always */ +#define SQRLO_DC_THRESHOLD 177 +#define SQRLO_SQR_THRESHOLD 8937 + +#define DC_DIV_QR_THRESHOLD 80 +#define DC_DIVAPPR_Q_THRESHOLD 240 +#define DC_BDIV_QR_THRESHOLD 76 +#define DC_BDIV_Q_THRESHOLD 166 + +#define INV_MULMOD_BNM1_THRESHOLD 42 +#define INV_NEWTON_THRESHOLD 262 +#define INV_APPR_THRESHOLD 250 + +#define BINV_NEWTON_THRESHOLD 272 +#define REDC_1_TO_REDC_N_THRESHOLD 72 + +#define MU_DIV_QR_THRESHOLD 1499 +#define MU_DIVAPPR_Q_THRESHOLD 1470 +#define MUPI_DIV_QR_THRESHOLD 124 +#define MU_BDIV_QR_THRESHOLD 1142 +#define MU_BDIV_Q_THRESHOLD 1341 + +#define POWM_SEC_TABLE 1,16,96,416,1259 + +#define GET_STR_DC_THRESHOLD 14 +#define GET_STR_PRECOMPUTE_THRESHOLD 27 +#define SET_STR_DC_THRESHOLD 270 +#define SET_STR_PRECOMPUTE_THRESHOLD 1084 + +#define FAC_DSC_THRESHOLD 194 +#define FAC_ODD_THRESHOLD 25 + +#define MATRIX22_STRASSEN_THRESHOLD 16 +#define HGCD_THRESHOLD 124 +#define HGCD_APPR_THRESHOLD 152 +#define HGCD_REDUCE_THRESHOLD 3014 +#define GCD_DC_THRESHOLD 474 +#define GCDEXT_DC_THRESHOLD 321 +#define JACOBI_BASE_METHOD 1 diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/mmx/lshift.asm b/vendor/gmp-6.3.0/mpn/x86/p6/mmx/lshift.asm new file mode 100644 index 0000000..febd1c0 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/p6/mmx/lshift.asm @@ -0,0 +1,38 @@ +dnl Intel Pentium-II mpn_lshift -- mpn left shift. + +dnl Copyright 2001 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +dnl The P55 code runs well on P-II/III, but could stand some minor tweaks +dnl at some stage probably. + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_lshift) +include_mpn(`x86/pentium/mmx/lshift.asm') diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/mmx/popham.asm b/vendor/gmp-6.3.0/mpn/x86/p6/mmx/popham.asm new file mode 100644 index 0000000..fd340e4 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/p6/mmx/popham.asm @@ -0,0 +1,39 @@ +dnl Intel Pentium-II mpn_popcount, mpn_hamdist -- population count and +dnl hamming distance. + +dnl Copyright 2000, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P6MMX: popcount 11 cycles/limb (approx), hamdist 11.5 cycles/limb (approx) + + +MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist) +include_mpn(`x86/k6/mmx/popham.asm') diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/mmx/rshift.asm b/vendor/gmp-6.3.0/mpn/x86/p6/mmx/rshift.asm new file mode 100644 index 0000000..77aa190 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/p6/mmx/rshift.asm @@ -0,0 +1,38 @@ +dnl Intel Pentium-II mpn_rshift -- mpn left shift. + +dnl Copyright 2001 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +dnl The P55 code runs well on P-II/III, but could stand some minor tweaks +dnl at some stage probably. + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_rshift) +include_mpn(`x86/pentium/mmx/rshift.asm') diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/mod_34lsub1.asm b/vendor/gmp-6.3.0/mpn/x86/p6/mod_34lsub1.asm new file mode 100644 index 0000000..b88ab5d --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/p6/mod_34lsub1.asm @@ -0,0 +1,190 @@ +dnl Intel P6 mpn_mod_34lsub1 -- remainder modulo 2^24-1. + +dnl Copyright 2000-2002, 2004 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P6: 2.0 cycles/limb + +C TODO +C Experiments with more unrolling indicate that 1.5 c/l is possible on P6-13 +C with the current carry handling scheme. + +C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size) +C +C Groups of three limbs are handled, with carry bits from 0mod3 into 1mod3 +C into 2mod3, but at that point going into a separate carries total so we +C don't keep the carry flag live across the loop control. Avoiding decl +C lets us get to 2.0 c/l, as compared to the generic x86 code at 3.66. +C + +defframe(PARAM_SIZE, 8) +defframe(PARAM_SRC, 4) + +dnl re-use parameter space +define(SAVE_EBX, `PARAM_SIZE') +define(SAVE_ESI, `PARAM_SRC') + + TEXT + ALIGN(16) +PROLOGUE(mpn_mod_34lsub1) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + movl PARAM_SRC, %edx + + subl $2, %ecx C size-2 + movl (%edx), %eax C src[0] + ja L(three_or_more) + jb L(one) + + C size==2 + + movl 4(%edx), %ecx C src[1] + + movl %eax, %edx C src[0] + shrl $24, %eax C src[0] high + + andl $0xFFFFFF, %edx C src[0] low + + addl %edx, %eax + movl %ecx, %edx C src[1] + shrl $16, %ecx C src[1] high + + andl $0xFFFF, %edx + addl %ecx, %eax + + shll $8, %edx C src[1] low + + addl %edx, %eax +L(one): + ret + + +L(three_or_more): + C eax src[0], initial acc 0mod3 + C ebx + C ecx size-2 + C edx src + C esi + C edi + C ebp + + movl %ebx, SAVE_EBX + movl 4(%edx), %ebx C src[1], initial 1mod3 + subl $3, %ecx C size-5 + + movl %esi, SAVE_ESI + movl 8(%edx), %esi C src[2], initial 2mod3 + + pushl %edi FRAME_pushl() + movl $0, %edi C initial carries 0mod3 + jng L(done) C if size < 6 + + +L(top): + C eax acc 0mod3 + C ebx acc 1mod3 + C ecx counter, limbs + C edx src + C esi acc 2mod3 + C edi carrys into 0mod3 + C ebp + + addl 12(%edx), %eax + adcl 16(%edx), %ebx + adcl 20(%edx), %esi + leal 12(%edx), %edx + adcl $0, %edi + + subl $3, %ecx + jg L(top) C at least 3 more to process + + +L(done): + C ecx is -2, -1 or 0 representing 0, 1 or 2 more limbs respectively + cmpl $-1, %ecx + jl L(done_0) C if -2, meaning 0 more limbs + + C 1 or 2 more limbs + movl $0, %ecx + je L(done_1) C if -1, meaning 1 more limb only + movl 16(%edx), %ecx +L(done_1): + addl 12(%edx), %eax C 0mod3 + adcl %ecx, %ebx C 1mod3 + adcl $0, %esi C 2mod3 + adcl $0, %edi C carries 0mod3 + +L(done_0): + C eax acc 0mod3 + C ebx acc 1mod3 + C ecx + C edx + C esi acc 2mod3 + C edi carries 0mod3 + C ebp + + movl %eax, %ecx C 0mod3 + shrl $24, %eax C 0mod3 high initial total + + andl $0xFFFFFF, %ecx C 0mod3 low + movl %edi, %edx C carries + shrl $24, %edi C carries high + + addl %ecx, %eax C add 0mod3 low + andl $0xFFFFFF, %edx C carries 0mod3 low + movl %ebx, %ecx C 1mod3 + + shrl $16, %ebx C 1mod3 high + addl %edi, %eax C add carries high + addl %edx, %eax C add carries 0mod3 low + + andl $0xFFFF, %ecx C 1mod3 low mask + addl %ebx, %eax C add 1mod3 high + movl SAVE_EBX, %ebx + + shll $8, %ecx C 1mod3 low + movl %esi, %edx C 2mod3 + popl %edi FRAME_popl() + + shrl $8, %esi C 2mod3 high + andl $0xFF, %edx C 2mod3 low mask + addl %ecx, %eax C add 1mod3 low + + shll $16, %edx C 2mod3 low + addl %esi, %eax C add 2mod3 high + movl SAVE_ESI, %esi + + addl %edx, %eax C add 2mod3 low + + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/mode1o.asm b/vendor/gmp-6.3.0/mpn/x86/p6/mode1o.asm new file mode 100644 index 0000000..7083195 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/p6/mode1o.asm @@ -0,0 +1,170 @@ +dnl Intel P6 mpn_modexact_1_odd -- exact division style remainder. + +dnl Copyright 2000-2002, 2007 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P6: 10.0 cycles/limb + + +C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size, +C mp_limb_t divisor); +C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, +C mp_limb_t divisor, mp_limb_t carry); +C +C It's not worth skipping a step at the end when high<divisor since the main +C loop is only 10 cycles. + +defframe(PARAM_CARRY, 16) +defframe(PARAM_DIVISOR,12) +defframe(PARAM_SIZE, 8) +defframe(PARAM_SRC, 4) + +dnl Not enough room under modexact_1 to make these re-use the parameter +dnl space, unfortunately. +defframe(SAVE_EBX, -4) +defframe(SAVE_ESI, -8) +defframe(SAVE_EDI, -12) +deflit(STACK_SPACE, 12) + + TEXT + + ALIGN(16) +PROLOGUE(mpn_modexact_1c_odd) +deflit(`FRAME',0) + + movl PARAM_CARRY, %ecx + jmp L(start_1c) + +EPILOGUE() + + ALIGN(16) +PROLOGUE(mpn_modexact_1_odd) +deflit(`FRAME',0) + + xorl %ecx, %ecx +L(start_1c): + movl PARAM_DIVISOR, %eax + + subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE) + + movl %esi, SAVE_ESI + movl PARAM_SRC, %esi + + shrl %eax C d/2 + movl %edi, SAVE_EDI + + andl $127, %eax + +ifdef(`PIC',` + LEA( binvert_limb_table, %edi) + movzbl (%eax,%edi), %edi C inv 8 bits +',` + movzbl binvert_limb_table(%eax), %edi C inv 8 bits +') + + xorl %edx, %edx C initial extra carry + leal (%edi,%edi), %eax C 2*inv + + imull %edi, %edi C inv*inv + + movl %ebx, SAVE_EBX + movl PARAM_SIZE, %ebx + + imull PARAM_DIVISOR, %edi C inv*inv*d + + subl %edi, %eax C inv = 2*inv - inv*inv*d + leal (%eax,%eax), %edi C 2*inv + + imull %eax, %eax C inv*inv + + imull PARAM_DIVISOR, %eax C inv*inv*d + + leal (%esi,%ebx,4), %esi C src end + negl %ebx C -size + + subl %eax, %edi C inv = 2*inv - inv*inv*d + + ASSERT(e,` C d*inv == 1 mod 2^GMP_LIMB_BITS + movl PARAM_DIVISOR, %eax + imull %edi, %eax + cmpl $1, %eax') + + +C The dependent chain here is +C +C subl %edx, %eax 1 +C imull %edi, %eax 4 +C mull PARAM_DIVISOR 5 +C ---- +C total 10 +C +C and this is the measured speed. No special scheduling is necessary, out +C of order execution hides the load latency. + +L(top): + C eax scratch (src limb) + C ebx counter, limbs, negative + C ecx carry bit, 0 or 1 + C edx carry limb, high of last product + C esi &src[size] + C edi inverse + C ebp + + movl (%esi,%ebx,4), %eax + subl %ecx, %eax + + sbbl %ecx, %ecx + subl %edx, %eax + + sbbl $0, %ecx + + imull %edi, %eax + + negl %ecx + + mull PARAM_DIVISOR + + incl %ebx + jnz L(top) + + + movl SAVE_ESI, %esi + leal (%ecx,%edx), %eax + + movl SAVE_EDI, %edi + + movl SAVE_EBX, %ebx + addl $STACK_SPACE, %esp + + ret + +EPILOGUE() +ASM_END() diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/mul_basecase.asm b/vendor/gmp-6.3.0/mpn/x86/p6/mul_basecase.asm new file mode 100644 index 0000000..d87bc12 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/p6/mul_basecase.asm @@ -0,0 +1,607 @@ +dnl Intel P6 mpn_mul_basecase -- multiply two mpn numbers. + +dnl Copyright 1999-2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P6: approx 6.5 cycles per cross product (16 limbs/loop unrolling). + + +dnl P6 UNROLL_COUNT cycles/product (approx) +dnl 8 7 +dnl 16 6.5 +dnl 32 6.4 +dnl Maximum possible with the current code is 32. + +deflit(UNROLL_COUNT, 16) + + +C void mpn_mul_basecase (mp_ptr wp, +C mp_srcptr xp, mp_size_t xsize, +C mp_srcptr yp, mp_size_t ysize); +C +C This routine is essentially the same as mpn/generic/mul_basecase.c, but +C it's faster because it does most of the mpn_addmul_1() startup +C calculations only once. + +ifdef(`PIC',` +deflit(UNROLL_THRESHOLD, 5) +',` +deflit(UNROLL_THRESHOLD, 5) +') + +defframe(PARAM_YSIZE,20) +defframe(PARAM_YP, 16) +defframe(PARAM_XSIZE,12) +defframe(PARAM_XP, 8) +defframe(PARAM_WP, 4) + + TEXT + ALIGN(16) + +PROLOGUE(mpn_mul_basecase) +deflit(`FRAME',0) + + movl PARAM_XSIZE, %ecx + + movl PARAM_YP, %eax + + movl PARAM_XP, %edx + + movl (%eax), %eax C yp[0] + cmpl $2, %ecx + ja L(xsize_more_than_two) + je L(two_by_something) + + + C one limb by one limb + + mull (%edx) + + movl PARAM_WP, %ecx + movl %eax, (%ecx) + movl %edx, 4(%ecx) + ret + + +C ----------------------------------------------------------------------------- +L(two_by_something): +deflit(`FRAME',0) + +dnl re-use parameter space +define(SAVE_EBX, `PARAM_XSIZE') +define(SAVE_ESI, `PARAM_YSIZE') + + movl %ebx, SAVE_EBX + cmpl $1, PARAM_YSIZE + movl %eax, %ecx C yp[0] + + movl %esi, SAVE_ESI C save esi + movl PARAM_WP, %ebx + movl %edx, %esi C xp + + movl (%edx), %eax C xp[0] + jne L(two_by_two) + + + C two limbs by one limb + C + C eax xp[0] + C ebx wp + C ecx yp[0] + C edx + C esi xp + + mull %ecx + + movl %eax, (%ebx) + movl 4(%esi), %eax + movl %edx, %esi C carry + + mull %ecx + + addl %eax, %esi + + movl %esi, 4(%ebx) + movl SAVE_ESI, %esi + + adcl $0, %edx + + movl %edx, 8(%ebx) + movl SAVE_EBX, %ebx + + ret + + + +C ----------------------------------------------------------------------------- + + ALIGN(16) +L(two_by_two): + C eax xp[0] + C ebx wp + C ecx yp[0] + C edx + C esi xp + C edi + C ebp + +dnl more parameter space re-use +define(SAVE_EDI, `PARAM_WP') + + mull %ecx C xp[0] * yp[0] + + movl %edi, SAVE_EDI + movl %edx, %edi C carry, for wp[1] + + movl %eax, (%ebx) + movl 4(%esi), %eax + + mull %ecx C xp[1] * yp[0] + + addl %eax, %edi + movl PARAM_YP, %ecx + + adcl $0, %edx + movl 4(%ecx), %ecx C yp[1] + + movl %edi, 4(%ebx) + movl 4(%esi), %eax C xp[1] + movl %edx, %edi C carry, for wp[2] + + mull %ecx C xp[1] * yp[1] + + addl %eax, %edi + movl (%esi), %eax C xp[0] + + adcl $0, %edx + movl %edx, %esi C carry, for wp[3] + + mull %ecx C xp[0] * yp[1] + + addl %eax, 4(%ebx) + movl %esi, %eax + + adcl %edx, %edi + movl SAVE_ESI, %esi + + movl %edi, 8(%ebx) + + adcl $0, %eax + movl SAVE_EDI, %edi + + movl %eax, 12(%ebx) + movl SAVE_EBX, %ebx + + ret + + +C ----------------------------------------------------------------------------- + ALIGN(16) +L(xsize_more_than_two): + +C The first limb of yp is processed with a simple mpn_mul_1 loop running at +C about 6.2 c/l. Unrolling this doesn't seem worthwhile since it's only run +C once (whereas the addmul_1 below is run ysize-1 many times). A call to +C mpn_mul_1 would be slowed down by the parameter pushing and popping etc, +C and doesn't seem likely to be worthwhile on the typical sizes reaching +C here from the Karatsuba code. + + C eax yp[0] + C ebx + C ecx xsize + C edx xp + C esi + C edi + C ebp + +defframe(`SAVE_EBX', -4) +defframe(`SAVE_ESI', -8) +defframe(`SAVE_EDI', -12) +defframe(`SAVE_EBP', -16) +defframe(VAR_COUNTER, -20) dnl for use in the unroll case +defframe(VAR_ADJUST, -24) +defframe(VAR_JMP, -28) +defframe(VAR_SWAP, -32) +defframe(VAR_XP_LOW, -36) +deflit(STACK_SPACE, 36) + + subl $STACK_SPACE, %esp +deflit(`FRAME',STACK_SPACE) + + movl %edi, SAVE_EDI + movl PARAM_WP, %edi + + movl %ebx, SAVE_EBX + + movl %ebp, SAVE_EBP + movl %eax, %ebp + + movl %esi, SAVE_ESI + xorl %ebx, %ebx + leal (%edx,%ecx,4), %esi C xp end + + leal (%edi,%ecx,4), %edi C wp end of mul1 + negl %ecx + + +L(mul1): + C eax scratch + C ebx carry + C ecx counter, negative + C edx scratch + C esi xp end + C edi wp end of mul1 + C ebp multiplier + + movl (%esi,%ecx,4), %eax + + mull %ebp + + addl %ebx, %eax + movl %eax, (%edi,%ecx,4) + movl $0, %ebx + + adcl %edx, %ebx + incl %ecx + jnz L(mul1) + + + movl PARAM_YSIZE, %edx + + movl %ebx, (%edi) C final carry + movl PARAM_XSIZE, %ecx + decl %edx + + jz L(done) C if ysize==1 + + cmpl $UNROLL_THRESHOLD, %ecx + movl PARAM_YP, %eax + jae L(unroll) + + +C ----------------------------------------------------------------------------- + C simple addmul looping + C + C eax yp + C ebx + C ecx xsize + C edx ysize-1 + C esi xp end + C edi wp end of mul1 + C ebp + + leal 4(%eax,%edx,4), %ebp C yp end + negl %ecx + negl %edx + + movl %edx, PARAM_YSIZE C -(ysize-1) + movl (%esi,%ecx,4), %eax C xp low limb + incl %ecx + + movl %ecx, PARAM_XSIZE C -(xsize-1) + xorl %ebx, %ebx C initial carry + + movl %ebp, PARAM_YP + movl (%ebp,%edx,4), %ebp C yp second lowest limb - multiplier + jmp L(simple_outer_entry) + + +L(simple_outer_top): + C ebp ysize counter, negative + + movl PARAM_YP, %edx + + movl PARAM_XSIZE, %ecx C -(xsize-1) + xorl %ebx, %ebx C carry + + movl %ebp, PARAM_YSIZE + addl $4, %edi C next position in wp + + movl (%edx,%ebp,4), %ebp C yp limb - multiplier + + movl -4(%esi,%ecx,4), %eax C xp low limb + + +L(simple_outer_entry): + +L(simple_inner_top): + C eax xp limb + C ebx carry limb + C ecx loop counter (negative) + C edx scratch + C esi xp end + C edi wp end + C ebp multiplier + + mull %ebp + + addl %eax, %ebx + adcl $0, %edx + + addl %ebx, (%edi,%ecx,4) + movl (%esi,%ecx,4), %eax + adcl $0, %edx + + incl %ecx + movl %edx, %ebx + jnz L(simple_inner_top) + + + C separate code for last limb so outer loop counter handling can be + C interleaved + + mull %ebp + + movl PARAM_YSIZE, %ebp + addl %eax, %ebx + + adcl $0, %edx + + addl %ebx, (%edi) + + adcl $0, %edx + incl %ebp + + movl %edx, 4(%edi) + jnz L(simple_outer_top) + + +L(done): + movl SAVE_EBX, %ebx + + movl SAVE_ESI, %esi + + movl SAVE_EDI, %edi + + movl SAVE_EBP, %ebp + addl $FRAME, %esp + + ret + + + +C ----------------------------------------------------------------------------- +C +C The unrolled loop is the same as in mpn_addmul_1, see that code for some +C comments. +C +C VAR_ADJUST is the negative of how many limbs the leals in the inner loop +C increment xp and wp. This is used to adjust xp and wp, and is rshifted to +C given an initial VAR_COUNTER at the top of the outer loop. +C +C VAR_COUNTER is for the unrolled loop, running from VAR_ADJUST/UNROLL_COUNT +C up to -1, inclusive. +C +C VAR_JMP is the computed jump into the unrolled loop. +C +C VAR_SWAP is 0 if xsize odd or 0xFFFFFFFF if xsize even, used to swap the +C initial ebx and ecx on entry to the unrolling. +C +C VAR_XP_LOW is the least significant limb of xp, which is needed at the +C start of the unrolled loop. +C +C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1, +C inclusive. +C +C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be +C added to give the location of the next limb of yp, which is the multiplier +C in the unrolled loop. +C +C The trick with the VAR_ADJUST value means it's only necessary to do one +C fetch in the outer loop to take care of xp, wp and the inner loop counter. + + +L(unroll): + C eax yp + C ebx + C ecx xsize + C edx ysize-1 + C esi xp end + C edi wp end of mul1 + C ebp + + movl PARAM_XP, %esi + + movl 4(%eax), %ebp C multiplier (yp second limb) + leal 4(%eax,%edx,4), %eax C yp adjust for ysize indexing + + movl %eax, PARAM_YP + movl PARAM_WP, %edi + negl %edx + + movl %edx, PARAM_YSIZE + leal UNROLL_COUNT-2(%ecx), %ebx C (xsize-1)+UNROLL_COUNT-1 + decl %ecx C xsize-1 + + movl (%esi), %eax C xp low limb + andl $-UNROLL_MASK-1, %ebx + negl %ecx C -(xsize-1) + + negl %ebx + andl $UNROLL_MASK, %ecx + + movl %ebx, VAR_ADJUST + movl %ecx, %edx + shll $4, %ecx + + movl %eax, VAR_XP_LOW + sarl $UNROLL_LOG2, %ebx + negl %edx + + C 15 code bytes per limb +ifdef(`PIC',` + call L(pic_calc) +L(unroll_here): +',` + leal L(unroll_inner_entry) (%ecx,%edx,1), %ecx +') + + movl %ecx, VAR_JMP + movl %edx, %ecx + shll $31, %edx + + sarl $31, %edx C 0 or -1 as xsize odd or even + leal 4(%edi,%ecx,4), %edi C wp and xp, adjust for unrolling, + leal 4(%esi,%ecx,4), %esi C and start at second limb + + movl %edx, VAR_SWAP + jmp L(unroll_outer_entry) + + +ifdef(`PIC',` +L(pic_calc): + C See mpn/x86/README about old gas bugs + leal (%ecx,%edx,1), %ecx + addl $L(unroll_inner_entry)-L(unroll_here), %ecx + addl (%esp), %ecx + ret_internal +') + + +C -------------------------------------------------------------------------- + ALIGN(16) +L(unroll_outer_top): + C eax + C ebx + C ecx + C edx + C esi xp + offset + C edi wp + offset + C ebp ysize counter, negative + + movl VAR_ADJUST, %ebx + movl PARAM_YP, %edx + + movl VAR_XP_LOW, %eax + movl %ebp, PARAM_YSIZE C store incremented ysize counter + + leal eval(UNROLL_BYTES + 4) (%edi,%ebx,4), %edi + leal (%esi,%ebx,4), %esi + sarl $UNROLL_LOG2, %ebx + + movl (%edx,%ebp,4), %ebp C yp next multiplier + +L(unroll_outer_entry): + mull %ebp + + movl %ebx, VAR_COUNTER + movl %edx, %ebx C carry high + movl %eax, %ecx C carry low + + xorl %edx, %eax + movl VAR_JMP, %edx + + andl VAR_SWAP, %eax + + xorl %eax, %ebx C carries other way for odd index + xorl %eax, %ecx + + jmp *%edx + + +C ----------------------------------------------------------------------------- + +L(unroll_inner_top): + C eax xp limb + C ebx carry high + C ecx carry low + C edx scratch + C esi xp+8 + C edi wp + C ebp yp multiplier limb + C + C VAR_COUNTER loop counter, negative + C + C 15 bytes each limb + + addl $UNROLL_BYTES, %edi + +L(unroll_inner_entry): + +deflit(CHUNK_COUNT,2) +forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, ` + deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128))) + deflit(`disp1', eval(disp0 + 4)) + +Zdisp( movl, disp0,(%esi), %eax) + mull %ebp +Zdisp( addl, %ecx, disp0,(%edi)) + adcl %eax, %ebx C new carry low + movl %edx, %ecx + adcl $0, %ecx C new carry high + + movl disp1(%esi), %eax + mull %ebp + addl %ebx, disp1(%edi) + adcl %eax, %ecx C new carry low + movl %edx, %ebx + adcl $0, %ebx C new carry high +') + + + incl VAR_COUNTER + leal UNROLL_BYTES(%esi), %esi + jnz L(unroll_inner_top) + + + C eax + C ebx carry high + C ecx carry low + C edx + C esi + C edi wp, pointing at second last limb) + C ebp + +deflit(`disp0', eval(UNROLL_BYTES ifelse(UNROLL_BYTES,256,-128))) +deflit(`disp1', eval(disp0 + 4)) + + movl PARAM_YSIZE, %ebp + addl %ecx, disp0(%edi) C carry low + + adcl $0, %ebx + incl %ebp + + movl %ebx, disp1(%edi) C carry high + jnz L(unroll_outer_top) + + + movl SAVE_ESI, %esi + + movl SAVE_EBP, %ebp + + movl SAVE_EDI, %edi + + movl SAVE_EBX, %ebx + addl $FRAME, %esp + + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/p3mmx/popham.asm b/vendor/gmp-6.3.0/mpn/x86/p6/p3mmx/popham.asm new file mode 100644 index 0000000..db2f260 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/p6/p3mmx/popham.asm @@ -0,0 +1,42 @@ +dnl Intel Pentium-III mpn_popcount, mpn_hamdist -- population count and +dnl hamming distance. + +dnl Copyright 2000, 2002, 2004, 2007 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C popcount hamdist +C P3 generic 6.5 7 +C P3 model 9 (Banias) ? ? +C P3 model 13 (Dothan) 5.75 6 + + +MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist) +include_mpn(`x86/k7/mmx/popham.asm') diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/sqr_basecase.asm b/vendor/gmp-6.3.0/mpn/x86/p6/sqr_basecase.asm new file mode 100644 index 0000000..8fc7fdf --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/p6/sqr_basecase.asm @@ -0,0 +1,649 @@ +dnl Intel P6 mpn_sqr_basecase -- square an mpn number. + +dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P6: approx 4.0 cycles per cross product, or 7.75 cycles per triangular +C product (measured on the speed difference between 20 and 40 limbs, +C which is the Karatsuba recursing range). + + +dnl These are the same as in mpn/x86/k6/sqr_basecase.asm, see that file for +dnl a description. The only difference here is that UNROLL_COUNT can go up +dnl to 64 (not 63) making SQR_TOOM2_THRESHOLD_MAX 67. + +deflit(SQR_TOOM2_THRESHOLD_MAX, 67) + +ifdef(`SQR_TOOM2_THRESHOLD_OVERRIDE', +`define(`SQR_TOOM2_THRESHOLD',SQR_TOOM2_THRESHOLD_OVERRIDE)') + +m4_config_gmp_mparam(`SQR_TOOM2_THRESHOLD') +deflit(UNROLL_COUNT, eval(SQR_TOOM2_THRESHOLD-3)) + + +C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size); +C +C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a +C lot of function call overheads are avoided, especially when the given size +C is small. +C +C The code size might look a bit excessive, but not all of it is executed so +C it won't all get into the code cache. The 1x1, 2x2 and 3x3 special cases +C clearly apply only to those sizes; mid sizes like 10x10 only need part of +C the unrolled addmul; and big sizes like 40x40 that do use the full +C unrolling will least be making good use of it, because 40x40 will take +C something like 7000 cycles. + +defframe(PARAM_SIZE,12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + TEXT + ALIGN(32) +PROLOGUE(mpn_sqr_basecase) +deflit(`FRAME',0) + + movl PARAM_SIZE, %edx + + movl PARAM_SRC, %eax + + cmpl $2, %edx + movl PARAM_DST, %ecx + je L(two_limbs) + + movl (%eax), %eax + ja L(three_or_more) + + +C ----------------------------------------------------------------------------- +C one limb only + C eax src limb + C ebx + C ecx dst + C edx + + mull %eax + + movl %eax, (%ecx) + movl %edx, 4(%ecx) + + ret + + +C ----------------------------------------------------------------------------- +L(two_limbs): + C eax src + C ebx + C ecx dst + C edx + +defframe(SAVE_ESI, -4) +defframe(SAVE_EBX, -8) +defframe(SAVE_EDI, -12) +defframe(SAVE_EBP, -16) +deflit(`STACK_SPACE',16) + + subl $STACK_SPACE, %esp +deflit(`FRAME',STACK_SPACE) + + movl %esi, SAVE_ESI + movl %eax, %esi + movl (%eax), %eax + + mull %eax C src[0]^2 + + movl %eax, (%ecx) C dst[0] + movl 4(%esi), %eax + + movl %ebx, SAVE_EBX + movl %edx, %ebx C dst[1] + + mull %eax C src[1]^2 + + movl %edi, SAVE_EDI + movl %eax, %edi C dst[2] + movl (%esi), %eax + + movl %ebp, SAVE_EBP + movl %edx, %ebp C dst[3] + + mull 4(%esi) C src[0]*src[1] + + addl %eax, %ebx + movl SAVE_ESI, %esi + + adcl %edx, %edi + + adcl $0, %ebp + addl %ebx, %eax + movl SAVE_EBX, %ebx + + adcl %edi, %edx + movl SAVE_EDI, %edi + + adcl $0, %ebp + + movl %eax, 4(%ecx) + + movl %ebp, 12(%ecx) + movl SAVE_EBP, %ebp + + movl %edx, 8(%ecx) + addl $FRAME, %esp + + ret + + +C ----------------------------------------------------------------------------- +L(three_or_more): + C eax src low limb + C ebx + C ecx dst + C edx size +deflit(`FRAME',0) + + pushl %esi defframe_pushl(`SAVE_ESI') + cmpl $4, %edx + + movl PARAM_SRC, %esi + jae L(four_or_more) + + +C ----------------------------------------------------------------------------- +C three limbs + + C eax src low limb + C ebx + C ecx dst + C edx + C esi src + C edi + C ebp + + pushl %ebp defframe_pushl(`SAVE_EBP') + pushl %edi defframe_pushl(`SAVE_EDI') + + mull %eax C src[0] ^ 2 + + movl %eax, (%ecx) + movl %edx, 4(%ecx) + + movl 4(%esi), %eax + xorl %ebp, %ebp + + mull %eax C src[1] ^ 2 + + movl %eax, 8(%ecx) + movl %edx, 12(%ecx) + movl 8(%esi), %eax + + pushl %ebx defframe_pushl(`SAVE_EBX') + + mull %eax C src[2] ^ 2 + + movl %eax, 16(%ecx) + movl %edx, 20(%ecx) + + movl (%esi), %eax + + mull 4(%esi) C src[0] * src[1] + + movl %eax, %ebx + movl %edx, %edi + + movl (%esi), %eax + + mull 8(%esi) C src[0] * src[2] + + addl %eax, %edi + movl %edx, %ebp + + adcl $0, %ebp + movl 4(%esi), %eax + + mull 8(%esi) C src[1] * src[2] + + xorl %esi, %esi + addl %eax, %ebp + + C eax + C ebx dst[1] + C ecx dst + C edx dst[4] + C esi zero, will be dst[5] + C edi dst[2] + C ebp dst[3] + + adcl $0, %edx + addl %ebx, %ebx + + adcl %edi, %edi + + adcl %ebp, %ebp + + adcl %edx, %edx + movl 4(%ecx), %eax + + adcl $0, %esi + addl %ebx, %eax + + movl %eax, 4(%ecx) + movl 8(%ecx), %eax + + adcl %edi, %eax + movl 12(%ecx), %ebx + + adcl %ebp, %ebx + movl 16(%ecx), %edi + + movl %eax, 8(%ecx) + movl SAVE_EBP, %ebp + + movl %ebx, 12(%ecx) + movl SAVE_EBX, %ebx + + adcl %edx, %edi + movl 20(%ecx), %eax + + movl %edi, 16(%ecx) + movl SAVE_EDI, %edi + + adcl %esi, %eax C no carry out of this + movl SAVE_ESI, %esi + + movl %eax, 20(%ecx) + addl $FRAME, %esp + + ret + + + +C ----------------------------------------------------------------------------- +defframe(VAR_COUNTER,-20) +defframe(VAR_JMP, -24) +deflit(`STACK_SPACE',24) + +L(four_or_more): + C eax src low limb + C ebx + C ecx + C edx size + C esi src + C edi + C ebp +deflit(`FRAME',4) dnl %esi already pushed + +C First multiply src[0]*src[1..size-1] and store at dst[1..size]. + + subl $STACK_SPACE-FRAME, %esp +deflit(`FRAME',STACK_SPACE) + movl $1, %ecx + + movl %edi, SAVE_EDI + movl PARAM_DST, %edi + + movl %ebx, SAVE_EBX + subl %edx, %ecx C -(size-1) + + movl %ebp, SAVE_EBP + movl $0, %ebx C initial carry + + leal (%esi,%edx,4), %esi C &src[size] + movl %eax, %ebp C multiplier + + leal -4(%edi,%edx,4), %edi C &dst[size-1] + + +C This loop runs at just over 6 c/l. + +L(mul_1): + C eax scratch + C ebx carry + C ecx counter, limbs, negative, -(size-1) to -1 + C edx scratch + C esi &src[size] + C edi &dst[size-1] + C ebp multiplier + + movl %ebp, %eax + + mull (%esi,%ecx,4) + + addl %ebx, %eax + movl $0, %ebx + + adcl %edx, %ebx + movl %eax, 4(%edi,%ecx,4) + + incl %ecx + jnz L(mul_1) + + + movl %ebx, 4(%edi) + + +C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2. +C +C The last two addmuls, which are the bottom right corner of the product +C triangle, are left to the end. These are src[size-3]*src[size-2,size-1] +C and src[size-2]*src[size-1]. If size is 4 then it's only these corner +C cases that need to be done. +C +C The unrolled code is the same as mpn_addmul_1(), see that routine for some +C comments. +C +C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive. +C +C VAR_JMP is the computed jump into the unrolled code, stepped by one code +C chunk each outer loop. + +dnl This is also hard-coded in the address calculation below. +deflit(CODE_BYTES_PER_LIMB, 15) + +dnl With &src[size] and &dst[size-1] pointers, the displacements in the +dnl unrolled code fit in a byte for UNROLL_COUNT values up to 32, but above +dnl that an offset must be added to them. +deflit(OFFSET, +ifelse(eval(UNROLL_COUNT>32),1, +eval((UNROLL_COUNT-32)*4), +0)) + + C eax + C ebx carry + C ecx + C edx + C esi &src[size] + C edi &dst[size-1] + C ebp + + movl PARAM_SIZE, %ecx + + subl $4, %ecx + jz L(corner) + + movl %ecx, %edx + negl %ecx + + shll $4, %ecx +ifelse(OFFSET,0,,`subl $OFFSET, %esi') + +ifdef(`PIC',` + call L(pic_calc) +L(here): +',` + leal L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx +') + negl %edx + +ifelse(OFFSET,0,,`subl $OFFSET, %edi') + + C The calculated jump mustn't be before the start of the available + C code. This is the limit that UNROLL_COUNT puts on the src operand + C size, but checked here using the jump address directly. + + ASSERT(ae, + `movl_text_address( L(unroll_inner_start), %eax) + cmpl %eax, %ecx') + + +C ----------------------------------------------------------------------------- + ALIGN(16) +L(unroll_outer_top): + C eax + C ebx high limb to store + C ecx VAR_JMP + C edx VAR_COUNTER, limbs, negative + C esi &src[size], constant + C edi dst ptr, second highest limb of last addmul + C ebp + + movl -12+OFFSET(%esi,%edx,4), %ebp C multiplier + movl %edx, VAR_COUNTER + + movl -8+OFFSET(%esi,%edx,4), %eax C first limb of multiplicand + + mull %ebp + +define(cmovX,`ifelse(eval(UNROLL_COUNT%2),1,`cmovz($@)',`cmovnz($@)')') + + testb $1, %cl + + movl %edx, %ebx C high carry + leal 4(%edi), %edi + + movl %ecx, %edx C jump + + movl %eax, %ecx C low carry + leal CODE_BYTES_PER_LIMB(%edx), %edx + + cmovX( %ebx, %ecx) C high carry reverse + cmovX( %eax, %ebx) C low carry reverse + movl %edx, VAR_JMP + jmp *%edx + + + C Must be on an even address here so the low bit of the jump address + C will indicate which way around ecx/ebx should start. + + ALIGN(2) + +L(unroll_inner_start): + C eax scratch + C ebx carry high + C ecx carry low + C edx scratch + C esi src pointer + C edi dst pointer + C ebp multiplier + C + C 15 code bytes each limb + C ecx/ebx reversed on each chunk + +forloop(`i', UNROLL_COUNT, 1, ` + deflit(`disp_src', eval(-i*4 + OFFSET)) + deflit(`disp_dst', eval(disp_src)) + + m4_assert(`disp_src>=-128 && disp_src<128') + m4_assert(`disp_dst>=-128 && disp_dst<128') + +ifelse(eval(i%2),0,` +Zdisp( movl, disp_src,(%esi), %eax) + mull %ebp +Zdisp( addl, %ebx, disp_dst,(%edi)) + adcl %eax, %ecx + movl %edx, %ebx + adcl $0, %ebx +',` + dnl this one comes out last +Zdisp( movl, disp_src,(%esi), %eax) + mull %ebp +Zdisp( addl, %ecx, disp_dst,(%edi)) + adcl %eax, %ebx + movl %edx, %ecx + adcl $0, %ecx +') +') +L(unroll_inner_end): + + addl %ebx, m4_empty_if_zero(OFFSET)(%edi) + + movl VAR_COUNTER, %edx + adcl $0, %ecx + + movl %ecx, m4_empty_if_zero(OFFSET+4)(%edi) + movl VAR_JMP, %ecx + + incl %edx + jnz L(unroll_outer_top) + + +ifelse(OFFSET,0,,` + addl $OFFSET, %esi + addl $OFFSET, %edi +') + + +C ----------------------------------------------------------------------------- + ALIGN(16) +L(corner): + C eax + C ebx + C ecx + C edx + C esi &src[size] + C edi &dst[2*size-5] + C ebp + + movl -12(%esi), %eax + + mull -8(%esi) + + addl %eax, (%edi) + movl -12(%esi), %eax + movl $0, %ebx + + adcl %edx, %ebx + + mull -4(%esi) + + addl %eax, %ebx + movl -8(%esi), %eax + + adcl $0, %edx + + addl %ebx, 4(%edi) + movl $0, %ebx + + adcl %edx, %ebx + + mull -4(%esi) + + movl PARAM_SIZE, %ecx + addl %ebx, %eax + + adcl $0, %edx + + movl %eax, 8(%edi) + + movl %edx, 12(%edi) + movl PARAM_DST, %edi + + +C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1]. + + subl $1, %ecx C size-1 + xorl %eax, %eax C ready for final adcl, and clear carry + + movl %ecx, %edx + movl PARAM_SRC, %esi + + +L(lshift): + C eax + C ebx + C ecx counter, size-1 to 1 + C edx size-1 (for later use) + C esi src (for later use) + C edi dst, incrementing + C ebp + + rcll 4(%edi) + rcll 8(%edi) + + leal 8(%edi), %edi + decl %ecx + jnz L(lshift) + + + adcl %eax, %eax + + movl %eax, 4(%edi) C dst most significant limb + movl (%esi), %eax C src[0] + + leal 4(%esi,%edx,4), %esi C &src[size] + subl %edx, %ecx C -(size-1) + + +C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ..., +C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the +C low limb of src[0]^2. + + + mull %eax + + movl %eax, (%edi,%ecx,8) C dst[0] + + +L(diag): + C eax scratch + C ebx scratch + C ecx counter, negative + C edx carry + C esi &src[size] + C edi dst[2*size-2] + C ebp + + movl (%esi,%ecx,4), %eax + movl %edx, %ebx + + mull %eax + + addl %ebx, 4(%edi,%ecx,8) + adcl %eax, 8(%edi,%ecx,8) + adcl $0, %edx + + incl %ecx + jnz L(diag) + + + movl SAVE_ESI, %esi + movl SAVE_EBX, %ebx + + addl %edx, 4(%edi) C dst most significant limb + + movl SAVE_EDI, %edi + movl SAVE_EBP, %ebp + addl $FRAME, %esp + ret + + + +C ----------------------------------------------------------------------------- +ifdef(`PIC',` +L(pic_calc): + addl (%esp), %ecx + addl $L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx + addl %edx, %ecx + ret_internal +') + + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/sse2/addmul_1.asm b/vendor/gmp-6.3.0/mpn/x86/p6/sse2/addmul_1.asm new file mode 100644 index 0000000..144b627 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/p6/sse2/addmul_1.asm @@ -0,0 +1,37 @@ +dnl Intel P6/SSE2 mpn_addmul_1. + +dnl Copyright 2008 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C TODO +C * Write P6 specific SSE2 code. + +MULFUNC_PROLOGUE(mpn_addmul_1) +include_mpn(`x86/pentium4/sse2/addmul_1.asm') diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/sse2/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/x86/p6/sse2/gmp-mparam.h new file mode 100644 index 0000000..a1e261b --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/p6/sse2/gmp-mparam.h @@ -0,0 +1,200 @@ +/* Intel P6/sse2 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2003, 2008-2010 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + + +/* NOTE: In a fat binary build SQR_TOOM2_THRESHOLD here cannot be more than the + value in mpn/x86/p6/gmp-mparam.h. The latter is used as a hard limit in + mpn/x86/p6/sqr_basecase.asm. */ + + +/* 1867 MHz P6 model 13 */ + +#define MOD_1_NORM_THRESHOLD 4 +#define MOD_1_UNNORM_THRESHOLD 4 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 11 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 8 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 21 + +#define MUL_TOOM22_THRESHOLD 20 +#define MUL_TOOM33_THRESHOLD 77 +#define MUL_TOOM44_THRESHOLD 169 +#define MUL_TOOM6H_THRESHOLD 246 +#define MUL_TOOM8H_THRESHOLD 381 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 73 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 114 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 97 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 80 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 106 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 30 +#define SQR_TOOM3_THRESHOLD 101 +#define SQR_TOOM4_THRESHOLD 154 +#define SQR_TOOM6_THRESHOLD 222 +#define SQR_TOOM8_THRESHOLD 527 + +#define MULMID_TOOM42_THRESHOLD 58 + +#define MULMOD_BNM1_THRESHOLD 13 +#define SQRMOD_BNM1_THRESHOLD 17 + +#define MUL_FFT_MODF_THRESHOLD 690 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 565, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 25, 7}, { 13, 6}, { 28, 7}, { 15, 6}, \ + { 31, 7}, { 17, 6}, { 35, 7}, { 27, 8}, \ + { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \ + { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \ + { 31, 7}, { 63, 8}, { 39, 9}, { 23, 5}, \ + { 383, 4}, { 991, 5}, { 511, 6}, { 267, 7}, \ + { 157, 8}, { 91, 9}, { 47, 8}, { 111, 9}, \ + { 63, 8}, { 127, 9}, { 79,10}, { 47, 9}, \ + { 95,11}, { 31,10}, { 63, 9}, { 135,10}, \ + { 79, 9}, { 159,10}, { 95,11}, { 63,10}, \ + { 143, 9}, { 287,10}, { 159,11}, { 95,10}, \ + { 191,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,10}, { 271, 9}, { 543,10}, { 287,11}, \ + { 159,10}, { 335, 9}, { 671,11}, { 191,10}, \ + { 383, 9}, { 767,10}, { 399, 9}, { 799,10}, \ + { 415,11}, { 223,12}, { 127,11}, { 255,10}, \ + { 543, 9}, { 1087,11}, { 287,10}, { 607,11}, \ + { 319,10}, { 671,12}, { 191,11}, { 383,10}, \ + { 799,11}, { 415,10}, { 831,13}, { 127,12}, \ + { 255,11}, { 543,10}, { 1087,11}, { 607,10}, \ + { 1215,12}, { 319,11}, { 671,10}, { 1343,11}, \ + { 735,10}, { 1471,12}, { 383,11}, { 799,10}, \ + { 1599,11}, { 863,12}, { 447,11}, { 959,13}, \ + { 255,12}, { 511,11}, { 1087,12}, { 575,11}, \ + { 1215,12}, { 639,11}, { 1343,12}, { 703,11}, \ + { 1471,13}, { 383,12}, { 831,11}, { 1727,12}, \ + { 959,14}, { 255,13}, { 511,12}, { 1215,13}, \ + { 639,12}, { 1471,11}, { 2943,13}, { 767,12}, \ + { 1727,13}, { 895,12}, { 1919,14}, { 511,13}, \ + { 1023,12}, { 2111,13}, { 1151,12}, { 2431,13}, \ + { 1407,12}, { 2815,14}, { 767,13}, { 1663,12}, \ + { 3455,13}, { 8192,14}, { 16384,15}, { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 132 +#define MUL_FFT_THRESHOLD 7424 + +#define SQR_FFT_MODF_THRESHOLD 565 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 472, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \ + { 31, 7}, { 17, 6}, { 35, 7}, { 27, 8}, \ + { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \ + { 23, 7}, { 49, 8}, { 27, 9}, { 15, 8}, \ + { 39, 9}, { 23, 8}, { 51,10}, { 15, 9}, \ + { 31, 8}, { 63, 4}, { 1023, 8}, { 67, 9}, \ + { 39, 5}, { 639, 4}, { 1471, 6}, { 383, 7}, \ + { 209, 8}, { 119, 9}, { 63, 7}, { 255, 8}, \ + { 139, 9}, { 71, 8}, { 143, 9}, { 79,10}, \ + { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \ + { 135,10}, { 79, 9}, { 159, 8}, { 319, 9}, \ + { 167,10}, { 95,11}, { 63,10}, { 143, 9}, \ + { 287,10}, { 159,11}, { 95,10}, { 191,12}, \ + { 63,11}, { 127,10}, { 255, 9}, { 543, 8}, \ + { 1087,10}, { 287, 9}, { 575,11}, { 159,10}, \ + { 319, 9}, { 639,10}, { 335, 9}, { 671,10}, \ + { 351, 9}, { 703,11}, { 191,10}, { 383, 9}, \ + { 767,10}, { 399, 9}, { 799,10}, { 415, 9}, \ + { 831,11}, { 223,12}, { 127,11}, { 255,10}, \ + { 543, 9}, { 1087,11}, { 287,10}, { 607, 9}, \ + { 1215,11}, { 319,10}, { 671, 9}, { 1343,11}, \ + { 351,10}, { 703,12}, { 191,11}, { 383,10}, \ + { 799,11}, { 415,10}, { 831,13}, { 127,12}, \ + { 255,11}, { 543,10}, { 1087,11}, { 607,12}, \ + { 319,11}, { 671,10}, { 1343,11}, { 735,12}, \ + { 383,11}, { 799,10}, { 1599,11}, { 863,12}, \ + { 447,11}, { 959,13}, { 255,12}, { 511,11}, \ + { 1087,12}, { 575,11}, { 1215,12}, { 639,11}, \ + { 1343,12}, { 703,11}, { 1471,13}, { 383,12}, \ + { 767,11}, { 1599,12}, { 831,11}, { 1727,12}, \ + { 959,14}, { 255,13}, { 511,12}, { 1215,13}, \ + { 639,12}, { 1471,13}, { 767,12}, { 1727,13}, \ + { 895,12}, { 1919,14}, { 511,13}, { 1023,12}, \ + { 2111,13}, { 1151,12}, { 2431,13}, { 1407,14}, \ + { 767,13}, { 1663,12}, { 3455,13}, { 8192,14}, \ + { 16384,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 146 +#define SQR_FFT_THRESHOLD 5760 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 31 +#define MULLO_MUL_N_THRESHOLD 13463 +#define SQRLO_BASECASE_THRESHOLD 0 /* always */ +#define SQRLO_DC_THRESHOLD 100 +#define SQRLO_SQR_THRESHOLD 9236 + +#define DC_DIV_QR_THRESHOLD 25 +#define DC_DIVAPPR_Q_THRESHOLD 55 +#define DC_BDIV_QR_THRESHOLD 60 +#define DC_BDIV_Q_THRESHOLD 132 + +#define INV_MULMOD_BNM1_THRESHOLD 38 +#define INV_NEWTON_THRESHOLD 65 +#define INV_APPR_THRESHOLD 65 + +#define BINV_NEWTON_THRESHOLD 252 +#define REDC_1_TO_REDC_N_THRESHOLD 62 + +#define MU_DIV_QR_THRESHOLD 1164 +#define MU_DIVAPPR_Q_THRESHOLD 748 +#define MUPI_DIV_QR_THRESHOLD 38 +#define MU_BDIV_QR_THRESHOLD 1360 +#define MU_BDIV_Q_THRESHOLD 1470 + +#define POWM_SEC_TABLE 2,23,258,879,2246 + +#define GET_STR_DC_THRESHOLD 13 +#define GET_STR_PRECOMPUTE_THRESHOLD 25 +#define SET_STR_DC_THRESHOLD 582 +#define SET_STR_PRECOMPUTE_THRESHOLD 1118 + +#define FAC_DSC_THRESHOLD 178 +#define FAC_ODD_THRESHOLD 34 + +#define MATRIX22_STRASSEN_THRESHOLD 17 +#define HGCD_THRESHOLD 69 +#define HGCD_APPR_THRESHOLD 112 +#define HGCD_REDUCE_THRESHOLD 3389 +#define GCD_DC_THRESHOLD 386 +#define GCDEXT_DC_THRESHOLD 303 +#define JACOBI_BASE_METHOD 1 diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/sse2/mod_1_1.asm b/vendor/gmp-6.3.0/mpn/x86/p6/sse2/mod_1_1.asm new file mode 100644 index 0000000..8b7b7ad --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/p6/sse2/mod_1_1.asm @@ -0,0 +1,34 @@ +dnl Intel P6/SSE2 mpn_mod_1_1. + +dnl Copyright 2009, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_mod_1_1p) +include_mpn(`x86/pentium4/sse2/mod_1_1.asm') diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/sse2/mod_1_4.asm b/vendor/gmp-6.3.0/mpn/x86/p6/sse2/mod_1_4.asm new file mode 100644 index 0000000..49c96c6 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/p6/sse2/mod_1_4.asm @@ -0,0 +1,34 @@ +dnl Intel P6/SSE2 mpn_mod_1_4. + +dnl Copyright 2009, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_mod_1s_4p) +include_mpn(`x86/pentium4/sse2/mod_1_4.asm') diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/sse2/mul_1.asm b/vendor/gmp-6.3.0/mpn/x86/p6/sse2/mul_1.asm new file mode 100644 index 0000000..50e5b69 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/p6/sse2/mul_1.asm @@ -0,0 +1,38 @@ +dnl Intel P6/SSE2 mpn_mul_1. + +dnl Copyright 2008 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C TODO +C * Write P6 specific SSE2 code. It should reach 3 c/l. +C The Pentium4 code runs at 4.2 c/l. + +MULFUNC_PROLOGUE(mpn_mul_1) +include_mpn(`x86/pentium4/sse2/mul_1.asm') diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/sse2/mul_basecase.asm b/vendor/gmp-6.3.0/mpn/x86/p6/sse2/mul_basecase.asm new file mode 100644 index 0000000..4687625 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/p6/sse2/mul_basecase.asm @@ -0,0 +1,35 @@ +dnl Intel P6/SSE2 mpn_mul_basecase. + +dnl Copyright 2008 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +MULFUNC_PROLOGUE(mpn_mul_basecase) +include_mpn(`x86/pentium4/sse2/mul_basecase.asm') diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/sse2/popcount.asm b/vendor/gmp-6.3.0/mpn/x86/p6/sse2/popcount.asm new file mode 100644 index 0000000..4c02b93 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/p6/sse2/popcount.asm @@ -0,0 +1,35 @@ +dnl Intel P6/SSE2 mpn_popcount -- population count. + +dnl Copyright 2008 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +MULFUNC_PROLOGUE(mpn_popcount) +include_mpn(`x86/pentium4/sse2/popcount.asm') diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/sse2/sqr_basecase.asm b/vendor/gmp-6.3.0/mpn/x86/p6/sse2/sqr_basecase.asm new file mode 100644 index 0000000..76b574b --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/p6/sse2/sqr_basecase.asm @@ -0,0 +1,35 @@ +dnl Intel P6/SSE2 mpn_sqr_basecase. + +dnl Copyright 2008 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +MULFUNC_PROLOGUE(mpn_sqr_basecase) +include_mpn(`x86/pentium4/sse2/sqr_basecase.asm') diff --git a/vendor/gmp-6.3.0/mpn/x86/p6/sse2/submul_1.asm b/vendor/gmp-6.3.0/mpn/x86/p6/sse2/submul_1.asm new file mode 100644 index 0000000..69d940d --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/p6/sse2/submul_1.asm @@ -0,0 +1,35 @@ +dnl Intel P6/SSE2 mpn_submul_1. + +dnl Copyright 2008 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +MULFUNC_PROLOGUE(mpn_submul_1) +include_mpn(`x86/pentium4/sse2/submul_1.asm') |