diff options
author | Thomas Voss <mail@thomasvoss.com> | 2024-06-21 23:36:36 +0200 |
---|---|---|
committer | Thomas Voss <mail@thomasvoss.com> | 2024-06-21 23:42:26 +0200 |
commit | a89a14ef5da44684a16b204e7a70460cc8c4922a (patch) | |
tree | b23b4c6b155977909ef508fdae2f48d33d802813 /vendor/gmp-6.3.0/mpn/x86/k7 | |
parent | 1db63fcedab0b288820d66e100b1877b1a5a8851 (diff) |
Basic constant folding implementation
Diffstat (limited to 'vendor/gmp-6.3.0/mpn/x86/k7')
24 files changed, 6741 insertions, 0 deletions
diff --git a/vendor/gmp-6.3.0/mpn/x86/k7/README b/vendor/gmp-6.3.0/mpn/x86/k7/README new file mode 100644 index 0000000..5711b61 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/k7/README @@ -0,0 +1,174 @@ +Copyright 2000, 2001 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. + + + + + AMD K7 MPN SUBROUTINES + + +This directory contains code optimized for the AMD Athlon CPU. + +The mmx subdirectory has routines using MMX instructions. All Athlons have +MMX, the separate directory is just so that configure can omit it if the +assembler doesn't support MMX. + + + +STATUS + +Times for the loops, with all code and data in L1 cache. + + cycles/limb + mpn_add/sub_n 1.6 + + mpn_copyi 0.75 or 1.0 \ varying with data alignment + mpn_copyd 0.75 or 1.0 / + + mpn_divrem_1 17.0 integer part, 15.0 fractional part + mpn_mod_1 17.0 + mpn_divexact_by3 8.0 + + mpn_l/rshift 1.2 + + mpn_mul_1 3.4 + mpn_addmul/submul_1 3.9 + + mpn_mul_basecase 4.42 cycles/crossproduct (approx) + mpn_sqr_basecase 2.3 cycles/crossproduct (approx) + or 4.55 cycles/triangleproduct (approx) + +Prefetching of sources hasn't yet been tried. + + + +NOTES + +cmov, MMX, 3DNow and some extensions to MMX and 3DNow are available. + +Write-allocate L1 data cache means prefetching of destinations is unnecessary. + +Floating point multiplications can be done in parallel with integer +multiplications, but there doesn't seem to be any way to make use of this. + +Unsigned "mul"s can be issued every 3 cycles. This suggests 3 is a limit on +the speed of the multiplication routines. The documentation shows mul +executing in IEU0 (or maybe in IEU0 and IEU1 together), so it might be that, +to get near 3 cycles code has to be arranged so that nothing else is issued +to IEU0. A busy IEU0 could explain why some code takes 4 cycles and other +apparently equivalent code takes 5. + + + +OPTIMIZATIONS + +Unrolled loops are used to reduce looping overhead. The unrolling is +configurable up to 32 limbs/loop for most routines and up to 64 for some. +The K7 has 64k L1 code cache so quite big unrolling is allowable. + +Computed jumps into the unrolling are used to handle sizes not a multiple of +the unrolling. An attractive feature of this is that times increase +smoothly with operand size, but it may be that some routines should just +have simple loops to finish up, especially when PIC adds between 2 and 16 +cycles to get %eip. + +Position independent code is implemented using a call to get %eip for the +computed jumps and a ret is always done, rather than an addl $4,%esp or a +popl, so the CPU return address branch prediction stack stays synchronised +with the actual stack in memory. + +Branch prediction, in absence of any history, will guess forward jumps are +not taken and backward jumps are taken. Where possible it's arranged that +the less likely or less important case is under a taken forward jump. + + + +CODING + +Instructions in general code have been shown grouped if they can execute +together, which means up to three direct-path instructions which have no +successive dependencies. K7 always decodes three and has out-of-order +execution, but the groupings show what slots might be available and what +dependency chains exist. + +When there's vector-path instructions an effort is made to get triplets of +direct-path instructions in between them, even if there's dependencies, +since this maximizes decoding throughput and might save a cycle or two if +decoding is the limiting factor. + + + +INSTRUCTIONS + +adcl direct +divl 39 cycles back-to-back +lodsl,etc vector +loop 1 cycle vector (decl/jnz opens up one decode slot) +movd reg vector +movd mem direct +mull issue every 3 cycles, latency 4 cycles low word, 6 cycles high word +popl vector (use movl for more than one pop) +pushl direct, will pair with a load +shrdl %cl vector, 3 cycles, seems to be 3 decode too +xorl r,r false read dependency recognised + + + +REFERENCES + +"AMD Athlon Processor X86 Code Optimization Guide", AMD publication number +22007, revision K, February 2002. Available on-line, + +http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/22007.pdf + +"3DNow Technology Manual", AMD publication number 21928G/0-March 2000. +This describes the femms and prefetch instructions. Available on-line, + +http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/21928.pdf + +"AMD Extensions to the 3DNow and MMX Instruction Sets Manual", AMD +publication number 22466, revision D, March 2000. This describes +instructions added in the Athlon processor, such as pswapd and the extra +prefetch forms. Available on-line, + +http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/22466.pdf + +"3DNow Instruction Porting Guide", AMD publication number 22621, revision B, +August 1999. This has some notes on general Athlon optimizations as well as +3DNow. Available on-line, + +http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/22621.pdf + + + + +---------------- +Local variables: +mode: text +fill-column: 76 +End: diff --git a/vendor/gmp-6.3.0/mpn/x86/k7/addlsh1_n.asm b/vendor/gmp-6.3.0/mpn/x86/k7/addlsh1_n.asm new file mode 100644 index 0000000..2cba1eb --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/k7/addlsh1_n.asm @@ -0,0 +1,196 @@ +dnl AMD K7 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1) + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C This is an attempt at an addlsh1_n for x86-32, not relying on sse2 insns. +C The innerloop is 2*3-way unrolled, which is best we can do with the available +C registers. It seems tricky to use the same structure for rsblsh1_n, since we +C cannot feed carry between operations there. + +C cycles/limb +C P5 +C P6 model 0-8,10-12 +C P6 model 9 (Banias) +C P6 model 13 (Dothan) 5.4 (worse than add_n + lshift) +C P4 model 0 (Willamette) +C P4 model 1 (?) +C P4 model 2 (Northwood) +C P4 model 3 (Prescott) +C P4 model 4 (Nocona) +C Intel Atom 6 +C AMD K6 ? +C AMD K7 2.5 +C AMD K8 + +C This is a basic addlsh1_n for k7, atom, and perhaps some other x86-32 +C processors. It uses 2*3-way unrolling, for good reasons. Unfortunately, +C that means we need an initial magic multiply. +C +C It is not clear how to do sublsh1_n or rsblsh1_n using the same pattern. We +C cannot do rsblsh1_n since we feed carry from the shift blocks to the +C add/subtract blocks, which is right for addition but reversed for +C subtraction. We could perhaps do sublsh1_n, with some extra move insns, +C without losing any time, since we're not issue limited but carry recurrency +C latency. +C +C Breaking carry recurrency might be a good idea. We would then need separate +C registers for the shift carry and add/subtract carry, which in turn would +C force us to 2*2-way unrolling. + +defframe(PARAM_SIZE, 16) +defframe(PARAM_DBLD, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +dnl re-use parameter space +define(VAR_COUNT,`PARAM_DST') +define(VAR_TMP,`PARAM_DBLD') + +ASM_START() + TEXT + ALIGN(8) +PROLOGUE(mpn_addlsh1_n) +deflit(`FRAME',0) + +define(`rp', `%edi') +define(`up', `%esi') +define(`vp', `%ebp') + + mov $0x2aaaaaab, %eax + + push %ebx FRAME_pushl() + mov PARAM_SIZE, %ebx C size + + push rp FRAME_pushl() + mov PARAM_DST, rp + + mul %ebx + + push up FRAME_pushl() + mov PARAM_SRC, up + + not %edx C count = -(size\8)-1 + mov %edx, VAR_COUNT + + push vp FRAME_pushl() + mov PARAM_DBLD, vp + + lea 3(%edx,%edx,2), %ecx C count*3+3 = -(size\6)*3 + xor %edx, %edx + lea (%ebx,%ecx,2), %ebx C size + (count*3+3)*2 = size % 6 + or %ebx, %ebx + jz L(exact) + +L(oop): +ifdef(`CPU_P6',` + shr %edx ') C restore 2nd saved carry bit + mov (vp), %eax + adc %eax, %eax + rcr %edx C restore 1st saved carry bit + lea 4(vp), vp + adc (up), %eax + lea 4(up), up + adc %edx, %edx C save a carry bit in edx +ifdef(`CPU_P6',` + adc %edx, %edx ') C save another carry bit in edx + dec %ebx + mov %eax, (rp) + lea 4(rp), rp + jnz L(oop) + mov vp, VAR_TMP +L(exact): + incl VAR_COUNT + jz L(end) + + ALIGN(16) +L(top): +ifdef(`CPU_P6',` + shr %edx ') C restore 2nd saved carry bit + mov (vp), %eax + adc %eax, %eax + mov 4(vp), %ebx + adc %ebx, %ebx + mov 8(vp), %ecx + adc %ecx, %ecx + + rcr %edx C restore 1st saved carry bit + + adc (up), %eax + mov %eax, (rp) + adc 4(up), %ebx + mov %ebx, 4(rp) + adc 8(up), %ecx + mov %ecx, 8(rp) + + mov 12(vp), %eax + adc %eax, %eax + mov 16(vp), %ebx + adc %ebx, %ebx + mov 20(vp), %ecx + adc %ecx, %ecx + + lea 24(vp), vp + adc %edx, %edx C save a carry bit in edx + + adc 12(up), %eax + mov %eax, 12(rp) + adc 16(up), %ebx + mov %ebx, 16(rp) + adc 20(up), %ecx + + lea 24(up), up + +ifdef(`CPU_P6',` + adc %edx, %edx ') C save another carry bit in edx + mov %ecx, 20(rp) + incl VAR_COUNT + lea 24(rp), rp + jne L(top) + +L(end): + pop vp FRAME_popl() + pop up FRAME_popl() + +ifdef(`CPU_P6',` + xor %eax, %eax + shr $1, %edx + adc %edx, %eax +',` + adc $0, %edx + mov %edx, %eax +') + pop rp FRAME_popl() + pop %ebx FRAME_popl() + ret +EPILOGUE() +ASM_END() diff --git a/vendor/gmp-6.3.0/mpn/x86/k7/aors_n.asm b/vendor/gmp-6.3.0/mpn/x86/k7/aors_n.asm new file mode 100644 index 0000000..1a08072 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/k7/aors_n.asm @@ -0,0 +1,258 @@ +dnl AMD K7 mpn_add_n/mpn_sub_n -- mpn add or subtract. + +dnl Copyright 1999-2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C K7: 1.64 cycles/limb (at 16 limbs/loop). + + + +dnl K7: UNROLL_COUNT cycles/limb +dnl 8 1.9 +dnl 16 1.64 +dnl 32 1.7 +dnl 64 2.0 +dnl Maximum possible with the current code is 64. + +deflit(UNROLL_COUNT, 16) + + +ifdef(`OPERATION_add_n', ` + define(M4_inst, adcl) + define(M4_function_n, mpn_add_n) + define(M4_function_nc, mpn_add_nc) + define(M4_description, add) +',`ifdef(`OPERATION_sub_n', ` + define(M4_inst, sbbl) + define(M4_function_n, mpn_sub_n) + define(M4_function_nc, mpn_sub_nc) + define(M4_description, subtract) +',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n +')')') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) + + +C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, +C mp_size_t size); +C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, +C mp_size_t size, mp_limb_t carry); +C +C Calculate src1,size M4_description src2,size, and store the result in +C dst,size. The return value is the carry bit from the top of the result (1 +C or 0). +C +C The _nc version accepts 1 or 0 for an initial carry into the low limb of +C the calculation. Note values other than 1 or 0 here will lead to garbage +C results. +C +C This code runs at 1.64 cycles/limb, which might be the best possible with +C plain integer operations. Each limb is 2 loads and 1 store, any 2 of +C which can be done each cycle, leading to 1.5 c/l. + +dnl Must have UNROLL_THRESHOLD >= 2, since the unrolled loop can't handle 1. +ifdef(`PIC',` +deflit(UNROLL_THRESHOLD, 8) +',` +deflit(UNROLL_THRESHOLD, 8) +') + +defframe(PARAM_CARRY,20) +defframe(PARAM_SIZE, 16) +defframe(PARAM_SRC2, 12) +defframe(PARAM_SRC1, 8) +defframe(PARAM_DST, 4) + +defframe(SAVE_EBP, -4) +defframe(SAVE_ESI, -8) +defframe(SAVE_EBX, -12) +defframe(SAVE_EDI, -16) +deflit(STACK_SPACE, 16) + + TEXT + ALIGN(32) +deflit(`FRAME',0) + +PROLOGUE(M4_function_nc) + movl PARAM_CARRY, %eax + jmp L(start) +EPILOGUE() + +PROLOGUE(M4_function_n) + + xorl %eax, %eax C carry +L(start): + movl PARAM_SIZE, %ecx + subl $STACK_SPACE, %esp +deflit(`FRAME',STACK_SPACE) + + movl %edi, SAVE_EDI + movl %ebx, SAVE_EBX + cmpl $UNROLL_THRESHOLD, %ecx + + movl PARAM_SRC2, %edx + movl PARAM_SRC1, %ebx + jae L(unroll) + + movl PARAM_DST, %edi + leal (%ebx,%ecx,4), %ebx + leal (%edx,%ecx,4), %edx + + leal (%edi,%ecx,4), %edi + negl %ecx + shrl %eax + + C This loop in in a single 16 byte code block already, so no + C alignment necessary. +L(simple): + C eax scratch + C ebx src1 + C ecx counter + C edx src2 + C esi + C edi dst + C ebp + + movl (%ebx,%ecx,4), %eax + M4_inst (%edx,%ecx,4), %eax + movl %eax, (%edi,%ecx,4) + incl %ecx + jnz L(simple) + + movl $0, %eax + movl SAVE_EDI, %edi + + movl SAVE_EBX, %ebx + setc %al + addl $STACK_SPACE, %esp + + ret + + +C ----------------------------------------------------------------------------- + C This is at 0x55, close enough to aligned. +L(unroll): +deflit(`FRAME',STACK_SPACE) + movl %ebp, SAVE_EBP + andl $-2, %ecx C size low bit masked out + andl $1, PARAM_SIZE C size low bit kept + + movl %ecx, %edi + decl %ecx + movl PARAM_DST, %ebp + + shrl $UNROLL_LOG2, %ecx + negl %edi + movl %esi, SAVE_ESI + + andl $UNROLL_MASK, %edi + +ifdef(`PIC',` + call L(pic_calc) +L(here): +',` + leal L(entry) (%edi,%edi,8), %esi C 9 bytes per +') + negl %edi + shrl %eax + + leal ifelse(UNROLL_BYTES,256,128) (%ebx,%edi,4), %ebx + leal ifelse(UNROLL_BYTES,256,128) (%edx,%edi,4), %edx + leal ifelse(UNROLL_BYTES,256,128) (%ebp,%edi,4), %edi + + jmp *%esi + + +ifdef(`PIC',` +L(pic_calc): + C See mpn/x86/README about old gas bugs + leal (%edi,%edi,8), %esi + addl $L(entry)-L(here), %esi + addl (%esp), %esi + ret_internal +') + + +C ----------------------------------------------------------------------------- + ALIGN(32) +L(top): + C eax zero + C ebx src1 + C ecx counter + C edx src2 + C esi scratch (was computed jump) + C edi dst + C ebp scratch + + leal UNROLL_BYTES(%edx), %edx + +L(entry): +deflit(CHUNK_COUNT, 2) +forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, ` + deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128))) + deflit(`disp1', eval(disp0 + 4)) + +Zdisp( movl, disp0,(%ebx), %esi) + movl disp1(%ebx), %ebp +Zdisp( M4_inst,disp0,(%edx), %esi) +Zdisp( movl, %esi, disp0,(%edi)) + M4_inst disp1(%edx), %ebp + movl %ebp, disp1(%edi) +') + + decl %ecx + leal UNROLL_BYTES(%ebx), %ebx + leal UNROLL_BYTES(%edi), %edi + jns L(top) + + + mov PARAM_SIZE, %esi + movl SAVE_EBP, %ebp + movl $0, %eax + + decl %esi + js L(even) + + movl (%ebx), %ecx + M4_inst UNROLL_BYTES(%edx), %ecx + movl %ecx, (%edi) +L(even): + + movl SAVE_EDI, %edi + movl SAVE_EBX, %ebx + setc %al + + movl SAVE_ESI, %esi + addl $STACK_SPACE, %esp + + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/k7/aorsmul_1.asm b/vendor/gmp-6.3.0/mpn/x86/k7/aorsmul_1.asm new file mode 100644 index 0000000..eec8df6 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/k7/aorsmul_1.asm @@ -0,0 +1,167 @@ +dnl AMD K7 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple. + +dnl Copyright 1999-2002, 2005, 2008 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C P5 +C P6 model 0-8,10-12 +C P6 model 9 (Banias) 6.5 +C P6 model 13 (Dothan) +C P4 model 0 (Willamette) +C P4 model 1 (?) +C P4 model 2 (Northwood) +C P4 model 3 (Prescott) +C P4 model 4 (Nocona) +C AMD K6 +C AMD K7 3.75 +C AMD K8 + +C TODO +C * Improve feed-in and wind-down code. We beat the old code for all n != 1, +C but lose by 2x for n == 1. + +ifdef(`OPERATION_addmul_1',` + define(`ADDSUB', `add') + define(`func', `mpn_addmul_1') +') +ifdef(`OPERATION_submul_1',` + define(`ADDSUB', `sub') + define(`func', `mpn_submul_1') +') + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func) + add $-16, %esp + mov %ebp, (%esp) + mov %ebx, 4(%esp) + mov %esi, 8(%esp) + mov %edi, 12(%esp) + + mov 20(%esp), %edi + mov 24(%esp), %esi + mov 28(%esp), %eax + mov 32(%esp), %ecx + mov %eax, %ebx + shr $2, %eax + mov %eax, 28(%esp) + mov (%esi), %eax + and $3, %ebx + jz L(b0) + cmp $2, %ebx + jz L(b2) + jg L(b3) + +L(b1): lea -4(%esi), %esi + lea -4(%edi), %edi + mul %ecx + mov %eax, %ebx + mov %edx, %ebp + cmpl $0, 28(%esp) + jz L(cj1) + mov 8(%esi), %eax + jmp L(1) + +L(b2): mul %ecx + mov %eax, %ebp + mov 4(%esi), %eax + mov %edx, %ebx + cmpl $0, 28(%esp) + jne L(2) + jmp L(cj2) + +L(b3): lea -12(%esi), %esi + lea -12(%edi), %edi + mul %ecx + mov %eax, %ebx + mov %edx, %ebp + mov 16(%esi), %eax + incl 28(%esp) + jmp L(3) + +L(b0): lea -8(%esi), %esi + lea -8(%edi), %edi + mul %ecx + mov %eax, %ebp + mov 12(%esi), %eax + mov %edx, %ebx + jmp L(0) + + ALIGN(16) +L(top): lea 16(%edi), %edi +L(2): mul %ecx + ADDSUB %ebp, 0(%edi) + mov $0, %ebp + adc %eax, %ebx + mov 8(%esi), %eax + adc %edx, %ebp +L(1): mul %ecx + ADDSUB %ebx, 4(%edi) + mov $0, %ebx + adc %eax, %ebp + mov 12(%esi), %eax + adc %edx, %ebx +L(0): mul %ecx + ADDSUB %ebp, 8(%edi) + mov $0, %ebp + adc %eax, %ebx + adc %edx, %ebp + mov 16(%esi), %eax +L(3): mul %ecx + ADDSUB %ebx, 12(%edi) + adc %eax, %ebp + mov 20(%esi), %eax + lea 16(%esi), %esi + mov $0, %ebx + adc %edx, %ebx + decl 28(%esp) + jnz L(top) + +L(end): lea 16(%edi), %edi +L(cj2): mul %ecx + ADDSUB %ebp, (%edi) + adc %eax, %ebx + adc $0, %edx +L(cj1): ADDSUB %ebx, 4(%edi) + adc $0, %edx + mov %edx, %eax + mov (%esp), %ebp + mov 4(%esp), %ebx + mov 8(%esp), %esi + mov 12(%esp), %edi + add $16, %esp + ret +EPILOGUE() +ASM_END() diff --git a/vendor/gmp-6.3.0/mpn/x86/k7/bdiv_q_1.asm b/vendor/gmp-6.3.0/mpn/x86/k7/bdiv_q_1.asm new file mode 100644 index 0000000..2af7bb9 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/k7/bdiv_q_1.asm @@ -0,0 +1,245 @@ +dnl AMD K7 mpn_bdiv_q_1 -- mpn by limb exact division. + +dnl Rearranged from mpn/x86/k7/dive_1.asm by Marco Bodrato. + +dnl Copyright 2001, 2002, 2004, 2007, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C Athlon: 11.0 +C Hammer: 9.0 + + +C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t divisor); +C +C The dependent chain is mul+imul+sub for 11 cycles and that speed is +C achieved with no special effort. The load and shrld latencies are hidden +C by out of order execution. +C +C It's a touch faster on size==1 to use the mul-by-inverse than divl. + +defframe(PARAM_SHIFT, 24) +defframe(PARAM_INVERSE,20) +defframe(PARAM_DIVISOR,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +defframe(SAVE_EBX, -4) +defframe(SAVE_ESI, -8) +defframe(SAVE_EDI, -12) +defframe(SAVE_EBP, -16) +defframe(VAR_INVERSE, -20) +defframe(VAR_DST_END, -24) + +deflit(STACK_SPACE, 24) + + TEXT + +C mp_limb_t +C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor, +C mp_limb_t inverse, int shift) + ALIGN(16) +PROLOGUE(mpn_pi1_bdiv_q_1) +deflit(`FRAME',0) + + subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE) + movl PARAM_SHIFT, %ecx C shift count + + movl %ebp, SAVE_EBP + movl PARAM_SIZE, %ebp + + movl %esi, SAVE_ESI + movl PARAM_SRC, %esi + + movl %edi, SAVE_EDI + movl PARAM_DST, %edi + + movl %ebx, SAVE_EBX + + leal (%esi,%ebp,4), %esi C src end + leal (%edi,%ebp,4), %edi C dst end + negl %ebp C -size + + movl PARAM_INVERSE, %eax C inv + +L(common): + movl %eax, VAR_INVERSE + movl (%esi,%ebp,4), %eax C src[0] + + incl %ebp + jz L(one) + + movl (%esi,%ebp,4), %edx C src[1] + + shrdl( %cl, %edx, %eax) + + movl %edi, VAR_DST_END + xorl %ebx, %ebx + jmp L(entry) + + ALIGN(8) +L(top): + C eax q + C ebx carry bit, 0 or 1 + C ecx shift + C edx + C esi src end + C edi dst end + C ebp counter, limbs, negative + + mull PARAM_DIVISOR C carry limb in edx + + movl -4(%esi,%ebp,4), %eax + movl (%esi,%ebp,4), %edi + + shrdl( %cl, %edi, %eax) + + subl %ebx, %eax C apply carry bit + setc %bl + movl VAR_DST_END, %edi + + subl %edx, %eax C apply carry limb + adcl $0, %ebx + +L(entry): + imull VAR_INVERSE, %eax + + movl %eax, -4(%edi,%ebp,4) + incl %ebp + jnz L(top) + + + mull PARAM_DIVISOR C carry limb in edx + + movl -4(%esi), %eax C src high limb + shrl %cl, %eax + movl SAVE_ESI, %esi + + subl %ebx, %eax C apply carry bit + movl SAVE_EBX, %ebx + movl SAVE_EBP, %ebp + + subl %edx, %eax C apply carry limb + + imull VAR_INVERSE, %eax + + movl %eax, -4(%edi) + movl SAVE_EDI, %edi + addl $STACK_SPACE, %esp + + ret + +L(one): + shrl %cl, %eax + movl SAVE_ESI, %esi + movl SAVE_EBX, %ebx + + imull VAR_INVERSE, %eax + + movl SAVE_EBP, %ebp + + movl %eax, -4(%edi) + movl SAVE_EDI, %edi + addl $STACK_SPACE, %esp + + ret +EPILOGUE() + +C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t divisor); +C + + ALIGN(16) +PROLOGUE(mpn_bdiv_q_1) +deflit(`FRAME',0) + + movl PARAM_DIVISOR, %eax + subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE) + movl $-1, %ecx C shift count + + movl %ebp, SAVE_EBP + movl PARAM_SIZE, %ebp + + movl %esi, SAVE_ESI + movl %edi, SAVE_EDI + + C If there's usually only one or two trailing zero bits then this + C should be faster than bsfl. +L(strip_twos): + incl %ecx + shrl %eax + jnc L(strip_twos) + + movl %ebx, SAVE_EBX + leal 1(%eax,%eax), %ebx C d without twos + andl $127, %eax C d/2, 7 bits + +ifdef(`PIC',` + LEA( binvert_limb_table, %edx) + movzbl (%eax,%edx), %eax C inv 8 bits +',` + movzbl binvert_limb_table(%eax), %eax C inv 8 bits +') + + leal (%eax,%eax), %edx C 2*inv + movl %ebx, PARAM_DIVISOR C d without twos + + imull %eax, %eax C inv*inv + + movl PARAM_SRC, %esi + movl PARAM_DST, %edi + + imull %ebx, %eax C inv*inv*d + + subl %eax, %edx C inv = 2*inv - inv*inv*d + leal (%edx,%edx), %eax C 2*inv + + imull %edx, %edx C inv*inv + + leal (%esi,%ebp,4), %esi C src end + leal (%edi,%ebp,4), %edi C dst end + negl %ebp C -size + + imull %ebx, %edx C inv*inv*d + + subl %edx, %eax C inv = 2*inv - inv*inv*d + + ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS + pushl %eax FRAME_pushl() + imull PARAM_DIVISOR, %eax + cmpl $1, %eax + popl %eax FRAME_popl()') + + jmp L(common) +EPILOGUE() +ASM_END() diff --git a/vendor/gmp-6.3.0/mpn/x86/k7/dive_1.asm b/vendor/gmp-6.3.0/mpn/x86/k7/dive_1.asm new file mode 100644 index 0000000..458bd02 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/k7/dive_1.asm @@ -0,0 +1,208 @@ +dnl AMD K7 mpn_divexact_1 -- mpn by limb exact division. + +dnl Copyright 2001, 2002, 2004, 2007 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C Athlon: 11.0 +C Hammer: 9.0 + + +C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t divisor); +C +C The dependent chain is mul+imul+sub for 11 cycles and that speed is +C achieved with no special effort. The load and shrld latencies are hidden +C by out of order execution. +C +C It's a touch faster on size==1 to use the mul-by-inverse than divl. + +defframe(PARAM_DIVISOR,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +defframe(SAVE_EBX, -4) +defframe(SAVE_ESI, -8) +defframe(SAVE_EDI, -12) +defframe(SAVE_EBP, -16) +defframe(VAR_INVERSE, -20) +defframe(VAR_DST_END, -24) + +deflit(STACK_SPACE, 24) + + TEXT + + ALIGN(16) +PROLOGUE(mpn_divexact_1) +deflit(`FRAME',0) + + movl PARAM_DIVISOR, %eax + subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE) + movl $-1, %ecx C shift count + + movl %ebp, SAVE_EBP + movl PARAM_SIZE, %ebp + + movl %esi, SAVE_ESI + movl %edi, SAVE_EDI + + C If there's usually only one or two trailing zero bits then this + C should be faster than bsfl. +L(strip_twos): + incl %ecx + shrl %eax + jnc L(strip_twos) + + movl %ebx, SAVE_EBX + leal 1(%eax,%eax), %ebx C d without twos + andl $127, %eax C d/2, 7 bits + +ifdef(`PIC',` + LEA( binvert_limb_table, %edx) + movzbl (%eax,%edx), %eax C inv 8 bits +',` + movzbl binvert_limb_table(%eax), %eax C inv 8 bits +') + + leal (%eax,%eax), %edx C 2*inv + movl %ebx, PARAM_DIVISOR C d without twos + + imull %eax, %eax C inv*inv + + movl PARAM_SRC, %esi + movl PARAM_DST, %edi + + imull %ebx, %eax C inv*inv*d + + subl %eax, %edx C inv = 2*inv - inv*inv*d + leal (%edx,%edx), %eax C 2*inv + + imull %edx, %edx C inv*inv + + leal (%esi,%ebp,4), %esi C src end + leal (%edi,%ebp,4), %edi C dst end + negl %ebp C -size + + imull %ebx, %edx C inv*inv*d + + subl %edx, %eax C inv = 2*inv - inv*inv*d + + ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS + pushl %eax FRAME_pushl() + imull PARAM_DIVISOR, %eax + cmpl $1, %eax + popl %eax FRAME_popl()') + + movl %eax, VAR_INVERSE + movl (%esi,%ebp,4), %eax C src[0] + + incl %ebp + jz L(one) + + movl (%esi,%ebp,4), %edx C src[1] + + shrdl( %cl, %edx, %eax) + + movl %edi, VAR_DST_END + xorl %ebx, %ebx + jmp L(entry) + + ALIGN(8) +L(top): + C eax q + C ebx carry bit, 0 or 1 + C ecx shift + C edx + C esi src end + C edi dst end + C ebp counter, limbs, negative + + mull PARAM_DIVISOR C carry limb in edx + + movl -4(%esi,%ebp,4), %eax + movl (%esi,%ebp,4), %edi + + shrdl( %cl, %edi, %eax) + + subl %ebx, %eax C apply carry bit + setc %bl + movl VAR_DST_END, %edi + + subl %edx, %eax C apply carry limb + adcl $0, %ebx + +L(entry): + imull VAR_INVERSE, %eax + + movl %eax, -4(%edi,%ebp,4) + incl %ebp + jnz L(top) + + + mull PARAM_DIVISOR C carry limb in edx + + movl -4(%esi), %eax C src high limb + shrl %cl, %eax + movl SAVE_ESI, %esi + + subl %ebx, %eax C apply carry bit + movl SAVE_EBX, %ebx + movl SAVE_EBP, %ebp + + subl %edx, %eax C apply carry limb + + imull VAR_INVERSE, %eax + + movl %eax, -4(%edi) + movl SAVE_EDI, %edi + addl $STACK_SPACE, %esp + + ret + + +L(one): + shrl %cl, %eax + movl SAVE_ESI, %esi + movl SAVE_EBX, %ebx + + imull VAR_INVERSE, %eax + + movl SAVE_EBP, %ebp + movl %eax, -4(%edi) + + movl SAVE_EDI, %edi + addl $STACK_SPACE, %esp + + ret + +EPILOGUE() +ASM_END() diff --git a/vendor/gmp-6.3.0/mpn/x86/k7/gcd_11.asm b/vendor/gmp-6.3.0/mpn/x86/k7/gcd_11.asm new file mode 100644 index 0000000..2648dfd --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/k7/gcd_11.asm @@ -0,0 +1,107 @@ +dnl x86 mpn_gcd_11 optimised for AMD K7. + +dnl Contributed to the GNU project by by Kevin Ryde. Rehacked by Torbjorn +dnl Granlund. + +dnl Copyright 2000-2002, 2005, 2009, 2011, 2012, 2014, 2015 Free Software +dnl Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/bit (approx) +C AMD K7 5.31 +C AMD K8,K9 5.33 +C AMD K10 5.30 +C AMD bd1 ? +C AMD bobcat 7.02 +C Intel P4-2 10.1 +C Intel P4-3/4 10.0 +C Intel P6/13 5.88 +C Intel core2 6.26 +C Intel NHM 6.83 +C Intel SBR 8.50 +C Intel atom 8.90 +C VIA nano ? +C Numbers measured with: speed -CD -s16-32 -t16 mpn_gcd_1 + + +C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0. + +deflit(MAXSHIFT, 6) +deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1)) + +DEF_OBJECT(ctz_table,64) + .byte MAXSHIFT +forloop(i,1,MASK, +` .byte m4_count_trailing_zeros(i) +') +END_OBJECT(ctz_table) + + +define(`u0', `%eax') +define(`v0', `%edx') + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_gcd_11) + push %edi + push %esi + + mov 12(%esp), %eax + mov 16(%esp), %edx + + LEAL( ctz_table, %esi) + jmp L(odd) + + ALIGN(16) C +L(top): cmovc( %ecx, %eax) C u = |v - u| + cmovc( %edi, %edx) C v = min(u,v) +L(mid): and $MASK, %ecx C + movzbl (%esi,%ecx), %ecx C + jz L(shift_alot) C + shr %cl, %eax C +L(odd): mov %eax, %edi C + mov %edx, %ecx C + sub %eax, %ecx C + sub %edx, %eax C + jnz L(top) C + +L(end): mov %edx, %eax + pop %esi + pop %edi + ret + +L(shift_alot): + shr $MAXSHIFT, %eax + mov %eax, %ecx + jmp L(mid) +EPILOGUE() +ASM_END() diff --git a/vendor/gmp-6.3.0/mpn/x86/k7/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/x86/k7/gmp-mparam.h new file mode 100644 index 0000000..a09507d --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/k7/gmp-mparam.h @@ -0,0 +1,263 @@ +/* AMD K7 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 2083 MHz K7 Barton */ +/* FFT tuning limit = 49,770,069 */ +/* Generated by tuneup.c, 2019-11-09, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 3 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 8 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 24 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 13 +#define USE_PREINV_DIVREM_1 1 /* native */ +/* From mati.gmplib.org, 2023-07-21 */ +#define DIV_QR_1N_PI1_METHOD 3 /* 9.52% faster than 1 */ +#define DIV_QR_1_NORM_THRESHOLD 4 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 26 + +#define DIV_1_VS_MUL_1_PERCENT 182 + +#define MUL_TOOM22_THRESHOLD 28 +#define MUL_TOOM33_THRESHOLD 85 +#define MUL_TOOM44_THRESHOLD 154 +#define MUL_TOOM6H_THRESHOLD 208 +#define MUL_TOOM8H_THRESHOLD 309 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 99 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 97 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 102 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 121 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 50 +#define SQR_TOOM3_THRESHOLD 86 +#define SQR_TOOM4_THRESHOLD 220 +#define SQR_TOOM6_THRESHOLD 270 +#define SQR_TOOM8_THRESHOLD 446 + +#define MULMID_TOOM42_THRESHOLD 50 + +#define MULMOD_BNM1_THRESHOLD 18 +#define SQRMOD_BNM1_THRESHOLD 19 + +#define MUL_FFT_MODF_THRESHOLD 606 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 606, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 15, 5}, { 31, 6}, { 28, 7}, { 15, 6}, \ + { 32, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \ + { 39, 7}, { 23, 6}, { 47, 7}, { 29, 8}, \ + { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \ + { 23, 7}, { 49, 8}, { 31, 7}, { 63, 8}, \ + { 39, 9}, { 23, 8}, { 55, 9}, { 31, 8}, \ + { 63, 7}, { 127, 8}, { 71, 9}, { 39, 6}, \ + { 319, 9}, { 47, 8}, { 99, 6}, { 399, 9}, \ + { 55,10}, { 31, 9}, { 63, 8}, { 127, 9}, \ + { 79,10}, { 47, 9}, { 95, 8}, { 191, 4}, \ + { 3135, 5}, { 1599, 4}, { 3455, 6}, { 959, 8}, \ + { 247,10}, { 79, 9}, { 167,10}, { 95, 9}, \ + { 199,10}, { 111,11}, { 63,10}, { 127, 9}, \ + { 255,10}, { 143, 9}, { 287, 8}, { 575,10}, \ + { 159, 9}, { 319, 8}, { 639, 7}, { 1279,11}, \ + { 95,10}, { 191, 9}, { 383, 8}, { 799,10}, \ + { 207,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511, 8}, { 1023,10}, { 271, 9}, { 543, 8}, \ + { 1087, 9}, { 575,11}, { 159, 9}, { 639,10}, \ + { 335, 9}, { 671, 8}, { 1343,10}, { 351, 9}, \ + { 703,11}, { 191,10}, { 383, 9}, { 799, 8}, \ + { 1599,11}, { 223,10}, { 447,12}, { 127,11}, \ + { 255,10}, { 511, 9}, { 1023,10}, { 543, 9}, \ + { 1087,10}, { 575, 9}, { 1151,10}, { 607, 9}, \ + { 1215,11}, { 319,10}, { 639, 9}, { 1343,10}, \ + { 703, 9}, { 1407,12}, { 191,11}, { 383,10}, \ + { 767, 9}, { 1535,10}, { 799, 9}, { 1599,10}, \ + { 831, 9}, { 1727, 8}, { 3455,11}, { 447,13}, \ + { 127,12}, { 255,11}, { 511,10}, { 1023, 9}, \ + { 2047,11}, { 543,10}, { 1087,11}, { 575,10}, \ + { 1151, 9}, { 2303,11}, { 607,10}, { 1215,12}, \ + { 319,11}, { 639,10}, { 1279,11}, { 671,10}, \ + { 1343,11}, { 703,10}, { 1407,11}, { 735,10}, \ + { 1471, 9}, { 2943,12}, { 383,11}, { 767,10}, \ + { 1535,11}, { 799,10}, { 1599,11}, { 831,10}, \ + { 1663,11}, { 863,10}, { 1727,12}, { 447,11}, \ + { 895,10}, { 1791,11}, { 959,10}, { 1919,13}, \ + { 255,12}, { 511,11}, { 1023,10}, { 2111,11}, \ + { 1087,10}, { 2175,12}, { 575,11}, { 1151,10}, \ + { 2303,11}, { 1215,10}, { 2431,12}, { 639,11}, \ + { 1343,12}, { 703,11}, { 1407,10}, { 2815,11}, \ + { 1471,10}, { 2943,13}, { 383,12}, { 767,11}, \ + { 1599,12}, { 831,11}, { 1663,10}, { 3327,11}, \ + { 1727,10}, { 3455,12}, { 895,11}, { 1855,12}, \ + { 959,11}, { 1919,10}, { 3839,14}, { 255,13}, \ + { 511,12}, { 1023,11}, { 2111,12}, { 1087,11}, \ + { 2239,12}, { 1151,11}, { 2303,12}, { 1215,11}, \ + { 2431,13}, { 639,12}, { 1343,11}, { 2687,12}, \ + { 1407,11}, { 2815,12}, { 1471,11}, { 2943,13}, \ + { 767,12}, { 1663,11}, { 3327,12}, { 1727,11}, \ + { 3455,13}, { 895,12}, { 1919,11}, { 3839,12}, \ + { 1983,14}, { 511,13}, { 1023,12}, { 2239,13}, \ + { 1151,12}, { 2495,13}, { 1279,12}, { 2687,13}, \ + { 1407,12}, { 2943,14}, { 767,13}, { 1535,12}, \ + { 3135,13}, { 1663,12}, { 3455,13}, { 1791,12}, \ + { 3583,13}, { 1919,12}, { 3967,15}, { 511,14}, \ + { 1023,13}, { 2047,12}, { 4095,13}, { 2175,12}, \ + { 4479,13}, { 2431,12}, { 4863,14}, { 1279,13}, \ + { 2559,12}, { 5119,13}, { 2943,12}, { 5887,14}, \ + { 1535,13}, { 3455,14}, { 1791,13}, { 3967,15}, \ + { 1023,14}, { 2047,13}, { 4479,14}, { 2303,13}, \ + { 4991,14}, { 2559,13}, { 5119,14}, { 2815,13}, \ + { 5887,15}, { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 254 +#define MUL_FFT_THRESHOLD 7552 + +#define SQR_FFT_MODF_THRESHOLD 492 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 492, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 28, 7}, { 15, 6}, { 32, 7}, { 17, 6}, \ + { 35, 7}, { 19, 6}, { 39, 7}, { 27, 8}, \ + { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \ + { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \ + { 31, 7}, { 63, 8}, { 39, 9}, { 23, 8}, \ + { 51, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \ + { 79, 9}, { 47, 8}, { 95, 9}, { 55,10}, \ + { 31, 9}, { 79,10}, { 47, 9}, { 103,11}, \ + { 31,10}, { 63, 9}, { 135, 8}, { 271, 9}, \ + { 143,10}, { 79, 9}, { 167,10}, { 95, 9}, \ + { 191, 8}, { 383,10}, { 111,11}, { 63,10}, \ + { 127, 9}, { 255, 8}, { 511,10}, { 143, 9}, \ + { 303,10}, { 159, 9}, { 319, 8}, { 639,11}, \ + { 95,10}, { 191, 9}, { 383, 8}, { 767, 9}, \ + { 399,10}, { 207,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271, 9}, { 543, 8}, \ + { 1087,10}, { 287, 9}, { 575,10}, { 303,11}, \ + { 159,10}, { 319, 9}, { 639,10}, { 335, 9}, \ + { 671, 8}, { 1343, 9}, { 703,11}, { 191,10}, \ + { 383, 9}, { 767, 8}, { 1535,10}, { 399, 9}, \ + { 799, 8}, { 1599, 9}, { 863,11}, { 223,10}, \ + { 447,12}, { 127,11}, { 255,10}, { 511, 9}, \ + { 1087,10}, { 575, 9}, { 1215,10}, { 639, 9}, \ + { 1279,10}, { 671, 9}, { 1343,11}, { 351,10}, \ + { 703, 9}, { 1407,10}, { 735, 9}, { 1471,12}, \ + { 191,11}, { 383,10}, { 767, 9}, { 1535,10}, \ + { 799, 9}, { 1599,11}, { 415,10}, { 831, 9}, \ + { 1663,10}, { 863, 9}, { 1727, 8}, { 3455,11}, \ + { 447,10}, { 895,13}, { 127,12}, { 255,11}, \ + { 511,10}, { 1023, 9}, { 2047,11}, { 543,10}, \ + { 1087, 9}, { 2175,11}, { 575,10}, { 1151, 9}, \ + { 2303,11}, { 607,10}, { 1215, 9}, { 2431,12}, \ + { 319,11}, { 639,10}, { 1279,11}, { 671,10}, \ + { 1343,11}, { 703,10}, { 1407, 9}, { 2815,11}, \ + { 735,10}, { 1471, 9}, { 2943,12}, { 383,11}, \ + { 767,10}, { 1599,11}, { 831,10}, { 1663, 9}, \ + { 3327,10}, { 1727,12}, { 447,11}, { 895,10}, \ + { 1791,11}, { 959,10}, { 1919,13}, { 255,12}, \ + { 511,11}, { 1023,10}, { 2111,11}, { 1087,10}, \ + { 2175,12}, { 575,11}, { 1151,10}, { 2303,11}, \ + { 1215,10}, { 2431,12}, { 639,11}, { 1343,12}, \ + { 703,11}, { 1407,10}, { 2815,11}, { 1471,10}, \ + { 2943,13}, { 383,12}, { 767,11}, { 1599,12}, \ + { 831,11}, { 1663,10}, { 3327,11}, { 1727,10}, \ + { 3455,12}, { 895,11}, { 1791,12}, { 959,11}, \ + { 1919,10}, { 3839,14}, { 255,13}, { 511,12}, \ + { 1023,11}, { 2111,12}, { 1087,11}, { 2239,12}, \ + { 1151,11}, { 2303,12}, { 1215,11}, { 2431,13}, \ + { 639,12}, { 1343,11}, { 2687,12}, { 1407,11}, \ + { 2815,12}, { 1471,11}, { 2943,13}, { 767,12}, \ + { 1599,11}, { 3199,12}, { 1663,11}, { 3327,12}, \ + { 1727,11}, { 3455,13}, { 895,12}, { 1791,11}, \ + { 3583,12}, { 1919,11}, { 3839,12}, { 1983,14}, \ + { 511,13}, { 1023,12}, { 2239,13}, { 1151,12}, \ + { 2431,13}, { 1279,12}, { 2687,13}, { 1407,12}, \ + { 2943,14}, { 767,13}, { 1535,12}, { 3199,13}, \ + { 1663,12}, { 3455,13}, { 1791,12}, { 3583,13}, \ + { 1919,12}, { 3967,15}, { 511,14}, { 1023,13}, \ + { 2047,12}, { 4095,13}, { 2175,12}, { 4351,13}, \ + { 2431,14}, { 1279,13}, { 2943,12}, { 5887,14}, \ + { 1535,13}, { 3455,14}, { 1791,13}, { 3967,15}, \ + { 1023,14}, { 2047,13}, { 4351,14}, { 2303,13}, \ + { 4991,14}, { 2559,13}, { 5119,14}, { 2815,13}, \ + { 5887,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 258 +#define SQR_FFT_THRESHOLD 5504 + +#define MULLO_BASECASE_THRESHOLD 3 +#define MULLO_DC_THRESHOLD 34 +#define MULLO_MUL_N_THRESHOLD 14281 +#define SQRLO_BASECASE_THRESHOLD 6 +#define SQRLO_DC_THRESHOLD 137 +#define SQRLO_SQR_THRESHOLD 10821 + +#define DC_DIV_QR_THRESHOLD 45 +#define DC_DIVAPPR_Q_THRESHOLD 206 +#define DC_BDIV_QR_THRESHOLD 39 +#define DC_BDIV_Q_THRESHOLD 144 + +#define INV_MULMOD_BNM1_THRESHOLD 54 +#define INV_NEWTON_THRESHOLD 202 +#define INV_APPR_THRESHOLD 206 + +#define BINV_NEWTON_THRESHOLD 224 +#define REDC_1_TO_REDC_N_THRESHOLD 63 + +#define MU_DIV_QR_THRESHOLD 1442 +#define MU_DIVAPPR_Q_THRESHOLD 1387 +#define MUPI_DIV_QR_THRESHOLD 82 +#define MU_BDIV_QR_THRESHOLD 1308 +#define MU_BDIV_Q_THRESHOLD 1387 + +#define POWM_SEC_TABLE 1,16,102,428,1221 + +#define GET_STR_DC_THRESHOLD 14 +#define GET_STR_PRECOMPUTE_THRESHOLD 28 +#define SET_STR_DC_THRESHOLD 254 +#define SET_STR_PRECOMPUTE_THRESHOLD 890 + +#define FAC_DSC_THRESHOLD 206 +#define FAC_ODD_THRESHOLD 29 + +#define MATRIX22_STRASSEN_THRESHOLD 17 +#define HGCD2_DIV1_METHOD 3 /* 3.84% faster than 4 */ +#define HGCD_THRESHOLD 123 +#define HGCD_APPR_THRESHOLD 151 +#define HGCD_REDUCE_THRESHOLD 3389 +#define GCD_DC_THRESHOLD 435 +#define GCDEXT_DC_THRESHOLD 318 +#define JACOBI_BASE_METHOD 4 /* 8.04% faster than 3 */ + +/* Tuneup completed successfully, took 175382 seconds */ diff --git a/vendor/gmp-6.3.0/mpn/x86/k7/invert_limb.asm b/vendor/gmp-6.3.0/mpn/x86/k7/invert_limb.asm new file mode 100644 index 0000000..31a867e --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/k7/invert_limb.asm @@ -0,0 +1,194 @@ +dnl x86 mpn_invert_limb + +dnl Contributed to the GNU project by Niels Möller + +dnl Copyright 2009, 2011, 2015 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles (approx) div +C P5 ? +C P6 model 0-8,10-12 ? +C P6 model 9 (Banias) ? +C P6 model 13 (Dothan) ? +C P4 model 0 (Willamette) ? +C P4 model 1 (?) ? +C P4 model 2 (Northwood) ? +C P4 model 3 (Prescott) ? +C P4 model 4 (Nocona) ? +C AMD K6 ? +C AMD K7 41 53 +C AMD K8 ? + +C TODO +C * These c/l numbers are for a non-PIC build. Consider falling back to using +C the 'div' instruction for PIC builds. +C * Perhaps use this file--or at least the algorithm--for more machines than k7. + +C Register usage: +C Input D in %edi +C Current approximation is in %eax and/or %ecx +C %ebx and %edx are temporaries +C %esi and %ebp are unused + +defframe(PARAM_DIVISOR,4) + +ASM_START() + +C Make approx_tab global to work around Apple relocation bug. +ifdef(`DARWIN',` + deflit(`approx_tab', MPN(invert_limb_tab)) + GLOBL approx_tab') + + TEXT + ALIGN(16) +PROLOGUE(mpn_invert_limb) +deflit(`FRAME', 0) + mov PARAM_DIVISOR, %eax + C Avoid push/pop on k7. + sub $8, %esp FRAME_subl_esp(8) + mov %ebx, (%esp) + mov %edi, 4(%esp) + + mov %eax, %edi + shr $22, %eax +ifdef(`PIC',` + LEAL( approx_tab, %ebx) + movzwl -1024(%ebx, %eax, 2), %eax +',` + movzwl -1024+approx_tab(%eax, %eax), %eax C %eax = v0 +') + + C v1 = (v0 << 4) - ((v0*v0*d_21) >> 32) - 1 + mov %eax, %ecx + imul %eax, %eax + mov %edi, %ebx + shr $11, %ebx + inc %ebx + mul %ebx + mov %edi, %ebx C Prepare + shr %ebx + sbb %eax, %eax + sub %eax, %ebx C %ebx = d_31, %eax = mask + shl $4, %ecx + dec %ecx + sub %edx, %ecx C %ecx = v1 + + C v_2 = (v1 << 15) + ((v1 *(2^48 - v1 * d31 + (v1 >> 1) & mask)) >> 33) + imul %ecx, %ebx + and %ecx, %eax + shr %eax + sub %ebx, %eax + mul %ecx + mov %edi, %eax C Prepare for next mul + shl $15, %ecx + shr %edx + add %edx, %ecx C %ecx = v2 + + mul %ecx + add %edi, %eax + mov %ecx, %eax + adc %edi, %edx + sub %edx, %eax C %eax = v3 + + mov (%esp), %ebx + mov 4(%esp), %edi + add $8, %esp + + ret + +EPILOGUE() + +DEF_OBJECT(approx_tab,2) + .value 0x7fe1,0x7fa1,0x7f61,0x7f22,0x7ee3,0x7ea4,0x7e65,0x7e27 + .value 0x7de9,0x7dab,0x7d6d,0x7d30,0x7cf3,0x7cb6,0x7c79,0x7c3d + .value 0x7c00,0x7bc4,0x7b89,0x7b4d,0x7b12,0x7ad7,0x7a9c,0x7a61 + .value 0x7a27,0x79ec,0x79b2,0x7979,0x793f,0x7906,0x78cc,0x7894 + .value 0x785b,0x7822,0x77ea,0x77b2,0x777a,0x7742,0x770b,0x76d3 + .value 0x769c,0x7665,0x762f,0x75f8,0x75c2,0x758c,0x7556,0x7520 + .value 0x74ea,0x74b5,0x7480,0x744b,0x7416,0x73e2,0x73ad,0x7379 + .value 0x7345,0x7311,0x72dd,0x72aa,0x7277,0x7243,0x7210,0x71de + .value 0x71ab,0x7179,0x7146,0x7114,0x70e2,0x70b1,0x707f,0x704e + .value 0x701c,0x6feb,0x6fba,0x6f8a,0x6f59,0x6f29,0x6ef9,0x6ec8 + .value 0x6e99,0x6e69,0x6e39,0x6e0a,0x6ddb,0x6dab,0x6d7d,0x6d4e + .value 0x6d1f,0x6cf1,0x6cc2,0x6c94,0x6c66,0x6c38,0x6c0a,0x6bdd + .value 0x6bb0,0x6b82,0x6b55,0x6b28,0x6afb,0x6acf,0x6aa2,0x6a76 + .value 0x6a49,0x6a1d,0x69f1,0x69c6,0x699a,0x696e,0x6943,0x6918 + .value 0x68ed,0x68c2,0x6897,0x686c,0x6842,0x6817,0x67ed,0x67c3 + .value 0x6799,0x676f,0x6745,0x671b,0x66f2,0x66c8,0x669f,0x6676 + .value 0x664d,0x6624,0x65fc,0x65d3,0x65aa,0x6582,0x655a,0x6532 + .value 0x650a,0x64e2,0x64ba,0x6493,0x646b,0x6444,0x641c,0x63f5 + .value 0x63ce,0x63a7,0x6381,0x635a,0x6333,0x630d,0x62e7,0x62c1 + .value 0x629a,0x6275,0x624f,0x6229,0x6203,0x61de,0x61b8,0x6193 + .value 0x616e,0x6149,0x6124,0x60ff,0x60da,0x60b6,0x6091,0x606d + .value 0x6049,0x6024,0x6000,0x5fdc,0x5fb8,0x5f95,0x5f71,0x5f4d + .value 0x5f2a,0x5f07,0x5ee3,0x5ec0,0x5e9d,0x5e7a,0x5e57,0x5e35 + .value 0x5e12,0x5def,0x5dcd,0x5dab,0x5d88,0x5d66,0x5d44,0x5d22 + .value 0x5d00,0x5cde,0x5cbd,0x5c9b,0x5c7a,0x5c58,0x5c37,0x5c16 + .value 0x5bf5,0x5bd4,0x5bb3,0x5b92,0x5b71,0x5b51,0x5b30,0x5b10 + .value 0x5aef,0x5acf,0x5aaf,0x5a8f,0x5a6f,0x5a4f,0x5a2f,0x5a0f + .value 0x59ef,0x59d0,0x59b0,0x5991,0x5972,0x5952,0x5933,0x5914 + .value 0x58f5,0x58d6,0x58b7,0x5899,0x587a,0x585b,0x583d,0x581f + .value 0x5800,0x57e2,0x57c4,0x57a6,0x5788,0x576a,0x574c,0x572e + .value 0x5711,0x56f3,0x56d5,0x56b8,0x569b,0x567d,0x5660,0x5643 + .value 0x5626,0x5609,0x55ec,0x55cf,0x55b2,0x5596,0x5579,0x555d + .value 0x5540,0x5524,0x5507,0x54eb,0x54cf,0x54b3,0x5497,0x547b + .value 0x545f,0x5443,0x5428,0x540c,0x53f0,0x53d5,0x53b9,0x539e + .value 0x5383,0x5368,0x534c,0x5331,0x5316,0x52fb,0x52e0,0x52c6 + .value 0x52ab,0x5290,0x5276,0x525b,0x5240,0x5226,0x520c,0x51f1 + .value 0x51d7,0x51bd,0x51a3,0x5189,0x516f,0x5155,0x513b,0x5121 + .value 0x5108,0x50ee,0x50d5,0x50bb,0x50a2,0x5088,0x506f,0x5056 + .value 0x503c,0x5023,0x500a,0x4ff1,0x4fd8,0x4fbf,0x4fa6,0x4f8e + .value 0x4f75,0x4f5c,0x4f44,0x4f2b,0x4f13,0x4efa,0x4ee2,0x4eca + .value 0x4eb1,0x4e99,0x4e81,0x4e69,0x4e51,0x4e39,0x4e21,0x4e09 + .value 0x4df1,0x4dda,0x4dc2,0x4daa,0x4d93,0x4d7b,0x4d64,0x4d4d + .value 0x4d35,0x4d1e,0x4d07,0x4cf0,0x4cd8,0x4cc1,0x4caa,0x4c93 + .value 0x4c7d,0x4c66,0x4c4f,0x4c38,0x4c21,0x4c0b,0x4bf4,0x4bde + .value 0x4bc7,0x4bb1,0x4b9a,0x4b84,0x4b6e,0x4b58,0x4b41,0x4b2b + .value 0x4b15,0x4aff,0x4ae9,0x4ad3,0x4abd,0x4aa8,0x4a92,0x4a7c + .value 0x4a66,0x4a51,0x4a3b,0x4a26,0x4a10,0x49fb,0x49e5,0x49d0 + .value 0x49bb,0x49a6,0x4990,0x497b,0x4966,0x4951,0x493c,0x4927 + .value 0x4912,0x48fe,0x48e9,0x48d4,0x48bf,0x48ab,0x4896,0x4881 + .value 0x486d,0x4858,0x4844,0x482f,0x481b,0x4807,0x47f3,0x47de + .value 0x47ca,0x47b6,0x47a2,0x478e,0x477a,0x4766,0x4752,0x473e + .value 0x472a,0x4717,0x4703,0x46ef,0x46db,0x46c8,0x46b4,0x46a1 + .value 0x468d,0x467a,0x4666,0x4653,0x4640,0x462c,0x4619,0x4606 + .value 0x45f3,0x45e0,0x45cd,0x45ba,0x45a7,0x4594,0x4581,0x456e + .value 0x455b,0x4548,0x4536,0x4523,0x4510,0x44fe,0x44eb,0x44d8 + .value 0x44c6,0x44b3,0x44a1,0x448f,0x447c,0x446a,0x4458,0x4445 + .value 0x4433,0x4421,0x440f,0x43fd,0x43eb,0x43d9,0x43c7,0x43b5 + .value 0x43a3,0x4391,0x437f,0x436d,0x435c,0x434a,0x4338,0x4327 + .value 0x4315,0x4303,0x42f2,0x42e0,0x42cf,0x42bd,0x42ac,0x429b + .value 0x4289,0x4278,0x4267,0x4256,0x4244,0x4233,0x4222,0x4211 + .value 0x4200,0x41ef,0x41de,0x41cd,0x41bc,0x41ab,0x419a,0x418a + .value 0x4179,0x4168,0x4157,0x4147,0x4136,0x4125,0x4115,0x4104 + .value 0x40f4,0x40e3,0x40d3,0x40c2,0x40b2,0x40a2,0x4091,0x4081 + .value 0x4071,0x4061,0x4050,0x4040,0x4030,0x4020,0x4010,0x4000 +END_OBJECT(approx_tab) +ASM_END() diff --git a/vendor/gmp-6.3.0/mpn/x86/k7/mmx/com.asm b/vendor/gmp-6.3.0/mpn/x86/k7/mmx/com.asm new file mode 100644 index 0000000..a258c22 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/k7/mmx/com.asm @@ -0,0 +1,125 @@ +dnl AMD Athlon mpn_com -- mpn bitwise one's complement. + +dnl Copyright 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C K7: 1.0 cycles/limb + + +C void mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size); +C +C The loop form below is necessary for the claimed speed. It needs to be +C aligned to a 16 byte boundary and only 16 bytes long. Maybe that's so it +C fits in a BTB entry. The adjustments to %eax and %edx avoid offsets on +C the movq's and achieve the necessary size. +C +C If both src and dst are 4mod8, the loop runs at 1.5 c/l. So long as one +C of the two is 0mod8, it runs at 1.0 c/l. On that basis dst is checked +C (offset by the size, as per the loop addressing) and one high limb +C processed separately to get alignment. +C +C The padding for the nails case is unattractive, but shouldn't cost any +C cycles. Explicit .byte's guarantee the desired instructions, at a point +C where we're probably stalled waiting for loads anyway. +C +C Enhancements: +C +C The combination load/pxor/store might be able to be unrolled to approach +C 0.5 c/l if desired. + +defframe(PARAM_SIZE,12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + TEXT + ALIGN(16) + +PROLOGUE(mpn_com) +deflit(`FRAME',0) + + movl PARAM_DST, %edx + movl PARAM_SIZE, %ecx + pcmpeqd %mm7, %mm7 + + leal (%edx,%ecx,4), %eax + andl $4, %eax +ifelse(GMP_NAIL_BITS,0,, +` psrld $GMP_NAIL_BITS, %mm7') C GMP_NUMB_MASK + + movl PARAM_SRC, %eax + movd -4(%eax,%ecx,4), %mm0 C src high limb + +ifelse(GMP_NAIL_BITS,0,, +` C padding for alignment below + .byte 0x8d, 0xb6, 0x00, 0x00, 0x00, 0x00 C lea 0(%esi),%esi + .byte 0x8d, 0xbf, 0x00, 0x00, 0x00, 0x00 C lea 0(%edi),%edi +') + + jz L(aligned) + + pxor %mm7, %mm0 + movd %mm0, -4(%edx,%ecx,4) C dst high limb + decl %ecx + jz L(done) +L(aligned): + + addl $4, %eax + addl $4, %edx + decl %ecx + jz L(one) + + C offset 0x30 for no nails, or 0x40 for nails + ALIGN(16) +L(top): + C eax src + C ebx + C ecx counter + C edx dst + + subl $2, %ecx + movq (%eax,%ecx,4), %mm0 + pxor %mm7, %mm0 + movq %mm0, (%edx,%ecx,4) + jg L(top) + + jnz L(done) C if size even + +L(one): + movd -4(%eax), %mm0 C src low limb + pxor %mm7, %mm0 + movd %mm0, -4(%edx) C dst low limb + +L(done): + emms + + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/k7/mmx/copyd.asm b/vendor/gmp-6.3.0/mpn/x86/k7/mmx/copyd.asm new file mode 100644 index 0000000..59ece40 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/k7/mmx/copyd.asm @@ -0,0 +1,144 @@ +dnl AMD K7 mpn_copyd -- copy limb vector, decrementing. + +dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C alignment dst/src, A=0mod8 N=4mod8 +C A/A A/N N/A N/N +C K7 0.75 1.0 1.0 0.75 + + +C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size); +C +C The various comments in mpn/x86/k7/copyi.asm apply here too. + +defframe(PARAM_SIZE,12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) +deflit(`FRAME',0) + +dnl parameter space reused +define(SAVE_EBX,`PARAM_SIZE') +define(SAVE_ESI,`PARAM_SRC') + +dnl minimum 5 since the unrolled code can't handle less than 5 +deflit(UNROLL_THRESHOLD, 5) + + TEXT + ALIGN(32) +PROLOGUE(mpn_copyd) + + movl PARAM_SIZE, %ecx + movl %ebx, SAVE_EBX + + movl PARAM_SRC, %eax + movl PARAM_DST, %edx + + cmpl $UNROLL_THRESHOLD, %ecx + jae L(unroll) + + orl %ecx, %ecx + jz L(simple_done) + +L(simple): + C eax src + C ebx scratch + C ecx counter + C edx dst + C + C this loop is 2 cycles/limb + + movl -4(%eax,%ecx,4), %ebx + movl %ebx, -4(%edx,%ecx,4) + decl %ecx + jnz L(simple) + +L(simple_done): + movl SAVE_EBX, %ebx + ret + + +L(unroll): + movl %esi, SAVE_ESI + leal (%eax,%ecx,4), %ebx + leal (%edx,%ecx,4), %esi + + andl %esi, %ebx + movl SAVE_ESI, %esi + subl $4, %ecx C size-4 + + testl $4, %ebx C testl to pad code closer to 16 bytes for L(top) + jz L(aligned) + + C both src and dst unaligned, process one limb to align them + movl 12(%eax,%ecx,4), %ebx + movl %ebx, 12(%edx,%ecx,4) + decl %ecx +L(aligned): + + + ALIGN(16) +L(top): + C eax src + C ebx + C ecx counter, limbs + C edx dst + + movq 8(%eax,%ecx,4), %mm0 + movq (%eax,%ecx,4), %mm1 + subl $4, %ecx + movq %mm0, 16+8(%edx,%ecx,4) + movq %mm1, 16(%edx,%ecx,4) + jns L(top) + + + C now %ecx is -4 to -1 representing respectively 0 to 3 limbs remaining + + testb $2, %cl + jz L(finish_not_two) + + movq 8(%eax,%ecx,4), %mm0 + movq %mm0, 8(%edx,%ecx,4) +L(finish_not_two): + + testb $1, %cl + jz L(done) + + movl (%eax), %ebx + movl %ebx, (%edx) + +L(done): + movl SAVE_EBX, %ebx + emms + ret + + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/k7/mmx/copyi.asm b/vendor/gmp-6.3.0/mpn/x86/k7/mmx/copyi.asm new file mode 100644 index 0000000..9a28f92 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/k7/mmx/copyi.asm @@ -0,0 +1,157 @@ +dnl AMD K7 mpn_copyi -- copy limb vector, incrementing. + +dnl Copyright 1999, 2000, 2002, 2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C alignment dst/src, A=0mod8 N=4mod8 +C A/A A/N N/A N/N +C K7 0.75 1.0 1.0 0.75 + + +C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size); +C +C Copy src,size to dst,size. +C +C This code at 0.75 or 1.0 c/l is always faster than a plain rep movsl at +C 1.33 c/l. +C +C The K7 can do a 64-bit load and 64-bit store in one cycle (optimization +C guile 22007 appendix B), so 0.5 c/l should be possible, however nothing +C under 0.7 c/l is known. Apparently only two 32-bit stores can be done in +C one cycle, so perhaps some scheduling is needed to ensure it's a +C load+store in each cycle, not store+store. +C +C If both source and destination are unaligned then one limb is processed at +C the start to make them aligned and so get 0.75 c/l, whereas if they'd been +C used unaligned it would be 1.5 c/l. + +defframe(PARAM_SIZE,12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +dnl parameter space reused +define(SAVE_EBX,`PARAM_SIZE') + +dnl minimum 5 since the unrolled code can't handle less than 5 +deflit(UNROLL_THRESHOLD, 5) + + TEXT + ALIGN(32) +PROLOGUE(mpn_copyi) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + movl %ebx, SAVE_EBX + + movl PARAM_SRC, %eax + movl PARAM_DST, %edx + + cmpl $UNROLL_THRESHOLD, %ecx + jae L(unroll) + + orl %ecx, %ecx + jz L(simple_done) + +L(simple): + C eax src, incrementing + C ebx scratch + C ecx counter + C edx dst, incrementing + C + C this loop is 2 cycles/limb + + movl (%eax), %ebx + movl %ebx, (%edx) + decl %ecx + leal 4(%eax), %eax + leal 4(%edx), %edx + jnz L(simple) + +L(simple_done): + movl SAVE_EBX, %ebx + ret + + +L(unroll): + movl %eax, %ebx + leal -12(%eax,%ecx,4), %eax C src end - 12 + subl $3, %ecx C size-3 + + andl %edx, %ebx + leal (%edx,%ecx,4), %edx C dst end - 12 + negl %ecx + + testl $4, %ebx C testl to pad code closer to 16 bytes for L(top) + jz L(aligned) + + C both src and dst unaligned, process one limb to align them + movl (%eax,%ecx,4), %ebx + movl %ebx, (%edx,%ecx,4) + incl %ecx +L(aligned): + + + ALIGN(16) +L(top): + C eax src end - 12 + C ebx + C ecx counter, negative, limbs + C edx dst end - 12 + + movq (%eax,%ecx,4), %mm0 + movq 8(%eax,%ecx,4), %mm1 + addl $4, %ecx + movq %mm0, -16(%edx,%ecx,4) + movq %mm1, -16+8(%edx,%ecx,4) + ja L(top) C jump no carry and not zero + + + C now %ecx is 0 to 3 representing respectively 3 to 0 limbs remaining + + testb $2, %cl + jnz L(finish_not_two) + + movq (%eax,%ecx,4), %mm0 + movq %mm0, (%edx,%ecx,4) +L(finish_not_two): + + testb $1, %cl + jnz L(done) + + movl 8(%eax), %ebx + movl %ebx, 8(%edx) + +L(done): + movl SAVE_EBX, %ebx + emms + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/k7/mmx/divrem_1.asm b/vendor/gmp-6.3.0/mpn/x86/k7/mmx/divrem_1.asm new file mode 100644 index 0000000..cf34328 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/k7/mmx/divrem_1.asm @@ -0,0 +1,832 @@ +dnl AMD K7 mpn_divrem_1, mpn_divrem_1c, mpn_preinv_divrem_1 -- mpn by limb +dnl division. + +dnl Copyright 1999-2002, 2004 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C K7: 17.0 cycles/limb integer part, 15.0 cycles/limb fraction part. + + +C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize, +C mp_srcptr src, mp_size_t size, +C mp_limb_t divisor); +C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize, +C mp_srcptr src, mp_size_t size, +C mp_limb_t divisor, mp_limb_t carry); +C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize, +C mp_srcptr src, mp_size_t size, +C mp_limb_t divisor, mp_limb_t inverse, +C unsigned shift); +C +C Algorithm: +C +C The method and nomenclature follow part 8 of "Division by Invariant +C Integers using Multiplication" by Granlund and Montgomery, reference in +C gmp.texi. +C +C The "and"s shown in the paper are done here with "cmov"s. "m" is written +C for m', and "d" for d_norm, which won't cause any confusion since it's +C only the normalized divisor that's of any use in the code. "b" is written +C for 2^N, the size of a limb, N being 32 here. +C +C The step "sdword dr = n - 2^N*d + (2^N-1-q1) * d" is instead done as +C "n-(q1+1)*d"; this rearrangement gives the same two-limb answer. If +C q1==0xFFFFFFFF, then q1+1 would overflow. We branch to a special case +C "q1_ff" if this occurs. Since the true quotient is either q1 or q1+1 then +C if q1==0xFFFFFFFF that must be the right value. +C +C For the last and second last steps q1==0xFFFFFFFF is instead handled by an +C sbbl to go back to 0xFFFFFFFF if an overflow occurs when adding 1. This +C then goes through as normal, and finding no addback required. sbbl costs +C an extra cycle over what the main loop code does, but it keeps code size +C and complexity down. +C +C Notes: +C +C mpn_divrem_1 and mpn_preinv_divrem_1 avoid one division if the src high +C limb is less than the divisor. mpn_divrem_1c doesn't check for a zero +C carry, since in normal circumstances that will be a very rare event. +C +C The test for skipping a division is branch free (once size>=1 is tested). +C The store to the destination high limb is 0 when a divide is skipped, or +C if it's not skipped then a copy of the src high limb is used. The latter +C is in case src==dst. +C +C There's a small bias towards expecting xsize==0, by having code for +C xsize==0 in a straight line and xsize!=0 under forward jumps. +C +C Alternatives: +C +C If the divisor is normalized (high bit set) then a division step can +C always be skipped, since the high destination limb is always 0 or 1 in +C that case. It doesn't seem worth checking for this though, since it +C probably occurs infrequently, in particular note that big_base for a +C decimal mpn_get_str is not normalized in a 32-bit limb. + + +dnl MUL_THRESHOLD is the value of xsize+size at which the multiply by +dnl inverse method is used, rather than plain "divl"s. Minimum value 1. +dnl +dnl The inverse takes about 50 cycles to calculate, but after that the +dnl multiply is 17 c/l versus division at 42 c/l. +dnl +dnl At 3 limbs the mul is a touch faster than div on the integer part, and +dnl even more so on the fractional part. + +deflit(MUL_THRESHOLD, 3) + + +defframe(PARAM_PREINV_SHIFT, 28) dnl mpn_preinv_divrem_1 +defframe(PARAM_PREINV_INVERSE, 24) dnl mpn_preinv_divrem_1 +defframe(PARAM_CARRY, 24) dnl mpn_divrem_1c +defframe(PARAM_DIVISOR,20) +defframe(PARAM_SIZE, 16) +defframe(PARAM_SRC, 12) +defframe(PARAM_XSIZE, 8) +defframe(PARAM_DST, 4) + +defframe(SAVE_EBX, -4) +defframe(SAVE_ESI, -8) +defframe(SAVE_EDI, -12) +defframe(SAVE_EBP, -16) + +defframe(VAR_NORM, -20) +defframe(VAR_INVERSE, -24) +defframe(VAR_SRC, -28) +defframe(VAR_DST, -32) +defframe(VAR_DST_STOP,-36) + +deflit(STACK_SPACE, 36) + + TEXT + ALIGN(32) + +PROLOGUE(mpn_preinv_divrem_1) +deflit(`FRAME',0) + movl PARAM_XSIZE, %ecx + movl PARAM_DST, %edx + subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE) + + movl %esi, SAVE_ESI + movl PARAM_SRC, %esi + + movl %ebx, SAVE_EBX + movl PARAM_SIZE, %ebx + + leal 8(%edx,%ecx,4), %edx C &dst[xsize+2] + movl %ebp, SAVE_EBP + movl PARAM_DIVISOR, %ebp + + movl %edx, VAR_DST_STOP C &dst[xsize+2] + movl %edi, SAVE_EDI + xorl %edi, %edi C carry + + movl -4(%esi,%ebx,4), %eax C src high limb + xor %ecx, %ecx + + C + + C + + cmpl %ebp, %eax C high cmp divisor + + cmovc( %eax, %edi) C high is carry if high<divisor + cmovnc( %eax, %ecx) C 0 if skip div, src high if not + C (the latter in case src==dst) + + movl %ecx, -12(%edx,%ebx,4) C dst high limb + sbbl $0, %ebx C skip one division if high<divisor + movl PARAM_PREINV_SHIFT, %ecx + + leal -8(%edx,%ebx,4), %edx C &dst[xsize+size] + movl $32, %eax + + movl %edx, VAR_DST C &dst[xsize+size] + + shll %cl, %ebp C d normalized + subl %ecx, %eax + movl %ecx, VAR_NORM + + movd %eax, %mm7 C rshift + movl PARAM_PREINV_INVERSE, %eax + jmp L(start_preinv) + +EPILOGUE() + + + ALIGN(16) + +PROLOGUE(mpn_divrem_1c) +deflit(`FRAME',0) + movl PARAM_CARRY, %edx + movl PARAM_SIZE, %ecx + subl $STACK_SPACE, %esp +deflit(`FRAME',STACK_SPACE) + + movl %ebx, SAVE_EBX + movl PARAM_XSIZE, %ebx + + movl %edi, SAVE_EDI + movl PARAM_DST, %edi + + movl %ebp, SAVE_EBP + movl PARAM_DIVISOR, %ebp + + movl %esi, SAVE_ESI + movl PARAM_SRC, %esi + + leal -4(%edi,%ebx,4), %edi C &dst[xsize-1] + jmp L(start_1c) + +EPILOGUE() + + + C offset 0xa1, close enough to aligned +PROLOGUE(mpn_divrem_1) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + movl $0, %edx C initial carry (if can't skip a div) + subl $STACK_SPACE, %esp +deflit(`FRAME',STACK_SPACE) + + movl %esi, SAVE_ESI + movl PARAM_SRC, %esi + + movl %ebx, SAVE_EBX + movl PARAM_XSIZE, %ebx + + movl %ebp, SAVE_EBP + movl PARAM_DIVISOR, %ebp + orl %ecx, %ecx C size + + movl %edi, SAVE_EDI + movl PARAM_DST, %edi + leal -4(%edi,%ebx,4), %edi C &dst[xsize-1] + + jz L(no_skip_div) C if size==0 + movl -4(%esi,%ecx,4), %eax C src high limb + xorl %esi, %esi + + cmpl %ebp, %eax C high cmp divisor + + cmovc( %eax, %edx) C high is carry if high<divisor + cmovnc( %eax, %esi) C 0 if skip div, src high if not + + movl %esi, (%edi,%ecx,4) C dst high limb + sbbl $0, %ecx C size-1 if high<divisor + movl PARAM_SRC, %esi C reload +L(no_skip_div): + + +L(start_1c): + C eax + C ebx xsize + C ecx size + C edx carry + C esi src + C edi &dst[xsize-1] + C ebp divisor + + leal (%ebx,%ecx), %eax C size+xsize + cmpl $MUL_THRESHOLD, %eax + jae L(mul_by_inverse) + + +C With MUL_THRESHOLD set to 3, the simple loops here only do 0 to 2 limbs. +C It'd be possible to write them out without the looping, but no speedup +C would be expected. +C +C Using PARAM_DIVISOR instead of %ebp measures 1 cycle/loop faster on the +C integer part, but curiously not on the fractional part, where %ebp is a +C (fixed) couple of cycles faster. + + orl %ecx, %ecx + jz L(divide_no_integer) + +L(divide_integer): + C eax scratch (quotient) + C ebx xsize + C ecx counter + C edx scratch (remainder) + C esi src + C edi &dst[xsize-1] + C ebp divisor + + movl -4(%esi,%ecx,4), %eax + + divl PARAM_DIVISOR + + movl %eax, (%edi,%ecx,4) + decl %ecx + jnz L(divide_integer) + + +L(divide_no_integer): + movl PARAM_DST, %edi + orl %ebx, %ebx + jnz L(divide_fraction) + +L(divide_done): + movl SAVE_ESI, %esi + movl SAVE_EDI, %edi + movl %edx, %eax + + movl SAVE_EBX, %ebx + movl SAVE_EBP, %ebp + addl $STACK_SPACE, %esp + + ret + + +L(divide_fraction): + C eax scratch (quotient) + C ebx counter + C ecx + C edx scratch (remainder) + C esi + C edi dst + C ebp divisor + + movl $0, %eax + + divl %ebp + + movl %eax, -4(%edi,%ebx,4) + decl %ebx + jnz L(divide_fraction) + + jmp L(divide_done) + + + +C ----------------------------------------------------------------------------- + +L(mul_by_inverse): + C eax + C ebx xsize + C ecx size + C edx carry + C esi src + C edi &dst[xsize-1] + C ebp divisor + + bsrl %ebp, %eax C 31-l + + leal 12(%edi), %ebx C &dst[xsize+2], loop dst stop + leal 4(%edi,%ecx,4), %edi C &dst[xsize+size] + + movl %edi, VAR_DST + movl %ebx, VAR_DST_STOP + + movl %ecx, %ebx C size + movl $31, %ecx + + movl %edx, %edi C carry + movl $-1, %edx + + C + + xorl %eax, %ecx C l + incl %eax C 32-l + + shll %cl, %ebp C d normalized + movl %ecx, VAR_NORM + + movd %eax, %mm7 + + movl $-1, %eax + subl %ebp, %edx C (b-d)-1 giving edx:eax = b*(b-d)-1 + + divl %ebp C floor (b*(b-d)-1) / d + +L(start_preinv): + C eax inverse + C ebx size + C ecx shift + C edx + C esi src + C edi carry + C ebp divisor + C + C mm7 rshift + + orl %ebx, %ebx C size + movl %eax, VAR_INVERSE + leal -12(%esi,%ebx,4), %eax C &src[size-3] + + jz L(start_zero) + movl %eax, VAR_SRC + cmpl $1, %ebx + + movl 8(%eax), %esi C src high limb + jz L(start_one) + +L(start_two_or_more): + movl 4(%eax), %edx C src second highest limb + + shldl( %cl, %esi, %edi) C n2 = carry,high << l + + shldl( %cl, %edx, %esi) C n10 = high,second << l + + cmpl $2, %ebx + je L(integer_two_left) + jmp L(integer_top) + + +L(start_one): + shldl( %cl, %esi, %edi) C n2 = carry,high << l + + shll %cl, %esi C n10 = high << l + movl %eax, VAR_SRC + jmp L(integer_one_left) + + +L(start_zero): + C Can be here with xsize==0 if mpn_preinv_divrem_1 had size==1 and + C skipped a division. + + shll %cl, %edi C n2 = carry << l + movl %edi, %eax C return value for zero_done + cmpl $0, PARAM_XSIZE + + je L(zero_done) + jmp L(fraction_some) + + + +C ----------------------------------------------------------------------------- +C +C The multiply by inverse loop is 17 cycles, and relies on some out-of-order +C execution. The instruction scheduling is important, with various +C apparently equivalent forms running 1 to 5 cycles slower. +C +C A lower bound for the time would seem to be 16 cycles, based on the +C following successive dependencies. +C +C cycles +C n2+n1 1 +C mul 6 +C q1+1 1 +C mul 6 +C sub 1 +C addback 1 +C --- +C 16 +C +C This chain is what the loop has already, but 16 cycles isn't achieved. +C K7 has enough decode, and probably enough execute (depending maybe on what +C a mul actually consumes), but nothing running under 17 has been found. +C +C In theory n2+n1 could be done in the sub and addback stages (by +C calculating both n2 and n2+n1 there), but lack of registers makes this an +C unlikely proposition. +C +C The jz in the loop keeps the q1+1 stage to 1 cycle. Handling an overflow +C from q1+1 with an "sbbl $0, %ebx" would add a cycle to the dependent +C chain, and nothing better than 18 cycles has been found when using it. +C The jump is taken only when q1 is 0xFFFFFFFF, and on random data this will +C be an extremely rare event. +C +C Branch mispredictions will hit random occurrences of q1==0xFFFFFFFF, but +C if some special data is coming out with this always, the q1_ff special +C case actually runs at 15 c/l. 0x2FFF...FFFD divided by 3 is a good way to +C induce the q1_ff case, for speed measurements or testing. Note that +C 0xFFF...FFF divided by 1 or 2 doesn't induce it. +C +C The instruction groupings and empty comments show the cycles for a naive +C in-order view of the code (conveniently ignoring the load latency on +C VAR_INVERSE). This shows some of where the time is going, but is nonsense +C to the extent that out-of-order execution rearranges it. In this case +C there's 19 cycles shown, but it executes at 17. + + ALIGN(16) +L(integer_top): + C eax scratch + C ebx scratch (nadj, q1) + C ecx scratch (src, dst) + C edx scratch + C esi n10 + C edi n2 + C ebp divisor + C + C mm0 scratch (src qword) + C mm7 rshift for normalization + + cmpl $0x80000000, %esi C n1 as 0=c, 1=nc + movl %edi, %eax C n2 + movl VAR_SRC, %ecx + + leal (%ebp,%esi), %ebx + cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow + sbbl $-1, %eax C n2+n1 + + mull VAR_INVERSE C m*(n2+n1) + + movq (%ecx), %mm0 C next limb and the one below it + subl $4, %ecx + + movl %ecx, VAR_SRC + + C + + addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag + leal 1(%edi), %ebx C n2+1 + movl %ebp, %eax C d + + C + + adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 + jz L(q1_ff) + movl VAR_DST, %ecx + + mull %ebx C (q1+1)*d + + psrlq %mm7, %mm0 + + leal -4(%ecx), %ecx + + C + + subl %eax, %esi + movl VAR_DST_STOP, %eax + + C + + sbbl %edx, %edi C n - (q1+1)*d + movl %esi, %edi C remainder -> n2 + leal (%ebp,%esi), %edx + + movd %mm0, %esi + + cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 + sbbl $0, %ebx C q + cmpl %eax, %ecx + + movl %ebx, (%ecx) + movl %ecx, VAR_DST + jne L(integer_top) + + +L(integer_loop_done): + + +C ----------------------------------------------------------------------------- +C +C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz +C q1_ff special case. This make the code a bit smaller and simpler, and +C costs only 1 cycle (each). + +L(integer_two_left): + C eax scratch + C ebx scratch (nadj, q1) + C ecx scratch (src, dst) + C edx scratch + C esi n10 + C edi n2 + C ebp divisor + C + C mm7 rshift + + cmpl $0x80000000, %esi C n1 as 0=c, 1=nc + movl %edi, %eax C n2 + movl PARAM_SRC, %ecx + + leal (%ebp,%esi), %ebx + cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow + sbbl $-1, %eax C n2+n1 + + mull VAR_INVERSE C m*(n2+n1) + + movd (%ecx), %mm0 C src low limb + + movl VAR_DST_STOP, %ecx + + C + + addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag + leal 1(%edi), %ebx C n2+1 + movl %ebp, %eax C d + + adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 + + sbbl $0, %ebx + + mull %ebx C (q1+1)*d + + psllq $32, %mm0 + + psrlq %mm7, %mm0 + + C + + subl %eax, %esi + + C + + sbbl %edx, %edi C n - (q1+1)*d + movl %esi, %edi C remainder -> n2 + leal (%ebp,%esi), %edx + + movd %mm0, %esi + + cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 + sbbl $0, %ebx C q + + movl %ebx, -4(%ecx) + + +C ----------------------------------------------------------------------------- +L(integer_one_left): + C eax scratch + C ebx scratch (nadj, q1) + C ecx dst + C edx scratch + C esi n10 + C edi n2 + C ebp divisor + C + C mm7 rshift + + movl VAR_DST_STOP, %ecx + cmpl $0x80000000, %esi C n1 as 0=c, 1=nc + movl %edi, %eax C n2 + + leal (%ebp,%esi), %ebx + cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow + sbbl $-1, %eax C n2+n1 + + mull VAR_INVERSE C m*(n2+n1) + + C + + C + + C + + addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag + leal 1(%edi), %ebx C n2+1 + movl %ebp, %eax C d + + C + + adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 + + sbbl $0, %ebx C q1 if q1+1 overflowed + + mull %ebx + + C + + C + + C + + subl %eax, %esi + + C + + sbbl %edx, %edi C n - (q1+1)*d + movl %esi, %edi C remainder -> n2 + leal (%ebp,%esi), %edx + + cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 + sbbl $0, %ebx C q + + movl %ebx, -8(%ecx) + subl $8, %ecx + + + +L(integer_none): + cmpl $0, PARAM_XSIZE + jne L(fraction_some) + + movl %edi, %eax +L(fraction_done): + movl VAR_NORM, %ecx +L(zero_done): + movl SAVE_EBP, %ebp + + movl SAVE_EDI, %edi + movl SAVE_ESI, %esi + + movl SAVE_EBX, %ebx + addl $STACK_SPACE, %esp + + shrl %cl, %eax + emms + + ret + + +C ----------------------------------------------------------------------------- +C +C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword +C of q*d is simply -d and the remainder n-q*d = n10+d + +L(q1_ff): + C eax (divisor) + C ebx (q1+1 == 0) + C ecx + C edx + C esi n10 + C edi n2 + C ebp divisor + + movl VAR_DST, %ecx + movl VAR_DST_STOP, %edx + subl $4, %ecx + + psrlq %mm7, %mm0 + leal (%ebp,%esi), %edi C n-q*d remainder -> next n2 + movl %ecx, VAR_DST + + movd %mm0, %esi C next n10 + + movl $-1, (%ecx) + cmpl %ecx, %edx + jne L(integer_top) + + jmp L(integer_loop_done) + + + +C ----------------------------------------------------------------------------- +C +C Being the fractional part, the "source" limbs are all zero, meaning +C n10=0, n1=0, and hence nadj=0, leading to many instructions eliminated. +C +C The loop runs at 15 cycles. The dependent chain is the same as the +C general case above, but without the n2+n1 stage (due to n1==0), so 15 +C would seem to be the lower bound. +C +C A not entirely obvious simplification is that q1+1 never overflows a limb, +C and so there's no need for the sbbl $0 or jz q1_ff from the general case. +C q1 is the high word of m*n2+b*n2 and the following shows q1<=b-2 always. +C rnd() means rounding down to a multiple of d. +C +C m*n2 + b*n2 <= m*(d-1) + b*(d-1) +C = m*d + b*d - m - b +C = floor((b(b-d)-1)/d)*d + b*d - m - b +C = rnd(b(b-d)-1) + b*d - m - b +C = rnd(b(b-d)-1 + b*d) - m - b +C = rnd(b*b-1) - m - b +C <= (b-2)*b +C +C Unchanged from the general case is that the final quotient limb q can be +C either q1 or q1+1, and the q1+1 case occurs often. This can be seen from +C equation 8.4 of the paper which simplifies as follows when n1==0 and +C n0==0. +C +C n-q1*d = (n2*k+q0*d)/b <= d + (d*d-2d)/b +C +C As before, the instruction groupings and empty comments show a naive +C in-order view of the code, which is made a nonsense by out of order +C execution. There's 17 cycles shown, but it executes at 15. +C +C Rotating the store q and remainder->n2 instructions up to the top of the +C loop gets the run time down from 16 to 15. + + ALIGN(16) +L(fraction_some): + C eax + C ebx + C ecx + C edx + C esi + C edi carry + C ebp divisor + + movl PARAM_DST, %esi + movl VAR_DST_STOP, %ecx C &dst[xsize+2] + movl %edi, %eax + + subl $8, %ecx C &dst[xsize] + jmp L(fraction_entry) + + + ALIGN(16) +L(fraction_top): + C eax n2 carry, then scratch + C ebx scratch (nadj, q1) + C ecx dst, decrementing + C edx scratch + C esi dst stop point + C edi (will be n2) + C ebp divisor + + movl %ebx, (%ecx) C previous q + movl %eax, %edi C remainder->n2 + +L(fraction_entry): + mull VAR_INVERSE C m*n2 + + movl %ebp, %eax C d + subl $4, %ecx C dst + leal 1(%edi), %ebx + + C + + C + + C + + C + + addl %edx, %ebx C 1 + high(n2<<32 + m*n2) = q1+1 + + mull %ebx C (q1+1)*d + + C + + C + + C + + negl %eax C low of n - (q1+1)*d + + C + + sbbl %edx, %edi C high of n - (q1+1)*d, caring only about carry + leal (%ebp,%eax), %edx + + cmovc( %edx, %eax) C n - q1*d if underflow from using q1+1 + sbbl $0, %ebx C q + cmpl %esi, %ecx + + jne L(fraction_top) + + + movl %ebx, (%ecx) + jmp L(fraction_done) + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/k7/mmx/lshift.asm b/vendor/gmp-6.3.0/mpn/x86/k7/mmx/lshift.asm new file mode 100644 index 0000000..b3383cf --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/k7/mmx/lshift.asm @@ -0,0 +1,481 @@ +dnl AMD K7 mpn_lshift -- mpn left shift. + +dnl Copyright 1999-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C K7: 1.21 cycles/limb (at 16 limbs/loop). + + + +dnl K7: UNROLL_COUNT cycles/limb +dnl 4 1.51 +dnl 8 1.26 +dnl 16 1.21 +dnl 32 1.2 +dnl Maximum possible with the current code is 64. + +deflit(UNROLL_COUNT, 16) + + +C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned shift); +C +C Shift src,size left by shift many bits and store the result in dst,size. +C Zeros are shifted in at the right. The bits shifted out at the left are +C the return value. +C +C The comments in mpn_rshift apply here too. + +ifdef(`PIC',` +deflit(UNROLL_THRESHOLD, 10) +',` +deflit(UNROLL_THRESHOLD, 10) +') + +defframe(PARAM_SHIFT,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +defframe(SAVE_EDI, -4) +defframe(SAVE_ESI, -8) +defframe(SAVE_EBX, -12) +deflit(SAVE_SIZE, 12) + + TEXT + ALIGN(32) + +PROLOGUE(mpn_lshift) +deflit(`FRAME',0) + + movl PARAM_SIZE, %eax + movl PARAM_SRC, %edx + subl $SAVE_SIZE, %esp +deflit(`FRAME',SAVE_SIZE) + + movl PARAM_SHIFT, %ecx + movl %edi, SAVE_EDI + + movl PARAM_DST, %edi + decl %eax + jnz L(more_than_one_limb) + + movl (%edx), %edx + + shldl( %cl, %edx, %eax) C eax was decremented to zero + + shll %cl, %edx + + movl %edx, (%edi) + movl SAVE_EDI, %edi + addl $SAVE_SIZE, %esp + + ret + + +C ----------------------------------------------------------------------------- +L(more_than_one_limb): + C eax size-1 + C ebx + C ecx shift + C edx src + C esi + C edi dst + C ebp + + movd PARAM_SHIFT, %mm6 + movd (%edx,%eax,4), %mm5 C src high limb + cmp $UNROLL_THRESHOLD-1, %eax + + jae L(unroll) + negl %ecx + movd (%edx), %mm4 C src low limb + + addl $32, %ecx + + movd %ecx, %mm7 + +L(simple_top): + C eax loop counter, limbs + C ebx + C ecx + C edx src + C esi + C edi dst + C ebp + C + C mm0 scratch + C mm4 src low limb + C mm5 src high limb + C mm6 shift + C mm7 32-shift + + movq -4(%edx,%eax,4), %mm0 + decl %eax + + psrlq %mm7, %mm0 + + movd %mm0, 4(%edi,%eax,4) + jnz L(simple_top) + + + psllq %mm6, %mm5 + psllq %mm6, %mm4 + + psrlq $32, %mm5 + movd %mm4, (%edi) C dst low limb + + movd %mm5, %eax C return value + + movl SAVE_EDI, %edi + addl $SAVE_SIZE, %esp + emms + + ret + + +C ----------------------------------------------------------------------------- + ALIGN(16) +L(unroll): + C eax size-1 + C ebx (saved) + C ecx shift + C edx src + C esi + C edi dst + C ebp + C + C mm5 src high limb, for return value + C mm6 lshift + + movl %esi, SAVE_ESI + movl %ebx, SAVE_EBX + leal -4(%edx,%eax,4), %edx C &src[size-2] + + testb $4, %dl + movq (%edx), %mm1 C src high qword + + jz L(start_src_aligned) + + + C src isn't aligned, process high limb (marked xxx) separately to + C make it so + C + C source -4(edx,%eax,4) + C | + C +-------+-------+-------+-- + C | xxx | + C +-------+-------+-------+-- + C 0mod8 4mod8 0mod8 + C + C dest -4(edi,%eax,4) + C | + C +-------+-------+-- + C | xxx | | + C +-------+-------+-- + + psllq %mm6, %mm1 + subl $4, %edx + movl %eax, PARAM_SIZE C size-1 + + psrlq $32, %mm1 + decl %eax C size-2 is new size-1 + + movd %mm1, 4(%edi,%eax,4) + movq (%edx), %mm1 C new src high qword +L(start_src_aligned): + + + leal -4(%edi,%eax,4), %edi C &dst[size-2] + psllq %mm6, %mm5 + + testl $4, %edi + psrlq $32, %mm5 C return value + + jz L(start_dst_aligned) + + + C dst isn't aligned, subtract 4 bytes to make it so, and pretend the + C shift is 32 bits extra. High limb of dst (marked xxx) handled + C here separately. + C + C source %edx + C +-------+-------+-- + C | mm1 | + C +-------+-------+-- + C 0mod8 4mod8 + C + C dest %edi + C +-------+-------+-------+-- + C | xxx | + C +-------+-------+-------+-- + C 0mod8 4mod8 0mod8 + + movq %mm1, %mm0 + psllq %mm6, %mm1 + addl $32, %ecx C shift+32 + + psrlq $32, %mm1 + + movd %mm1, 4(%edi) + movq %mm0, %mm1 + subl $4, %edi + + movd %ecx, %mm6 C new lshift +L(start_dst_aligned): + + decl %eax C size-2, two last limbs handled at end + movq %mm1, %mm2 C copy of src high qword + negl %ecx + + andl $-2, %eax C round size down to even + addl $64, %ecx + + movl %eax, %ebx + negl %eax + + andl $UNROLL_MASK, %eax + decl %ebx + + shll %eax + + movd %ecx, %mm7 C rshift = 64-lshift + +ifdef(`PIC',` + call L(pic_calc) +L(here): +',` + leal L(entry) (%eax,%eax,4), %esi +') + shrl $UNROLL_LOG2, %ebx C loop counter + + leal ifelse(UNROLL_BYTES,256,128) -8(%edx,%eax,2), %edx + leal ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi + movl PARAM_SIZE, %eax C for use at end + jmp *%esi + + +ifdef(`PIC',` +L(pic_calc): + C See mpn/x86/README about old gas bugs + leal (%eax,%eax,4), %esi + addl $L(entry)-L(here), %esi + addl (%esp), %esi + + ret_internal +') + + +C ----------------------------------------------------------------------------- + ALIGN(32) +L(top): + C eax size (for use at end) + C ebx loop counter + C ecx rshift + C edx src + C esi computed jump + C edi dst + C ebp + C + C mm0 scratch + C mm1 \ carry (alternating, mm2 first) + C mm2 / + C mm6 lshift + C mm7 rshift + C + C 10 code bytes/limb + C + C The two chunks differ in whether mm1 or mm2 hold the carry. + C The computed jump puts the initial carry in both mm1 and mm2. + +L(entry): +deflit(CHUNK_COUNT, 4) +forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, ` + deflit(`disp0', eval(-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128))) + deflit(`disp1', eval(disp0 - 8)) + +Zdisp( movq, disp0,(%edx), %mm0) + psllq %mm6, %mm2 + + movq %mm0, %mm1 + psrlq %mm7, %mm0 + + por %mm2, %mm0 +Zdisp( movq, %mm0, disp0,(%edi)) + + +Zdisp( movq, disp1,(%edx), %mm0) + psllq %mm6, %mm1 + + movq %mm0, %mm2 + psrlq %mm7, %mm0 + + por %mm1, %mm0 +Zdisp( movq, %mm0, disp1,(%edi)) +') + + subl $UNROLL_BYTES, %edx + subl $UNROLL_BYTES, %edi + decl %ebx + + jns L(top) + + + +define(`disp', `m4_empty_if_zero(eval($1 ifelse(UNROLL_BYTES,256,-128)))') + +L(end): + testb $1, %al + movl SAVE_EBX, %ebx + psllq %mm6, %mm2 C wanted left shifted in all cases below + + movd %mm5, %eax + + movl SAVE_ESI, %esi + jz L(end_even) + + +L(end_odd): + + C Size odd, destination was aligned. + C + C source edx+8 edx+4 + C --+---------------+-------+ + C | mm2 | | + C --+---------------+-------+ + C + C dest edi + C --+---------------+---------------+-------+ + C | written | | | + C --+---------------+---------------+-------+ + C + C mm6 = shift + C mm7 = ecx = 64-shift + + + C Size odd, destination was unaligned. + C + C source edx+8 edx+4 + C --+---------------+-------+ + C | mm2 | | + C --+---------------+-------+ + C + C dest edi + C --+---------------+---------------+ + C | written | | + C --+---------------+---------------+ + C + C mm6 = shift+32 + C mm7 = ecx = 64-(shift+32) + + + C In both cases there's one extra limb of src to fetch and combine + C with mm2 to make a qword at (%edi), and in the aligned case + C there's an extra limb of dst to be formed from that extra src limb + C left shifted. + + movd disp(4) (%edx), %mm0 + testb $32, %cl + + movq %mm0, %mm1 + psllq $32, %mm0 + + psrlq %mm7, %mm0 + psllq %mm6, %mm1 + + por %mm2, %mm0 + + movq %mm0, disp(0) (%edi) + jz L(end_odd_unaligned) + movd %mm1, disp(-4) (%edi) +L(end_odd_unaligned): + + movl SAVE_EDI, %edi + addl $SAVE_SIZE, %esp + emms + + ret + + +L(end_even): + + C Size even, destination was aligned. + C + C source edx+8 + C --+---------------+ + C | mm2 | + C --+---------------+ + C + C dest edi + C --+---------------+---------------+ + C | written | | + C --+---------------+---------------+ + C + C mm6 = shift + C mm7 = ecx = 64-shift + + + C Size even, destination was unaligned. + C + C source edx+8 + C --+---------------+ + C | mm2 | + C --+---------------+ + C + C dest edi+4 + C --+---------------+-------+ + C | written | | + C --+---------------+-------+ + C + C mm6 = shift+32 + C mm7 = ecx = 64-(shift+32) + + + C The movq for the aligned case overwrites the movd for the + C unaligned case. + + movq %mm2, %mm0 + psrlq $32, %mm2 + + testb $32, %cl + movd %mm2, disp(4) (%edi) + + jz L(end_even_unaligned) + movq %mm0, disp(0) (%edi) +L(end_even_unaligned): + + movl SAVE_EDI, %edi + addl $SAVE_SIZE, %esp + emms + + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/k7/mmx/popham.asm b/vendor/gmp-6.3.0/mpn/x86/k7/mmx/popham.asm new file mode 100644 index 0000000..95965b7 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/k7/mmx/popham.asm @@ -0,0 +1,213 @@ +dnl AMD K7 mpn_popcount, mpn_hamdist -- population count and hamming +dnl distance. + +dnl Copyright 2000-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C popcount hamdist +C P3 generic 6.5 7 +C P3 model 9 (Banias) 5.7 6.1 +C P3 model 13 (Dothan) 5.75 6 +C K7 5 6 + +C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size); +C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size); +C +C The code here is almost certainly not optimal, but is already a 3x speedup +C over the generic C code. The main improvement would be to interleave +C processing of two qwords in the loop so as to fully exploit the available +C execution units, possibly leading to 3.25 c/l (13 cycles for 4 limbs). +C +C The loop is based on the example "Efficient 64-bit population count using +C MMX instructions" in the Athlon Optimization Guide, AMD document 22007, +C page 158 of rev E (reference in mpn/x86/k7/README). + +ifdef(`OPERATION_popcount',, +`ifdef(`OPERATION_hamdist',, +`m4_error(`Need OPERATION_popcount or OPERATION_hamdist defined +')')') + +define(HAM, +m4_assert_numargs(1) +`ifdef(`OPERATION_hamdist',`$1')') + +define(POP, +m4_assert_numargs(1) +`ifdef(`OPERATION_popcount',`$1')') + +HAM(` +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC2, 8) +defframe(PARAM_SRC, 4) +define(M4_function,mpn_hamdist) +') +POP(` +defframe(PARAM_SIZE, 8) +defframe(PARAM_SRC, 4) +define(M4_function,mpn_popcount) +') + +MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist) + + +ifdef(`PIC',,` + dnl non-PIC + + RODATA + ALIGN(8) + +L(rodata_AAAAAAAAAAAAAAAA): + .long 0xAAAAAAAA + .long 0xAAAAAAAA + +L(rodata_3333333333333333): + .long 0x33333333 + .long 0x33333333 + +L(rodata_0F0F0F0F0F0F0F0F): + .long 0x0F0F0F0F + .long 0x0F0F0F0F +') + + TEXT + ALIGN(32) + +PROLOGUE(M4_function) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + +ifdef(`PIC',` + movl $0xAAAAAAAA, %eax + movl $0x33333333, %edx + + movd %eax, %mm7 + movd %edx, %mm6 + + movl $0x0F0F0F0F, %eax + + punpckldq %mm7, %mm7 + punpckldq %mm6, %mm6 + + movd %eax, %mm5 + movd %edx, %mm4 + + punpckldq %mm5, %mm5 + +',` + movq L(rodata_AAAAAAAAAAAAAAAA), %mm7 + movq L(rodata_3333333333333333), %mm6 + movq L(rodata_0F0F0F0F0F0F0F0F), %mm5 +') + pxor %mm4, %mm4 + +define(REG_AAAAAAAAAAAAAAAA,%mm7) +define(REG_3333333333333333,%mm6) +define(REG_0F0F0F0F0F0F0F0F,%mm5) +define(REG_0000000000000000,%mm4) + + + movl PARAM_SRC, %eax +HAM(` movl PARAM_SRC2, %edx') + + pxor %mm2, %mm2 C total + + shrl %ecx + jnc L(top) + + movd (%eax,%ecx,8), %mm1 + +HAM(` movd (%edx,%ecx,8), %mm0 + pxor %mm0, %mm1 +') + orl %ecx, %ecx + jmp L(loaded) + + + ALIGN(16) +L(top): + C eax src + C ebx + C ecx counter, qwords, decrementing + C edx [hamdist] src2 + C + C mm0 (scratch) + C mm1 (scratch) + C mm2 total (low dword) + C mm3 + C mm4 \ + C mm5 | special constants + C mm6 | + C mm7 / + + movq -8(%eax,%ecx,8), %mm1 + +HAM(` pxor -8(%edx,%ecx,8), %mm1') + decl %ecx + +L(loaded): + movq %mm1, %mm0 + pand REG_AAAAAAAAAAAAAAAA, %mm1 + + psrlq $1, %mm1 + + psubd %mm1, %mm0 C bit pairs + + + movq %mm0, %mm1 + psrlq $2, %mm0 + + pand REG_3333333333333333, %mm0 + pand REG_3333333333333333, %mm1 + + paddd %mm1, %mm0 C nibbles + + + movq %mm0, %mm1 + psrlq $4, %mm0 + + pand REG_0F0F0F0F0F0F0F0F, %mm0 + pand REG_0F0F0F0F0F0F0F0F, %mm1 + + paddd %mm1, %mm0 C bytes + + + psadbw( %mm4, %mm0) + + paddd %mm0, %mm2 C add to total + jnz L(top) + + + movd %mm2, %eax + emms + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/k7/mmx/rshift.asm b/vendor/gmp-6.3.0/mpn/x86/k7/mmx/rshift.asm new file mode 100644 index 0000000..345d23a --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/k7/mmx/rshift.asm @@ -0,0 +1,480 @@ +dnl AMD K7 mpn_rshift -- mpn right shift. + +dnl Copyright 1999-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C K7: 1.21 cycles/limb (at 16 limbs/loop). + + + +dnl K7: UNROLL_COUNT cycles/limb +dnl 4 1.51 +dnl 8 1.26 +dnl 16 1.21 +dnl 32 1.2 +dnl Maximum possible with the current code is 64. + +deflit(UNROLL_COUNT, 16) + + +C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned shift); +C +C Shift src,size right by shift many bits and store the result in dst,size. +C Zeros are shifted in at the left. The bits shifted out at the right are +C the return value. +C +C This code uses 64-bit MMX operations, which makes it possible to handle +C two limbs at a time, for a theoretical 1.0 cycles/limb. Plain integer +C code, on the other hand, suffers from shrd being a vector path decode and +C running at 3 cycles back-to-back. +C +C Full speed depends on source and destination being aligned, and some hairy +C setups and finish-ups are done to arrange this for the loop. + +ifdef(`PIC',` +deflit(UNROLL_THRESHOLD, 10) +',` +deflit(UNROLL_THRESHOLD, 10) +') + +defframe(PARAM_SHIFT,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +defframe(SAVE_EDI, -4) +defframe(SAVE_ESI, -8) +defframe(SAVE_EBX, -12) +deflit(SAVE_SIZE, 12) + + TEXT + ALIGN(32) + +PROLOGUE(mpn_rshift) +deflit(`FRAME',0) + + movl PARAM_SIZE, %eax + movl PARAM_SRC, %edx + subl $SAVE_SIZE, %esp +deflit(`FRAME',SAVE_SIZE) + + movl PARAM_SHIFT, %ecx + movl %edi, SAVE_EDI + + movl PARAM_DST, %edi + decl %eax + jnz L(more_than_one_limb) + + movl (%edx), %edx C src limb + + shrdl( %cl, %edx, %eax) C eax was decremented to zero + + shrl %cl, %edx + + movl %edx, (%edi) C dst limb + movl SAVE_EDI, %edi + addl $SAVE_SIZE, %esp + + ret + + +C ----------------------------------------------------------------------------- +L(more_than_one_limb): + C eax size-1 + C ebx + C ecx shift + C edx src + C esi + C edi dst + C ebp + + movd PARAM_SHIFT, %mm6 C rshift + movd (%edx), %mm5 C src low limb + cmp $UNROLL_THRESHOLD-1, %eax + + jae L(unroll) + leal (%edx,%eax,4), %edx C &src[size-1] + leal -4(%edi,%eax,4), %edi C &dst[size-2] + + movd (%edx), %mm4 C src high limb + negl %eax + + +L(simple_top): + C eax loop counter, limbs, negative + C ebx + C ecx shift + C edx carry + C edx &src[size-1] + C edi &dst[size-2] + C ebp + C + C mm0 scratch + C mm4 src high limb + C mm5 src low limb + C mm6 shift + + movq (%edx,%eax,4), %mm0 + incl %eax + + psrlq %mm6, %mm0 + + movd %mm0, (%edi,%eax,4) + jnz L(simple_top) + + + psllq $32, %mm5 + psrlq %mm6, %mm4 + + psrlq %mm6, %mm5 + movd %mm4, 4(%edi) C dst high limb + + movd %mm5, %eax C return value + + movl SAVE_EDI, %edi + addl $SAVE_SIZE, %esp + emms + + ret + + +C ----------------------------------------------------------------------------- + ALIGN(16) +L(unroll): + C eax size-1 + C ebx + C ecx shift + C edx src + C esi + C edi dst + C ebp + C + C mm5 src low limb + C mm6 rshift + + testb $4, %dl + movl %esi, SAVE_ESI + movl %ebx, SAVE_EBX + + psllq $32, %mm5 + jz L(start_src_aligned) + + + C src isn't aligned, process low limb separately (marked xxx) and + C step src and dst by one limb, making src aligned. + C + C source edx + C --+-------+-------+-------+ + C | xxx | + C --+-------+-------+-------+ + C 4mod8 0mod8 4mod8 + C + C dest edi + C --+-------+-------+ + C | | xxx | + C --+-------+-------+ + + movq (%edx), %mm0 C src low two limbs + addl $4, %edx + movl %eax, PARAM_SIZE C size-1 + + addl $4, %edi + decl %eax C size-2 is new size-1 + + psrlq %mm6, %mm0 + movl %edi, PARAM_DST C new dst + + movd %mm0, -4(%edi) +L(start_src_aligned): + + + movq (%edx), %mm1 C src low two limbs + decl %eax C size-2, two last limbs handled at end + testl $4, %edi + + psrlq %mm6, %mm5 + jz L(start_dst_aligned) + + + C dst isn't aligned, add 4 to make it so, and pretend the shift is + C 32 bits extra. Low limb of dst (marked xxx) handled here separately. + C + C source edx + C --+-------+-------+ + C | mm1 | + C --+-------+-------+ + C 4mod8 0mod8 + C + C dest edi + C --+-------+-------+-------+ + C | xxx | + C --+-------+-------+-------+ + C 4mod8 0mod8 4mod8 + + movq %mm1, %mm0 + psrlq %mm6, %mm1 + addl $32, %ecx C shift+32 + + movd %mm1, (%edi) + movq %mm0, %mm1 + addl $4, %edi C new dst + + movd %ecx, %mm6 +L(start_dst_aligned): + + + movq %mm1, %mm2 C copy of src low two limbs + negl %ecx + andl $-2, %eax C round size down to even + + movl %eax, %ebx + negl %eax + addl $64, %ecx + + andl $UNROLL_MASK, %eax + decl %ebx + + shll %eax + + movd %ecx, %mm7 C lshift = 64-rshift + +ifdef(`PIC',` + call L(pic_calc) +L(here): +',` + leal L(entry) (%eax,%eax,4), %esi + negl %eax +') + shrl $UNROLL_LOG2, %ebx C loop counter + + leal ifelse(UNROLL_BYTES,256,128+) 8(%edx,%eax,2), %edx + leal ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi + movl PARAM_SIZE, %eax C for use at end + + jmp *%esi + + +ifdef(`PIC',` +L(pic_calc): + C See mpn/x86/README about old gas bugs + leal (%eax,%eax,4), %esi + addl $L(entry)-L(here), %esi + addl (%esp), %esi + negl %eax + + ret_internal +') + + +C ----------------------------------------------------------------------------- + ALIGN(64) +L(top): + C eax size, for use at end + C ebx loop counter + C ecx lshift + C edx src + C esi was computed jump + C edi dst + C ebp + C + C mm0 scratch + C mm1 \ carry (alternating) + C mm2 / + C mm6 rshift + C mm7 lshift + C + C 10 code bytes/limb + C + C The two chunks differ in whether mm1 or mm2 hold the carry. + C The computed jump puts the initial carry in both mm1 and mm2. + +L(entry): +deflit(CHUNK_COUNT, 4) +forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, ` + deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128))) + deflit(`disp1', eval(disp0 + 8)) + +Zdisp( movq, disp0,(%edx), %mm0) + psrlq %mm6, %mm2 + + movq %mm0, %mm1 + psllq %mm7, %mm0 + + por %mm2, %mm0 +Zdisp( movq, %mm0, disp0,(%edi)) + + +Zdisp( movq, disp1,(%edx), %mm0) + psrlq %mm6, %mm1 + + movq %mm0, %mm2 + psllq %mm7, %mm0 + + por %mm1, %mm0 +Zdisp( movq, %mm0, disp1,(%edi)) +') + + addl $UNROLL_BYTES, %edx + addl $UNROLL_BYTES, %edi + decl %ebx + + jns L(top) + + +deflit(`disp0', ifelse(UNROLL_BYTES,256,-128)) +deflit(`disp1', eval(disp0-0 + 8)) + + testb $1, %al + psrlq %mm6, %mm2 C wanted rshifted in all cases below + movl SAVE_ESI, %esi + + movd %mm5, %eax C return value + + movl SAVE_EBX, %ebx + jz L(end_even) + + + C Size odd, destination was aligned. + C + C source + C edx + C +-------+---------------+-- + C | | mm2 | + C +-------+---------------+-- + C + C dest edi + C +-------+---------------+---------------+-- + C | | | written | + C +-------+---------------+---------------+-- + C + C mm6 = shift + C mm7 = ecx = 64-shift + + + C Size odd, destination was unaligned. + C + C source + C edx + C +-------+---------------+-- + C | | mm2 | + C +-------+---------------+-- + C + C dest edi + C +---------------+---------------+-- + C | | written | + C +---------------+---------------+-- + C + C mm6 = shift+32 + C mm7 = ecx = 64-(shift+32) + + + C In both cases there's one extra limb of src to fetch and combine + C with mm2 to make a qword to store, and in the aligned case there's + C a further extra limb of dst to be formed. + + + movd disp0(%edx), %mm0 + movq %mm0, %mm1 + + psllq %mm7, %mm0 + testb $32, %cl + + por %mm2, %mm0 + psrlq %mm6, %mm1 + + movq %mm0, disp0(%edi) + jz L(finish_odd_unaligned) + + movd %mm1, disp1(%edi) +L(finish_odd_unaligned): + + movl SAVE_EDI, %edi + addl $SAVE_SIZE, %esp + emms + + ret + + +L(end_even): + + C Size even, destination was aligned. + C + C source + C +---------------+-- + C | mm2 | + C +---------------+-- + C + C dest edi + C +---------------+---------------+-- + C | | mm3 | + C +---------------+---------------+-- + C + C mm6 = shift + C mm7 = ecx = 64-shift + + + C Size even, destination was unaligned. + C + C source + C +---------------+-- + C | mm2 | + C +---------------+-- + C + C dest edi + C +-------+---------------+-- + C | | mm3 | + C +-------+---------------+-- + C + C mm6 = shift+32 + C mm7 = 64-(shift+32) + + + C The movd for the unaligned case is the same data as the movq for + C the aligned case, it's just a choice between whether one or two + C limbs should be written. + + + testb $32, %cl + movd %mm2, disp0(%edi) + + jz L(end_even_unaligned) + + movq %mm2, disp0(%edi) +L(end_even_unaligned): + + movl SAVE_EDI, %edi + addl $SAVE_SIZE, %esp + emms + + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/k7/mod_1_1.asm b/vendor/gmp-6.3.0/mpn/x86/k7/mod_1_1.asm new file mode 100644 index 0000000..1bbe6f9 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/k7/mod_1_1.asm @@ -0,0 +1,221 @@ +dnl x86-32 mpn_mod_1_1p, requiring cmov. + +dnl Contributed to the GNU project by Niels Möller and Torbjorn Granlund. + +dnl Copyright 2010, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C P5 ? +C P6 model 0-8,10-12 ? +C P6 model 9 (Banias) ? +C P6 model 13 (Dothan) ? +C P4 model 0 (Willamette) ? +C P4 model 1 (?) ? +C P4 model 2 (Northwood) ? +C P4 model 3 (Prescott) ? +C P4 model 4 (Nocona) ? +C AMD K6 ? +C AMD K7 7 +C AMD K8 ? + +define(`B2mb', `%ebx') +define(`r0', `%esi') +define(`r2', `%ebp') +define(`t0', `%edi') +define(`ap', `%ecx') C Also shift count + +C Stack frame +C pre 36(%esp) +C b 32(%esp) +C n 28(%esp) +C ap 24(%esp) +C return 20(%esp) +C %ebp 16(%esp) +C %edi 12(%esp) +C %esi 8(%esp) +C %ebx 4(%esp) +C B2mod (%esp) + +define(`B2modb', `(%esp)') +define(`n', `28(%esp)') +define(`b', `32(%esp)') +define(`pre', `36(%esp)') + +C mp_limb_t +C mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, mp_limb_t pre[4]) +C +C The pre array contains bi, cnt, B1modb, B2modb +C Note: This implementation needs B1modb only when cnt > 0 + +ASM_START() + TEXT + ALIGN(8) +PROLOGUE(mpn_mod_1_1p) + push %ebp + push %edi + push %esi + push %ebx + mov 32(%esp), %ebp C pre[] + + mov 12(%ebp), %eax C B2modb + push %eax C Put it on stack + + mov n, %edx + mov 24(%esp), ap + + lea (ap, %edx, 4), ap + mov -4(ap), %eax + cmp $3, %edx + jnc L(first) + mov -8(ap), r0 + jmp L(reduce_two) + +L(first): + C First iteration, no r2 + mull B2modb + mov -12(ap), r0 + add %eax, r0 + mov -8(ap), %eax + adc %edx, %eax + sbb r2, r2 + subl $3, n + lea -16(ap), ap + jz L(reduce_three) + + mov B2modb, B2mb + sub b, B2mb + lea (B2mb, r0), t0 + jmp L(mid) + + ALIGN(16) +L(top): C Loopmixed to 7 c/l on k7 + add %eax, r0 + lea (B2mb, r0), t0 + mov r2, %eax + adc %edx, %eax + sbb r2, r2 +L(mid): mull B2modb + and B2modb, r2 + add r0, r2 + decl n + mov (ap), r0 + cmovc( t0, r2) + lea -4(ap), ap + jnz L(top) + + add %eax, r0 + mov r2, %eax + adc %edx, %eax + sbb r2, r2 + +L(reduce_three): + C Eliminate r2 + and b, r2 + sub r2, %eax + +L(reduce_two): + mov pre, %ebp + movb 4(%ebp), %cl + test %cl, %cl + jz L(normalized) + + C Unnormalized, use B1modb to reduce to size < B b + mull 8(%ebp) + xor t0, t0 + add %eax, r0 + adc %edx, t0 + mov t0, %eax + + C Left-shift to normalize + shld %cl, r0, %eax C Always use shld? + + shl %cl, r0 + jmp L(udiv) + +L(normalized): + mov %eax, t0 + sub b, t0 + cmovnc( t0, %eax) + +L(udiv): + lea 1(%eax), t0 + mull (%ebp) + mov b, %ebx C Needed in register for lea + add r0, %eax + adc t0, %edx + imul %ebx, %edx + sub %edx, r0 + cmp r0, %eax + lea (%ebx, r0), %eax + cmovnc( r0, %eax) + cmp %ebx, %eax + jnc L(fix) +L(ok): shr %cl, %eax + + add $4, %esp + pop %ebx + pop %esi + pop %edi + pop %ebp + + ret +L(fix): sub %ebx, %eax + jmp L(ok) +EPILOGUE() + +PROLOGUE(mpn_mod_1_1p_cps) + push %ebp + mov 12(%esp), %ebp + push %esi + bsr %ebp, %ecx + push %ebx + xor $31, %ecx + mov 16(%esp), %esi + sal %cl, %ebp + mov %ebp, %edx + not %edx + mov $-1, %eax + div %ebp C On K7, invert_limb would be a few cycles faster. + mov %eax, (%esi) C store bi + mov %ecx, 4(%esi) C store cnt + neg %ebp + mov $1, %edx + shld %cl, %eax, %edx + imul %ebp, %edx + shr %cl, %edx + imul %ebp, %eax + mov %edx, 8(%esi) C store B1modb + mov %eax, 12(%esi) C store B2modb + pop %ebx + pop %esi + pop %ebp + ret +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/k7/mod_1_4.asm b/vendor/gmp-6.3.0/mpn/x86/k7/mod_1_4.asm new file mode 100644 index 0000000..bb7597e --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/k7/mod_1_4.asm @@ -0,0 +1,260 @@ +dnl x86-32 mpn_mod_1s_4p, requiring cmov. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2009, 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C P5 ? +C P6 model 0-8,10-12 ? +C P6 model 9 (Banias) ? +C P6 model 13 (Dothan) 6 +C P4 model 0 (Willamette) ? +C P4 model 1 (?) ? +C P4 model 2 (Northwood) 15.5 +C P4 model 3 (Prescott) ? +C P4 model 4 (Nocona) ? +C AMD K6 ? +C AMD K7 4.75 +C AMD K8 ? + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mod_1s_4p) + push %ebp + push %edi + push %esi + push %ebx + sub $28, %esp + mov 60(%esp), %edi C cps[] + mov 8(%edi), %eax + mov 12(%edi), %edx + mov 16(%edi), %ecx + mov 20(%edi), %esi + mov 24(%edi), %edi + mov %eax, 4(%esp) + mov %edx, 8(%esp) + mov %ecx, 12(%esp) + mov %esi, 16(%esp) + mov %edi, 20(%esp) + mov 52(%esp), %eax C n + xor %edi, %edi + mov 48(%esp), %esi C up + lea -12(%esi,%eax,4), %esi + and $3, %eax + je L(b0) + cmp $2, %eax + jc L(b1) + je L(b2) + +L(b3): mov 4(%esi), %eax + mull 4(%esp) + mov (%esi), %ebp + add %eax, %ebp + adc %edx, %edi + mov 8(%esi), %eax + mull 8(%esp) + lea -12(%esi), %esi + jmp L(m0) + +L(b0): mov (%esi), %eax + mull 4(%esp) + mov -4(%esi), %ebp + add %eax, %ebp + adc %edx, %edi + mov 4(%esi), %eax + mull 8(%esp) + add %eax, %ebp + adc %edx, %edi + mov 8(%esi), %eax + mull 12(%esp) + lea -16(%esi), %esi + jmp L(m0) + +L(b1): mov 8(%esi), %ebp + lea -4(%esi), %esi + jmp L(m1) + +L(b2): mov 8(%esi), %edi + mov 4(%esi), %ebp + lea -8(%esi), %esi + jmp L(m1) + + ALIGN(16) +L(top): mov (%esi), %eax + mull 4(%esp) + mov -4(%esi), %ebx + xor %ecx, %ecx + add %eax, %ebx + adc %edx, %ecx + mov 4(%esi), %eax + mull 8(%esp) + add %eax, %ebx + adc %edx, %ecx + mov 8(%esi), %eax + mull 12(%esp) + add %eax, %ebx + adc %edx, %ecx + lea -16(%esi), %esi + mov 16(%esp), %eax + mul %ebp + add %eax, %ebx + adc %edx, %ecx + mov 20(%esp), %eax + mul %edi + mov %ebx, %ebp + mov %ecx, %edi +L(m0): add %eax, %ebp + adc %edx, %edi +L(m1): subl $4, 52(%esp) + ja L(top) + +L(end): mov 4(%esp), %eax + mul %edi + mov 60(%esp), %edi + add %eax, %ebp + adc $0, %edx + mov 4(%edi), %ecx + mov %edx, %esi + mov %ebp, %eax + sal %cl, %esi + mov %ecx, %ebx + neg %ecx + shr %cl, %eax + or %esi, %eax + lea 1(%eax), %esi + mull (%edi) + mov %ebx, %ecx + mov %eax, %ebx + mov %ebp, %eax + mov 56(%esp), %ebp + sal %cl, %eax + add %eax, %ebx + adc %esi, %edx + imul %ebp, %edx + sub %edx, %eax + lea (%eax,%ebp), %edx + cmp %eax, %ebx + cmovc( %edx, %eax) + mov %eax, %edx + sub %ebp, %eax + cmovc( %edx, %eax) + add $28, %esp + pop %ebx + pop %esi + pop %edi + pop %ebp + shr %cl, %eax + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(mpn_mod_1s_4p_cps) +C CAUTION: This is the same code as in pentium4/sse2/mod_1_4.asm + push %ebp + push %edi + push %esi + push %ebx + mov 20(%esp), %ebp C FIXME: avoid bp for 0-idx + mov 24(%esp), %ebx + bsr %ebx, %ecx + xor $31, %ecx + sal %cl, %ebx C b << cnt + mov %ebx, %edx + not %edx + mov $-1, %eax + div %ebx + xor %edi, %edi + sub %ebx, %edi + mov $1, %esi + mov %eax, (%ebp) C store bi + mov %ecx, 4(%ebp) C store cnt + shld %cl, %eax, %esi + imul %edi, %esi + mov %eax, %edi + mul %esi + + add %esi, %edx + shr %cl, %esi + mov %esi, 8(%ebp) C store B1modb + + not %edx + imul %ebx, %edx + lea (%edx,%ebx), %esi + cmp %edx, %eax + cmovnc( %edx, %esi) + mov %edi, %eax + mul %esi + + add %esi, %edx + shr %cl, %esi + mov %esi, 12(%ebp) C store B2modb + + not %edx + imul %ebx, %edx + lea (%edx,%ebx), %esi + cmp %edx, %eax + cmovnc( %edx, %esi) + mov %edi, %eax + mul %esi + + add %esi, %edx + shr %cl, %esi + mov %esi, 16(%ebp) C store B3modb + + not %edx + imul %ebx, %edx + lea (%edx,%ebx), %esi + cmp %edx, %eax + cmovnc( %edx, %esi) + mov %edi, %eax + mul %esi + + add %esi, %edx + shr %cl, %esi + mov %esi, 20(%ebp) C store B4modb + + not %edx + imul %ebx, %edx + add %edx, %ebx + cmp %edx, %eax + cmovnc( %edx, %ebx) + + shr %cl, %ebx + mov %ebx, 24(%ebp) C store B5modb + + pop %ebx + pop %esi + pop %edi + pop %ebp + ret +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/k7/mod_34lsub1.asm b/vendor/gmp-6.3.0/mpn/x86/k7/mod_34lsub1.asm new file mode 100644 index 0000000..ee3ad04 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/k7/mod_34lsub1.asm @@ -0,0 +1,188 @@ +dnl AMD K7 mpn_mod_34lsub1 -- remainder modulo 2^24-1. + +dnl Copyright 2000-2002, 2004, 2005, 2008 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C Athlon: 1 +C Hammer: 1 + + +C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size) +C +C The loop form below and the 64 byte code alignment seem necessary for the +C claimed speed. This is a bit strange, since normally k7 isn't very +C sensitive to such things. Perhaps there has to be 6 instructions in the +C first 16 bytes for the BTB entry or something. + +defframe(PARAM_SIZE, 8) +defframe(PARAM_SRC, 4) + +dnl re-use parameter space +define(SAVE_EDI, `PARAM_SIZE') + + TEXT + ALIGN(64) +PROLOGUE(mpn_mod_34lsub1) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + movl PARAM_SRC, %edx + + subl $2, %ecx + ja L(three_or_more) + + movl (%edx), %eax + jb L(one) + + movl 4(%edx), %ecx + movl %eax, %edx + shrl $24, %eax C src[0] low + + andl $0xFFFFFF, %edx C src[0] high + addl %edx, %eax + movl %ecx, %edx + + andl $0xFFFF, %ecx + shrl $16, %edx C src[1] high + addl %edx, %eax + + shll $8, %ecx C src[1] low + addl %ecx, %eax + +L(one): + ret + + +L(three_or_more): + C eax + C ebx + C ecx size-2 + C edx src + C esi + C edi + + pushl %ebx FRAME_pushl() + xorl %eax, %eax + xorl %ebx, %ebx + + movl %edi, SAVE_EDI + pushl %esi FRAME_pushl() + xorl %esi, %esi C and clear carry flag + + + C code offset 0x40 at this point +L(top): + C eax acc 0mod3 + C ebx acc 1mod3 + C ecx counter, limbs + C edx src + C esi acc 2mod3 + C edi + + leal 24(%edx), %edx + leal -2(%ecx), %ecx + adcl -24(%edx), %eax + adcl -20(%edx), %ebx + adcl -16(%edx), %esi + + decl %ecx + jng L(done_loop) + + leal -2(%ecx), %ecx + adcl -12(%edx), %eax + adcl -8(%edx), %ebx + adcl -4(%edx), %esi + + decl %ecx + jg L(top) + + + leal 12(%edx), %edx + + +L(done_loop): + C ecx is -2, -1 or 0 representing 0, 1 or 2 more limbs, respectively + + incl %ecx + movl $0xFFFFFFFF, %edi + js L(combine) + + adcl -12(%edx), %eax + decl %ecx + movl $0xFFFFFF00, %edi + js L(combine) + + adcl -8(%edx), %ebx + movl $0xFFFF0000, %edi + + +L(combine): + C eax acc 0mod3 + C ebx acc 1mod3 + C ecx + C edx + C esi acc 2mod3 + C edi mask + + sbbl %ecx, %ecx C carry + movl %eax, %edx C 0mod3 + shrl $24, %eax C 0mod3 high + + andl %edi, %ecx C carry masked + andl $0x00FFFFFF, %edx C 0mod3 low + movl %ebx, %edi C 1mod3 + + subl %ecx, %eax C apply carry + shrl $16, %ebx C 1mod3 high + andl $0xFFFF, %edi + + addl %edx, %eax C apply 0mod3 low + movl %esi, %edx C 2mod3 + shll $8, %edi C 1mod3 low + + addl %ebx, %eax C apply 1mod3 high + shrl $8, %esi C 2mod3 high + movzbl %dl, %edx C 2mod3 low + + addl %edi, %eax C apply 1mod3 low + shll $16, %edx C 2mod3 low + + addl %esi, %eax C apply 2mod3 high + popl %esi FRAME_popl() + + movl SAVE_EDI, %edi + addl %edx, %eax C apply 2mod3 low + popl %ebx FRAME_popl() + + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/k7/mode1o.asm b/vendor/gmp-6.3.0/mpn/x86/k7/mode1o.asm new file mode 100644 index 0000000..2394033 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/k7/mode1o.asm @@ -0,0 +1,181 @@ +dnl AMD K7 mpn_modexact_1_odd -- exact division style remainder. + +dnl Copyright 2000-2002, 2004, 2007 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C Athlon: 11.0 +C Hammer: 7.0 + + +C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size, +C mp_limb_t divisor); +C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, +C mp_limb_t divisor, mp_limb_t carry); +C +C With the loop running at just 11 cycles it doesn't seem worth bothering to +C check for high<divisor to save one step. +C +C Using a divl for size==1 measures slower than the modexact method, which +C is not too surprising since for the latter it's only about 24 cycles to +C calculate the modular inverse. + +defframe(PARAM_CARRY, 16) +defframe(PARAM_DIVISOR,12) +defframe(PARAM_SIZE, 8) +defframe(PARAM_SRC, 4) + +defframe(SAVE_EBX, -4) +defframe(SAVE_ESI, -8) +defframe(SAVE_EDI, -12) +defframe(SAVE_EBP, -16) + +deflit(STACK_SPACE, 16) + + TEXT + + ALIGN(16) +PROLOGUE(mpn_modexact_1c_odd) +deflit(`FRAME',0) + + movl PARAM_CARRY, %ecx + jmp L(start_1c) + +EPILOGUE() + + + ALIGN(16) +PROLOGUE(mpn_modexact_1_odd) +deflit(`FRAME',0) + + xorl %ecx, %ecx +L(start_1c): + movl PARAM_DIVISOR, %eax + subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE) + + movl %esi, SAVE_ESI + movl PARAM_DIVISOR, %esi + + movl %edi, SAVE_EDI + + shrl %eax C d/2 + + andl $127, %eax + +ifdef(`PIC',` + LEA( binvert_limb_table, %edi) + movzbl (%eax,%edi), %edi C inv 8 bits +',` + movzbl binvert_limb_table(%eax), %edi C inv 8 bits +') + + xorl %edx, %edx C initial extra carry + leal (%edi,%edi), %eax C 2*inv + + imull %edi, %edi C inv*inv + + movl %ebp, SAVE_EBP + movl PARAM_SIZE, %ebp + + movl %ebx, SAVE_EBX + movl PARAM_SRC, %ebx + + imull %esi, %edi C inv*inv*d + + subl %edi, %eax C inv = 2*inv - inv*inv*d + leal (%eax,%eax), %edi C 2*inv + + imull %eax, %eax C inv*inv + + imull %esi, %eax C inv*inv*d + + leal (%ebx,%ebp,4), %ebx C src end + negl %ebp C -size + + subl %eax, %edi C inv = 2*inv - inv*inv*d + + ASSERT(e,` C d*inv == 1 mod 2^GMP_LIMB_BITS + movl %esi, %eax + imull %edi, %eax + cmpl $1, %eax') + + +C The dependent chain here is +C +C cycles +C subl %edx, %eax 1 +C imull %edi, %eax 4 +C mull %esi 6 (high limb) +C ---- +C total 11 +C +C Out of order execution hides the load latency for the source data, so no +C special scheduling is required. + +L(top): + C eax src limb + C ebx src end ptr + C ecx next carry bit, 0 or 1 (or initial carry param) + C edx carry limb, high of last product + C esi divisor + C edi inverse + C ebp counter, limbs, negative + + movl (%ebx,%ebp,4), %eax + + subl %ecx, %eax C apply carry bit + movl $0, %ecx + + setc %cl C new carry bit + + subl %edx, %eax C apply carry limb + adcl $0, %ecx + + imull %edi, %eax + + mull %esi + + incl %ebp + jnz L(top) + + + movl SAVE_ESI, %esi + movl SAVE_EDI, %edi + leal (%ecx,%edx), %eax + + movl SAVE_EBX, %ebx + movl SAVE_EBP, %ebp + addl $STACK_SPACE, %esp + + ret + +EPILOGUE() +ASM_END() diff --git a/vendor/gmp-6.3.0/mpn/x86/k7/mul_1.asm b/vendor/gmp-6.3.0/mpn/x86/k7/mul_1.asm new file mode 100644 index 0000000..755cd2e --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/k7/mul_1.asm @@ -0,0 +1,237 @@ +dnl AMD K7 mpn_mul_1. + +dnl Copyright 1999-2002, 2005, 2008 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C P5 +C P6 model 0-8,10-12) +C P6 model 9 (Banias) +C P6 model 13 (Dothan) +C P4 model 0 (Willamette) +C P4 model 1 (?) +C P4 model 2 (Northwood) +C P4 model 3 (Prescott) +C P4 model 4 (Nocona) +C AMD K6 +C AMD K7 3.25 +C AMD K8 + +C TODO +C * Improve feed-in and wind-down code. We beat the old code for all n != 1, +C but we might be able to do even better. +C * The feed-in code for mul_1c is crude. + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mul_1c) + add $-16, %esp + mov %ebp, (%esp) + mov %ebx, 4(%esp) + mov %esi, 8(%esp) + mov %edi, 12(%esp) + + mov 20(%esp), %edi + mov 24(%esp), %esi + mov 28(%esp), %ebp + mov 32(%esp), %ecx + mov %ebp, %ebx + shr $2, %ebp + mov %ebp, 28(%esp) + mov (%esi), %eax + and $3, %ebx + jz L(c0) + cmp $2, %ebx + mov 36(%esp), %ebx + jz L(c2) + jg L(c3) + +L(c1): lea -4(%edi), %edi + mul %ecx + test %ebp, %ebp + jnz 1f + add %ebx, %eax + mov %eax, 4(%edi) + mov %edx, %eax + adc %ebp, %eax + jmp L(rt) +1: add %eax, %ebx + mov $0, %ebp + adc %edx, %ebp + mov 4(%esi), %eax + jmp L(1) + +L(c2): lea 4(%esi), %esi + mul %ecx + test %ebp, %ebp + mov %ebx, %ebp + jnz 2f + add %eax, %ebp + mov $0, %ebx + adc %edx, %ebx + mov (%esi), %eax + jmp L(cj2) +2: add %eax, %ebp + mov $0, %ebx + adc %edx, %ebx + mov (%esi), %eax + jmp L(2) + +L(c3): lea 8(%esi), %esi + lea -12(%edi), %edi + mul %ecx + add %eax, %ebx + mov $0, %ebp + adc %edx, %ebp + mov -4(%esi), %eax + incl 28(%esp) + jmp L(3) + +L(c0): mov 36(%esp), %ebx + lea -4(%esi), %esi + lea -8(%edi), %edi + mul %ecx + mov %ebx, %ebp + add %eax, %ebp + mov $0, %ebx + adc %edx, %ebx + mov 8(%esi), %eax + jmp L(0) + +EPILOGUE() + ALIGN(16) +PROLOGUE(mpn_mul_1) + add $-16, %esp + mov %ebp, (%esp) + mov %ebx, 4(%esp) + mov %esi, 8(%esp) + mov %edi, 12(%esp) + + mov 20(%esp), %edi + mov 24(%esp), %esi + mov 28(%esp), %ebp + mov 32(%esp), %ecx + mov %ebp, %ebx + shr $2, %ebp + mov %ebp, 28(%esp) + mov (%esi), %eax + and $3, %ebx + jz L(b0) + cmp $2, %ebx + jz L(b2) + jg L(b3) + +L(b1): lea -4(%edi), %edi + mul %ecx + test %ebp, %ebp + jnz L(gt1) + mov %eax, 4(%edi) + mov %edx, %eax + jmp L(rt) +L(gt1): mov %eax, %ebx + mov %edx, %ebp + mov 4(%esi), %eax + jmp L(1) + +L(b2): lea 4(%esi), %esi + mul %ecx + test %ebp, %ebp + mov %eax, %ebp + mov %edx, %ebx + mov (%esi), %eax + jnz L(2) + jmp L(cj2) + +L(b3): lea 8(%esi), %esi + lea -12(%edi), %edi + mul %ecx + mov %eax, %ebx + mov %edx, %ebp + mov -4(%esi), %eax + incl 28(%esp) + jmp L(3) + +L(b0): lea -4(%esi), %esi + lea -8(%edi), %edi + mul %ecx + mov %eax, %ebp + mov %edx, %ebx + mov 8(%esi), %eax + jmp L(0) + + ALIGN(16) +L(top): mov $0, %ebx + adc %edx, %ebx +L(2): mul %ecx + add %eax, %ebx + mov %ebp, 0(%edi) + mov 4(%esi), %eax + mov $0, %ebp + adc %edx, %ebp +L(1): mul %ecx + add %eax, %ebp + mov 8(%esi), %eax + mov %ebx, 4(%edi) + mov $0, %ebx + adc %edx, %ebx +L(0): mov %ebp, 8(%edi) + mul %ecx + add %eax, %ebx + mov 12(%esi), %eax + lea 16(%esi), %esi + mov $0, %ebp + adc %edx, %ebp +L(3): mov %ebx, 12(%edi) + mul %ecx + lea 16(%edi), %edi + add %eax, %ebp + decl 28(%esp) + mov 0(%esi), %eax + jnz L(top) + +L(end): mov $0, %ebx + adc %edx, %ebx +L(cj2): mul %ecx + add %eax, %ebx + mov %ebp, (%edi) +L(cj1): mov %ebx, 4(%edi) + adc $0, %edx + mov %edx, %eax + +L(rt): mov (%esp), %ebp + mov 4(%esp), %ebx + mov 8(%esp), %esi + mov 12(%esp), %edi + add $16, %esp + ret +EPILOGUE() +ASM_END() diff --git a/vendor/gmp-6.3.0/mpn/x86/k7/mul_basecase.asm b/vendor/gmp-6.3.0/mpn/x86/k7/mul_basecase.asm new file mode 100644 index 0000000..4dfb500 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/k7/mul_basecase.asm @@ -0,0 +1,602 @@ +dnl AMD K7 mpn_mul_basecase -- multiply two mpn numbers. + +dnl Copyright 1999-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C K7: approx 4.42 cycles per cross product at around 20x20 limbs (16 +C limbs/loop unrolling). + + + +dnl K7 UNROLL_COUNT cycles/product (at around 20x20) +dnl 8 4.67 +dnl 16 4.59 +dnl 32 4.42 +dnl Maximum possible with the current code is 32. +dnl +dnl At 32 the typical 13-26 limb sizes from the karatsuba code will get +dnl done with a straight run through a block of code, no inner loop. Using +dnl 32 gives 1k of code, but the k7 has a 64k L1 code cache. + +deflit(UNROLL_COUNT, 32) + + +C void mpn_mul_basecase (mp_ptr wp, +C mp_srcptr xp, mp_size_t xsize, +C mp_srcptr yp, mp_size_t ysize); +C +C Calculate xp,xsize multiplied by yp,ysize, storing the result in +C wp,xsize+ysize. +C +C This routine is essentially the same as mpn/generic/mul_basecase.c, but +C it's faster because it does most of the mpn_addmul_1() startup +C calculations only once. The saving is 15-25% on typical sizes coming from +C the Karatsuba multiply code. + +ifdef(`PIC',` +deflit(UNROLL_THRESHOLD, 5) +',` +deflit(UNROLL_THRESHOLD, 5) +') + +defframe(PARAM_YSIZE,20) +defframe(PARAM_YP, 16) +defframe(PARAM_XSIZE,12) +defframe(PARAM_XP, 8) +defframe(PARAM_WP, 4) + + TEXT + ALIGN(32) +PROLOGUE(mpn_mul_basecase) +deflit(`FRAME',0) + + movl PARAM_XSIZE, %ecx + movl PARAM_YP, %eax + + movl PARAM_XP, %edx + movl (%eax), %eax C yp low limb + + cmpl $2, %ecx + ja L(xsize_more_than_two) + je L(two_by_something) + + + C one limb by one limb + + mull (%edx) + + movl PARAM_WP, %ecx + movl %eax, (%ecx) + movl %edx, 4(%ecx) + ret + + +C ----------------------------------------------------------------------------- +L(two_by_something): +deflit(`FRAME',0) + decl PARAM_YSIZE + pushl %ebx defframe_pushl(`SAVE_EBX') + movl %eax, %ecx C yp low limb + + movl PARAM_WP, %ebx + pushl %esi defframe_pushl(`SAVE_ESI') + movl %edx, %esi C xp + + movl (%edx), %eax C xp low limb + jnz L(two_by_two) + + + C two limbs by one limb + + mull %ecx + + movl %eax, (%ebx) + movl 4(%esi), %eax + movl %edx, %esi C carry + + mull %ecx + + addl %eax, %esi + + movl %esi, 4(%ebx) + movl SAVE_ESI, %esi + + adcl $0, %edx + + movl %edx, 8(%ebx) + movl SAVE_EBX, %ebx + addl $FRAME, %esp + + ret + + + +C ----------------------------------------------------------------------------- +C Could load yp earlier into another register. + + ALIGN(16) +L(two_by_two): + C eax xp low limb + C ebx wp + C ecx yp low limb + C edx + C esi xp + C edi + C ebp + +dnl FRAME carries on from previous + + mull %ecx C xp[0] * yp[0] + + push %edi defframe_pushl(`SAVE_EDI') + movl %edx, %edi C carry, for wp[1] + + movl %eax, (%ebx) + movl 4(%esi), %eax + + mull %ecx C xp[1] * yp[0] + + addl %eax, %edi + movl PARAM_YP, %ecx + + adcl $0, %edx + movl 4(%ecx), %ecx C yp[1] + movl %edi, 4(%ebx) + + movl 4(%esi), %eax C xp[1] + movl %edx, %edi C carry, for wp[2] + + mull %ecx C xp[1] * yp[1] + + addl %eax, %edi + + adcl $0, %edx + movl (%esi), %eax C xp[0] + + movl %edx, %esi C carry, for wp[3] + + mull %ecx C xp[0] * yp[1] + + addl %eax, 4(%ebx) + adcl %edx, %edi + movl %edi, 8(%ebx) + + adcl $0, %esi + movl SAVE_EDI, %edi + movl %esi, 12(%ebx) + + movl SAVE_ESI, %esi + movl SAVE_EBX, %ebx + addl $FRAME, %esp + + ret + + +C ----------------------------------------------------------------------------- + ALIGN(16) +L(xsize_more_than_two): + +C The first limb of yp is processed with a simple mpn_mul_1 style loop +C inline. Unrolling this doesn't seem worthwhile since it's only run once +C (whereas the addmul below is run ysize-1 many times). A call to the +C actual mpn_mul_1 will be slowed down by the call and parameter pushing and +C popping, and doesn't seem likely to be worthwhile on the typical 13-26 +C limb operations the Karatsuba code calls here with. + + C eax yp[0] + C ebx + C ecx xsize + C edx xp + C esi + C edi + C ebp + +dnl FRAME doesn't carry on from previous, no pushes yet here +defframe(`SAVE_EBX',-4) +defframe(`SAVE_ESI',-8) +defframe(`SAVE_EDI',-12) +defframe(`SAVE_EBP',-16) +deflit(`FRAME',0) + + subl $16, %esp +deflit(`FRAME',16) + + movl %edi, SAVE_EDI + movl PARAM_WP, %edi + + movl %ebx, SAVE_EBX + movl %ebp, SAVE_EBP + movl %eax, %ebp + + movl %esi, SAVE_ESI + xorl %ebx, %ebx + leal (%edx,%ecx,4), %esi C xp end + + leal (%edi,%ecx,4), %edi C wp end of mul1 + negl %ecx + + +L(mul1): + C eax scratch + C ebx carry + C ecx counter, negative + C edx scratch + C esi xp end + C edi wp end of mul1 + C ebp multiplier + + movl (%esi,%ecx,4), %eax + + mull %ebp + + addl %ebx, %eax + movl %eax, (%edi,%ecx,4) + movl $0, %ebx + + adcl %edx, %ebx + incl %ecx + jnz L(mul1) + + + movl PARAM_YSIZE, %edx + movl PARAM_XSIZE, %ecx + + movl %ebx, (%edi) C final carry + decl %edx + + jnz L(ysize_more_than_one) + + + movl SAVE_EDI, %edi + movl SAVE_EBX, %ebx + + movl SAVE_EBP, %ebp + movl SAVE_ESI, %esi + addl $FRAME, %esp + + ret + + +L(ysize_more_than_one): + cmpl $UNROLL_THRESHOLD, %ecx + movl PARAM_YP, %eax + + jae L(unroll) + + +C ----------------------------------------------------------------------------- + C simple addmul looping + C + C eax yp + C ebx + C ecx xsize + C edx ysize-1 + C esi xp end + C edi wp end of mul1 + C ebp + + leal 4(%eax,%edx,4), %ebp C yp end + negl %ecx + negl %edx + + movl (%esi,%ecx,4), %eax C xp low limb + movl %edx, PARAM_YSIZE C -(ysize-1) + incl %ecx + + xorl %ebx, %ebx C initial carry + movl %ecx, PARAM_XSIZE C -(xsize-1) + movl %ebp, PARAM_YP + + movl (%ebp,%edx,4), %ebp C yp second lowest limb - multiplier + jmp L(simple_outer_entry) + + + C this is offset 0x121 so close enough to aligned +L(simple_outer_top): + C ebp ysize counter, negative + + movl PARAM_YP, %edx + movl PARAM_XSIZE, %ecx C -(xsize-1) + xorl %ebx, %ebx C carry + + movl %ebp, PARAM_YSIZE + addl $4, %edi C next position in wp + + movl (%edx,%ebp,4), %ebp C yp limb - multiplier + movl -4(%esi,%ecx,4), %eax C xp low limb + + +L(simple_outer_entry): + +L(simple_inner): + C eax xp limb + C ebx carry limb + C ecx loop counter (negative) + C edx scratch + C esi xp end + C edi wp end + C ebp multiplier + + mull %ebp + + addl %eax, %ebx + adcl $0, %edx + + addl %ebx, (%edi,%ecx,4) + movl (%esi,%ecx,4), %eax + adcl $0, %edx + + incl %ecx + movl %edx, %ebx + jnz L(simple_inner) + + + mull %ebp + + movl PARAM_YSIZE, %ebp + addl %eax, %ebx + + adcl $0, %edx + addl %ebx, (%edi) + + adcl $0, %edx + incl %ebp + + movl %edx, 4(%edi) + jnz L(simple_outer_top) + + + movl SAVE_EBX, %ebx + movl SAVE_ESI, %esi + + movl SAVE_EDI, %edi + movl SAVE_EBP, %ebp + addl $FRAME, %esp + + ret + + + +C ----------------------------------------------------------------------------- +C +C The unrolled loop is the same as in mpn_addmul_1(), see that code for some +C comments. +C +C VAR_ADJUST is the negative of how many limbs the leals in the inner loop +C increment xp and wp. This is used to adjust back xp and wp, and rshifted +C to given an initial VAR_COUNTER at the top of the outer loop. +C +C VAR_COUNTER is for the unrolled loop, running from VAR_ADJUST/UNROLL_COUNT +C up to -1, inclusive. +C +C VAR_JMP is the computed jump into the unrolled loop. +C +C VAR_XP_LOW is the least significant limb of xp, which is needed at the +C start of the unrolled loop. +C +C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1, +C inclusive. +C +C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be +C added to give the location of the next limb of yp, which is the multiplier +C in the unrolled loop. +C +C The trick with VAR_ADJUST means it's only necessary to do one fetch in the +C outer loop to take care of xp, wp and the inner loop counter. + +defframe(VAR_COUNTER, -20) +defframe(VAR_ADJUST, -24) +defframe(VAR_JMP, -28) +defframe(VAR_XP_LOW, -32) +deflit(VAR_EXTRA_SPACE, 16) + + +L(unroll): + C eax yp + C ebx + C ecx xsize + C edx ysize-1 + C esi xp end + C edi wp end of mul1 + C ebp + + movl PARAM_XP, %esi + movl 4(%eax), %ebp C multiplier (yp second limb) + leal 4(%eax,%edx,4), %eax C yp adjust for ysize indexing + + movl PARAM_WP, %edi + movl %eax, PARAM_YP + negl %edx + + movl %edx, PARAM_YSIZE + leal UNROLL_COUNT-2(%ecx), %ebx C (xsize-1)+UNROLL_COUNT-1 + decl %ecx C xsize-1 + + movl (%esi), %eax C xp low limb + andl $-UNROLL_MASK-1, %ebx + negl %ecx + + subl $VAR_EXTRA_SPACE, %esp +deflit(`FRAME',16+VAR_EXTRA_SPACE) + negl %ebx + andl $UNROLL_MASK, %ecx + + movl %ebx, VAR_ADJUST + movl %ecx, %edx + shll $4, %ecx + + sarl $UNROLL_LOG2, %ebx + + C 17 code bytes per limb +ifdef(`PIC',` + call L(pic_calc) +L(unroll_here): +',` + leal L(unroll_entry) (%ecx,%edx,1), %ecx +') + negl %edx + + movl %eax, VAR_XP_LOW + movl %ecx, VAR_JMP + leal 4(%edi,%edx,4), %edi C wp and xp, adjust for unrolling, + leal 4(%esi,%edx,4), %esi C and start at second limb + jmp L(unroll_outer_entry) + + +ifdef(`PIC',` +L(pic_calc): + C See mpn/x86/README about old gas bugs + leal (%ecx,%edx,1), %ecx + addl $L(unroll_entry)-L(unroll_here), %ecx + addl (%esp), %ecx + ret_internal +') + + +C -------------------------------------------------------------------------- + ALIGN(32) +L(unroll_outer_top): + C ebp ysize counter, negative + + movl VAR_ADJUST, %ebx + movl PARAM_YP, %edx + + movl VAR_XP_LOW, %eax + movl %ebp, PARAM_YSIZE C store incremented ysize counter + + leal 4(%edi,%ebx,4), %edi + leal (%esi,%ebx,4), %esi + sarl $UNROLL_LOG2, %ebx + + movl (%edx,%ebp,4), %ebp C yp next multiplier + movl VAR_JMP, %ecx + +L(unroll_outer_entry): + mull %ebp + + testb $1, %cl C and clear carry bit + movl %ebx, VAR_COUNTER + movl $0, %ebx + + movl $0, %ecx + cmovz( %eax, %ecx) C eax into low carry, zero into high carry limb + cmovnz( %eax, %ebx) + + C Extra fetch of VAR_JMP is bad, but registers are tight + jmp *VAR_JMP + + +C ----------------------------------------------------------------------------- + ALIGN(32) +L(unroll_top): + C eax xp limb + C ebx carry high + C ecx carry low + C edx scratch + C esi xp+8 + C edi wp + C ebp yp multiplier limb + C + C VAR_COUNTER loop counter, negative + C + C 17 bytes each limb + +L(unroll_entry): + +deflit(CHUNK_COUNT,2) +forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, ` + deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128))) + deflit(`disp1', eval(disp0 + 4)) + +Zdisp( movl, disp0,(%esi), %eax) + adcl %edx, %ebx + + mull %ebp + +Zdisp( addl, %ecx, disp0,(%edi)) + movl $0, %ecx + + adcl %eax, %ebx + + + movl disp1(%esi), %eax + adcl %edx, %ecx + + mull %ebp + + addl %ebx, disp1(%edi) + movl $0, %ebx + + adcl %eax, %ecx +') + + + incl VAR_COUNTER + leal UNROLL_BYTES(%esi), %esi + leal UNROLL_BYTES(%edi), %edi + + jnz L(unroll_top) + + + C eax + C ebx zero + C ecx low + C edx high + C esi + C edi wp, pointing at second last limb) + C ebp + C + C carry flag to be added to high + +deflit(`disp0', ifelse(UNROLL_BYTES,256,-128)) +deflit(`disp1', eval(disp0-0 + 4)) + + movl PARAM_YSIZE, %ebp + adcl $0, %edx + addl %ecx, disp0(%edi) + + adcl $0, %edx + incl %ebp + + movl %edx, disp1(%edi) + jnz L(unroll_outer_top) + + + movl SAVE_ESI, %esi + movl SAVE_EBP, %ebp + + movl SAVE_EDI, %edi + movl SAVE_EBX, %ebx + addl $FRAME, %esp + + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/k7/sqr_basecase.asm b/vendor/gmp-6.3.0/mpn/x86/k7/sqr_basecase.asm new file mode 100644 index 0000000..7b6a97e --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/k7/sqr_basecase.asm @@ -0,0 +1,635 @@ +dnl AMD K7 mpn_sqr_basecase -- square an mpn number. + +dnl Copyright 1999-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C K7: approx 2.3 cycles/crossproduct, or 4.55 cycles/triangular product +C (measured on the speed difference between 25 and 50 limbs, which is +C roughly the Karatsuba recursing range). + + +dnl These are the same as mpn/x86/k6/sqr_basecase.asm, see that code for +dnl some comments. + +deflit(SQR_TOOM2_THRESHOLD_MAX, 66) + +ifdef(`SQR_TOOM2_THRESHOLD_OVERRIDE', +`define(`SQR_TOOM2_THRESHOLD',SQR_TOOM2_THRESHOLD_OVERRIDE)') + +m4_config_gmp_mparam(`SQR_TOOM2_THRESHOLD') +deflit(UNROLL_COUNT, eval(SQR_TOOM2_THRESHOLD-3)) + + +C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size); +C +C With a SQR_TOOM2_THRESHOLD around 50 this code is about 1500 bytes, +C which is quite a bit, but is considered good value since squares big +C enough to use most of the code will be spending quite a few cycles in it. + + +defframe(PARAM_SIZE,12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + TEXT + ALIGN(32) +PROLOGUE(mpn_sqr_basecase) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + movl PARAM_SRC, %eax + cmpl $2, %ecx + + movl PARAM_DST, %edx + je L(two_limbs) + ja L(three_or_more) + + +C------------------------------------------------------------------------------ +C one limb only + C eax src + C ecx size + C edx dst + + movl (%eax), %eax + movl %edx, %ecx + + mull %eax + + movl %edx, 4(%ecx) + movl %eax, (%ecx) + ret + + +C------------------------------------------------------------------------------ +C +C Using the read/modify/write "add"s seems to be faster than saving and +C restoring registers. Perhaps the loads for the first set hide under the +C mul latency and the second gets store to load forwarding. + + ALIGN(16) +L(two_limbs): + C eax src + C ebx + C ecx size + C edx dst +deflit(`FRAME',0) + + pushl %ebx FRAME_pushl() + movl %eax, %ebx C src + movl (%eax), %eax + + movl %edx, %ecx C dst + + mull %eax C src[0]^2 + + movl %eax, (%ecx) C dst[0] + movl 4(%ebx), %eax + + movl %edx, 4(%ecx) C dst[1] + + mull %eax C src[1]^2 + + movl %eax, 8(%ecx) C dst[2] + movl (%ebx), %eax + + movl %edx, 12(%ecx) C dst[3] + + mull 4(%ebx) C src[0]*src[1] + + popl %ebx + + addl %eax, 4(%ecx) + adcl %edx, 8(%ecx) + adcl $0, 12(%ecx) + ASSERT(nc) + + addl %eax, 4(%ecx) + adcl %edx, 8(%ecx) + adcl $0, 12(%ecx) + ASSERT(nc) + + ret + + +C------------------------------------------------------------------------------ +defframe(SAVE_EBX, -4) +defframe(SAVE_ESI, -8) +defframe(SAVE_EDI, -12) +defframe(SAVE_EBP, -16) +deflit(STACK_SPACE, 16) + +L(three_or_more): + subl $STACK_SPACE, %esp + cmpl $4, %ecx + jae L(four_or_more) +deflit(`FRAME',STACK_SPACE) + + +C------------------------------------------------------------------------------ +C Three limbs +C +C Writing out the loads and stores separately at the end of this code comes +C out about 10 cycles faster than using adcls to memory. + + C eax src + C ecx size + C edx dst + + movl %ebx, SAVE_EBX + movl %eax, %ebx C src + movl (%eax), %eax + + movl %edx, %ecx C dst + movl %esi, SAVE_ESI + movl %edi, SAVE_EDI + + mull %eax C src[0] ^ 2 + + movl %eax, (%ecx) + movl 4(%ebx), %eax + movl %edx, 4(%ecx) + + mull %eax C src[1] ^ 2 + + movl %eax, 8(%ecx) + movl 8(%ebx), %eax + movl %edx, 12(%ecx) + + mull %eax C src[2] ^ 2 + + movl %eax, 16(%ecx) + movl (%ebx), %eax + movl %edx, 20(%ecx) + + mull 4(%ebx) C src[0] * src[1] + + movl %eax, %esi + movl (%ebx), %eax + movl %edx, %edi + + mull 8(%ebx) C src[0] * src[2] + + addl %eax, %edi + movl %ebp, SAVE_EBP + movl $0, %ebp + + movl 4(%ebx), %eax + adcl %edx, %ebp + + mull 8(%ebx) C src[1] * src[2] + + xorl %ebx, %ebx + addl %eax, %ebp + + adcl $0, %edx + + C eax + C ebx zero, will be dst[5] + C ecx dst + C edx dst[4] + C esi dst[1] + C edi dst[2] + C ebp dst[3] + + adcl $0, %edx + addl %esi, %esi + + adcl %edi, %edi + movl 4(%ecx), %eax + + adcl %ebp, %ebp + + adcl %edx, %edx + + adcl $0, %ebx + addl %eax, %esi + movl 8(%ecx), %eax + + adcl %eax, %edi + movl 12(%ecx), %eax + movl %esi, 4(%ecx) + + adcl %eax, %ebp + movl 16(%ecx), %eax + movl %edi, 8(%ecx) + + movl SAVE_ESI, %esi + movl SAVE_EDI, %edi + + adcl %eax, %edx + movl 20(%ecx), %eax + movl %ebp, 12(%ecx) + + adcl %ebx, %eax + ASSERT(nc) + movl SAVE_EBX, %ebx + movl SAVE_EBP, %ebp + + movl %edx, 16(%ecx) + movl %eax, 20(%ecx) + addl $FRAME, %esp + + ret + + +C------------------------------------------------------------------------------ +L(four_or_more): + +C First multiply src[0]*src[1..size-1] and store at dst[1..size]. +C Further products are added in rather than stored. + + C eax src + C ebx + C ecx size + C edx dst + C esi + C edi + C ebp + +defframe(`VAR_COUNTER',-20) +defframe(`VAR_JMP', -24) +deflit(EXTRA_STACK_SPACE, 8) + + movl %ebx, SAVE_EBX + movl %edi, SAVE_EDI + leal (%edx,%ecx,4), %edi C &dst[size] + + movl %esi, SAVE_ESI + movl %ebp, SAVE_EBP + leal (%eax,%ecx,4), %esi C &src[size] + + movl (%eax), %ebp C multiplier + movl $0, %ebx + decl %ecx + + negl %ecx + subl $EXTRA_STACK_SPACE, %esp +FRAME_subl_esp(EXTRA_STACK_SPACE) + +L(mul_1): + C eax scratch + C ebx carry + C ecx counter + C edx scratch + C esi &src[size] + C edi &dst[size] + C ebp multiplier + + movl (%esi,%ecx,4), %eax + + mull %ebp + + addl %ebx, %eax + movl %eax, (%edi,%ecx,4) + movl $0, %ebx + + adcl %edx, %ebx + incl %ecx + jnz L(mul_1) + + +C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2. +C +C The last two products, which are the bottom right corner of the product +C triangle, are left to the end. These are src[size-3]*src[size-2,size-1] +C and src[size-2]*src[size-1]. If size is 4 then it's only these corner +C cases that need to be done. +C +C The unrolled code is the same as in mpn_addmul_1, see that routine for +C some comments. +C +C VAR_COUNTER is the outer loop, running from -size+4 to -1, inclusive. +C +C VAR_JMP is the computed jump into the unrolled code, stepped by one code +C chunk each outer loop. +C +C K7 does branch prediction on indirect jumps, which is bad since it's a +C different target each time. There seems no way to avoid this. + +dnl This value also hard coded in some shifts and adds +deflit(CODE_BYTES_PER_LIMB, 17) + +dnl With the unmodified &src[size] and &dst[size] pointers, the +dnl displacements in the unrolled code fit in a byte for UNROLL_COUNT +dnl values up to 31, but above that an offset must be added to them. + +deflit(OFFSET, +ifelse(eval(UNROLL_COUNT>31),1, +eval((UNROLL_COUNT-31)*4), +0)) + +dnl Because the last chunk of code is generated differently, a label placed +dnl at the end doesn't work. Instead calculate the implied end using the +dnl start and how many chunks of code there are. + +deflit(UNROLL_INNER_END, +`L(unroll_inner_start)+eval(UNROLL_COUNT*CODE_BYTES_PER_LIMB)') + + C eax + C ebx carry + C ecx + C edx + C esi &src[size] + C edi &dst[size] + C ebp + + movl PARAM_SIZE, %ecx + movl %ebx, (%edi) + + subl $4, %ecx + jz L(corner) + + negl %ecx +ifelse(OFFSET,0,,`subl $OFFSET, %edi') +ifelse(OFFSET,0,,`subl $OFFSET, %esi') + + movl %ecx, %edx + shll $4, %ecx + +ifdef(`PIC',` + call L(pic_calc) +L(here): +',` + leal UNROLL_INNER_END-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx +') + + + C The calculated jump mustn't come out to before the start of the + C code available. This is the limit UNROLL_COUNT puts on the src + C operand size, but checked here directly using the jump address. + ASSERT(ae, + `movl_text_address(L(unroll_inner_start), %eax) + cmpl %eax, %ecx') + + +C------------------------------------------------------------------------------ + ALIGN(16) +L(unroll_outer_top): + C eax + C ebx high limb to store + C ecx VAR_JMP + C edx VAR_COUNTER, limbs, negative + C esi &src[size], constant + C edi dst ptr, high of last addmul + C ebp + + movl -12+OFFSET(%esi,%edx,4), %ebp C next multiplier + movl -8+OFFSET(%esi,%edx,4), %eax C first of multiplicand + + movl %edx, VAR_COUNTER + + mull %ebp + +define(cmovX,`ifelse(eval(UNROLL_COUNT%2),0,`cmovz($@)',`cmovnz($@)')') + + testb $1, %cl + movl %edx, %ebx C high carry + movl %ecx, %edx C jump + + movl %eax, %ecx C low carry + cmovX( %ebx, %ecx) C high carry reverse + cmovX( %eax, %ebx) C low carry reverse + + leal CODE_BYTES_PER_LIMB(%edx), %eax + xorl %edx, %edx + leal 4(%edi), %edi + + movl %eax, VAR_JMP + + jmp *%eax + + +ifdef(`PIC',` +L(pic_calc): + addl (%esp), %ecx + addl $UNROLL_INNER_END-eval(2*CODE_BYTES_PER_LIMB)-L(here), %ecx + addl %edx, %ecx + ret_internal +') + + + C Must be an even address to preserve the significance of the low + C bit of the jump address indicating which way around ecx/ebx should + C start. + ALIGN(2) + +L(unroll_inner_start): + C eax next limb + C ebx carry high + C ecx carry low + C edx scratch + C esi src + C edi dst + C ebp multiplier + +forloop(`i', UNROLL_COUNT, 1, ` + deflit(`disp_src', eval(-i*4 + OFFSET)) + deflit(`disp_dst', eval(disp_src - 4)) + + m4_assert(`disp_src>=-128 && disp_src<128') + m4_assert(`disp_dst>=-128 && disp_dst<128') + +ifelse(eval(i%2),0,` +Zdisp( movl, disp_src,(%esi), %eax) + adcl %edx, %ebx + + mull %ebp + +Zdisp( addl, %ecx, disp_dst,(%edi)) + movl $0, %ecx + + adcl %eax, %ebx + +',` + dnl this bit comes out last +Zdisp( movl, disp_src,(%esi), %eax) + adcl %edx, %ecx + + mull %ebp + +Zdisp( addl, %ebx, disp_dst,(%edi)) + +ifelse(forloop_last,0, +` movl $0, %ebx') + + adcl %eax, %ecx +') +') + + C eax next limb + C ebx carry high + C ecx carry low + C edx scratch + C esi src + C edi dst + C ebp multiplier + + adcl $0, %edx + addl %ecx, -4+OFFSET(%edi) + movl VAR_JMP, %ecx + + adcl $0, %edx + + movl %edx, m4_empty_if_zero(OFFSET) (%edi) + movl VAR_COUNTER, %edx + + incl %edx + jnz L(unroll_outer_top) + + +ifelse(OFFSET,0,,` + addl $OFFSET, %esi + addl $OFFSET, %edi +') + + +C------------------------------------------------------------------------------ +L(corner): + C esi &src[size] + C edi &dst[2*size-5] + + movl -12(%esi), %ebp + movl -8(%esi), %eax + movl %eax, %ecx + + mull %ebp + + addl %eax, -4(%edi) + movl -4(%esi), %eax + + adcl $0, %edx + movl %edx, %ebx + movl %eax, %esi + + mull %ebp + + addl %ebx, %eax + + adcl $0, %edx + addl %eax, (%edi) + movl %esi, %eax + + adcl $0, %edx + movl %edx, %ebx + + mull %ecx + + addl %ebx, %eax + movl %eax, 4(%edi) + + adcl $0, %edx + movl %edx, 8(%edi) + + + +C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1]. + +L(lshift_start): + movl PARAM_SIZE, %eax + movl PARAM_DST, %edi + xorl %ecx, %ecx C clear carry + + leal (%edi,%eax,8), %edi + notl %eax C -size-1, preserve carry + + leal 2(%eax), %eax C -(size-1) + +L(lshift): + C eax counter, negative + C ebx + C ecx + C edx + C esi + C edi dst, pointing just after last limb + C ebp + + rcll -4(%edi,%eax,8) + rcll (%edi,%eax,8) + incl %eax + jnz L(lshift) + + setc %al + + movl PARAM_SRC, %esi + movl %eax, -4(%edi) C dst most significant limb + + movl PARAM_SIZE, %ecx + + +C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ..., +C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the +C low limb of src[0]^2. + + movl (%esi), %eax C src[0] + + mull %eax + + leal (%esi,%ecx,4), %esi C src point just after last limb + negl %ecx + + movl %eax, (%edi,%ecx,8) C dst[0] + incl %ecx + +L(diag): + C eax scratch + C ebx scratch + C ecx counter, negative + C edx carry + C esi src just after last limb + C edi dst just after last limb + C ebp + + movl (%esi,%ecx,4), %eax + movl %edx, %ebx + + mull %eax + + addl %ebx, -4(%edi,%ecx,8) + adcl %eax, (%edi,%ecx,8) + adcl $0, %edx + + incl %ecx + jnz L(diag) + + + movl SAVE_ESI, %esi + movl SAVE_EBX, %ebx + + addl %edx, -4(%edi) C dst most significant limb + movl SAVE_EDI, %edi + + movl SAVE_EBP, %ebp + addl $FRAME, %esp + + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/k7/sublsh1_n.asm b/vendor/gmp-6.3.0/mpn/x86/k7/sublsh1_n.asm new file mode 100644 index 0000000..8851683 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/k7/sublsh1_n.asm @@ -0,0 +1,173 @@ +dnl AMD K7 mpn_sublsh1_n_ip1 -- rp[] = rp[] - (up[] << 1) + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C This is an attempt at a sublsh1_n for x86-32, not relying on sse2 insns. The +C innerloop is 2*3-way unrolled, which is best we can do with the available +C registers. It seems tricky to use the same structure for rsblsh1_n, since we +C cannot feed carry between operations there. + +C cycles/limb +C P5 +C P6 model 0-8,10-12 +C P6 model 9 (Banias) +C P6 model 13 (Dothan) +C P4 model 0 (Willamette) +C P4 model 1 (?) +C P4 model 2 (Northwood) +C P4 model 3 (Prescott) +C P4 model 4 (Nocona) +C Intel Atom 6.75 +C AMD K6 +C AMD K7 +C AMD K8 + +C This is a basic sublsh1_n for k7, atom, and perhaps some other x86-32 +C processors. It uses 2*4-way unrolling, for good reasons. +C +C Breaking carry recurrency might be a good idea. We would then need separate +C registers for the shift carry and add/subtract carry, which in turn would +C force us to 2*2-way unrolling. + +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +dnl re-use parameter space +define(VAR_COUNT,`PARAM_SIZE') +define(SAVE_EBX,`PARAM_SRC') +define(SAVE_EBP,`PARAM_DST') + +ASM_START() + TEXT + ALIGN(8) +PROLOGUE(mpn_sublsh1_n_ip1) +deflit(`FRAME',0) + +define(`rp', `%edi') +define(`up', `%esi') + + mov PARAM_SIZE, %eax C size + push up FRAME_pushl() + push rp FRAME_pushl() + xor %edx, %edx + mov PARAM_SRC, up + mov PARAM_DST, rp + mov %ebx, SAVE_EBX + mov %eax, %ebx + shr $3, %eax + + not %eax C count = -(size\8)-i + and $7, %ebx C size % 8 + jz L(exact) + +L(oop): +ifdef(`CPU_P6',` + shr %edx ') C restore 2nd saved carry bit + mov (up), %ecx + adc %ecx, %ecx + rcr %edx C restore 1st saved carry bit + lea 4(up), up + sbb %ecx, (rp) + lea 4(rp), rp + adc %edx, %edx C save a carry bit in edx +ifdef(`CPU_P6',` + adc %edx, %edx ') C save another carry bit in edx + dec %ebx + jnz L(oop) +L(exact): + inc %eax + jz L(end) + mov %eax, VAR_COUNT + mov %ebp, SAVE_EBP + + ALIGN(16) +L(top): +ifdef(`CPU_P6',` + shr %edx ') C restore 2nd saved carry bit + mov (up), %eax + adc %eax, %eax + mov 4(up), %ebx + adc %ebx, %ebx + mov 8(up), %ecx + adc %ecx, %ecx + mov 12(up), %ebp + adc %ebp, %ebp + + rcr %edx C restore 1st saved carry bit + + sbb %eax, (rp) + sbb %ebx, 4(rp) + sbb %ecx, 8(rp) + sbb %ebp, 12(rp) + + mov 16(up), %eax + adc %eax, %eax + mov 20(up), %ebx + adc %ebx, %ebx + mov 24(up), %ecx + adc %ecx, %ecx + mov 28(up), %ebp + adc %ebp, %ebp + + lea 32(up), up + adc %edx, %edx C save a carry bit in edx + + sbb %eax, 16(rp) + sbb %ebx, 20(rp) + sbb %ecx, 24(rp) + sbb %ebp, 28(rp) + +ifdef(`CPU_P6',` + adc %edx, %edx ') C save another carry bit in edx + incl VAR_COUNT + lea 32(rp), rp + jne L(top) + + mov SAVE_EBP, %ebp +L(end): + mov SAVE_EBX, %ebx + +ifdef(`CPU_P6',` + xor %eax, %eax + shr $1, %edx + adc %edx, %eax +',` + adc $0, %edx + mov %edx, %eax +') + pop rp FRAME_popl() + pop up FRAME_popl() + ret +EPILOGUE() +ASM_END() |