diff options
author | Thomas Voss <mail@thomasvoss.com> | 2024-06-21 23:36:36 +0200 |
---|---|---|
committer | Thomas Voss <mail@thomasvoss.com> | 2024-06-21 23:42:26 +0200 |
commit | a89a14ef5da44684a16b204e7a70460cc8c4922a (patch) | |
tree | b23b4c6b155977909ef508fdae2f48d33d802813 /vendor/gmp-6.3.0/mpn/x86/k6 | |
parent | 1db63fcedab0b288820d66e100b1877b1a5a8851 (diff) |
Basic constant folding implementation
Diffstat (limited to 'vendor/gmp-6.3.0/mpn/x86/k6')
21 files changed, 5438 insertions, 0 deletions
diff --git a/vendor/gmp-6.3.0/mpn/x86/k6/README b/vendor/gmp-6.3.0/mpn/x86/k6/README new file mode 100644 index 0000000..1d65af3 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/k6/README @@ -0,0 +1,251 @@ +Copyright 2000, 2001 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. + + + + + AMD K6 MPN SUBROUTINES + + + +This directory contains code optimized for AMD K6 CPUs, meaning K6, K6-2 and +K6-3. + +The mmx subdirectory has MMX code suiting plain K6, the k62mmx subdirectory +has MMX code suiting K6-2 and K6-3. All chips in the K6 family have MMX, +the separate directories are just so that ./configure can omit them if the +assembler doesn't support MMX. + + + + +STATUS + +Times for the loops, with all code and data in L1 cache, are as follows. + + cycles/limb + + mpn_add_n/sub_n 3.25 normal, 2.75 in-place + + mpn_mul_1 6.25 + mpn_add/submul_1 7.65-8.4 (varying with data values) + + mpn_mul_basecase 9.25 cycles/crossproduct (approx) + mpn_sqr_basecase 4.7 cycles/crossproduct (approx) + or 9.2 cycles/triangleproduct (approx) + + mpn_l/rshift 3.0 + + mpn_divrem_1 20.0 + mpn_mod_1 20.0 + mpn_divexact_by3 11.0 + + mpn_copyi 1.0 + mpn_copyd 1.0 + + +K6-2 and K6-3 have dual-issue MMX and get the following improvements. + + mpn_l/rshift 1.75 + + +Prefetching of sources hasn't yet given any joy. With the 3DNow "prefetch" +instruction, code seems to run slower, and with just "mov" loads it doesn't +seem faster. Results so far are inconsistent. The K6 does a hardware +prefetch of the second cache line in a sector, so the penalty for not +prefetching in software is reduced. + + + + +NOTES + +All K6 family chips have MMX, but only K6-2 and K6-3 have 3DNow. + +Plain K6 executes MMX instructions only in the X pipe, but K6-2 and K6-3 can +execute them in both X and Y (and in both together). + +Branch misprediction penalty is 1 to 4 cycles (Optimization Manual +chapter 6 table 12). + +Write-allocate L1 data cache means prefetching of destinations is unnecessary. +Store queue is 7 entries of 64 bits each. + +Floating point multiplications can be done in parallel with integer +multiplications, but there doesn't seem to be any way to make use of this. + + + +OPTIMIZATIONS + +Unrolled loops are used to reduce looping overhead. The unrolling is +configurable up to 32 limbs/loop for most routines, up to 64 for some. + +Sometimes computed jumps into the unrolling are used to handle sizes not a +multiple of the unrolling. An attractive feature of this is that times +smoothly increase with operand size, but an indirect jump is about 6 cycles +and the setups about another 6, so it depends on how much the unrolled code +is faster than a simple loop as to whether a computed jump ought to be used. + +Position independent code is implemented using a call to get eip for +computed jumps and a ret is always done, rather than an addl $4,%esp or a +popl, so the CPU return address branch prediction stack stays synchronised +with the actual stack in memory. Such a call however still costs 4 to 7 +cycles. + +Branch prediction, in absence of any history, will guess forward jumps are +not taken and backward jumps are taken. Where possible it's arranged that +the less likely or less important case is under a taken forward jump. + + + +MMX + +Putting emms or femms as late as possible in a routine seems to be fastest. +Perhaps an emms or femms stalls until all outstanding MMX instructions have +completed, so putting it later gives them a chance to complete on their own, +in parallel with other operations (like register popping). + +The Optimization Manual chapter 5 recommends using a femms on K6-2 and K6-3 +at the start of a routine, in case it's been preceded by x87 floating point +operations. This isn't done because in gmp programs it's expected that x87 +floating point won't be much used and that chances are an mpn routine won't +have been preceded by any x87 code. + + + +CODING + +Instructions in general code are shown paired if they can decode and execute +together, meaning two short decode instructions with the second not +depending on the first, only the first using the shifter, no more than one +load, and no more than one store. + +K6 does some out of order execution so the pairings aren't essential, they +just show what slots might be available. When decoding is the limiting +factor things can be scheduled that might not execute until later. + + + +NOTES + +Code alignment + +- if an opcode/modrm or 0Fh/opcode/modrm crosses a cache line boundary, + short decode is inhibited. The cross.pl script detects this. + +- loops and branch targets should be aligned to 16 bytes, or ensure at least + 2 instructions before a 32 byte boundary. This makes use of the 16 byte + cache in the BTB. + +Addressing modes + +- (%esi) degrades decoding from short to vector. 0(%esi) doesn't have this + problem, and can be used as an equivalent, or easier is just to use a + different register, like %ebx. + +- K6 and pre-CXT core K6-2 have the following problem. (K6-2 CXT and K6-3 + have it fixed, these being cpuid function 1 signatures 0x588 to 0x58F). + + If more than 3 bytes are needed to determine instruction length then + decoding degrades from direct to long, or from long to vector. This + happens with forms like "0F opcode mod/rm" with mod/rm=00-xxx-100 since + with mod=00 the sib determines whether there's a displacement. + + This affects all MMX and 3DNow instructions, and others with an 0F prefix, + like movzbl. The modes affected are anything with an index and no + displacement, or an index but no base, and this includes (%esp) which is + really (,%esp,1). + + The cross.pl script detects problem cases. The workaround is to always + use a displacement, and to do this with Zdisp if it's zero so the + assembler doesn't discard it. + + See Optimization Manual rev D page 67 and 3DNow Porting Guide rev B pages + 13-14 and 36-37. + +Calls + +- indirect jumps and calls are not branch predicted, they measure about 6 + cycles. + +Various + +- adcl 2 cycles of decode, maybe 2 cycles executing in the X pipe +- bsf 12-27 cycles +- emms 5 cycles +- femms 3 cycles +- jecxz 2 cycles taken, 13 not taken (optimization manual says 7 not taken) +- divl 20 cycles back-to-back +- imull 2 decode, 3 execute +- mull 2 decode, 3 execute (optimization manual decoding sample) +- prefetch 2 cycles +- rcll/rcrl implicit by one bit: 2 cycles + immediate or %cl count: 11 + 2 per bit for dword + 13 + 4 per bit for byte +- setCC 2 cycles +- xchgl %eax,reg 1.5 cycles, back-to-back (strange) + reg,reg 2 cycles, back-to-back + + + + +REFERENCES + +"AMD-K6 Processor Code Optimization Application Note", AMD publication +number 21924, revision D amendment 0, January 2000. This describes K6-2 and +K6-3. Available on-line, + +http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/21924.pdf + +"AMD-K6 MMX Enhanced Processor x86 Code Optimization Application Note", AMD +publication number 21828, revision A amendment 0, August 1997. This is an +older edition of the above document, describing plain K6. Available +on-line, + +http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/21828.pdf + +"3DNow Technology Manual", AMD publication number 21928G/0-March 2000. +This describes the femms and prefetch instructions, but nothing else from +3DNow has been used. Available on-line, + +http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/21928.pdf + +"3DNow Instruction Porting Guide", AMD publication number 22621, revision B, +August 1999. This has some notes on general K6 optimizations as well as +3DNow. Available on-line, + +http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/22621.pdf + + + +---------------- +Local variables: +mode: text +fill-column: 76 +End: diff --git a/vendor/gmp-6.3.0/mpn/x86/k6/aors_n.asm b/vendor/gmp-6.3.0/mpn/x86/k6/aors_n.asm new file mode 100644 index 0000000..168f9b4 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/k6/aors_n.asm @@ -0,0 +1,337 @@ +dnl AMD K6 mpn_add/sub_n -- mpn addition or subtraction. + +dnl Copyright 1999-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C K6: normal 3.25 cycles/limb, in-place 2.75 cycles/limb. + + +ifdef(`OPERATION_add_n', ` + define(M4_inst, adcl) + define(M4_function_n, mpn_add_n) + define(M4_function_nc, mpn_add_nc) + define(M4_description, add) +',`ifdef(`OPERATION_sub_n', ` + define(M4_inst, sbbl) + define(M4_function_n, mpn_sub_n) + define(M4_function_nc, mpn_sub_nc) + define(M4_description, subtract) +',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n +')')') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) + + +C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, +C mp_size_t size); +C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, +C mp_size_t size, mp_limb_t carry); +C +C Calculate src1,size M4_description src2,size, and store the result in +C dst,size. The return value is the carry bit from the top of the result +C (1 or 0). +C +C The _nc version accepts 1 or 0 for an initial carry into the low limb of +C the calculation. Note values other than 1 or 0 here will lead to garbage +C results. +C +C Instruction decoding limits a normal dst=src1+src2 operation to 3 c/l, and +C an in-place dst+=src to 2.5 c/l. The unrolled loops have 1 cycle/loop of +C loop control, which with 4 limbs/loop means an extra 0.25 c/l. + +define(PARAM_CARRY, `FRAME+20(%esp)') +define(PARAM_SIZE, `FRAME+16(%esp)') +define(PARAM_SRC2, `FRAME+12(%esp)') +define(PARAM_SRC1, `FRAME+8(%esp)') +define(PARAM_DST, `FRAME+4(%esp)') +deflit(`FRAME',0) + +dnl minimum 5 because the unrolled code can't handle less +deflit(UNROLL_THRESHOLD, 5) + + TEXT + ALIGN(32) + +PROLOGUE(M4_function_nc) + movl PARAM_CARRY, %eax + jmp L(start) +EPILOGUE() + + +PROLOGUE(M4_function_n) + xorl %eax, %eax +L(start): + movl PARAM_SIZE, %ecx + pushl %ebx +FRAME_pushl() + + movl PARAM_SRC1, %ebx + pushl %edi +FRAME_pushl() + + movl PARAM_SRC2, %edx + cmpl $UNROLL_THRESHOLD, %ecx + + movl PARAM_DST, %edi + jae L(unroll) + + + shrl %eax C initial carry flag + + C offset 0x21 here, close enough to aligned +L(simple): + C eax scratch + C ebx src1 + C ecx counter + C edx src2 + C esi + C edi dst + C ebp + C + C The store to (%edi) could be done with a stosl; it'd be smaller + C code, but there's no speed gain and a cld would have to be added + C (per mpn/x86/README). + + movl (%ebx), %eax + leal 4(%ebx), %ebx + + M4_inst (%edx), %eax + + movl %eax, (%edi) + leal 4(%edi), %edi + + leal 4(%edx), %edx + loop L(simple) + + + movl $0, %eax + popl %edi + + setc %al + + popl %ebx + ret + + +C ----------------------------------------------------------------------------- +L(unroll): + C eax carry + C ebx src1 + C ecx counter + C edx src2 + C esi + C edi dst + C ebp + + cmpl %edi, %ebx + pushl %esi + + je L(inplace) + +ifdef(`OPERATION_add_n',` + cmpl %edi, %edx + + je L(inplace_reverse) +') + + movl %ecx, %esi + + andl $-4, %ecx + andl $3, %esi + + leal (%ebx,%ecx,4), %ebx + leal (%edx,%ecx,4), %edx + leal (%edi,%ecx,4), %edi + + negl %ecx + shrl %eax + + ALIGN(32) +L(normal_top): + C eax counter, qwords, negative + C ebx src1 + C ecx scratch + C edx src2 + C esi + C edi dst + C ebp + + movl (%ebx,%ecx,4), %eax + leal 5(%ecx), %ecx + M4_inst -20(%edx,%ecx,4), %eax + movl %eax, -20(%edi,%ecx,4) + + movl 4-20(%ebx,%ecx,4), %eax + M4_inst 4-20(%edx,%ecx,4), %eax + movl %eax, 4-20(%edi,%ecx,4) + + movl 8-20(%ebx,%ecx,4), %eax + M4_inst 8-20(%edx,%ecx,4), %eax + movl %eax, 8-20(%edi,%ecx,4) + + movl 12-20(%ebx,%ecx,4), %eax + M4_inst 12-20(%edx,%ecx,4), %eax + movl %eax, 12-20(%edi,%ecx,4) + + loop L(normal_top) + + + decl %esi + jz L(normal_finish_one) + js L(normal_done) + + C two or three more limbs + + movl (%ebx), %eax + M4_inst (%edx), %eax + movl %eax, (%edi) + + movl 4(%ebx), %eax + M4_inst 4(%edx), %eax + decl %esi + movl %eax, 4(%edi) + + jz L(normal_done) + movl $2, %ecx + +L(normal_finish_one): + movl (%ebx,%ecx,4), %eax + M4_inst (%edx,%ecx,4), %eax + movl %eax, (%edi,%ecx,4) + +L(normal_done): + popl %esi + popl %edi + + movl $0, %eax + popl %ebx + + setc %al + + ret + + +C ----------------------------------------------------------------------------- + +ifdef(`OPERATION_add_n',` +L(inplace_reverse): + C dst==src2 + + movl %ebx, %edx +') + +L(inplace): + C eax initial carry + C ebx + C ecx size + C edx src + C esi + C edi dst + C ebp + + leal -1(%ecx), %esi + decl %ecx + + andl $-4, %ecx + andl $3, %esi + + movl (%edx), %ebx C src low limb + leal (%edx,%ecx,4), %edx + + leal (%edi,%ecx,4), %edi + negl %ecx + + shrl %eax + + + ALIGN(32) +L(inplace_top): + C eax + C ebx next src limb + C ecx size + C edx src + C esi + C edi dst + C ebp + + M4_inst %ebx, (%edi,%ecx,4) + + movl 4(%edx,%ecx,4), %eax + leal 5(%ecx), %ecx + + M4_inst %eax, 4-20(%edi,%ecx,4) + + movl 8-20(%edx,%ecx,4), %eax + movl 12-20(%edx,%ecx,4), %ebx + + M4_inst %eax, 8-20(%edi,%ecx,4) + M4_inst %ebx, 12-20(%edi,%ecx,4) + + movl 16-20(%edx,%ecx,4), %ebx + loop L(inplace_top) + + + C now %esi is 0 to 3 representing respectively 1 to 4 limbs more + + M4_inst %ebx, (%edi) + + decl %esi + jz L(inplace_finish_one) + js L(inplace_done) + + C two or three more limbs + + movl 4(%edx), %eax + movl 8(%edx), %ebx + M4_inst %eax, 4(%edi) + M4_inst %ebx, 8(%edi) + + decl %esi + movl $2, %ecx + + jz L(normal_done) + +L(inplace_finish_one): + movl 4(%edx,%ecx,4), %eax + M4_inst %eax, 4(%edi,%ecx,4) + +L(inplace_done): + popl %esi + popl %edi + + movl $0, %eax + popl %ebx + + setc %al + + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/k6/aorsmul_1.asm b/vendor/gmp-6.3.0/mpn/x86/k6/aorsmul_1.asm new file mode 100644 index 0000000..eaa92eb --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/k6/aorsmul_1.asm @@ -0,0 +1,391 @@ +dnl AMD K6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple. + +dnl Copyright 1999-2003, 2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C P5 +C P6 model 0-8,10-12 5.94 +C P6 model 9 (Banias) 5.51 +C P6 model 13 (Dothan) 5.57 +C P4 model 0 (Willamette) +C P4 model 1 (?) +C P4 model 2 (Northwood) +C P4 model 3 (Prescott) +C P4 model 4 (Nocona) +C AMD K6 7.65-8.5 (data dependent) +C AMD K7 +C AMD K8 + + +dnl K6: large multipliers small multipliers +dnl UNROLL_COUNT cycles/limb cycles/limb +dnl 4 9.5 7.78 +dnl 8 9.0 7.78 +dnl 16 8.4 7.65 +dnl 32 8.4 8.2 +dnl +dnl Maximum possible unrolling with the current code is 32. +dnl +dnl Unrolling to 16 limbs/loop makes the unrolled loop fit exactly in a 256 +dnl byte block, which might explain the good speed at that unrolling. + +deflit(UNROLL_COUNT, 16) + + +ifdef(`OPERATION_addmul_1', ` + define(M4_inst, addl) + define(M4_function_1, mpn_addmul_1) + define(M4_function_1c, mpn_addmul_1c) +',`ifdef(`OPERATION_submul_1', ` + define(M4_inst, subl) + define(M4_function_1, mpn_submul_1) + define(M4_function_1c, mpn_submul_1c) +',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1 +')')') + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c) + + +C mp_limb_t mpn_addmul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t mult); +C mp_limb_t mpn_addmul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t mult, mp_limb_t carry); +C mp_limb_t mpn_submul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t mult); +C mp_limb_t mpn_submul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t mult, mp_limb_t carry); +C +C The jadcl0()s in the unrolled loop makes the speed data dependent. Small +C multipliers (most significant few bits clear) result in few carry bits and +C speeds up to 7.65 cycles/limb are attained. Large multipliers (most +C significant few bits set) make the carry bits 50/50 and lead to something +C more like 8.4 c/l. With adcl's both of these would be 9.3 c/l. +C +C It's important that the gains for jadcl0 on small multipliers don't come +C at the cost of slowing down other data. Tests on uniformly distributed +C random data, designed to confound branch prediction, show about a 7% +C speed-up using jadcl0 over adcl (8.93 versus 9.57 cycles/limb, with all +C overheads included). +C +C In the simple loop, jadcl0() measures slower than adcl (11.9-14.7 versus +C 11.0 cycles/limb), and hence isn't used. +C +C In the simple loop, note that running ecx from negative to zero and using +C it as an index in the two movs wouldn't help. It would save one +C instruction (2*addl+loop becoming incl+jnz), but there's nothing unpaired +C that would be collapsed by this. +C +C Attempts at a simpler main loop, with less unrolling, haven't yielded much +C success, generally running over 9 c/l. +C +C +C jadcl0 +C ------ +C +C jadcl0() being faster than adcl $0 seems to be an artifact of two things, +C firstly the instruction decoding and secondly the fact that there's a +C carry bit for the jadcl0 only on average about 1/4 of the time. +C +C The code in the unrolled loop decodes something like the following. +C +C decode cycles +C mull %ebp 2 +C M4_inst %esi, disp(%edi) 1 +C adcl %eax, %ecx 2 +C movl %edx, %esi \ 1 +C jnc 1f / +C incl %esi \ 1 +C 1: movl disp(%ebx), %eax / +C --- +C 7 +C +C In a back-to-back style test this measures 7 with the jnc not taken, or 8 +C with it taken (both when correctly predicted). This is opposite to the +C measurements showing small multipliers running faster than large ones. +C Don't really know why. +C +C It's not clear how much branch misprediction might be costing. The K6 +C doco says it will be 1 to 4 cycles, but presumably it's near the low end +C of that range to get the measured results. +C +C +C In the code the two carries are more or less the preceding mul product and +C the calculation is roughly +C +C x*y + u*b+v +C +C where b=2^32 is the size of a limb, x*y is the two carry limbs, and u and +C v are the two limbs it's added to (being the low of the next mul, and a +C limb from the destination). +C +C To get a carry requires x*y+u*b+v >= b^2, which is u*b+v >= b^2-x*y, and +C there are b^2-(b^2-x*y) = x*y many such values, giving a probability of +C x*y/b^2. If x, y, u and v are random and uniformly distributed between 0 +C and b-1, then the total probability can be summed over x and y, +C +C 1 b-1 b-1 x*y 1 b*(b-1) b*(b-1) +C --- * sum sum --- = --- * ------- * ------- = 1/4 +C b^2 x=0 y=1 b^2 b^4 2 2 +C +C Actually it's a very tiny bit less than 1/4 of course. If y is fixed, +C then the probability is 1/2*y/b thus varying linearly between 0 and 1/2. + + +ifdef(`PIC',` +deflit(UNROLL_THRESHOLD, 9) +',` +deflit(UNROLL_THRESHOLD, 6) +') + +defframe(PARAM_CARRY, 20) +defframe(PARAM_MULTIPLIER,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + TEXT + ALIGN(32) + +PROLOGUE(M4_function_1c) + pushl %esi +deflit(`FRAME',4) + movl PARAM_CARRY, %esi + jmp L(start_nc) +EPILOGUE() + +PROLOGUE(M4_function_1) + push %esi +deflit(`FRAME',4) + xorl %esi, %esi C initial carry + +L(start_nc): + movl PARAM_SIZE, %ecx + pushl %ebx +deflit(`FRAME',8) + + movl PARAM_SRC, %ebx + pushl %edi +deflit(`FRAME',12) + + cmpl $UNROLL_THRESHOLD, %ecx + movl PARAM_DST, %edi + + pushl %ebp +deflit(`FRAME',16) + jae L(unroll) + + + C simple loop + + movl PARAM_MULTIPLIER, %ebp + +L(simple): + C eax scratch + C ebx src + C ecx counter + C edx scratch + C esi carry + C edi dst + C ebp multiplier + + movl (%ebx), %eax + addl $4, %ebx + + mull %ebp + + addl $4, %edi + addl %esi, %eax + + adcl $0, %edx + + M4_inst %eax, -4(%edi) + + adcl $0, %edx + + movl %edx, %esi + loop L(simple) + + + popl %ebp + popl %edi + + popl %ebx + movl %esi, %eax + + popl %esi + ret + + + +C ----------------------------------------------------------------------------- +C The unrolled loop uses a "two carry limbs" scheme. At the top of the loop +C the carries are ecx=lo, esi=hi, then they swap for each limb processed. +C For the computed jump an odd size means they start one way around, an even +C size the other. +C +C VAR_JUMP holds the computed jump temporarily because there's not enough +C registers at the point of doing the mul for the initial two carry limbs. +C +C The add/adc for the initial carry in %esi is necessary only for the +C mpn_addmul/submul_1c entry points. Duplicating the startup code to +C eliminate this for the plain mpn_add/submul_1 doesn't seem like a good +C idea. + +dnl overlapping with parameters already fetched +define(VAR_COUNTER, `PARAM_SIZE') +define(VAR_JUMP, `PARAM_DST') + +L(unroll): + C eax + C ebx src + C ecx size + C edx + C esi initial carry + C edi dst + C ebp + + movl %ecx, %edx + decl %ecx + + subl $2, %edx + negl %ecx + + shrl $UNROLL_LOG2, %edx + andl $UNROLL_MASK, %ecx + + movl %edx, VAR_COUNTER + movl %ecx, %edx + + shll $4, %edx + negl %ecx + + C 15 code bytes per limb +ifdef(`PIC',` + call L(pic_calc) +L(here): +',` + leal L(entry) (%edx,%ecx,1), %edx +') + movl (%ebx), %eax C src low limb + + movl PARAM_MULTIPLIER, %ebp + movl %edx, VAR_JUMP + + mull %ebp + + addl %esi, %eax C initial carry (from _1c) + jadcl0( %edx) + + + leal 4(%ebx,%ecx,4), %ebx + movl %edx, %esi C high carry + + movl VAR_JUMP, %edx + leal (%edi,%ecx,4), %edi + + testl $1, %ecx + movl %eax, %ecx C low carry + + jz L(noswap) + movl %esi, %ecx C high,low carry other way around + + movl %eax, %esi +L(noswap): + + jmp *%edx + + +ifdef(`PIC',` +L(pic_calc): + C See mpn/x86/README about old gas bugs + leal (%edx,%ecx,1), %edx + addl $L(entry)-L(here), %edx + addl (%esp), %edx + ret_internal +') + + +C ----------------------------------------------------------- + ALIGN(32) +L(top): +deflit(`FRAME',16) + C eax scratch + C ebx src + C ecx carry lo + C edx scratch + C esi carry hi + C edi dst + C ebp multiplier + C + C 15 code bytes per limb + + leal UNROLL_BYTES(%edi), %edi + +L(entry): +forloop(`i', 0, UNROLL_COUNT/2-1, ` + deflit(`disp0', eval(2*i*4)) + deflit(`disp1', eval(disp0 + 4)) + +Zdisp( movl, disp0,(%ebx), %eax) + mull %ebp +Zdisp( M4_inst,%ecx, disp0,(%edi)) + adcl %eax, %esi + movl %edx, %ecx + jadcl0( %ecx) + + movl disp1(%ebx), %eax + mull %ebp + M4_inst %esi, disp1(%edi) + adcl %eax, %ecx + movl %edx, %esi + jadcl0( %esi) +') + + decl VAR_COUNTER + + leal UNROLL_BYTES(%ebx), %ebx + jns L(top) + + + popl %ebp + M4_inst %ecx, UNROLL_BYTES(%edi) + + popl %edi + movl %esi, %eax + + popl %ebx + jadcl0( %eax) + + popl %esi + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/k6/cross.pl b/vendor/gmp-6.3.0/mpn/x86/k6/cross.pl new file mode 100755 index 0000000..fc921a5 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/k6/cross.pl @@ -0,0 +1,182 @@ +#! /usr/bin/perl + +# Copyright 2000, 2001 Free Software Foundation, Inc. +# +# This file is part of the GNU MP Library. +# +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of either: +# +# * the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 3 of the License, or (at your +# option) any later version. +# +# or +# +# * the GNU General Public License as published by the Free Software +# Foundation; either version 2 of the License, or (at your option) any +# later version. +# +# or both in parallel, as here. +# +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# for more details. +# +# You should have received copies of the GNU General Public License and the +# GNU Lesser General Public License along with the GNU MP Library. If not, +# see https://www.gnu.org/licenses/. + + +# Usage: cross.pl [filename.o]... +# +# Produce an annotated disassembly of the given object files, indicating +# certain code alignment and addressing mode problems afflicting K6 chips. +# "ZZ" is used on all annotations, so this can be searched for. +# +# With no arguments, all .o files corresponding to .asm files are processed. +# This is good in the mpn object directory of a k6*-*-* build. +# +# Code alignments of 8 bytes or more are handled. When 32 is used, cache +# line boundaries will fall in at offsets 0x20,0x40,etc and problems are +# flagged at those locations. When 16 is used, the line boundaries can also +# fall at offsets 0x10,0x30,0x50,etc, depending where the file is loaded, so +# problems are identified there too. Likewise when 8 byte alignment is used +# problems are flagged additionally at 0x08,0x18,0x28,etc. +# +# Usually 32 byte alignment is used for k6 routines, but less is certainly +# possible if through good luck, or a little tweaking, cache line crossing +# problems can be avoided at the extra locations. +# +# Bugs: +# +# Instructions without mod/rm bytes or which are already vector decoded are +# unaffected by cache line boundary crossing, but not all of these have yet +# been put in as exceptions. All that occur in practice in GMP are present +# though. +# +# There's no messages for using the vector decoded addressing mode (%esi), +# but that's easy to avoid when coding. +# +# Future: +# +# Warn about jump targets that are poorly aligned (less than 2 instructions +# before a cache line boundary). + +use strict; + +sub disassemble { + my ($file) = @_; + my ($addr,$b1,$b2,$b3, $prefix,$opcode,$modrm); + my $align; + + open (IN, "objdump -Srfh $file |") + || die "Cannot open pipe from objdump\n"; + while (<IN>) { + print; + + if (/^[ \t]*[0-9]+[ \t]+\.text[ \t]/ && /2\*\*([0-9]+)$/) { + $align = 1 << $1; + if ($align < 8) { + print "ZZ cross.pl cannot handle alignment < 2**3\n"; + $align = 8 + } + } + + if (/^[ \t]*([0-9a-f]*):[ \t]*([0-9a-f]+)[ \t]+([0-9a-f]+)[ \t]+([0-9a-f]+)/) { + ($addr,$b1,$b2,$b3) = ($1,$2,$3,$4); + + } elsif (/^[ \t]*([0-9a-f]*):[ \t]*([0-9a-f]+)[ \t]+([0-9a-f]+)/) { + ($addr,$b1,$b2,$b3) = ($1,$2,$3,''); + + } elsif (/^[ \t]*([0-9a-f]*):[ \t]*([0-9a-f]+)/) { + ($addr,$b1,$b2,$b3) = ($1,$2,'',''); + + } else { + next; + } + + if ($b1 =~ /0f/) { + $prefix = $b1; + $opcode = $b2; + $modrm = $b3; + } else { + $prefix = ''; + $opcode = $b1; + $modrm = $b2; + } + + # modrm of the form 00-xxx-100 with an 0F prefix is the problem case + # for K6 and pre-CXT K6-2 + if ($prefix =~ /0f/ + && $opcode !~ /^8/ # jcond disp32 + && $modrm =~ /^[0-3][4c]/) { + print "ZZ ($file) >3 bytes to determine instruction length [K6]\n"; + } + + # with just an opcode, starting 1f mod 20h + if (($align==32 && $addr =~ /[13579bdf]f$/ + || $align==16 && $addr =~ /f$/ + || $align==8 && $addr =~ /[7f]$/) + && $prefix !~ /0f/ + && $opcode !~ /1[012345]/ # adc + && $opcode !~ /1[89abcd]/ # sbb + && $opcode !~ /^4/ # inc/dec reg + && $opcode !~ /^5/ # push/pop reg + && $opcode !~ /68/ # push $imm32 + && $opcode !~ /^7/ # jcond disp8 + && $opcode !~ /a[89]/ # test+imm + && $opcode !~ /a[a-f]/ # stos/lods/scas + && $opcode !~ /b8/ # movl $imm32,%eax + && $opcode !~ /d[0123]/ # rcl + && $opcode !~ /e[0123]/ # loop/loopz/loopnz/jcxz + && $opcode !~ /e8/ # call disp32 + && $opcode !~ /e[9b]/ # jmp disp32/disp8 + && $opcode !~ /f[89abcd]/ # clc,stc,cli,sti,cld,std + && !($opcode =~ /f[67]/ # grp 1 + && $modrm =~ /^[2367abef]/) # mul, imul, div, idiv + && $modrm !~ /^$/) { + print "ZZ ($file) opcode/modrm cross 32-byte boundary\n"; + } + + # with an 0F prefix, anything starting at 1f mod 20h + if (($align==32 && $addr =~ /[13579bdf][f]$/ + || $align==16 && $addr =~ /f$/ + || $align==8 && $addr =~ /[7f]$/) + && $prefix =~ /0f/ + && $opcode !~ /af/ # imul + && $opcode !~ /a[45]/ # shldl + && $opcode !~ /a[cd]/ # shrdl + ) { + print "ZZ ($file) prefix/opcode cross 32-byte boundary\n"; + } + + # with an 0F prefix, anything with mod/rm starting at 1e mod 20h + if (($align==32 && $addr =~ /[13579bdf][e]$/ + || $align==16 && $addr =~ /[e]$/ + || $align==8 && $addr =~ /[6e]$/) + && $prefix =~ /0f/ + && $opcode !~ /^8/ # jcond disp32 + && $opcode !~ /af/ # imull reg,reg + && $opcode !~ /a[45]/ # shldl + && $opcode !~ /a[cd]/ # shrdl + && $modrm !~ /^$/) { + print "ZZ ($file) prefix/opcode/modrm cross 32-byte boundary\n"; + } + } + close IN || die "Error from objdump (or objdump not available)\n"; +} + + +my @files; +if ($#ARGV >= 0) { + @files = @ARGV; +} else { + @files = glob "*.asm"; + map {s/.asm/.o/} @files; +} + +foreach (@files) { + disassemble($_); +} diff --git a/vendor/gmp-6.3.0/mpn/x86/k6/divrem_1.asm b/vendor/gmp-6.3.0/mpn/x86/k6/divrem_1.asm new file mode 100644 index 0000000..b4cea4f --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/k6/divrem_1.asm @@ -0,0 +1,203 @@ +dnl AMD K6 mpn_divrem_1 -- mpn by limb division. + +dnl Copyright 1999-2003, 2007 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C K6: 20 cycles/limb + + +C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize, +C mp_srcptr src, mp_size_t size, mp_limb_t divisor); +C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize, +C mp_srcptr src, mp_size_t size, mp_limb_t divisor, +C mp_limb_t carry); +C +C The code here is basically the same as mpn/x86/divrem_1.asm, but uses loop +C instead of decl+jnz, since it comes out 2 cycles/limb faster. +C +C A test is done to see if the high limb is less than the divisor, and if so +C one less div is done. A div is 20 cycles, so assuming high<divisor about +C half the time, then this test saves half that amount. The branch +C misprediction penalty is less than that. +C +C Back-to-back div instructions run at 20 cycles, the same as the loop here, +C so it seems there's nothing to gain by rearranging the loop. Pairing the +C mov and loop instructions was found to gain nothing. +C +C Enhancements: +C +C The low-latency K6 multiply might be thought to suit a mul-by-inverse, but +C that algorithm has been found to suffer from the relatively poor carry +C handling on K6 and too many auxiliary instructions. The fractional part +C however could be done at about 13 c/l, if it mattered enough. + +defframe(PARAM_CARRY, 24) +defframe(PARAM_DIVISOR,20) +defframe(PARAM_SIZE, 16) +defframe(PARAM_SRC, 12) +defframe(PARAM_XSIZE, 8) +defframe(PARAM_DST, 4) + + TEXT + + ALIGN(32) +PROLOGUE(mpn_divrem_1c) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + pushl %edi FRAME_pushl() + + movl PARAM_SRC, %edi + pushl %esi FRAME_pushl() + + movl PARAM_DIVISOR, %esi + pushl %ebx FRAME_pushl() + + movl PARAM_DST, %ebx + pushl %ebp FRAME_pushl() + + movl PARAM_XSIZE, %ebp + orl %ecx, %ecx C size + + movl PARAM_CARRY, %edx + jz L(fraction) C if size==0 + + leal -4(%ebx,%ebp,4), %ebx C dst one limb below integer part + jmp L(integer_top) + +EPILOGUE() + + + ALIGN(16) +PROLOGUE(mpn_divrem_1) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + pushl %edi FRAME_pushl() + + movl PARAM_SRC, %edi + pushl %esi FRAME_pushl() + + movl PARAM_DIVISOR, %esi + orl %ecx,%ecx C size + + jz L(size_zero) + pushl %ebx FRAME_pushl() + + movl -4(%edi,%ecx,4), %eax C src high limb + xorl %edx, %edx + + movl PARAM_DST, %ebx + pushl %ebp FRAME_pushl() + + movl PARAM_XSIZE, %ebp + cmpl %esi, %eax + + leal -4(%ebx,%ebp,4), %ebx C dst one limb below integer part + jae L(integer_entry) + + + C high<divisor, so high of dst is zero, and avoid one div + + movl %edx, (%ebx,%ecx,4) + decl %ecx + + movl %eax, %edx + jz L(fraction) + + +L(integer_top): + C eax scratch (quotient) + C ebx dst+4*xsize-4 + C ecx counter + C edx scratch (remainder) + C esi divisor + C edi src + C ebp xsize + + movl -4(%edi,%ecx,4), %eax +L(integer_entry): + + divl %esi + + movl %eax, (%ebx,%ecx,4) + loop L(integer_top) + + +L(fraction): + orl %ebp, %ecx + jz L(done) + + movl PARAM_DST, %ebx + + +L(fraction_top): + C eax scratch (quotient) + C ebx dst + C ecx counter + C edx scratch (remainder) + C esi divisor + C edi + C ebp + + xorl %eax, %eax + + divl %esi + + movl %eax, -4(%ebx,%ecx,4) + loop L(fraction_top) + + +L(done): + popl %ebp + movl %edx, %eax + popl %ebx + popl %esi + popl %edi + ret + + +L(size_zero): +deflit(`FRAME',8) + movl PARAM_XSIZE, %ecx + xorl %eax, %eax + + movl PARAM_DST, %edi + + cld C better safe than sorry, see mpn/x86/README + + rep + stosl + + popl %esi + popl %edi + ret +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/k6/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/x86/k6/gmp-mparam.h new file mode 100644 index 0000000..f03f1b2 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/k6/gmp-mparam.h @@ -0,0 +1,166 @@ +/* AMD K6 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 2000-2004, 2009, 2010 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + + +/* 450MHz K6-2 */ + +#define MOD_1_NORM_THRESHOLD 12 +#define MOD_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 41 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 32 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 3 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 128 +#define USE_PREINV_DIVREM_1 0 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */ + +#define MUL_TOOM22_THRESHOLD 20 +#define MUL_TOOM33_THRESHOLD 69 +#define MUL_TOOM44_THRESHOLD 106 +#define MUL_TOOM6H_THRESHOLD 157 +#define MUL_TOOM8H_THRESHOLD 199 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 73 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 69 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 65 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 64 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 32 +#define SQR_TOOM3_THRESHOLD 97 +#define SQR_TOOM4_THRESHOLD 143 +#define SQR_TOOM6_THRESHOLD 222 +#define SQR_TOOM8_THRESHOLD 272 + +#define MULMOD_BNM1_THRESHOLD 13 +#define SQRMOD_BNM1_THRESHOLD 17 + +#define MUL_FFT_MODF_THRESHOLD 476 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 476, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \ + { 11, 5}, { 23, 6}, { 17, 7}, { 9, 6}, \ + { 19, 7}, { 11, 6}, { 23, 7}, { 13, 6}, \ + { 27, 7}, { 15, 6}, { 31, 7}, { 17, 6}, \ + { 35, 7}, { 21, 8}, { 11, 7}, { 27, 8}, \ + { 15, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \ + { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \ + { 31, 7}, { 63, 8}, { 39, 9}, { 23, 8}, \ + { 51,10}, { 15, 9}, { 31, 8}, { 67, 9}, \ + { 47,10}, { 31, 9}, { 79,10}, { 47, 9}, \ + { 95,11}, { 31,10}, { 63, 9}, { 135,10}, \ + { 79, 9}, { 167,10}, { 95, 9}, { 191,10}, \ + { 111,11}, { 63,10}, { 127, 9}, { 255,10}, \ + { 143, 9}, { 287,10}, { 159,11}, { 95,10}, \ + { 191, 9}, { 383,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271, 9}, { 543,10}, \ + { 287,11}, { 159,10}, { 351,11}, { 191,10}, \ + { 415, 9}, { 831,11}, { 223,12}, { 127,11}, \ + { 255,10}, { 543,11}, { 287,10}, { 575,11}, \ + { 351,10}, { 703,12}, { 191,11}, { 415,10}, \ + { 831,13}, { 127,12}, { 255,11}, { 543,10}, \ + { 1087,11}, { 575,12}, { 319,11}, { 703,12}, \ + { 383,11}, { 831,12}, { 447,11}, { 895,13}, \ + { 255,12}, { 511,11}, { 1087,12}, { 575,11}, \ + { 1151,12}, { 703,13}, { 383,12}, { 959,14}, \ + { 255,13}, { 511,12}, { 1215,13}, { 8192,14}, \ + { 16384,15}, { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 106 +#define MUL_FFT_THRESHOLD 7424 + +#define SQR_FFT_MODF_THRESHOLD 432 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 432, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \ + { 11, 5}, { 23, 6}, { 21, 7}, { 11, 6}, \ + { 24, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \ + { 31, 7}, { 21, 8}, { 11, 7}, { 29, 8}, \ + { 15, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \ + { 23, 7}, { 49, 8}, { 27, 9}, { 15, 8}, \ + { 39, 9}, { 23, 7}, { 93, 8}, { 47, 7}, \ + { 95, 8}, { 51,10}, { 15, 9}, { 31, 8}, \ + { 67, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \ + { 95, 9}, { 55,10}, { 31, 9}, { 71, 8}, \ + { 143, 9}, { 79,10}, { 47, 9}, { 95,11}, \ + { 31,10}, { 63, 9}, { 135,10}, { 79, 9}, \ + { 167,10}, { 95, 9}, { 191,11}, { 63,10}, \ + { 127, 9}, { 255,10}, { 143, 9}, { 287, 8}, \ + { 575,10}, { 159, 9}, { 319,11}, { 95,10}, \ + { 191,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,10}, { 271, 9}, { 543,10}, { 287,11}, \ + { 159,10}, { 319, 9}, { 639,10}, { 351, 9}, \ + { 703,11}, { 191,10}, { 415,11}, { 223,12}, \ + { 127,11}, { 255,10}, { 543,11}, { 287,10}, \ + { 607,11}, { 319,10}, { 639,11}, { 351,10}, \ + { 703,12}, { 191,11}, { 415,10}, { 831,13}, \ + { 127,12}, { 255,11}, { 543,10}, { 1087,11}, \ + { 607,12}, { 319,11}, { 703,12}, { 383,11}, \ + { 831,12}, { 447,13}, { 255,12}, { 511,11}, \ + { 1087,12}, { 575,11}, { 1215,12}, { 703,13}, \ + { 383,12}, { 895,14}, { 255,13}, { 511,12}, \ + { 1215,13}, { 8192,14}, { 16384,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 112 +#define SQR_FFT_THRESHOLD 7040 + +#define MULLO_BASECASE_THRESHOLD 3 +#define MULLO_DC_THRESHOLD 60 +#define MULLO_MUL_N_THRESHOLD 13463 + +#define DC_DIV_QR_THRESHOLD 78 +#define DC_DIVAPPR_Q_THRESHOLD 252 +#define DC_BDIV_QR_THRESHOLD 84 +#define DC_BDIV_Q_THRESHOLD 171 + +#define INV_MULMOD_BNM1_THRESHOLD 55 +#define INV_NEWTON_THRESHOLD 234 +#define INV_APPR_THRESHOLD 236 + +#define BINV_NEWTON_THRESHOLD 268 +#define REDC_1_TO_REDC_N_THRESHOLD 67 + +#define MU_DIV_QR_THRESHOLD 1308 +#define MU_DIVAPPR_Q_THRESHOLD 1142 +#define MUPI_DIV_QR_THRESHOLD 134 +#define MU_BDIV_QR_THRESHOLD 1164 +#define MU_BDIV_Q_THRESHOLD 1164 + +#define MATRIX22_STRASSEN_THRESHOLD 15 +#define HGCD_THRESHOLD 182 +#define GCD_DC_THRESHOLD 591 +#define GCDEXT_DC_THRESHOLD 472 +#define JACOBI_BASE_METHOD 2 + +#define GET_STR_DC_THRESHOLD 24 +#define GET_STR_PRECOMPUTE_THRESHOLD 40 +#define SET_STR_DC_THRESHOLD 834 +#define SET_STR_PRECOMPUTE_THRESHOLD 2042 diff --git a/vendor/gmp-6.3.0/mpn/x86/k6/k62mmx/copyd.asm b/vendor/gmp-6.3.0/mpn/x86/k6/k62mmx/copyd.asm new file mode 100644 index 0000000..f80a5a1 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/k6/k62mmx/copyd.asm @@ -0,0 +1,118 @@ +dnl AMD K6-2 mpn_copyd -- copy limb vector, decrementing. + +dnl Copyright 2001, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C K6-2: 1.0 cycles/limb + + +C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size); +C +C The loop here is no faster than a rep movsl at 1.0 c/l, but it avoids a 30 +C cycle startup time, which amounts for instance to a 2x speedup at 15 +C limbs. +C +C If dst is 4mod8 the loop would be 1.17 c/l, but that's avoided by +C processing one limb separately to make it aligned. This and a final odd +C limb are handled in a branch-free fashion, ending up re-copying if the +C special case isn't needed. +C +C Alternatives: +C +C There used to be a big unrolled version of this, running at 0.56 c/l if +C the destination was aligned, but that seemed rather excessive for the +C relative importance of copyd. +C +C If the destination alignment is ignored and just left to run at 1.17 c/l +C some code size and a fixed few cycles can be saved. Considering how few +C uses copyd finds perhaps that should be favoured. The current code has +C the attraction of being no slower than a basic rep movsl though. + +defframe(PARAM_SIZE,12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +dnl re-using parameter space +define(SAVE_EBX,`PARAM_SIZE') + + TEXT + ALIGN(16) + +PROLOGUE(mpn_copyd) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + movl %ebx, SAVE_EBX + + movl PARAM_SRC, %eax + movl PARAM_DST, %edx + + subl $1, %ecx C better code alignment than decl + jb L(zero) + + jz L(one_more) + leal 4(%edx,%ecx,4), %ebx + +Zdisp( movd, 0,(%eax,%ecx,4), %mm0) C high limb +Zdisp( movd, %mm0, 0,(%edx,%ecx,4)) C Zdisp for good code alignment + + cmpl $1, %ecx + je L(one_more) + + shrl $2, %ebx + andl $1, %ebx C 1 if dst[size-2] unaligned + + subl %ebx, %ecx + nop C code alignment + +L(top): + C eax src + C ebx + C ecx counter + C edx dst + + movq -4(%eax,%ecx,4), %mm0 + subl $2, %ecx + + movq %mm0, 4(%edx,%ecx,4) + ja L(top) + + +L(one_more): + movd (%eax), %mm0 + movd %mm0, (%edx) + + movl SAVE_EBX, %ebx + emms_or_femms +L(zero): + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/k6/k62mmx/lshift.asm b/vendor/gmp-6.3.0/mpn/x86/k6/k62mmx/lshift.asm new file mode 100644 index 0000000..c86575f --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/k6/k62mmx/lshift.asm @@ -0,0 +1,294 @@ +dnl AMD K6-2 mpn_lshift -- mpn left shift. + +dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C K6-2: 1.75 cycles/limb + + +C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned shift); +C + +defframe(PARAM_SHIFT,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) +deflit(`FRAME',0) + +dnl used after src has been fetched +define(VAR_RETVAL,`PARAM_SRC') + +dnl minimum 9, because unrolled loop can't handle less +deflit(UNROLL_THRESHOLD, 9) + + TEXT + ALIGN(32) + +PROLOGUE(mpn_lshift) +deflit(`FRAME',0) + + C The 1 limb case can be done without the push %ebx, but it's then + C still the same speed. The push is left as a free helping hand for + C the two_or_more code. + + movl PARAM_SIZE, %eax + pushl %ebx FRAME_pushl() + + movl PARAM_SRC, %ebx + decl %eax + + movl PARAM_SHIFT, %ecx + jnz L(two_or_more) + + movl (%ebx), %edx C src limb + movl PARAM_DST, %ebx + + shldl( %cl, %edx, %eax) C return value + + shll %cl, %edx + + movl %edx, (%ebx) C dst limb + popl %ebx + + ret + + +C ----------------------------------------------------------------------------- + ALIGN(16) C avoid offset 0x1f +L(two_or_more): + C eax size-1 + C ebx src + C ecx shift + C edx + + movl (%ebx,%eax,4), %edx C src high limb + negl %ecx + + movd PARAM_SHIFT, %mm6 + addl $32, %ecx C 32-shift + + shrl %cl, %edx + cmpl $UNROLL_THRESHOLD-1, %eax + + movl %edx, VAR_RETVAL + jae L(unroll) + + + movd %ecx, %mm7 + movl %eax, %ecx + + movl PARAM_DST, %eax + +L(simple): + C eax dst + C ebx src + C ecx counter, size-1 to 1 + C edx retval + C + C mm0 scratch + C mm6 shift + C mm7 32-shift + + movq -4(%ebx,%ecx,4), %mm0 + + psrlq %mm7, %mm0 + +Zdisp( movd, %mm0, 0,(%eax,%ecx,4)) + loop L(simple) + + + movd (%ebx), %mm0 + popl %ebx + + psllq %mm6, %mm0 + + movd %mm0, (%eax) + movl %edx, %eax + + femms + ret + + +C ----------------------------------------------------------------------------- + ALIGN(16) +L(unroll): + C eax size-1 + C ebx src + C ecx 32-shift + C edx retval (but instead VAR_RETVAL is used) + C + C mm6 shift + + addl $32, %ecx + movl PARAM_DST, %edx + + movd %ecx, %mm7 + subl $7, %eax C size-8 + + leal (%edx,%eax,4), %ecx C alignment of dst + + movq 32-8(%ebx,%eax,4), %mm2 C src high qword + testb $4, %cl + + jz L(dst_aligned) + psllq %mm6, %mm2 + + psrlq $32, %mm2 + decl %eax + + movd %mm2, 32(%edx,%eax,4) C dst high limb + movq 32-8(%ebx,%eax,4), %mm2 C new src high qword +L(dst_aligned): + + movq 32-16(%ebx,%eax,4), %mm0 C src second highest qword + + + C This loop is the important bit, the rest is just support for it. + C Four src limbs are held at the start, and four more will be read. + C Four dst limbs will be written. This schedule seems necessary for + C full speed. + C + C The use of size-8 lets the loop stop when %eax goes negative and + C leaves -4 to -1 which can be tested with test $1 and $2. + +L(top): + C eax counter, size-8 step by -4 until <0 + C ebx src + C ecx + C edx dst + C + C mm0 src next qword + C mm1 scratch + C mm2 src prev qword + C mm6 shift + C mm7 64-shift + + psllq %mm6, %mm2 + subl $4, %eax + + movq %mm0, %mm1 + psrlq %mm7, %mm0 + + por %mm0, %mm2 + movq 24(%ebx,%eax,4), %mm0 + + psllq %mm6, %mm1 + movq %mm2, 40(%edx,%eax,4) + + movq %mm0, %mm2 + psrlq %mm7, %mm0 + + por %mm0, %mm1 + movq 16(%ebx,%eax,4), %mm0 + + movq %mm1, 32(%edx,%eax,4) + jnc L(top) + + + C Now have four limbs in mm2 (prev) and mm0 (next), plus eax mod 4. + C + C 8(%ebx) is the next source, and 24(%edx) is the next destination. + C %eax is between -4 and -1, representing respectively 0 to 3 extra + C limbs that must be read. + + + testl $2, %eax C testl to avoid bad cache line crossing + jz L(finish_nottwo) + + C Two more limbs: lshift mm2, OR it with rshifted mm0, mm0 becomes + C new mm2 and a new mm0 is loaded. + + psllq %mm6, %mm2 + movq %mm0, %mm1 + + psrlq %mm7, %mm0 + subl $2, %eax + + por %mm0, %mm2 + movq 16(%ebx,%eax,4), %mm0 + + movq %mm2, 32(%edx,%eax,4) + movq %mm1, %mm2 +L(finish_nottwo): + + + C lshift mm2, OR with rshifted mm0, mm1 becomes lshifted mm0 + + testb $1, %al + psllq %mm6, %mm2 + + movq %mm0, %mm1 + psrlq %mm7, %mm0 + + por %mm0, %mm2 + psllq %mm6, %mm1 + + movq %mm2, 24(%edx,%eax,4) + jz L(finish_even) + + + C Size is odd, so mm1 and one extra limb to process. + + movd (%ebx), %mm0 C src[0] + popl %ebx +deflit(`FRAME',0) + + movq %mm0, %mm2 + psllq $32, %mm0 + + psrlq %mm7, %mm0 + + psllq %mm6, %mm2 + por %mm0, %mm1 + + movq %mm1, 4(%edx) C dst[1,2] + movd %mm2, (%edx) C dst[0] + + movl VAR_RETVAL, %eax + + femms + ret + + + nop C avoid bad cache line crossing +L(finish_even): +deflit(`FRAME',4) + C Size is even, so only mm1 left to process. + + movq %mm1, (%edx) C dst[0,1] + movl VAR_RETVAL, %eax + + popl %ebx + femms + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/k6/k62mmx/rshift.asm b/vendor/gmp-6.3.0/mpn/x86/k6/k62mmx/rshift.asm new file mode 100644 index 0000000..f604a7b --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/k6/k62mmx/rshift.asm @@ -0,0 +1,293 @@ +dnl AMD K6-2 mpn_rshift -- mpn right shift. + +dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C K6-2: 1.75 cycles/limb + + +C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned shift); +C + +defframe(PARAM_SHIFT,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) +deflit(`FRAME',0) + +dnl Minimum 9, because the unrolled loop can't handle less. +dnl +deflit(UNROLL_THRESHOLD, 9) + + TEXT + ALIGN(32) + +PROLOGUE(mpn_rshift) +deflit(`FRAME',0) + + C The 1 limb case can be done without the push %ebx, but it's then + C still the same speed. The push is left as a free helping hand for + C the two_or_more code. + + movl PARAM_SIZE, %eax + pushl %ebx FRAME_pushl() + + movl PARAM_SRC, %ebx + decl %eax + + movl PARAM_SHIFT, %ecx + jnz L(two_or_more) + + movl (%ebx), %edx C src limb + movl PARAM_DST, %ebx + + shrdl( %cl, %edx, %eax) C return value + + shrl %cl, %edx + + movl %edx, (%ebx) C dst limb + popl %ebx + + ret + + +C ----------------------------------------------------------------------------- + ALIGN(16) C avoid offset 0x1f +L(two_or_more): + C eax size-1 + C ebx src + C ecx shift + C edx + + movl (%ebx), %edx C src low limb + negl %ecx + + addl $32, %ecx + movd PARAM_SHIFT, %mm6 + + shll %cl, %edx + cmpl $UNROLL_THRESHOLD-1, %eax + + jae L(unroll) + + + C eax size-1 + C ebx src + C ecx 32-shift + C edx retval + C + C mm6 shift + + movl PARAM_DST, %ecx + leal (%ebx,%eax,4), %ebx + + leal -4(%ecx,%eax,4), %ecx + negl %eax + + C This loop runs at about 3 cycles/limb, which is the amount of + C decoding, and this is despite every second access being unaligned. + +L(simple): + C eax counter, -(size-1) to -1 + C ebx &src[size-1] + C ecx &dst[size-1] + C edx retval + C + C mm0 scratch + C mm6 shift + +Zdisp( movq, 0,(%ebx,%eax,4), %mm0) + incl %eax + + psrlq %mm6, %mm0 + +Zdisp( movd, %mm0, 0,(%ecx,%eax,4)) + jnz L(simple) + + + movq %mm0, (%ecx) + movl %edx, %eax + + popl %ebx + + femms + ret + + +C ----------------------------------------------------------------------------- + ALIGN(16) +L(unroll): + C eax size-1 + C ebx src + C ecx 32-shift + C edx retval + C + C mm6 shift + + addl $32, %ecx + subl $7, %eax C size-8 + + movd %ecx, %mm7 + movl PARAM_DST, %ecx + + movq (%ebx), %mm2 C src low qword + leal (%ebx,%eax,4), %ebx C src end - 32 + + testb $4, %cl + leal (%ecx,%eax,4), %ecx C dst end - 32 + + notl %eax C -(size-7) + jz L(dst_aligned) + + psrlq %mm6, %mm2 + incl %eax + +Zdisp( movd, %mm2, 0,(%ecx,%eax,4)) C dst low limb + movq 4(%ebx,%eax,4), %mm2 C new src low qword +L(dst_aligned): + + movq 12(%ebx,%eax,4), %mm0 C src second lowest qword + nop C avoid bad cache line crossing + + + C This loop is the important bit, the rest is just support for it. + C Four src limbs are held at the start, and four more will be read. + C Four dst limbs will be written. This schedule seems necessary for + C full speed. + C + C The use of -(size-7) lets the loop stop when %eax becomes >= 0 and + C and leaves 0 to 3 which can be tested with test $1 and $2. + +L(top): + C eax counter, -(size-7) step by +4 until >=0 + C ebx src end - 32 + C ecx dst end - 32 + C edx retval + C + C mm0 src next qword + C mm1 scratch + C mm2 src prev qword + C mm6 shift + C mm7 64-shift + + psrlq %mm6, %mm2 + addl $4, %eax + + movq %mm0, %mm1 + psllq %mm7, %mm0 + + por %mm0, %mm2 + movq 4(%ebx,%eax,4), %mm0 + + psrlq %mm6, %mm1 + movq %mm2, -12(%ecx,%eax,4) + + movq %mm0, %mm2 + psllq %mm7, %mm0 + + por %mm0, %mm1 + movq 12(%ebx,%eax,4), %mm0 + + movq %mm1, -4(%ecx,%eax,4) + ja L(top) C jump if no carry and not zero + + + + C Now have the four limbs in mm2 (low) and mm0 (high), and %eax is 0 + C to 3 representing respectively 3 to 0 further limbs. + + testl $2, %eax C testl to avoid bad cache line crossings + jnz L(finish_nottwo) + + C Two or three extra limbs: rshift mm2, OR it with lshifted mm0, mm0 + C becomes new mm2 and a new mm0 is loaded. + + psrlq %mm6, %mm2 + movq %mm0, %mm1 + + psllq %mm7, %mm0 + addl $2, %eax + + por %mm0, %mm2 + movq 12(%ebx,%eax,4), %mm0 + + movq %mm2, -4(%ecx,%eax,4) + movq %mm1, %mm2 +L(finish_nottwo): + + + testb $1, %al + psrlq %mm6, %mm2 + + movq %mm0, %mm1 + psllq %mm7, %mm0 + + por %mm0, %mm2 + psrlq %mm6, %mm1 + + movq %mm2, 4(%ecx,%eax,4) + jnz L(finish_even) + + + C one further extra limb to process + + movd 32-4(%ebx), %mm0 C src[size-1], most significant limb + popl %ebx + + movq %mm0, %mm2 + psllq %mm7, %mm0 + + por %mm0, %mm1 + psrlq %mm6, %mm2 + + movq %mm1, 32-12(%ecx) C dst[size-3,size-2] + movd %mm2, 32-4(%ecx) C dst[size-1] + + movl %edx, %eax C retval + + femms + ret + + + nop C avoid bad cache line crossing +L(finish_even): + C no further extra limbs + + movq %mm1, 32-8(%ecx) C dst[size-2,size-1] + movl %edx, %eax C retval + + popl %ebx + + femms + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/k6/mmx/com.asm b/vendor/gmp-6.3.0/mpn/x86/k6/mmx/com.asm new file mode 100644 index 0000000..b747454 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/k6/mmx/com.asm @@ -0,0 +1,103 @@ +dnl AMD K6-2 mpn_com -- mpn bitwise one's complement. + +dnl Copyright 1999-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +NAILS_SUPPORT(0-31) + + +C alignment dst/src, A=0mod8 N=4mod8 +C A/A A/N N/A N/N +C K6-2 1.0 1.18 1.18 1.18 cycles/limb +C K6 1.5 1.85 1.75 1.85 + + +C void mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size); +C +C Take the bitwise ones-complement of src,size and write it to dst,size. + +defframe(PARAM_SIZE,12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + TEXT + ALIGN(16) +PROLOGUE(mpn_com) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + movl PARAM_SRC, %eax + movl PARAM_DST, %edx + shrl %ecx + jnz L(two_or_more) + + movl (%eax), %eax + notl_or_xorl_GMP_NUMB_MASK( %eax) + movl %eax, (%edx) + ret + + +L(two_or_more): + pushl %ebx FRAME_pushl() + pcmpeqd %mm7, %mm7 C all ones + + movl %ecx, %ebx +ifelse(GMP_NAIL_BITS,0,, +` psrld $GMP_NAIL_BITS, %mm7') C clear nails + + + + ALIGN(8) +L(top): + C eax src + C ebx floor(size/2) + C ecx counter + C edx dst + C + C mm0 scratch + C mm7 mask + + movq -8(%eax,%ecx,8), %mm0 + pxor %mm7, %mm0 + movq %mm0, -8(%edx,%ecx,8) + loop L(top) + + + jnc L(no_extra) + movl (%eax,%ebx,8), %eax + notl_or_xorl_GMP_NUMB_MASK( %eax) + movl %eax, (%edx,%ebx,8) +L(no_extra): + + popl %ebx + emms_or_femms + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/k6/mmx/dive_1.asm b/vendor/gmp-6.3.0/mpn/x86/k6/mmx/dive_1.asm new file mode 100644 index 0000000..1bbad3a --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/k6/mmx/dive_1.asm @@ -0,0 +1,282 @@ +dnl AMD K6 mpn_divexact_1 -- mpn by limb exact division. + +dnl Copyright 2000-2002, 2007 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C divisor +C odd even +C K6: 10.0 12.0 cycles/limb +C K6-2: 10.0 11.5 + + +C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t divisor); +C +C A simple divl is used for size==1. This is about 10 cycles faster for an +C odd divisor or 20 cycles for an even divisor. +C +C The loops are quite sensitive to code alignment, speeds should be +C rechecked (odd and even divisor, pic and non-pic) if contemplating +C changing anything. + +defframe(PARAM_DIVISOR,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +dnl re-use parameter space +define(VAR_INVERSE,`PARAM_DST') + + TEXT + + ALIGN(32) +PROLOGUE(mpn_divexact_1) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + + movl PARAM_SRC, %eax + xorl %edx, %edx + + cmpl $1, %ecx + jnz L(two_or_more) + + movl (%eax), %eax + + divl PARAM_DIVISOR + + movl PARAM_DST, %ecx + movl %eax, (%ecx) + + ret + + +L(two_or_more): + movl PARAM_DIVISOR, %eax + pushl %ebx FRAME_pushl() + + movl PARAM_SRC, %ebx + pushl %ebp FRAME_pushl() + +L(strip_twos): + shrl %eax + incl %edx C will get shift+1 + + jnc L(strip_twos) + pushl %esi FRAME_pushl() + + leal 1(%eax,%eax), %esi C d without twos + andl $127, %eax C d/2, 7 bits + +ifdef(`PIC',` + LEA( binvert_limb_table, %ebp) +Zdisp( movzbl, 0,(%eax,%ebp), %eax) +',` + movzbl binvert_limb_table(%eax), %eax C inv 8 bits +') + pushl %edi FRAME_pushl() + + leal (%eax,%eax), %ebp C 2*inv + + imull %eax, %eax C inv*inv + + movl PARAM_DST, %edi + + imull %esi, %eax C inv*inv*d + + subl %eax, %ebp C inv = 2*inv - inv*inv*d + leal (%ebp,%ebp), %eax C 2*inv + + imull %ebp, %ebp C inv*inv + + movl %esi, PARAM_DIVISOR C d without twos + leal (%ebx,%ecx,4), %ebx C src end + + imull %esi, %ebp C inv*inv*d + + leal (%edi,%ecx,4), %edi C dst end + negl %ecx C -size + + subl %ebp, %eax C inv = 2*inv - inv*inv*d + subl $1, %edx C shift amount, and clear carry + + ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS + pushl %eax FRAME_pushl() + imull PARAM_DIVISOR, %eax + cmpl $1, %eax + popl %eax FRAME_popl()') + + movl %eax, VAR_INVERSE + jnz L(even) + + movl (%ebx,%ecx,4), %esi C src low limb + jmp L(odd_entry) + + + ALIGN(16) + nop C code alignment +L(odd_top): + C eax scratch + C ebx src end + C ecx counter, limbs, negative + C edx inverse + C esi next limb, adjusted for carry + C edi dst end + C ebp carry bit, 0 or -1 + + imull %edx, %esi + + movl PARAM_DIVISOR, %eax + movl %esi, -4(%edi,%ecx,4) + + mull %esi C carry limb in edx + + subl %ebp, %edx C apply carry bit + movl (%ebx,%ecx,4), %esi + +L(odd_entry): + subl %edx, %esi C apply carry limb + movl VAR_INVERSE, %edx + + sbbl %ebp, %ebp C 0 or -1 + + incl %ecx + jnz L(odd_top) + + + imull %edx, %esi + + movl %esi, -4(%edi,%ecx,4) + + popl %edi + popl %esi + + popl %ebp + popl %ebx + + ret + + +L(even): + C eax + C ebx src end + C ecx -size + C edx twos + C esi + C edi dst end + C ebp + + xorl %ebp, %ebp +Zdisp( movq, 0,(%ebx,%ecx,4), %mm0) C src[0,1] + + movd %edx, %mm7 + movl VAR_INVERSE, %edx + + addl $2, %ecx + psrlq %mm7, %mm0 + + movd %mm0, %esi + jz L(even_two) C if only two limbs + + +C Out-of-order execution is good enough to hide the load/rshift/movd +C latency. Having imul at the top of the loop gives 11.5 c/l instead of 12, +C on K6-2. In fact there's only 11 of decode, but nothing running at 11 has +C been found. Maybe the fact every second movq is unaligned costs the extra +C 0.5. + +L(even_top): + C eax scratch + C ebx src end + C ecx counter, limbs, negative + C edx inverse + C esi next limb, adjusted for carry + C edi dst end + C ebp carry bit, 0 or -1 + C + C mm0 scratch, source limbs + C mm7 twos + + imull %edx, %esi + + movl %esi, -8(%edi,%ecx,4) + movl PARAM_DIVISOR, %eax + + mull %esi C carry limb in edx + + movq -4(%ebx,%ecx,4), %mm0 + psrlq %mm7, %mm0 + + movd %mm0, %esi + subl %ebp, %edx C apply carry bit + + subl %edx, %esi C apply carry limb + movl VAR_INVERSE, %edx + + sbbl %ebp, %ebp C 0 or -1 + + incl %ecx + jnz L(even_top) + + +L(even_two): + movd -4(%ebx), %mm0 C src high limb + psrlq %mm7, %mm0 + + imull %edx, %esi + + movl %esi, -8(%edi) + movl PARAM_DIVISOR, %eax + + mull %esi C carry limb in edx + + movd %mm0, %esi + subl %ebp, %edx C apply carry bit + + movl VAR_INVERSE, %eax + subl %edx, %esi C apply carry limb + + imull %eax, %esi + + movl %esi, -4(%edi) + + popl %edi + popl %esi + + popl %ebp + popl %ebx + + emms_or_femms + + ret + +EPILOGUE() +ASM_END() diff --git a/vendor/gmp-6.3.0/mpn/x86/k6/mmx/logops_n.asm b/vendor/gmp-6.3.0/mpn/x86/k6/mmx/logops_n.asm new file mode 100644 index 0000000..e17930b --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/k6/mmx/logops_n.asm @@ -0,0 +1,226 @@ +dnl AMD K6-2 mpn_and_n, mpn_andn_n, mpn_nand_n, mpn_ior_n, mpn_iorn_n, +dnl mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise logical operations. + +dnl Copyright 1999-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +NAILS_SUPPORT(0-31) + + +C alignment dst/src1/src2, A=0mod8, N=4mod8 +C A/A/A A/A/N A/N/A A/N/N N/A/A N/A/N N/N/A N/N/N +C +C K6-2 1.2 1.5 1.5 1.2 1.2 1.5 1.5 1.2 and,andn,ior,xor +C K6-2 1.5 1.75 2.0 1.75 1.75 2.0 1.75 1.5 iorn,xnor +C K6-2 1.75 2.0 2.0 2.0 2.0 2.0 2.0 1.75 nand,nior +C +C K6 1.5 1.68 1.75 1.2 1.75 1.75 1.68 1.5 and,andn,ior,xor +C K6 2.0 2.0 2.25 2.25 2.25 2.25 2.0 2.0 iorn,xnor +C K6 2.0 2.25 2.25 2.25 2.25 2.25 2.25 2.0 nand,nior + + +dnl M4_p and M4_i are the MMX and integer instructions +dnl M4_*_neg_dst means whether to negate the final result before writing +dnl M4_*_neg_src2 means whether to negate the src2 values before using them + +define(M4_choose_op, +m4_assert_numargs(7) +`ifdef(`OPERATION_$1',` +define(`M4_function', `mpn_$1') +define(`M4_operation', `$1') +define(`M4_p', `$2') +define(`M4_p_neg_dst', `$3') +define(`M4_p_neg_src2',`$4') +define(`M4_i', `$5') +define(`M4_i_neg_dst', `$6') +define(`M4_i_neg_src2',`$7') +')') + +dnl xnor is done in "iorn" style because it's a touch faster than "nior" +dnl style (the two are equivalent for xor). +dnl +dnl pandn can't be used with nails. + +M4_choose_op( and_n, pand,0,0, andl,0,0) +ifelse(GMP_NAIL_BITS,0, +`M4_choose_op(andn_n, pandn,0,0, andl,0,1)', +`M4_choose_op(andn_n, pand,0,1, andl,0,1)') +M4_choose_op( nand_n, pand,1,0, andl,1,0) +M4_choose_op( ior_n, por,0,0, orl,0,0) +M4_choose_op( iorn_n, por,0,1, orl,0,1) +M4_choose_op( nior_n, por,1,0, orl,1,0) +M4_choose_op( xor_n, pxor,0,0, xorl,0,0) +M4_choose_op( xnor_n, pxor,0,1, xorl,0,1) + +ifdef(`M4_function',, +`m4_error(`Unrecognised or undefined OPERATION symbol +')') + +MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n) + + +C void M4_function (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, +C mp_size_t size); +C +C Do src1,size M4_operation src2,size, storing the result in dst,size. +C +C Unaligned movq loads and stores are a bit slower than aligned ones. The +C test at the start of the routine checks the alignment of src1 and if +C necessary processes one limb separately at the low end to make it aligned. +C +C The raw speeds without this alignment switch are as follows. +C +C alignment dst/src1/src2, A=0mod8, N=4mod8 +C A/A/A A/A/N A/N/A A/N/N N/A/A N/A/N N/N/A N/N/N +C +C K6 1.5 2.0 1.5 2.0 and,andn,ior,xor +C K6 1.75 2.2 2.0 2.28 iorn,xnor +C K6 2.0 2.25 2.35 2.28 nand,nior +C +C +C Future: +C +C K6 can do one 64-bit load per cycle so each of these routines should be +C able to approach 1.0 c/l, if aligned. The basic and/andn/ior/xor might be +C able to get 1.0 with just a 4 limb loop, being 3 instructions per 2 limbs. +C The others are 4 instructions per 2 limbs, and so can only approach 1.0 +C because there's nowhere to hide some loop control. + +defframe(PARAM_SIZE,16) +defframe(PARAM_SRC2,12) +defframe(PARAM_SRC1,8) +defframe(PARAM_DST, 4) +deflit(`FRAME',0) + + TEXT + ALIGN(32) +PROLOGUE(M4_function) + movl PARAM_SIZE, %ecx + pushl %ebx FRAME_pushl() + + movl PARAM_SRC1, %eax + + movl PARAM_SRC2, %ebx + cmpl $1, %ecx + + movl PARAM_DST, %edx + ja L(two_or_more) + + + movl (%ebx), %ecx + popl %ebx +ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK( %ecx)') + M4_i (%eax), %ecx +ifelse(M4_i_neg_dst,1,` notl_or_xorl_GMP_NUMB_MASK( %ecx)') + movl %ecx, (%edx) + + ret + + +L(two_or_more): + C eax src1 + C ebx src2 + C ecx size + C edx dst + C esi + C edi + C ebp + + pushl %esi FRAME_pushl() + testl $4, %eax + jz L(alignment_ok) + + movl (%ebx), %esi + addl $4, %ebx +ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK( %esi)') + M4_i (%eax), %esi + addl $4, %eax +ifelse(M4_i_neg_dst,1,` notl_or_xorl_GMP_NUMB_MASK( %esi)') + movl %esi, (%edx) + addl $4, %edx + decl %ecx + +L(alignment_ok): + movl %ecx, %esi + shrl %ecx + jnz L(still_two_or_more) + + movl (%ebx), %ecx + popl %esi +ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK( %ecx)') + M4_i (%eax), %ecx +ifelse(M4_i_neg_dst,1,` notl_or_xorl_GMP_NUMB_MASK( %ecx)') + popl %ebx + movl %ecx, (%edx) + ret + + +L(still_two_or_more): +ifelse(eval(M4_p_neg_src2 || M4_p_neg_dst),1,` + pcmpeqd %mm7, %mm7 C all ones +ifelse(GMP_NAIL_BITS,0,,`psrld $GMP_NAIL_BITS, %mm7') C clear nails +') + + ALIGN(16) +L(top): + C eax src1 + C ebx src2 + C ecx counter + C edx dst + C esi + C edi + C ebp + C + C carry bit is low of size + + movq -8(%ebx,%ecx,8), %mm0 +ifelse(M4_p_neg_src2,1,`pxor %mm7, %mm0') + M4_p -8(%eax,%ecx,8), %mm0 +ifelse(M4_p_neg_dst,1,` pxor %mm7, %mm0') + movq %mm0, -8(%edx,%ecx,8) + + loop L(top) + + + jnc L(no_extra) + + movl -4(%ebx,%esi,4), %ebx +ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK( %ebx)') + M4_i -4(%eax,%esi,4), %ebx +ifelse(M4_i_neg_dst,1,` notl_or_xorl_GMP_NUMB_MASK( %ebx)') + movl %ebx, -4(%edx,%esi,4) +L(no_extra): + + popl %esi + popl %ebx + emms_or_femms + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/k6/mmx/lshift.asm b/vendor/gmp-6.3.0/mpn/x86/k6/mmx/lshift.asm new file mode 100644 index 0000000..45be582 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/k6/mmx/lshift.asm @@ -0,0 +1,130 @@ +dnl AMD K6 mpn_lshift -- mpn left shift. + +dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C K6: 3.0 cycles/limb + + +C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned shift); +C +C The loop runs at 3 cycles/limb, limited by decoding and by having 3 mmx +C instructions. This is despite every second fetch being unaligned. + + +defframe(PARAM_SHIFT,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + TEXT + ALIGN(32) + +PROLOGUE(mpn_lshift) +deflit(`FRAME',0) + + C The 1 limb case can be done without the push %ebx, but it's then + C still the same speed. The push is left as a free helping hand for + C the two_or_more code. + + movl PARAM_SIZE, %eax + pushl %ebx FRAME_pushl() + + movl PARAM_SRC, %ebx + decl %eax + + movl PARAM_SHIFT, %ecx + jnz L(two_or_more) + + movl (%ebx), %edx C src limb + movl PARAM_DST, %ebx + + shldl( %cl, %edx, %eax) C return value + + shll %cl, %edx + + movl %edx, (%ebx) C dst limb + popl %ebx + + ret + + + ALIGN(16) C avoid offset 0x1f + nop C avoid bad cache line crossing +L(two_or_more): + C eax size-1 + C ebx src + C ecx shift + C edx + + movl (%ebx,%eax,4), %edx C src high limb + negl %ecx + + movd PARAM_SHIFT, %mm6 + addl $32, %ecx C 32-shift + + shrl %cl, %edx + + movd %ecx, %mm7 + movl PARAM_DST, %ecx + +L(top): + C eax counter, size-1 to 1 + C ebx src + C ecx dst + C edx retval + C + C mm0 scratch + C mm6 shift + C mm7 32-shift + + movq -4(%ebx,%eax,4), %mm0 + decl %eax + + psrlq %mm7, %mm0 + + movd %mm0, 4(%ecx,%eax,4) + jnz L(top) + + + movd (%ebx), %mm0 + popl %ebx + + psllq %mm6, %mm0 + movl %edx, %eax + + movd %mm0, (%ecx) + + emms + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/k6/mmx/popham.asm b/vendor/gmp-6.3.0/mpn/x86/k6/mmx/popham.asm new file mode 100644 index 0000000..2b19d0b --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/k6/mmx/popham.asm @@ -0,0 +1,236 @@ +dnl AMD K6-2 mpn_popcount, mpn_hamdist -- mpn bit population count and +dnl hamming distance. + +dnl Copyright 2000-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C popcount hamdist +C K6-2: 9.0 11.5 cycles/limb +C K6: 12.5 13.0 + + +C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size); +C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size); +C +C The code here isn't optimal, but it's already a 2x speedup over the plain +C integer mpn/generic/popcount.c,hamdist.c. + + +ifdef(`OPERATION_popcount',, +`ifdef(`OPERATION_hamdist',, +`m4_error(`Need OPERATION_popcount or OPERATION_hamdist +')m4exit(1)')') + +define(HAM, +m4_assert_numargs(1) +`ifdef(`OPERATION_hamdist',`$1')') + +define(POP, +m4_assert_numargs(1) +`ifdef(`OPERATION_popcount',`$1')') + +HAM(` +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC2, 8) +defframe(PARAM_SRC, 4) +define(M4_function,mpn_hamdist) +') +POP(` +defframe(PARAM_SIZE, 8) +defframe(PARAM_SRC, 4) +define(M4_function,mpn_popcount) +') + +MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist) + + +ifdef(`PIC',,` + dnl non-PIC + + RODATA + ALIGN(8) + +L(rodata_AAAAAAAAAAAAAAAA): + .long 0xAAAAAAAA + .long 0xAAAAAAAA + +L(rodata_3333333333333333): + .long 0x33333333 + .long 0x33333333 + +L(rodata_0F0F0F0F0F0F0F0F): + .long 0x0F0F0F0F + .long 0x0F0F0F0F + +L(rodata_000000FF000000FF): + .long 0x000000FF + .long 0x000000FF +') + + TEXT + ALIGN(32) + +POP(`ifdef(`PIC', ` + C avoid shrl crossing a 32-byte boundary + nop')') + +PROLOGUE(M4_function) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + +ifdef(`PIC',` + movl $0xAAAAAAAA, %eax + movl $0x33333333, %edx + + movd %eax, %mm7 + movd %edx, %mm6 + + movl $0x0F0F0F0F, %eax + movl $0x000000FF, %edx + + punpckldq %mm7, %mm7 + punpckldq %mm6, %mm6 + + movd %eax, %mm5 + movd %edx, %mm4 + + punpckldq %mm5, %mm5 + punpckldq %mm4, %mm4 +',` + + movq L(rodata_AAAAAAAAAAAAAAAA), %mm7 + movq L(rodata_3333333333333333), %mm6 + movq L(rodata_0F0F0F0F0F0F0F0F), %mm5 + movq L(rodata_000000FF000000FF), %mm4 +') + +define(REG_AAAAAAAAAAAAAAAA, %mm7) +define(REG_3333333333333333, %mm6) +define(REG_0F0F0F0F0F0F0F0F, %mm5) +define(REG_000000FF000000FF, %mm4) + + + movl PARAM_SRC, %eax +HAM(` movl PARAM_SRC2, %edx') + + pxor %mm2, %mm2 C total + + shrl %ecx + jnc L(top) + +Zdisp( movd, 0,(%eax,%ecx,8), %mm1) + +HAM(` +Zdisp( movd, 0,(%edx,%ecx,8), %mm0) + pxor %mm0, %mm1 +') + + incl %ecx + jmp L(loaded) + + + ALIGN(16) +POP(` nop C alignment to avoid crossing 32-byte boundaries') + +L(top): + C eax src + C ebx + C ecx counter, qwords, decrementing + C edx [hamdist] src2 + C + C mm0 (scratch) + C mm1 (scratch) + C mm2 total (low dword) + C mm3 + C mm4 \ + C mm5 | special constants + C mm6 | + C mm7 / + + movq -8(%eax,%ecx,8), %mm1 +HAM(` pxor -8(%edx,%ecx,8), %mm1') + +L(loaded): + movq %mm1, %mm0 + pand REG_AAAAAAAAAAAAAAAA, %mm1 + + psrlq $1, %mm1 +HAM(` nop C code alignment') + + psubd %mm1, %mm0 C bit pairs +HAM(` nop C code alignment') + + + movq %mm0, %mm1 + psrlq $2, %mm0 + + pand REG_3333333333333333, %mm0 + pand REG_3333333333333333, %mm1 + + paddd %mm1, %mm0 C nibbles + + + movq %mm0, %mm1 + psrlq $4, %mm0 + + pand REG_0F0F0F0F0F0F0F0F, %mm0 + pand REG_0F0F0F0F0F0F0F0F, %mm1 + + paddd %mm1, %mm0 C bytes + + movq %mm0, %mm1 + psrlq $8, %mm0 + + + paddb %mm1, %mm0 C words + + + movq %mm0, %mm1 + psrlq $16, %mm0 + + paddd %mm1, %mm0 C dwords + + pand REG_000000FF000000FF, %mm0 + + paddd %mm0, %mm2 C low to total + psrlq $32, %mm0 + + paddd %mm0, %mm2 C high to total + loop L(top) + + + + movd %mm2, %eax + emms_or_femms + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/k6/mmx/rshift.asm b/vendor/gmp-6.3.0/mpn/x86/k6/mmx/rshift.asm new file mode 100644 index 0000000..cd0382f --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/k6/mmx/rshift.asm @@ -0,0 +1,130 @@ +dnl AMD K6 mpn_rshift -- mpn right shift. + +dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C K6: 3.0 cycles/limb + + +C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned shift); +C +C The loop runs at 3 cycles/limb, limited by decoding and by having 3 mmx +C instructions. This is despite every second fetch being unaligned. + + +defframe(PARAM_SHIFT,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) +deflit(`FRAME',0) + + TEXT + ALIGN(32) + +PROLOGUE(mpn_rshift) +deflit(`FRAME',0) + + C The 1 limb case can be done without the push %ebx, but it's then + C still the same speed. The push is left as a free helping hand for + C the two_or_more code. + + movl PARAM_SIZE, %eax + pushl %ebx FRAME_pushl() + + movl PARAM_SRC, %ebx + decl %eax + + movl PARAM_SHIFT, %ecx + jnz L(two_or_more) + + movl (%ebx), %edx C src limb + movl PARAM_DST, %ebx + + shrdl( %cl, %edx, %eax) C return value + + shrl %cl, %edx + + movl %edx, (%ebx) C dst limb + popl %ebx + + ret + + + ALIGN(16) C avoid offset 0x1f +L(two_or_more): + C eax size-1 + C ebx src + C ecx shift + C edx + + movl (%ebx), %edx C src low limb + negl %ecx + + addl $32, %ecx C 32-shift + movd PARAM_SHIFT, %mm6 + + shll %cl, %edx C retval + movl PARAM_DST, %ecx + + leal (%ebx,%eax,4), %ebx + + leal -4(%ecx,%eax,4), %ecx + negl %eax + + +L(simple): + C eax counter (negative) + C ebx &src[size-1] + C ecx &dst[size-1] + C edx retval + C + C mm0 scratch + C mm6 shift + +Zdisp( movq, 0,(%ebx,%eax,4), %mm0) + incl %eax + + psrlq %mm6, %mm0 + +Zdisp( movd, %mm0, 0,(%ecx,%eax,4)) + jnz L(simple) + + + movq %mm0, (%ecx) + movl %edx, %eax + + popl %ebx + + emms + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/k6/mod_34lsub1.asm b/vendor/gmp-6.3.0/mpn/x86/k6/mod_34lsub1.asm new file mode 100644 index 0000000..7e30503 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/k6/mod_34lsub1.asm @@ -0,0 +1,190 @@ +dnl AMD K6 mpn_mod_34lsub1 -- mpn remainder modulo 2**24-1. + +dnl Copyright 2000-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C K6: 2.66 cycles/limb + + +C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size) +C +C An attempt was made to use a loop like +C +C L(top): +C adcl (%edx), %eax +C adcl 4(%edx), %ebx +C adcl 8(%edx), %esi +C leal 12(%edx), %edx +C loop L(top) +C +C with %ecx starting from floor(size/3), but it still measured 2.66 c/l. +C The form used instead can save about 6 cycles by not dividing by 3. +C +C In the code used, putting the "leal"s at the top of the loop is necessary +C for the claimed speed, anywhere else costs an extra cycle per loop. +C Perhaps a tight loop like this needs short decode instructions at the +C branch target, which would explain the leal/loop form above taking 8 +C cycles instead of 7 too. + +defframe(PARAM_SIZE, 8) +defframe(PARAM_SRC, 4) + +dnl re-use parameter space +define(SAVE_EBX, `PARAM_SIZE') +define(SAVE_ESI, `PARAM_SRC') + + TEXT + ALIGN(16) +PROLOGUE(mpn_mod_34lsub1) +deflit(`FRAME',0) + + movl PARAM_SIZE, %eax + movl PARAM_SRC, %edx + + subl $2, %eax + ja L(three_or_more) + +Zdisp( movl, 0,(%edx), %eax) C avoid code cache line boundary + jne L(one) + + movl %eax, %ecx + movl 4(%edx), %edx + + shrl $24, %eax C src[0] high + andl $0x00FFFFFF, %ecx C src[0] low + + addl %ecx, %eax + movl %edx, %ecx + + shll $8, %edx + andl $0x00FFFF00, %edx C src[1] high + + shrl $16, %ecx C src[1] low + addl %ecx, %eax + + addl %edx, %eax + +L(one): + ret + + +L(three_or_more): + C eax size-2 + C ebx + C ecx + C edx src + + movl %ebx, SAVE_EBX + xorl %ebx, %ebx + + movl %esi, SAVE_ESI + pushl %edi FRAME_pushl() + + xorl %esi, %esi + xorl %edi, %edi C and clear carry flag + +L(top): + C eax counter, limbs + C ebx acc 0mod3 + C ecx + C edx src, incrementing + C esi acc 1mod3 + C edi acc 2mod3 + C ebp + + leal -2(%eax), %eax + leal 12(%edx), %edx + + adcl -12(%edx), %ebx + adcl -8(%edx), %esi + adcl -4(%edx), %edi + + decl %eax + jg L(top) + + + C ecx is -3, -2 or -1 representing 0, 1 or 2 more limbs, respectively + + movb $0, %cl + incl %eax + + js L(combine) C 0 more + +Zdisp( adcl, 0,(%edx), %ebx) C avoid code cache line crossings + + movb $8, %cl + decl %eax + + js L(combine) C 1 more + + adcl 4(%edx), %esi + + movb $16, %cl + + +L(combine): + sbbl %edx, %edx + + shll %cl, %edx C carry + movl %ebx, %eax C 0mod3 + + shrl $24, %eax C 0mod3 high + andl $0x00FFFFFF, %ebx C 0mod3 low + + subl %edx, %eax C apply carry + movl %esi, %ecx C 1mod3 + + shrl $16, %esi C 1mod3 high + addl %ebx, %eax C apply 0mod3 low + + andl $0x0000FFFF, %ecx + addl %esi, %eax C apply 1mod3 high + + shll $8, %ecx C 1mod3 low + movl %edi, %edx C 2mod3 + + shrl $8, %edx C 2mod3 high + addl %ecx, %eax C apply 1mod3 low + + addl %edx, %eax C apply 2mod3 high + andl $0x000000FF, %edi + + shll $16, %edi C 2mod3 low + movl SAVE_EBX, %ebx + + addl %edi, %eax C apply 2mod3 low + movl SAVE_ESI, %esi + + popl %edi + + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/k6/mode1o.asm b/vendor/gmp-6.3.0/mpn/x86/k6/mode1o.asm new file mode 100644 index 0000000..4a338bd --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/k6/mode1o.asm @@ -0,0 +1,176 @@ +dnl AMD K6 mpn_modexact_1_odd -- exact division style remainder. + +dnl Copyright 2000-2003, 2007 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C K6: 10.0 cycles/limb + + +C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size, +C mp_limb_t divisor); +C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, +C mp_limb_t divisor, mp_limb_t carry); +C +C A special case for high<divisor at the end measured only about 4 cycles +C faster, and so isn't used. +C +C A special case for size==1 using a divl rather than the inverse measured +C only about 5 cycles faster, and so isn't used. When size==1 and +C high<divisor it can skip a division and be a full 24 cycles faster, but +C this isn't an important case. + +defframe(PARAM_CARRY, 16) +defframe(PARAM_DIVISOR,12) +defframe(PARAM_SIZE, 8) +defframe(PARAM_SRC, 4) + + TEXT + + ALIGN(32) +PROLOGUE(mpn_modexact_1c_odd) +deflit(`FRAME',0) + + movl PARAM_DIVISOR, %ecx + pushl %esi FRAME_pushl() + + movl PARAM_CARRY, %edx + jmp L(start_1c) + +EPILOGUE() + + + ALIGN(16) +PROLOGUE(mpn_modexact_1_odd) +deflit(`FRAME',0) + + movl PARAM_DIVISOR, %ecx + pushl %esi FRAME_pushl() + + xorl %edx, %edx +L(start_1c): + pushl %edi FRAME_pushl() + + shrl %ecx C d/2 + movl PARAM_DIVISOR, %esi + + andl $127, %ecx C d/2, 7 bits + pushl %ebp FRAME_pushl() + +ifdef(`PIC',` + LEA( binvert_limb_table, %edi) +Zdisp( movzbl, 0,(%ecx,%edi), %edi) C inv 8 bits +',` + movzbl binvert_limb_table(%ecx), %edi C inv 8 bits +') + leal (%edi,%edi), %ecx C 2*inv + + imull %edi, %edi C inv*inv + + movl PARAM_SRC, %eax + movl PARAM_SIZE, %ebp + + imull %esi, %edi C inv*inv*d + + pushl %ebx FRAME_pushl() + leal (%eax,%ebp,4), %ebx C src end + + subl %edi, %ecx C inv = 2*inv - inv*inv*d + leal (%ecx,%ecx), %edi C 2*inv + + imull %ecx, %ecx C inv*inv + + movl (%eax), %eax C src low limb + negl %ebp C -size + + imull %esi, %ecx C inv*inv*d + + subl %ecx, %edi C inv = 2*inv - inv*inv*d + + ASSERT(e,` C d*inv == 1 mod 2^GMP_LIMB_BITS + pushl %eax + movl %esi, %eax + imull %edi, %eax + cmpl $1, %eax + popl %eax') + + jmp L(entry) + + +C Rotating the mul to the top of the loop saves 1 cycle, presumably by +C hiding the loop control under the imul latency. +C +C The run time is 10 cycles, but decoding is only 9 (and the dependent chain +C only 8). It's not clear how to get down to 9 cycles. +C +C The xor and rcl to handle the carry bit could be an sbb instead, with the +C the carry bit add becoming a sub, but that doesn't save anything. + +L(top): + C eax (low product) + C ebx src end + C ecx carry bit, 0 or 1 + C edx (high product, being carry limb) + C esi divisor + C edi inverse + C ebp counter, limbs, negative + + mull %esi + + movl (%ebx,%ebp,4), %eax + addl %ecx, %edx C apply carry bit to carry limb + +L(entry): + xorl %ecx, %ecx + subl %edx, %eax C apply carry limb + + rcll %ecx + + imull %edi, %eax + + incl %ebp + jnz L(top) + + + + popl %ebx + popl %ebp + + mull %esi + + popl %edi + popl %esi + + leal (%ecx,%edx), %eax + + ret + +EPILOGUE() +ASM_END() diff --git a/vendor/gmp-6.3.0/mpn/x86/k6/mul_1.asm b/vendor/gmp-6.3.0/mpn/x86/k6/mul_1.asm new file mode 100644 index 0000000..3ef7ec2 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/k6/mul_1.asm @@ -0,0 +1,292 @@ +dnl AMD K6 mpn_mul_1 -- mpn by limb multiply. + +dnl Copyright 1999, 2000, 2002, 2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C P5 +C P6 model 0-8,10-12 5.5 +C P6 model 9 (Banias) +C P6 model 13 (Dothan) 4.87 +C P4 model 0 (Willamette) +C P4 model 1 (?) +C P4 model 2 (Northwood) +C P4 model 3 (Prescott) +C P4 model 4 (Nocona) +C AMD K6 6.25 +C AMD K7 +C AMD K8 + + +C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t multiplier); +C mp_limb_t mpn_mul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t multiplier, mp_limb_t carry); +C +C Multiply src,size by mult and store the result in dst,size. +C Return the carry limb from the top of the result. +C +C mpn_mul_1c() accepts an initial carry for the calculation, it's added into +C the low limb of the result. + +defframe(PARAM_CARRY, 20) +defframe(PARAM_MULTIPLIER,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +dnl minimum 5 because the unrolled code can't handle less +deflit(UNROLL_THRESHOLD, 5) + + TEXT + ALIGN(32) + +PROLOGUE(mpn_mul_1c) + pushl %esi +deflit(`FRAME',4) + movl PARAM_CARRY, %esi + jmp L(start_nc) +EPILOGUE() + + +PROLOGUE(mpn_mul_1) + push %esi +deflit(`FRAME',4) + xorl %esi, %esi C initial carry + +L(start_nc): + mov PARAM_SIZE, %ecx + push %ebx +FRAME_pushl() + + movl PARAM_SRC, %ebx + push %edi +FRAME_pushl() + + movl PARAM_DST, %edi + pushl %ebp +FRAME_pushl() + + cmpl $UNROLL_THRESHOLD, %ecx + movl PARAM_MULTIPLIER, %ebp + + jae L(unroll) + + + C code offset 0x22 here, close enough to aligned +L(simple): + C eax scratch + C ebx src + C ecx counter + C edx scratch + C esi carry + C edi dst + C ebp multiplier + C + C this loop 8 cycles/limb + + movl (%ebx), %eax + addl $4, %ebx + + mull %ebp + + addl %esi, %eax + movl $0, %esi + + adcl %edx, %esi + + movl %eax, (%edi) + addl $4, %edi + + loop L(simple) + + + popl %ebp + + popl %edi + popl %ebx + + movl %esi, %eax + popl %esi + + ret + + +C ----------------------------------------------------------------------------- +C The code for each limb is 6 cycles, with instruction decoding being the +C limiting factor. At 4 limbs/loop and 1 cycle/loop of overhead it's 6.25 +C cycles/limb in total. +C +C The secret ingredient to get 6.25 is to start the loop with the mul and +C have the load/store pair at the end. Rotating the load/store to the top +C is an 0.5 c/l slowdown. (Some address generation effect probably.) +C +C The whole unrolled loop fits nicely in exactly 80 bytes. + + + ALIGN(16) C already aligned to 16 here actually +L(unroll): + movl (%ebx), %eax + leal -16(%ebx,%ecx,4), %ebx + + leal -16(%edi,%ecx,4), %edi + subl $4, %ecx + + negl %ecx + + + ALIGN(16) C one byte nop for this alignment +L(top): + C eax scratch + C ebx &src[size-4] + C ecx counter + C edx scratch + C esi carry + C edi &dst[size-4] + C ebp multiplier + + mull %ebp + + addl %esi, %eax + movl $0, %esi + + adcl %edx, %esi + + movl %eax, (%edi,%ecx,4) + movl 4(%ebx,%ecx,4), %eax + + + mull %ebp + + addl %esi, %eax + movl $0, %esi + + adcl %edx, %esi + + movl %eax, 4(%edi,%ecx,4) + movl 8(%ebx,%ecx,4), %eax + + + mull %ebp + + addl %esi, %eax + movl $0, %esi + + adcl %edx, %esi + + movl %eax, 8(%edi,%ecx,4) + movl 12(%ebx,%ecx,4), %eax + + + mull %ebp + + addl %esi, %eax + movl $0, %esi + + adcl %edx, %esi + + movl %eax, 12(%edi,%ecx,4) + movl 16(%ebx,%ecx,4), %eax + + + addl $4, %ecx + js L(top) + + + + C eax next src limb + C ebx &src[size-4] + C ecx 0 to 3 representing respectively 4 to 1 further limbs + C edx + C esi carry + C edi &dst[size-4] + + testb $2, %cl + jnz L(finish_not_two) + + mull %ebp + + addl %esi, %eax + movl $0, %esi + + adcl %edx, %esi + + movl %eax, (%edi,%ecx,4) + movl 4(%ebx,%ecx,4), %eax + + + mull %ebp + + addl %esi, %eax + movl $0, %esi + + adcl %edx, %esi + + movl %eax, 4(%edi,%ecx,4) + movl 8(%ebx,%ecx,4), %eax + + addl $2, %ecx +L(finish_not_two): + + + testb $1, %cl + jnz L(finish_not_one) + + mull %ebp + + addl %esi, %eax + movl $0, %esi + + adcl %edx, %esi + + movl %eax, 8(%edi) + movl 12(%ebx), %eax +L(finish_not_one): + + + mull %ebp + + addl %esi, %eax + popl %ebp + + adcl $0, %edx + + movl %eax, 12(%edi) + popl %edi + + popl %ebx + movl %edx, %eax + + popl %esi + + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/k6/mul_basecase.asm b/vendor/gmp-6.3.0/mpn/x86/k6/mul_basecase.asm new file mode 100644 index 0000000..7030001 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/k6/mul_basecase.asm @@ -0,0 +1,612 @@ +dnl AMD K6 mpn_mul_basecase -- multiply two mpn numbers. + +dnl Copyright 1999-2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C K6: approx 9.0 cycles per cross product on 30x30 limbs (with 16 limbs/loop +C unrolling). + + + +dnl K6: UNROLL_COUNT cycles/product (approx) +dnl 8 9.75 +dnl 16 9.3 +dnl 32 9.3 +dnl Maximum possible with the current code is 32. +dnl +dnl With 16 the inner unrolled loop fits exactly in a 256 byte block, which +dnl might explain it's good performance. + +deflit(UNROLL_COUNT, 16) + + +C void mpn_mul_basecase (mp_ptr wp, +C mp_srcptr xp, mp_size_t xsize, +C mp_srcptr yp, mp_size_t ysize); +C +C Calculate xp,xsize multiplied by yp,ysize, storing the result in +C wp,xsize+ysize. +C +C This routine is essentially the same as mpn/generic/mul_basecase.c, but +C it's faster because it does most of the mpn_addmul_1() entry code only +C once. The saving is about 10-20% on typical sizes coming from the +C Karatsuba multiply code. +C +C Enhancements: +C +C The mul_1 loop is about 8.5 c/l, which is slower than mpn_mul_1 at 6.25 +C c/l. Could call mpn_mul_1 when ysize is big enough to make it worthwhile. +C +C The main unrolled addmul loop could be shared by mpn_addmul_1, using some +C extra stack setups and maybe 2 or 3 wasted cycles at the end. Code saving +C would be 256 bytes. + +ifdef(`PIC',` +deflit(UNROLL_THRESHOLD, 8) +',` +deflit(UNROLL_THRESHOLD, 8) +') + +defframe(PARAM_YSIZE,20) +defframe(PARAM_YP, 16) +defframe(PARAM_XSIZE,12) +defframe(PARAM_XP, 8) +defframe(PARAM_WP, 4) + + TEXT + ALIGN(32) +PROLOGUE(mpn_mul_basecase) +deflit(`FRAME',0) + + movl PARAM_XSIZE, %ecx + movl PARAM_YP, %eax + + movl PARAM_XP, %edx + movl (%eax), %eax C yp low limb + + cmpl $2, %ecx + ja L(xsize_more_than_two_limbs) + je L(two_by_something) + + + C one limb by one limb + + movl (%edx), %edx C xp low limb + movl PARAM_WP, %ecx + + mull %edx + + movl %eax, (%ecx) + movl %edx, 4(%ecx) + ret + + +C ----------------------------------------------------------------------------- +L(two_by_something): + decl PARAM_YSIZE + pushl %ebx +deflit(`FRAME',4) + + movl PARAM_WP, %ebx + pushl %esi +deflit(`FRAME',8) + + movl %eax, %ecx C yp low limb + movl (%edx), %eax C xp low limb + + movl %edx, %esi C xp + jnz L(two_by_two) + + + C two limbs by one limb + + mull %ecx + + movl %eax, (%ebx) + movl 4(%esi), %eax + + movl %edx, %esi C carry + + mull %ecx + + addl %eax, %esi + movl %esi, 4(%ebx) + + adcl $0, %edx + + movl %edx, 8(%ebx) + popl %esi + + popl %ebx + ret + + + +C ----------------------------------------------------------------------------- + ALIGN(16) +L(two_by_two): + C eax xp low limb + C ebx wp + C ecx yp low limb + C edx + C esi xp + C edi + C ebp +deflit(`FRAME',8) + + mull %ecx C xp[0] * yp[0] + + push %edi +deflit(`FRAME',12) + movl %eax, (%ebx) + + movl 4(%esi), %eax + movl %edx, %edi C carry, for wp[1] + + mull %ecx C xp[1] * yp[0] + + addl %eax, %edi + movl PARAM_YP, %ecx + + adcl $0, %edx + + movl %edi, 4(%ebx) + movl 4(%ecx), %ecx C yp[1] + + movl 4(%esi), %eax C xp[1] + movl %edx, %edi C carry, for wp[2] + + mull %ecx C xp[1] * yp[1] + + addl %eax, %edi + + adcl $0, %edx + + movl (%esi), %eax C xp[0] + movl %edx, %esi C carry, for wp[3] + + mull %ecx C xp[0] * yp[1] + + addl %eax, 4(%ebx) + adcl %edx, %edi + adcl $0, %esi + + movl %edi, 8(%ebx) + popl %edi + + movl %esi, 12(%ebx) + popl %esi + + popl %ebx + ret + + +C ----------------------------------------------------------------------------- + ALIGN(16) +L(xsize_more_than_two_limbs): + +C The first limb of yp is processed with a simple mpn_mul_1 style loop +C inline. Unrolling this doesn't seem worthwhile since it's only run once +C (whereas the addmul below is run ysize-1 many times). A call to the +C actual mpn_mul_1 will be slowed down by the call and parameter pushing and +C popping, and doesn't seem likely to be worthwhile on the typical 10-20 +C limb operations the Karatsuba code calls here with. + + C eax yp[0] + C ebx + C ecx xsize + C edx xp + C esi + C edi + C ebp +deflit(`FRAME',0) + + pushl %edi defframe_pushl(SAVE_EDI) + pushl %ebp defframe_pushl(SAVE_EBP) + + movl PARAM_WP, %edi + pushl %esi defframe_pushl(SAVE_ESI) + + movl %eax, %ebp + pushl %ebx defframe_pushl(SAVE_EBX) + + leal (%edx,%ecx,4), %ebx C xp end + xorl %esi, %esi + + leal (%edi,%ecx,4), %edi C wp end of mul1 + negl %ecx + + +L(mul1): + C eax scratch + C ebx xp end + C ecx counter, negative + C edx scratch + C esi carry + C edi wp end of mul1 + C ebp multiplier + + movl (%ebx,%ecx,4), %eax + + mull %ebp + + addl %esi, %eax + movl $0, %esi + + adcl %edx, %esi + + movl %eax, (%edi,%ecx,4) + incl %ecx + + jnz L(mul1) + + + movl PARAM_YSIZE, %edx + movl %esi, (%edi) C final carry + + movl PARAM_XSIZE, %ecx + decl %edx + + jnz L(ysize_more_than_one_limb) + + popl %ebx + popl %esi + popl %ebp + popl %edi + ret + + +L(ysize_more_than_one_limb): + cmpl $UNROLL_THRESHOLD, %ecx + movl PARAM_YP, %eax + + jae L(unroll) + + +C ----------------------------------------------------------------------------- +C Simple addmul loop. +C +C Using ebx and edi pointing at the ends of their respective locations saves +C a couple of instructions in the outer loop. The inner loop is still 11 +C cycles, the same as the simple loop in aorsmul_1.asm. + + C eax yp + C ebx xp end + C ecx xsize + C edx ysize-1 + C esi + C edi wp end of mul1 + C ebp + + movl 4(%eax), %ebp C multiplier + negl %ecx + + movl %ecx, PARAM_XSIZE C -xsize + xorl %esi, %esi C initial carry + + leal 4(%eax,%edx,4), %eax C yp end + negl %edx + + movl %eax, PARAM_YP + movl %edx, PARAM_YSIZE + + jmp L(simple_outer_entry) + + + C aligning here saves a couple of cycles + ALIGN(16) +L(simple_outer_top): + C edx ysize counter, negative + + movl PARAM_YP, %eax C yp end + xorl %esi, %esi C carry + + movl PARAM_XSIZE, %ecx C -xsize + movl %edx, PARAM_YSIZE + + movl (%eax,%edx,4), %ebp C yp limb multiplier +L(simple_outer_entry): + addl $4, %edi + + +L(simple_inner): + C eax scratch + C ebx xp end + C ecx counter, negative + C edx scratch + C esi carry + C edi wp end of this addmul + C ebp multiplier + + movl (%ebx,%ecx,4), %eax + + mull %ebp + + addl %esi, %eax + movl $0, %esi + + adcl $0, %edx + addl %eax, (%edi,%ecx,4) + adcl %edx, %esi + + incl %ecx + jnz L(simple_inner) + + + movl PARAM_YSIZE, %edx + movl %esi, (%edi) + + incl %edx + jnz L(simple_outer_top) + + + popl %ebx + popl %esi + popl %ebp + popl %edi + ret + + +C ----------------------------------------------------------------------------- +C Unrolled loop. +C +C The unrolled inner loop is the same as in aorsmul_1.asm, see that code for +C some comments. +C +C VAR_COUNTER is for the inner loop, running from VAR_COUNTER_INIT down to +C 0, inclusive. +C +C VAR_JMP is the computed jump into the unrolled loop. +C +C PARAM_XP and PARAM_WP get offset appropriately for where the unrolled loop +C is entered. +C +C VAR_XP_LOW is the least significant limb of xp, which is needed at the +C start of the unrolled loop. This can't just be fetched through the xp +C pointer because of the offset applied to it. +C +C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1, +C inclusive. +C +C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be +C added to give the location of the next limb of yp, which is the multiplier +C in the unrolled loop. +C +C PARAM_WP is similarly offset so that the PARAM_YSIZE counter can be added +C to give the starting point in the destination for each unrolled loop (this +C point is one limb upwards for each limb of yp processed). +C +C Having PARAM_YSIZE count negative to zero means it's not necessary to +C store new values of PARAM_YP and PARAM_WP on each loop. Those values on +C the stack remain constant and on each loop an leal adjusts them with the +C PARAM_YSIZE counter value. + + +defframe(VAR_COUNTER, -20) +defframe(VAR_COUNTER_INIT, -24) +defframe(VAR_JMP, -28) +defframe(VAR_XP_LOW, -32) +deflit(VAR_STACK_SPACE, 16) + +dnl For some strange reason using (%esp) instead of 0(%esp) is a touch +dnl slower in this code, hence the defframe empty-if-zero feature is +dnl disabled. +dnl +dnl If VAR_COUNTER is at (%esp), the effect is worse. In this case the +dnl unrolled loop is 255 instead of 256 bytes, but quite how this affects +dnl anything isn't clear. +dnl +define(`defframe_empty_if_zero_disabled',1) + +L(unroll): + C eax yp (not used) + C ebx xp end (not used) + C ecx xsize + C edx ysize-1 + C esi + C edi wp end of mul1 (not used) + C ebp +deflit(`FRAME', 16) + + leal -2(%ecx), %ebp C one limb processed at start, + decl %ecx C and ebp is one less + + shrl $UNROLL_LOG2, %ebp + negl %ecx + + subl $VAR_STACK_SPACE, %esp +deflit(`FRAME', 16+VAR_STACK_SPACE) + andl $UNROLL_MASK, %ecx + + movl %ecx, %esi + shll $4, %ecx + + movl %ebp, VAR_COUNTER_INIT + negl %esi + + C 15 code bytes per limb +ifdef(`PIC',` + call L(pic_calc) +L(unroll_here): +',` + leal L(unroll_entry) (%ecx,%esi,1), %ecx +') + + movl PARAM_XP, %ebx + movl %ebp, VAR_COUNTER + + movl PARAM_WP, %edi + movl %ecx, VAR_JMP + + movl (%ebx), %eax + leal 4(%edi,%esi,4), %edi C wp adjust for unrolling and mul1 + + leal (%ebx,%esi,4), %ebx C xp adjust for unrolling + + movl %eax, VAR_XP_LOW + + movl %ebx, PARAM_XP + movl PARAM_YP, %ebx + + leal (%edi,%edx,4), %ecx C wp adjust for ysize indexing + movl 4(%ebx), %ebp C multiplier (yp second limb) + + leal 4(%ebx,%edx,4), %ebx C yp adjust for ysize indexing + + movl %ecx, PARAM_WP + + leal 1(%esi), %ecx C adjust parity for decl %ecx above + + movl %ebx, PARAM_YP + negl %edx + + movl %edx, PARAM_YSIZE + jmp L(unroll_outer_entry) + + +ifdef(`PIC',` +L(pic_calc): + C See mpn/x86/README about old gas bugs + leal (%ecx,%esi,1), %ecx + addl $L(unroll_entry)-L(unroll_here), %ecx + addl (%esp), %ecx + ret_internal +') + + +C ----------------------------------------------------------------------------- + C Aligning here saves a couple of cycles per loop. Using 32 doesn't + C cost any extra space, since the inner unrolled loop below is + C aligned to 32. + ALIGN(32) +L(unroll_outer_top): + C edx ysize + + movl PARAM_YP, %eax + movl %edx, PARAM_YSIZE C incremented ysize counter + + movl PARAM_WP, %edi + + movl VAR_COUNTER_INIT, %ebx + movl (%eax,%edx,4), %ebp C next multiplier + + movl PARAM_XSIZE, %ecx + leal (%edi,%edx,4), %edi C adjust wp for where we are in yp + + movl VAR_XP_LOW, %eax + movl %ebx, VAR_COUNTER + +L(unroll_outer_entry): + mull %ebp + + C using testb is a tiny bit faster than testl + testb $1, %cl + + movl %eax, %ecx C low carry + movl VAR_JMP, %eax + + movl %edx, %esi C high carry + movl PARAM_XP, %ebx + + jnz L(unroll_noswap) + movl %ecx, %esi C high,low carry other way around + + movl %edx, %ecx +L(unroll_noswap): + + jmp *%eax + + + +C ----------------------------------------------------------------------------- + ALIGN(32) +L(unroll_top): + C eax scratch + C ebx xp + C ecx carry low + C edx scratch + C esi carry high + C edi wp + C ebp multiplier + C VAR_COUNTER loop counter + C + C 15 code bytes each limb + + leal UNROLL_BYTES(%edi), %edi + +L(unroll_entry): +deflit(CHUNK_COUNT,2) +forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, ` + deflit(`disp0', eval(i*CHUNK_COUNT*4)) + deflit(`disp1', eval(disp0 + 4)) + deflit(`disp2', eval(disp1 + 4)) + + movl disp1(%ebx), %eax + mull %ebp +Zdisp( addl, %ecx, disp0,(%edi)) + adcl %eax, %esi + movl %edx, %ecx + jadcl0( %ecx) + + movl disp2(%ebx), %eax + mull %ebp + addl %esi, disp1(%edi) + adcl %eax, %ecx + movl %edx, %esi + jadcl0( %esi) +') + + decl VAR_COUNTER + leal UNROLL_BYTES(%ebx), %ebx + + jns L(unroll_top) + + + movl PARAM_YSIZE, %edx + addl %ecx, UNROLL_BYTES(%edi) + + adcl $0, %esi + + incl %edx + movl %esi, UNROLL_BYTES+4(%edi) + + jnz L(unroll_outer_top) + + + movl SAVE_ESI, %esi + movl SAVE_EBP, %ebp + movl SAVE_EDI, %edi + movl SAVE_EBX, %ebx + + addl $FRAME, %esp + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/k6/pre_mod_1.asm b/vendor/gmp-6.3.0/mpn/x86/k6/pre_mod_1.asm new file mode 100644 index 0000000..34db20d --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/k6/pre_mod_1.asm @@ -0,0 +1,146 @@ +dnl AMD K6 mpn_preinv_mod_1 -- mpn by 1 remainder, with pre-inverted divisor. + +dnl Copyright 2000, 2002, 2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C K6: 18.0 cycles/limb + + +C mp_limb_t mpn_preinv_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor, +C mp_limb_t inverse); +C +C This code is only 2 c/l faster than a simple divl, but that's 10% so it's +C considered worthwhile (just). + +defframe(PARAM_INVERSE,16) +defframe(PARAM_DIVISOR,12) +defframe(PARAM_SIZE, 8) +defframe(PARAM_SRC, 4) + + TEXT + ALIGN(32) +PROLOGUE(mpn_preinv_mod_1) +deflit(`FRAME',0) + + ASSERT(ae,`cmpl $1, PARAM_SIZE') + ASSERT(nz,`testl $0x80000000, PARAM_DIVISOR') + + movl PARAM_SIZE, %ecx + pushl %ebp FRAME_pushl() + + movl PARAM_SRC, %ebp + pushl %edi FRAME_pushl() + + movl PARAM_DIVISOR, %eax + pushl %esi FRAME_pushl() + + movl -4(%ebp,%ecx,4), %esi C src high limb + pushl %ebx FRAME_pushl() + + movl %edx, %edi C first n2 to cancel + subl %eax, %esi C first n1 = high-divisor + + decl %ecx + jz L(done_sbbl) + +L(top): + C eax scratch + C ebx n10, nadj, q1 + C ecx counter, size to 1 + C edx scratch + C esi n2 + C edi old high, for underflow test + C ebp src + + sbbl %edx, %edi C high n-(q1+1)*d, 0 or -1 + +L(entry): + andl PARAM_DIVISOR, %edi +L(q1_ff_top): + movl -4(%ebp,%ecx,4), %ebx + + addl %esi, %edi C possible addback + movl %ebx, %esi C n10 + + sarl $31, %ebx C -n1 = 0 or -1 + movl %edi, %eax C n2 + + movl PARAM_INVERSE, %edx + subl %ebx, %eax C n2+n1 + + mull %edx C m*(n2+n1) + + andl PARAM_DIVISOR, %ebx C -n1 & d + addl %esi, %ebx C nadj = n10 + (-n1&d), ignoring overflow + + addl %ebx, %eax C low m*(n2+n1) + nadj, giving carry flag + leal 1(%edi), %ebx C n2+1 + + adcl %ebx, %edx C 1+high(n2<<32+m*(n2+n1)+nadj) = q1+1 + + movl PARAM_DIVISOR, %eax C d + jz L(q1_ff) + + mull %edx C (q1+1)*d + + subl %eax, %esi C low n-(q1+1)*d + loop L(top) + + + +L(done_sbbl): + sbbl %edx, %edi C high n-(q1+1)*d, 0 or -1 + + andl PARAM_DIVISOR, %edi +L(done_esi_edi): + popl %ebx + + leal (%esi,%edi), %eax + popl %esi + + popl %edi + popl %ebp + + ret + + +C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword +C of q*d is simply -d and the remainder n-q*d = n10+d. This is rarely +C reached. + +L(q1_ff): + movl PARAM_DIVISOR, %edi + loop L(q1_ff_top) + + jmp L(done_esi_edi) + + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/k6/sqr_basecase.asm b/vendor/gmp-6.3.0/mpn/x86/k6/sqr_basecase.asm new file mode 100644 index 0000000..b7ecb5c --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/k6/sqr_basecase.asm @@ -0,0 +1,680 @@ +dnl AMD K6 mpn_sqr_basecase -- square an mpn number. + +dnl Copyright 1999-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C K6: approx 4.7 cycles per cross product, or 9.2 cycles per triangular +C product (measured on the speed difference between 17 and 33 limbs, +C which is roughly the Karatsuba recursing range). + + +dnl SQR_TOOM2_THRESHOLD_MAX is the maximum SQR_TOOM2_THRESHOLD this +dnl code supports. This value is used only by the tune program to know +dnl what it can go up to. (An attempt to compile with a bigger value will +dnl trigger some m4_assert()s in the code, making the build fail.) +dnl +dnl The value is determined by requiring the displacements in the unrolled +dnl addmul to fit in single bytes. This means a maximum UNROLL_COUNT of +dnl 63, giving a maximum SQR_TOOM2_THRESHOLD of 66. + +deflit(SQR_TOOM2_THRESHOLD_MAX, 66) + + +dnl Allow a value from the tune program to override config.m4. + +ifdef(`SQR_TOOM2_THRESHOLD_OVERRIDE', +`define(`SQR_TOOM2_THRESHOLD',SQR_TOOM2_THRESHOLD_OVERRIDE)') + + +dnl UNROLL_COUNT is the number of code chunks in the unrolled addmul. The +dnl number required is determined by SQR_TOOM2_THRESHOLD, since +dnl mpn_sqr_basecase only needs to handle sizes < SQR_TOOM2_THRESHOLD. +dnl +dnl The first addmul is the biggest, and this takes the second least +dnl significant limb and multiplies it by the third least significant and +dnl up. Hence for a maximum operand size of SQR_TOOM2_THRESHOLD-1 +dnl limbs, UNROLL_COUNT needs to be SQR_TOOM2_THRESHOLD-3. + +m4_config_gmp_mparam(`SQR_TOOM2_THRESHOLD') +deflit(UNROLL_COUNT, eval(SQR_TOOM2_THRESHOLD-3)) + + +C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size); +C +C The algorithm is essentially the same as mpn/generic/sqr_basecase.c, but a +C lot of function call overheads are avoided, especially when the given size +C is small. +C +C The code size might look a bit excessive, but not all of it is executed +C and so won't fill up the code cache. The 1x1, 2x2 and 3x3 special cases +C clearly apply only to those sizes; mid sizes like 10x10 only need part of +C the unrolled addmul; and big sizes like 35x35 that do need all of it will +C at least be getting value for money, because 35x35 spends something like +C 5780 cycles here. +C +C Different values of UNROLL_COUNT give slightly different speeds, between +C 9.0 and 9.2 c/tri-prod measured on the difference between 17 and 33 limbs. +C This isn't a big difference, but it's presumably some alignment effect +C which if understood could give a simple speedup. + +defframe(PARAM_SIZE,12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + TEXT + ALIGN(32) +PROLOGUE(mpn_sqr_basecase) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + movl PARAM_SRC, %eax + + cmpl $2, %ecx + je L(two_limbs) + + movl PARAM_DST, %edx + ja L(three_or_more) + + +C ----------------------------------------------------------------------------- +C one limb only + C eax src + C ebx + C ecx size + C edx dst + + movl (%eax), %eax + movl %edx, %ecx + + mull %eax + + movl %eax, (%ecx) + movl %edx, 4(%ecx) + ret + + +C ----------------------------------------------------------------------------- + ALIGN(16) +L(two_limbs): + C eax src + C ebx + C ecx size + C edx dst + + pushl %ebx + movl %eax, %ebx C src +deflit(`FRAME',4) + + movl (%ebx), %eax + movl PARAM_DST, %ecx + + mull %eax C src[0]^2 + + movl %eax, (%ecx) + movl 4(%ebx), %eax + + movl %edx, 4(%ecx) + + mull %eax C src[1]^2 + + movl %eax, 8(%ecx) + movl (%ebx), %eax + + movl %edx, 12(%ecx) + movl 4(%ebx), %edx + + mull %edx C src[0]*src[1] + + addl %eax, 4(%ecx) + + adcl %edx, 8(%ecx) + adcl $0, 12(%ecx) + + popl %ebx + addl %eax, 4(%ecx) + + adcl %edx, 8(%ecx) + adcl $0, 12(%ecx) + + ret + + +C ----------------------------------------------------------------------------- +L(three_or_more): +deflit(`FRAME',0) + cmpl $4, %ecx + jae L(four_or_more) + + +C ----------------------------------------------------------------------------- +C three limbs + C eax src + C ecx size + C edx dst + + pushl %ebx + movl %eax, %ebx C src + + movl (%ebx), %eax + movl %edx, %ecx C dst + + mull %eax C src[0] ^ 2 + + movl %eax, (%ecx) + movl 4(%ebx), %eax + + movl %edx, 4(%ecx) + pushl %esi + + mull %eax C src[1] ^ 2 + + movl %eax, 8(%ecx) + movl 8(%ebx), %eax + + movl %edx, 12(%ecx) + pushl %edi + + mull %eax C src[2] ^ 2 + + movl %eax, 16(%ecx) + movl (%ebx), %eax + + movl %edx, 20(%ecx) + movl 4(%ebx), %edx + + mull %edx C src[0] * src[1] + + movl %eax, %esi + movl (%ebx), %eax + + movl %edx, %edi + movl 8(%ebx), %edx + + pushl %ebp + xorl %ebp, %ebp + + mull %edx C src[0] * src[2] + + addl %eax, %edi + movl 4(%ebx), %eax + + adcl %edx, %ebp + + movl 8(%ebx), %edx + + mull %edx C src[1] * src[2] + + addl %eax, %ebp + + adcl $0, %edx + + + C eax will be dst[5] + C ebx + C ecx dst + C edx dst[4] + C esi dst[1] + C edi dst[2] + C ebp dst[3] + + xorl %eax, %eax + addl %esi, %esi + adcl %edi, %edi + adcl %ebp, %ebp + adcl %edx, %edx + adcl $0, %eax + + addl %esi, 4(%ecx) + adcl %edi, 8(%ecx) + adcl %ebp, 12(%ecx) + + popl %ebp + popl %edi + + adcl %edx, 16(%ecx) + + popl %esi + popl %ebx + + adcl %eax, 20(%ecx) + ASSERT(nc) + + ret + + +C ----------------------------------------------------------------------------- + +defframe(SAVE_EBX, -4) +defframe(SAVE_ESI, -8) +defframe(SAVE_EDI, -12) +defframe(SAVE_EBP, -16) +defframe(VAR_COUNTER,-20) +defframe(VAR_JMP, -24) +deflit(STACK_SPACE, 24) + + ALIGN(16) +L(four_or_more): + + C eax src + C ebx + C ecx size + C edx dst + C esi + C edi + C ebp + +C First multiply src[0]*src[1..size-1] and store at dst[1..size]. +C +C A test was done calling mpn_mul_1 here to get the benefit of its unrolled +C loop, but this was only a tiny speedup; at 35 limbs it took 24 cycles off +C a 5780 cycle operation, which is not surprising since the loop here is 8 +C c/l and mpn_mul_1 is 6.25 c/l. + + subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE) + + movl %edi, SAVE_EDI + leal 4(%edx), %edi + + movl %ebx, SAVE_EBX + leal 4(%eax), %ebx + + movl %esi, SAVE_ESI + xorl %esi, %esi + + movl %ebp, SAVE_EBP + + C eax + C ebx src+4 + C ecx size + C edx + C esi + C edi dst+4 + C ebp + + movl (%eax), %ebp C multiplier + leal -1(%ecx), %ecx C size-1, and pad to a 16 byte boundary + + + ALIGN(16) +L(mul_1): + C eax scratch + C ebx src ptr + C ecx counter + C edx scratch + C esi carry + C edi dst ptr + C ebp multiplier + + movl (%ebx), %eax + addl $4, %ebx + + mull %ebp + + addl %esi, %eax + movl $0, %esi + + adcl %edx, %esi + + movl %eax, (%edi) + addl $4, %edi + + loop L(mul_1) + + +C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2. +C +C The last two addmuls, which are the bottom right corner of the product +C triangle, are left to the end. These are src[size-3]*src[size-2,size-1] +C and src[size-2]*src[size-1]. If size is 4 then it's only these corner +C cases that need to be done. +C +C The unrolled code is the same as mpn_addmul_1(), see that routine for some +C comments. +C +C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive. +C +C VAR_JMP is the computed jump into the unrolled code, stepped by one code +C chunk each outer loop. +C +C K6 doesn't do any branch prediction on indirect jumps, which is good +C actually because it's a different target each time. The unrolled addmul +C is about 3 cycles/limb faster than a simple loop, so the 6 cycle cost of +C the indirect jump is quickly recovered. + + +dnl This value is also implicitly encoded in a shift and add. +dnl +deflit(CODE_BYTES_PER_LIMB, 15) + +dnl With the unmodified &src[size] and &dst[size] pointers, the +dnl displacements in the unrolled code fit in a byte for UNROLL_COUNT +dnl values up to 31. Above that an offset must be added to them. +dnl +deflit(OFFSET, +ifelse(eval(UNROLL_COUNT>31),1, +eval((UNROLL_COUNT-31)*4), +0)) + + C eax + C ebx &src[size] + C ecx + C edx + C esi carry + C edi &dst[size] + C ebp + + movl PARAM_SIZE, %ecx + movl %esi, (%edi) + + subl $4, %ecx + jz L(corner) + + movl %ecx, %edx +ifelse(OFFSET,0,, +` subl $OFFSET, %ebx') + + shll $4, %ecx +ifelse(OFFSET,0,, +` subl $OFFSET, %edi') + + negl %ecx + +ifdef(`PIC',` + call L(pic_calc) +L(here): +',` + leal L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx +') + negl %edx + + + C The calculated jump mustn't be before the start of the available + C code. This is the limitation UNROLL_COUNT puts on the src operand + C size, but checked here using the jump address directly. + C + ASSERT(ae,` + movl_text_address( L(unroll_inner_start), %eax) + cmpl %eax, %ecx + ') + + +C ----------------------------------------------------------------------------- + ALIGN(16) +L(unroll_outer_top): + C eax + C ebx &src[size], constant + C ecx VAR_JMP + C edx VAR_COUNTER, limbs, negative + C esi high limb to store + C edi dst ptr, high of last addmul + C ebp + + movl -12+OFFSET(%ebx,%edx,4), %ebp C multiplier + movl %edx, VAR_COUNTER + + movl -8+OFFSET(%ebx,%edx,4), %eax C first limb of multiplicand + + mull %ebp + + testb $1, %cl + + movl %edx, %esi C high carry + movl %ecx, %edx C jump + + movl %eax, %ecx C low carry + leal CODE_BYTES_PER_LIMB(%edx), %edx + + movl %edx, VAR_JMP + leal 4(%edi), %edi + + C A branch-free version of this using some xors was found to be a + C touch slower than just a conditional jump, despite the jump + C switching between taken and not taken on every loop. + +ifelse(eval(UNROLL_COUNT%2),0, + jz,jnz) L(unroll_noswap) + movl %esi, %eax C high,low carry other way around + + movl %ecx, %esi + movl %eax, %ecx +L(unroll_noswap): + + jmp *%edx + + + C Must be on an even address here so the low bit of the jump address + C will indicate which way around ecx/esi should start. + C + C An attempt was made at padding here to get the end of the unrolled + C code to come out on a good alignment, to save padding before + C L(corner). This worked, but turned out to run slower than just an + C ALIGN(2). The reason for this is not clear, it might be related + C to the different speeds on different UNROLL_COUNTs noted above. + + ALIGN(2) + +L(unroll_inner_start): + C eax scratch + C ebx src + C ecx carry low + C edx scratch + C esi carry high + C edi dst + C ebp multiplier + C + C 15 code bytes each limb + C ecx/esi swapped on each chunk + +forloop(`i', UNROLL_COUNT, 1, ` + deflit(`disp_src', eval(-i*4 + OFFSET)) + deflit(`disp_dst', eval(disp_src - 4)) + + m4_assert(`disp_src>=-128 && disp_src<128') + m4_assert(`disp_dst>=-128 && disp_dst<128') + +ifelse(eval(i%2),0,` +Zdisp( movl, disp_src,(%ebx), %eax) + mull %ebp +Zdisp( addl, %esi, disp_dst,(%edi)) + adcl %eax, %ecx + movl %edx, %esi + jadcl0( %esi) +',` + dnl this one comes out last +Zdisp( movl, disp_src,(%ebx), %eax) + mull %ebp +Zdisp( addl, %ecx, disp_dst,(%edi)) + adcl %eax, %esi + movl %edx, %ecx + jadcl0( %ecx) +') +') +L(unroll_inner_end): + + addl %esi, -4+OFFSET(%edi) + + movl VAR_COUNTER, %edx + jadcl0( %ecx) + + movl %ecx, m4_empty_if_zero(OFFSET)(%edi) + movl VAR_JMP, %ecx + + incl %edx + jnz L(unroll_outer_top) + + +ifelse(OFFSET,0,,` + addl $OFFSET, %ebx + addl $OFFSET, %edi +') + + +C ----------------------------------------------------------------------------- + ALIGN(16) +L(corner): + C ebx &src[size] + C edi &dst[2*size-5] + + movl -12(%ebx), %ebp + + movl -8(%ebx), %eax + movl %eax, %ecx + + mull %ebp + + addl %eax, -4(%edi) + adcl $0, %edx + + movl -4(%ebx), %eax + movl %edx, %esi + movl %eax, %ebx + + mull %ebp + + addl %esi, %eax + adcl $0, %edx + + addl %eax, (%edi) + adcl $0, %edx + + movl %edx, %esi + movl %ebx, %eax + + mull %ecx + + addl %esi, %eax + movl %eax, 4(%edi) + + adcl $0, %edx + + movl %edx, 8(%edi) + + +C ----------------------------------------------------------------------------- +C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1]. +C The loop measures about 6 cycles/iteration, though it looks like it should +C decode in 5. + +L(lshift_start): + movl PARAM_SIZE, %ecx + + movl PARAM_DST, %edi + subl $1, %ecx C size-1 and clear carry + + movl PARAM_SRC, %ebx + movl %ecx, %edx + + xorl %eax, %eax C ready for adcl + + + ALIGN(16) +L(lshift): + C eax + C ebx src (for later use) + C ecx counter, decrementing + C edx size-1 (for later use) + C esi + C edi dst, incrementing + C ebp + + rcll 4(%edi) + rcll 8(%edi) + leal 8(%edi), %edi + loop L(lshift) + + + adcl %eax, %eax + + movl %eax, 4(%edi) C dst most significant limb + movl (%ebx), %eax C src[0] + + leal 4(%ebx,%edx,4), %ebx C &src[size] + subl %edx, %ecx C -(size-1) + + +C ----------------------------------------------------------------------------- +C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ..., +C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the +C low limb of src[0]^2. + + + mull %eax + + movl %eax, (%edi,%ecx,8) C dst[0] + + + ALIGN(16) +L(diag): + C eax scratch + C ebx &src[size] + C ecx counter, negative + C edx carry + C esi scratch + C edi dst[2*size-2] + C ebp + + movl (%ebx,%ecx,4), %eax + movl %edx, %esi + + mull %eax + + addl %esi, 4(%edi,%ecx,8) + adcl %eax, 8(%edi,%ecx,8) + adcl $0, %edx + + incl %ecx + jnz L(diag) + + + movl SAVE_EBX, %ebx + movl SAVE_ESI, %esi + + addl %edx, 4(%edi) C dst most significant limb + + movl SAVE_EDI, %edi + movl SAVE_EBP, %ebp + addl $FRAME, %esp + ret + + + +C ----------------------------------------------------------------------------- +ifdef(`PIC',` +L(pic_calc): + C See mpn/x86/README about old gas bugs + addl (%esp), %ecx + addl $L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx + addl %edx, %ecx + ret_internal +') + + +EPILOGUE() |