From a89a14ef5da44684a16b204e7a70460cc8c4922a Mon Sep 17 00:00:00 2001 From: Thomas Voss Date: Fri, 21 Jun 2024 23:36:36 +0200 Subject: Basic constant folding implementation --- vendor/gmp-6.3.0/mpn/x86/k6/mmx/com.asm | 103 ++++++++++ vendor/gmp-6.3.0/mpn/x86/k6/mmx/dive_1.asm | 282 +++++++++++++++++++++++++++ vendor/gmp-6.3.0/mpn/x86/k6/mmx/logops_n.asm | 226 +++++++++++++++++++++ vendor/gmp-6.3.0/mpn/x86/k6/mmx/lshift.asm | 130 ++++++++++++ vendor/gmp-6.3.0/mpn/x86/k6/mmx/popham.asm | 236 ++++++++++++++++++++++ vendor/gmp-6.3.0/mpn/x86/k6/mmx/rshift.asm | 130 ++++++++++++ 6 files changed, 1107 insertions(+) create mode 100644 vendor/gmp-6.3.0/mpn/x86/k6/mmx/com.asm create mode 100644 vendor/gmp-6.3.0/mpn/x86/k6/mmx/dive_1.asm create mode 100644 vendor/gmp-6.3.0/mpn/x86/k6/mmx/logops_n.asm create mode 100644 vendor/gmp-6.3.0/mpn/x86/k6/mmx/lshift.asm create mode 100644 vendor/gmp-6.3.0/mpn/x86/k6/mmx/popham.asm create mode 100644 vendor/gmp-6.3.0/mpn/x86/k6/mmx/rshift.asm (limited to 'vendor/gmp-6.3.0/mpn/x86/k6/mmx') diff --git a/vendor/gmp-6.3.0/mpn/x86/k6/mmx/com.asm b/vendor/gmp-6.3.0/mpn/x86/k6/mmx/com.asm new file mode 100644 index 0000000..b747454 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/k6/mmx/com.asm @@ -0,0 +1,103 @@ +dnl AMD K6-2 mpn_com -- mpn bitwise one's complement. + +dnl Copyright 1999-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +NAILS_SUPPORT(0-31) + + +C alignment dst/src, A=0mod8 N=4mod8 +C A/A A/N N/A N/N +C K6-2 1.0 1.18 1.18 1.18 cycles/limb +C K6 1.5 1.85 1.75 1.85 + + +C void mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size); +C +C Take the bitwise ones-complement of src,size and write it to dst,size. + +defframe(PARAM_SIZE,12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + TEXT + ALIGN(16) +PROLOGUE(mpn_com) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + movl PARAM_SRC, %eax + movl PARAM_DST, %edx + shrl %ecx + jnz L(two_or_more) + + movl (%eax), %eax + notl_or_xorl_GMP_NUMB_MASK( %eax) + movl %eax, (%edx) + ret + + +L(two_or_more): + pushl %ebx FRAME_pushl() + pcmpeqd %mm7, %mm7 C all ones + + movl %ecx, %ebx +ifelse(GMP_NAIL_BITS,0,, +` psrld $GMP_NAIL_BITS, %mm7') C clear nails + + + + ALIGN(8) +L(top): + C eax src + C ebx floor(size/2) + C ecx counter + C edx dst + C + C mm0 scratch + C mm7 mask + + movq -8(%eax,%ecx,8), %mm0 + pxor %mm7, %mm0 + movq %mm0, -8(%edx,%ecx,8) + loop L(top) + + + jnc L(no_extra) + movl (%eax,%ebx,8), %eax + notl_or_xorl_GMP_NUMB_MASK( %eax) + movl %eax, (%edx,%ebx,8) +L(no_extra): + + popl %ebx + emms_or_femms + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/k6/mmx/dive_1.asm b/vendor/gmp-6.3.0/mpn/x86/k6/mmx/dive_1.asm new file mode 100644 index 0000000..1bbad3a --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/k6/mmx/dive_1.asm @@ -0,0 +1,282 @@ +dnl AMD K6 mpn_divexact_1 -- mpn by limb exact division. + +dnl Copyright 2000-2002, 2007 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C divisor +C odd even +C K6: 10.0 12.0 cycles/limb +C K6-2: 10.0 11.5 + + +C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t divisor); +C +C A simple divl is used for size==1. This is about 10 cycles faster for an +C odd divisor or 20 cycles for an even divisor. +C +C The loops are quite sensitive to code alignment, speeds should be +C rechecked (odd and even divisor, pic and non-pic) if contemplating +C changing anything. + +defframe(PARAM_DIVISOR,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +dnl re-use parameter space +define(VAR_INVERSE,`PARAM_DST') + + TEXT + + ALIGN(32) +PROLOGUE(mpn_divexact_1) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + + movl PARAM_SRC, %eax + xorl %edx, %edx + + cmpl $1, %ecx + jnz L(two_or_more) + + movl (%eax), %eax + + divl PARAM_DIVISOR + + movl PARAM_DST, %ecx + movl %eax, (%ecx) + + ret + + +L(two_or_more): + movl PARAM_DIVISOR, %eax + pushl %ebx FRAME_pushl() + + movl PARAM_SRC, %ebx + pushl %ebp FRAME_pushl() + +L(strip_twos): + shrl %eax + incl %edx C will get shift+1 + + jnc L(strip_twos) + pushl %esi FRAME_pushl() + + leal 1(%eax,%eax), %esi C d without twos + andl $127, %eax C d/2, 7 bits + +ifdef(`PIC',` + LEA( binvert_limb_table, %ebp) +Zdisp( movzbl, 0,(%eax,%ebp), %eax) +',` + movzbl binvert_limb_table(%eax), %eax C inv 8 bits +') + pushl %edi FRAME_pushl() + + leal (%eax,%eax), %ebp C 2*inv + + imull %eax, %eax C inv*inv + + movl PARAM_DST, %edi + + imull %esi, %eax C inv*inv*d + + subl %eax, %ebp C inv = 2*inv - inv*inv*d + leal (%ebp,%ebp), %eax C 2*inv + + imull %ebp, %ebp C inv*inv + + movl %esi, PARAM_DIVISOR C d without twos + leal (%ebx,%ecx,4), %ebx C src end + + imull %esi, %ebp C inv*inv*d + + leal (%edi,%ecx,4), %edi C dst end + negl %ecx C -size + + subl %ebp, %eax C inv = 2*inv - inv*inv*d + subl $1, %edx C shift amount, and clear carry + + ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS + pushl %eax FRAME_pushl() + imull PARAM_DIVISOR, %eax + cmpl $1, %eax + popl %eax FRAME_popl()') + + movl %eax, VAR_INVERSE + jnz L(even) + + movl (%ebx,%ecx,4), %esi C src low limb + jmp L(odd_entry) + + + ALIGN(16) + nop C code alignment +L(odd_top): + C eax scratch + C ebx src end + C ecx counter, limbs, negative + C edx inverse + C esi next limb, adjusted for carry + C edi dst end + C ebp carry bit, 0 or -1 + + imull %edx, %esi + + movl PARAM_DIVISOR, %eax + movl %esi, -4(%edi,%ecx,4) + + mull %esi C carry limb in edx + + subl %ebp, %edx C apply carry bit + movl (%ebx,%ecx,4), %esi + +L(odd_entry): + subl %edx, %esi C apply carry limb + movl VAR_INVERSE, %edx + + sbbl %ebp, %ebp C 0 or -1 + + incl %ecx + jnz L(odd_top) + + + imull %edx, %esi + + movl %esi, -4(%edi,%ecx,4) + + popl %edi + popl %esi + + popl %ebp + popl %ebx + + ret + + +L(even): + C eax + C ebx src end + C ecx -size + C edx twos + C esi + C edi dst end + C ebp + + xorl %ebp, %ebp +Zdisp( movq, 0,(%ebx,%ecx,4), %mm0) C src[0,1] + + movd %edx, %mm7 + movl VAR_INVERSE, %edx + + addl $2, %ecx + psrlq %mm7, %mm0 + + movd %mm0, %esi + jz L(even_two) C if only two limbs + + +C Out-of-order execution is good enough to hide the load/rshift/movd +C latency. Having imul at the top of the loop gives 11.5 c/l instead of 12, +C on K6-2. In fact there's only 11 of decode, but nothing running at 11 has +C been found. Maybe the fact every second movq is unaligned costs the extra +C 0.5. + +L(even_top): + C eax scratch + C ebx src end + C ecx counter, limbs, negative + C edx inverse + C esi next limb, adjusted for carry + C edi dst end + C ebp carry bit, 0 or -1 + C + C mm0 scratch, source limbs + C mm7 twos + + imull %edx, %esi + + movl %esi, -8(%edi,%ecx,4) + movl PARAM_DIVISOR, %eax + + mull %esi C carry limb in edx + + movq -4(%ebx,%ecx,4), %mm0 + psrlq %mm7, %mm0 + + movd %mm0, %esi + subl %ebp, %edx C apply carry bit + + subl %edx, %esi C apply carry limb + movl VAR_INVERSE, %edx + + sbbl %ebp, %ebp C 0 or -1 + + incl %ecx + jnz L(even_top) + + +L(even_two): + movd -4(%ebx), %mm0 C src high limb + psrlq %mm7, %mm0 + + imull %edx, %esi + + movl %esi, -8(%edi) + movl PARAM_DIVISOR, %eax + + mull %esi C carry limb in edx + + movd %mm0, %esi + subl %ebp, %edx C apply carry bit + + movl VAR_INVERSE, %eax + subl %edx, %esi C apply carry limb + + imull %eax, %esi + + movl %esi, -4(%edi) + + popl %edi + popl %esi + + popl %ebp + popl %ebx + + emms_or_femms + + ret + +EPILOGUE() +ASM_END() diff --git a/vendor/gmp-6.3.0/mpn/x86/k6/mmx/logops_n.asm b/vendor/gmp-6.3.0/mpn/x86/k6/mmx/logops_n.asm new file mode 100644 index 0000000..e17930b --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/k6/mmx/logops_n.asm @@ -0,0 +1,226 @@ +dnl AMD K6-2 mpn_and_n, mpn_andn_n, mpn_nand_n, mpn_ior_n, mpn_iorn_n, +dnl mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise logical operations. + +dnl Copyright 1999-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +NAILS_SUPPORT(0-31) + + +C alignment dst/src1/src2, A=0mod8, N=4mod8 +C A/A/A A/A/N A/N/A A/N/N N/A/A N/A/N N/N/A N/N/N +C +C K6-2 1.2 1.5 1.5 1.2 1.2 1.5 1.5 1.2 and,andn,ior,xor +C K6-2 1.5 1.75 2.0 1.75 1.75 2.0 1.75 1.5 iorn,xnor +C K6-2 1.75 2.0 2.0 2.0 2.0 2.0 2.0 1.75 nand,nior +C +C K6 1.5 1.68 1.75 1.2 1.75 1.75 1.68 1.5 and,andn,ior,xor +C K6 2.0 2.0 2.25 2.25 2.25 2.25 2.0 2.0 iorn,xnor +C K6 2.0 2.25 2.25 2.25 2.25 2.25 2.25 2.0 nand,nior + + +dnl M4_p and M4_i are the MMX and integer instructions +dnl M4_*_neg_dst means whether to negate the final result before writing +dnl M4_*_neg_src2 means whether to negate the src2 values before using them + +define(M4_choose_op, +m4_assert_numargs(7) +`ifdef(`OPERATION_$1',` +define(`M4_function', `mpn_$1') +define(`M4_operation', `$1') +define(`M4_p', `$2') +define(`M4_p_neg_dst', `$3') +define(`M4_p_neg_src2',`$4') +define(`M4_i', `$5') +define(`M4_i_neg_dst', `$6') +define(`M4_i_neg_src2',`$7') +')') + +dnl xnor is done in "iorn" style because it's a touch faster than "nior" +dnl style (the two are equivalent for xor). +dnl +dnl pandn can't be used with nails. + +M4_choose_op( and_n, pand,0,0, andl,0,0) +ifelse(GMP_NAIL_BITS,0, +`M4_choose_op(andn_n, pandn,0,0, andl,0,1)', +`M4_choose_op(andn_n, pand,0,1, andl,0,1)') +M4_choose_op( nand_n, pand,1,0, andl,1,0) +M4_choose_op( ior_n, por,0,0, orl,0,0) +M4_choose_op( iorn_n, por,0,1, orl,0,1) +M4_choose_op( nior_n, por,1,0, orl,1,0) +M4_choose_op( xor_n, pxor,0,0, xorl,0,0) +M4_choose_op( xnor_n, pxor,0,1, xorl,0,1) + +ifdef(`M4_function',, +`m4_error(`Unrecognised or undefined OPERATION symbol +')') + +MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n) + + +C void M4_function (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, +C mp_size_t size); +C +C Do src1,size M4_operation src2,size, storing the result in dst,size. +C +C Unaligned movq loads and stores are a bit slower than aligned ones. The +C test at the start of the routine checks the alignment of src1 and if +C necessary processes one limb separately at the low end to make it aligned. +C +C The raw speeds without this alignment switch are as follows. +C +C alignment dst/src1/src2, A=0mod8, N=4mod8 +C A/A/A A/A/N A/N/A A/N/N N/A/A N/A/N N/N/A N/N/N +C +C K6 1.5 2.0 1.5 2.0 and,andn,ior,xor +C K6 1.75 2.2 2.0 2.28 iorn,xnor +C K6 2.0 2.25 2.35 2.28 nand,nior +C +C +C Future: +C +C K6 can do one 64-bit load per cycle so each of these routines should be +C able to approach 1.0 c/l, if aligned. The basic and/andn/ior/xor might be +C able to get 1.0 with just a 4 limb loop, being 3 instructions per 2 limbs. +C The others are 4 instructions per 2 limbs, and so can only approach 1.0 +C because there's nowhere to hide some loop control. + +defframe(PARAM_SIZE,16) +defframe(PARAM_SRC2,12) +defframe(PARAM_SRC1,8) +defframe(PARAM_DST, 4) +deflit(`FRAME',0) + + TEXT + ALIGN(32) +PROLOGUE(M4_function) + movl PARAM_SIZE, %ecx + pushl %ebx FRAME_pushl() + + movl PARAM_SRC1, %eax + + movl PARAM_SRC2, %ebx + cmpl $1, %ecx + + movl PARAM_DST, %edx + ja L(two_or_more) + + + movl (%ebx), %ecx + popl %ebx +ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK( %ecx)') + M4_i (%eax), %ecx +ifelse(M4_i_neg_dst,1,` notl_or_xorl_GMP_NUMB_MASK( %ecx)') + movl %ecx, (%edx) + + ret + + +L(two_or_more): + C eax src1 + C ebx src2 + C ecx size + C edx dst + C esi + C edi + C ebp + + pushl %esi FRAME_pushl() + testl $4, %eax + jz L(alignment_ok) + + movl (%ebx), %esi + addl $4, %ebx +ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK( %esi)') + M4_i (%eax), %esi + addl $4, %eax +ifelse(M4_i_neg_dst,1,` notl_or_xorl_GMP_NUMB_MASK( %esi)') + movl %esi, (%edx) + addl $4, %edx + decl %ecx + +L(alignment_ok): + movl %ecx, %esi + shrl %ecx + jnz L(still_two_or_more) + + movl (%ebx), %ecx + popl %esi +ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK( %ecx)') + M4_i (%eax), %ecx +ifelse(M4_i_neg_dst,1,` notl_or_xorl_GMP_NUMB_MASK( %ecx)') + popl %ebx + movl %ecx, (%edx) + ret + + +L(still_two_or_more): +ifelse(eval(M4_p_neg_src2 || M4_p_neg_dst),1,` + pcmpeqd %mm7, %mm7 C all ones +ifelse(GMP_NAIL_BITS,0,,`psrld $GMP_NAIL_BITS, %mm7') C clear nails +') + + ALIGN(16) +L(top): + C eax src1 + C ebx src2 + C ecx counter + C edx dst + C esi + C edi + C ebp + C + C carry bit is low of size + + movq -8(%ebx,%ecx,8), %mm0 +ifelse(M4_p_neg_src2,1,`pxor %mm7, %mm0') + M4_p -8(%eax,%ecx,8), %mm0 +ifelse(M4_p_neg_dst,1,` pxor %mm7, %mm0') + movq %mm0, -8(%edx,%ecx,8) + + loop L(top) + + + jnc L(no_extra) + + movl -4(%ebx,%esi,4), %ebx +ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK( %ebx)') + M4_i -4(%eax,%esi,4), %ebx +ifelse(M4_i_neg_dst,1,` notl_or_xorl_GMP_NUMB_MASK( %ebx)') + movl %ebx, -4(%edx,%esi,4) +L(no_extra): + + popl %esi + popl %ebx + emms_or_femms + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/k6/mmx/lshift.asm b/vendor/gmp-6.3.0/mpn/x86/k6/mmx/lshift.asm new file mode 100644 index 0000000..45be582 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/k6/mmx/lshift.asm @@ -0,0 +1,130 @@ +dnl AMD K6 mpn_lshift -- mpn left shift. + +dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C K6: 3.0 cycles/limb + + +C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned shift); +C +C The loop runs at 3 cycles/limb, limited by decoding and by having 3 mmx +C instructions. This is despite every second fetch being unaligned. + + +defframe(PARAM_SHIFT,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + TEXT + ALIGN(32) + +PROLOGUE(mpn_lshift) +deflit(`FRAME',0) + + C The 1 limb case can be done without the push %ebx, but it's then + C still the same speed. The push is left as a free helping hand for + C the two_or_more code. + + movl PARAM_SIZE, %eax + pushl %ebx FRAME_pushl() + + movl PARAM_SRC, %ebx + decl %eax + + movl PARAM_SHIFT, %ecx + jnz L(two_or_more) + + movl (%ebx), %edx C src limb + movl PARAM_DST, %ebx + + shldl( %cl, %edx, %eax) C return value + + shll %cl, %edx + + movl %edx, (%ebx) C dst limb + popl %ebx + + ret + + + ALIGN(16) C avoid offset 0x1f + nop C avoid bad cache line crossing +L(two_or_more): + C eax size-1 + C ebx src + C ecx shift + C edx + + movl (%ebx,%eax,4), %edx C src high limb + negl %ecx + + movd PARAM_SHIFT, %mm6 + addl $32, %ecx C 32-shift + + shrl %cl, %edx + + movd %ecx, %mm7 + movl PARAM_DST, %ecx + +L(top): + C eax counter, size-1 to 1 + C ebx src + C ecx dst + C edx retval + C + C mm0 scratch + C mm6 shift + C mm7 32-shift + + movq -4(%ebx,%eax,4), %mm0 + decl %eax + + psrlq %mm7, %mm0 + + movd %mm0, 4(%ecx,%eax,4) + jnz L(top) + + + movd (%ebx), %mm0 + popl %ebx + + psllq %mm6, %mm0 + movl %edx, %eax + + movd %mm0, (%ecx) + + emms + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/k6/mmx/popham.asm b/vendor/gmp-6.3.0/mpn/x86/k6/mmx/popham.asm new file mode 100644 index 0000000..2b19d0b --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/k6/mmx/popham.asm @@ -0,0 +1,236 @@ +dnl AMD K6-2 mpn_popcount, mpn_hamdist -- mpn bit population count and +dnl hamming distance. + +dnl Copyright 2000-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C popcount hamdist +C K6-2: 9.0 11.5 cycles/limb +C K6: 12.5 13.0 + + +C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size); +C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size); +C +C The code here isn't optimal, but it's already a 2x speedup over the plain +C integer mpn/generic/popcount.c,hamdist.c. + + +ifdef(`OPERATION_popcount',, +`ifdef(`OPERATION_hamdist',, +`m4_error(`Need OPERATION_popcount or OPERATION_hamdist +')m4exit(1)')') + +define(HAM, +m4_assert_numargs(1) +`ifdef(`OPERATION_hamdist',`$1')') + +define(POP, +m4_assert_numargs(1) +`ifdef(`OPERATION_popcount',`$1')') + +HAM(` +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC2, 8) +defframe(PARAM_SRC, 4) +define(M4_function,mpn_hamdist) +') +POP(` +defframe(PARAM_SIZE, 8) +defframe(PARAM_SRC, 4) +define(M4_function,mpn_popcount) +') + +MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist) + + +ifdef(`PIC',,` + dnl non-PIC + + RODATA + ALIGN(8) + +L(rodata_AAAAAAAAAAAAAAAA): + .long 0xAAAAAAAA + .long 0xAAAAAAAA + +L(rodata_3333333333333333): + .long 0x33333333 + .long 0x33333333 + +L(rodata_0F0F0F0F0F0F0F0F): + .long 0x0F0F0F0F + .long 0x0F0F0F0F + +L(rodata_000000FF000000FF): + .long 0x000000FF + .long 0x000000FF +') + + TEXT + ALIGN(32) + +POP(`ifdef(`PIC', ` + C avoid shrl crossing a 32-byte boundary + nop')') + +PROLOGUE(M4_function) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + +ifdef(`PIC',` + movl $0xAAAAAAAA, %eax + movl $0x33333333, %edx + + movd %eax, %mm7 + movd %edx, %mm6 + + movl $0x0F0F0F0F, %eax + movl $0x000000FF, %edx + + punpckldq %mm7, %mm7 + punpckldq %mm6, %mm6 + + movd %eax, %mm5 + movd %edx, %mm4 + + punpckldq %mm5, %mm5 + punpckldq %mm4, %mm4 +',` + + movq L(rodata_AAAAAAAAAAAAAAAA), %mm7 + movq L(rodata_3333333333333333), %mm6 + movq L(rodata_0F0F0F0F0F0F0F0F), %mm5 + movq L(rodata_000000FF000000FF), %mm4 +') + +define(REG_AAAAAAAAAAAAAAAA, %mm7) +define(REG_3333333333333333, %mm6) +define(REG_0F0F0F0F0F0F0F0F, %mm5) +define(REG_000000FF000000FF, %mm4) + + + movl PARAM_SRC, %eax +HAM(` movl PARAM_SRC2, %edx') + + pxor %mm2, %mm2 C total + + shrl %ecx + jnc L(top) + +Zdisp( movd, 0,(%eax,%ecx,8), %mm1) + +HAM(` +Zdisp( movd, 0,(%edx,%ecx,8), %mm0) + pxor %mm0, %mm1 +') + + incl %ecx + jmp L(loaded) + + + ALIGN(16) +POP(` nop C alignment to avoid crossing 32-byte boundaries') + +L(top): + C eax src + C ebx + C ecx counter, qwords, decrementing + C edx [hamdist] src2 + C + C mm0 (scratch) + C mm1 (scratch) + C mm2 total (low dword) + C mm3 + C mm4 \ + C mm5 | special constants + C mm6 | + C mm7 / + + movq -8(%eax,%ecx,8), %mm1 +HAM(` pxor -8(%edx,%ecx,8), %mm1') + +L(loaded): + movq %mm1, %mm0 + pand REG_AAAAAAAAAAAAAAAA, %mm1 + + psrlq $1, %mm1 +HAM(` nop C code alignment') + + psubd %mm1, %mm0 C bit pairs +HAM(` nop C code alignment') + + + movq %mm0, %mm1 + psrlq $2, %mm0 + + pand REG_3333333333333333, %mm0 + pand REG_3333333333333333, %mm1 + + paddd %mm1, %mm0 C nibbles + + + movq %mm0, %mm1 + psrlq $4, %mm0 + + pand REG_0F0F0F0F0F0F0F0F, %mm0 + pand REG_0F0F0F0F0F0F0F0F, %mm1 + + paddd %mm1, %mm0 C bytes + + movq %mm0, %mm1 + psrlq $8, %mm0 + + + paddb %mm1, %mm0 C words + + + movq %mm0, %mm1 + psrlq $16, %mm0 + + paddd %mm1, %mm0 C dwords + + pand REG_000000FF000000FF, %mm0 + + paddd %mm0, %mm2 C low to total + psrlq $32, %mm0 + + paddd %mm0, %mm2 C high to total + loop L(top) + + + + movd %mm2, %eax + emms_or_femms + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86/k6/mmx/rshift.asm b/vendor/gmp-6.3.0/mpn/x86/k6/mmx/rshift.asm new file mode 100644 index 0000000..cd0382f --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86/k6/mmx/rshift.asm @@ -0,0 +1,130 @@ +dnl AMD K6 mpn_rshift -- mpn right shift. + +dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C K6: 3.0 cycles/limb + + +C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned shift); +C +C The loop runs at 3 cycles/limb, limited by decoding and by having 3 mmx +C instructions. This is despite every second fetch being unaligned. + + +defframe(PARAM_SHIFT,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) +deflit(`FRAME',0) + + TEXT + ALIGN(32) + +PROLOGUE(mpn_rshift) +deflit(`FRAME',0) + + C The 1 limb case can be done without the push %ebx, but it's then + C still the same speed. The push is left as a free helping hand for + C the two_or_more code. + + movl PARAM_SIZE, %eax + pushl %ebx FRAME_pushl() + + movl PARAM_SRC, %ebx + decl %eax + + movl PARAM_SHIFT, %ecx + jnz L(two_or_more) + + movl (%ebx), %edx C src limb + movl PARAM_DST, %ebx + + shrdl( %cl, %edx, %eax) C return value + + shrl %cl, %edx + + movl %edx, (%ebx) C dst limb + popl %ebx + + ret + + + ALIGN(16) C avoid offset 0x1f +L(two_or_more): + C eax size-1 + C ebx src + C ecx shift + C edx + + movl (%ebx), %edx C src low limb + negl %ecx + + addl $32, %ecx C 32-shift + movd PARAM_SHIFT, %mm6 + + shll %cl, %edx C retval + movl PARAM_DST, %ecx + + leal (%ebx,%eax,4), %ebx + + leal -4(%ecx,%eax,4), %ecx + negl %eax + + +L(simple): + C eax counter (negative) + C ebx &src[size-1] + C ecx &dst[size-1] + C edx retval + C + C mm0 scratch + C mm6 shift + +Zdisp( movq, 0,(%ebx,%eax,4), %mm0) + incl %eax + + psrlq %mm6, %mm0 + +Zdisp( movd, %mm0, 0,(%ecx,%eax,4)) + jnz L(simple) + + + movq %mm0, (%ecx) + movl %edx, %eax + + popl %ebx + + emms + ret + +EPILOGUE() -- cgit v1.2.3