From a89a14ef5da44684a16b204e7a70460cc8c4922a Mon Sep 17 00:00:00 2001
From: Thomas Voss <mail@thomasvoss.com>
Date: Fri, 21 Jun 2024 23:36:36 +0200
Subject: Basic constant folding implementation

---
 vendor/gmp-6.3.0/mpn/arm64/bdiv_q_1.asm | 122 ++++++++++++++++++++++++++++++++
 1 file changed, 122 insertions(+)
 create mode 100644 vendor/gmp-6.3.0/mpn/arm64/bdiv_q_1.asm

(limited to 'vendor/gmp-6.3.0/mpn/arm64/bdiv_q_1.asm')

diff --git a/vendor/gmp-6.3.0/mpn/arm64/bdiv_q_1.asm b/vendor/gmp-6.3.0/mpn/arm64/bdiv_q_1.asm
new file mode 100644
index 0000000..7fffc93
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/arm64/bdiv_q_1.asm
@@ -0,0 +1,122 @@
+dnl  ARM64 mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- Hensel division by 1-limb divisor.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2012, 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C               cycles/limb
+C               norm   unorm
+C Cortex-A53	12	15
+C Cortex-A57	12	12
+C Cortex-A72
+C Cortex-A73
+C X-Gene	11	11
+
+C TODO
+C  * Scheduling of umulh later in the unorm loop brings A53 time to 12 c/l.
+C    Unfortunately, that requires software pipelining.
+
+define(`rp',  `x0')
+define(`up',  `x1')
+define(`n',   `x2')
+define(`d',   `x3')
+define(`di',  `x4')		C	just mpn_pi1_bdiv_q_1
+define(`cnt', `x5')		C	just mpn_pi1_bdiv_q_1
+
+define(`cy',  `r7')
+define(`tnc', `x8')
+
+ASM_START()
+PROLOGUE(mpn_bdiv_q_1)
+
+	rbit	x6, d
+	clz	cnt, x6
+	lsr	d, d, cnt
+
+	LEA_HI(	x7, binvert_limb_table)
+	ubfx	x6, d, 1, 7
+	LEA_LO(	x7, binvert_limb_table)
+	ldrb	w6, [x7, x6]
+	ubfiz	x7, x6, 1, 8
+	umull	x6, w6, w6
+	msub	x6, x6, d, x7
+	lsl	x7, x6, 1
+	mul	x6, x6, x6
+	msub	x6, x6, d, x7
+	lsl	x7, x6, 1
+	mul	x6, x6, x6
+	msub	di, x6, d, x7
+
+	b	GSYM_PREFIX`'mpn_pi1_bdiv_q_1
+EPILOGUE()
+
+PROLOGUE(mpn_pi1_bdiv_q_1)
+	sub	n, n, #1
+	subs	x6, x6, x6		C clear r6 and C flag
+	ldr	x9, [up],#8
+	cbz	cnt, L(norm)
+
+L(unorm):
+	lsr	x12, x9, cnt
+	cbz	n, L(eu1)
+	sub	tnc, xzr, cnt
+
+L(tpu):	ldr	x9, [up],#8
+	lsl	x7, x9, tnc
+	orr	x7, x7, x12
+	sbcs	x6, x7, x6
+	mul	x7, x6, di
+	str	x7, [rp],#8
+	lsr	x12, x9, cnt
+	umulh	x6, x7, d
+	sub	n, n, #1
+	cbnz	n, L(tpu)
+
+L(eu1):	sbcs	x6, x12, x6
+	mul	x6, x6, di
+	str	x6, [rp]
+	ret
+
+L(norm):
+	mul	x5, x9, di
+	str	x5, [rp],#8
+	cbz	n, L(en1)
+
+L(tpn):	ldr	x9, [up],#8
+	umulh	x5, x5, d
+	sbcs	x5, x9, x5
+	mul	x5, x5, di
+	str	x5, [rp],#8
+	sub	n, n, #1
+	cbnz	n, L(tpn)
+
+L(en1):	ret
+EPILOGUE()
-- 
cgit v1.2.3