1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
|
dnl PPC-32 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number.
dnl Copyright 2007, 2008, 2012, 2017 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
dnl
dnl The GNU MP Library is free software; you can redistribute it and/or modify
dnl it under the terms of either:
dnl
dnl * the GNU Lesser General Public License as published by the Free
dnl Software Foundation; either version 3 of the License, or (at your
dnl option) any later version.
dnl
dnl or
dnl
dnl * the GNU General Public License as published by the Free Software
dnl Foundation; either version 2 of the License, or (at your option) any
dnl later version.
dnl
dnl or both in parallel, as here.
dnl
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
dnl for more details.
dnl
dnl You should have received copies of the GNU General Public License and the
dnl GNU Lesser General Public License along with the GNU MP Library. If not,
dnl see https://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
C norm frac
C 7410 ~36.5 ~36.5
C 744x, 745x 29 29
C INPUT PARAMETERS
C qp = r3
C fn = r4
C up = r5
C un = r6
C d = r7
C TODO
C * Decrease register usage.
C * Make sure mul operands and optimal for early-out.
C * Check that things work well for a shared library build.
C * Write an invert_limb, perhaps inline, perhaps as a private call. Or at
C least vastly improve the current __udiv_qrnnd_c based code.
ASM_START()
PROLOGUE(mpn_divrem_2)
stwu r1, -32(r1)
slwi r0, r6, 2
add r5, r5, r0
stmw r28, 8(r1)
addi r29, r5, -8 C up = up_param + un - 2
lwz r10, 4(r7)
lwz r12, 4(r29)
addi r8, r3, -12
lwz r7, 0(r7)
cmplw cr7, r12, r10
lwz r28, 0(r29)
blt- cr7, L(2)
bgt+ cr7, L(4)
cmplw cr7, r28, r7
blt- cr7, L(2)
L(4): subfc r28, r7, r28
subfe r12, r10, r12
li r3, 1
b L(6)
L(2): li r3, 0
L(6): add r0, r4, r6
addic. r30, r0, -2
ble- cr0, L(ret)
slwi r9, r0, 2
add r8, r8, r9 C rp += un + fn
mtctr r30
C Compute di from d1
srwi r11, r10, 16
nor r0, r10, r10
divwu r31, r0, r11
rlwinm r5, r10, 0, 16, 31
mullw r9, r11, r31
mullw r6, r5, r31
subf r0, r9, r0
slwi r0, r0, 16
ori r0, r0, 65535
cmplw cr7, r0, r6
bge- cr7, L(9)
add r0, r0, r10
cmplw cr7, r0, r10
cmplw cr6, r6, r0
addi r31, r31, -1 C q1--
crorc 28, 28, 25
blt+ cr7, L(9)
addi r31, r31, -1 C q1--
add r0, r0, r10
L(9): subf r0, r6, r0
divwu r6, r0, r11
mullw r9, r11, r6
mullw r11, r5, r6
subf r0, r9, r0
slwi r0, r0, 16
ori r0, r0, 65535
cmplw cr7, r0, r11
bge- cr7, L(13)
add r0, r0, r10
cmplw cr7, r0, r10
cmplw cr6, r11, r0
addi r6, r6, -1 C q0--
crorc 28, 28, 25
blt+ cr7, L(13)
C add r0, r0, r10 C final remainder
addi r6, r6, -1 C q0--
L(13): rlwimi r6, r31, 16, 0, 15 C assemble final quotient
C Adjust di by including d0
mullw r9, r10, r6 C t0 = LO(di * d1)
addc r11, r9, r7
subfe r0, r1, r1
mulhwu r9, r6, r7 C s1 = HI(di * d0)
addc r9, r11, r9
addze. r0, r0
blt cr0, L(17)
L(18): subfc r9, r10, r9
addi r6, r6, -1
addme. r0, r0
bge+ cr0, L(18)
L(17):
C r0 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 r28 r29 r30 r31
C msl di d0 qp d1 fn up un
L(loop):
mullw r0, r12, r6 C q0 = LO(n2 * di)
cmpw cr7, r30, r4
addc r31, r0, r28 C q0 += n1
mulhwu r9, r12, r6 C q = HI(n2 * di)
adde r12, r9, r12 C q += n2
addi r30, r30, -1
mullw r0, r10, r12 C d1 * q
li r9, 0
subf r0, r0, r28 C n1 -= d1 * q
addi r5, r12, 1
ble- cr7, L(23)
lwzu r9, -4(r29)
L(23): mullw r11, r12, r7 C t0 = LO(d0 * q)
subfc r28, r7, r9 C n0 -= d0
subfe r0, r10, r0 C n1 -= d1
mulhwu r12, r12, r7 C t1 = HI(d0 * q)
subfc r28, r11, r28 C n0 -= t0
subfe r12, r12, r0 C n1 -= t1
cmplw cr7, r12, r31
blt+ cr7, L(24)
addc r28, r28, r7
adde r12, r12, r10
addi r5, r5, -1
L(24): cmplw cr7, r12, r10
bge- cr7, L(fix)
L(bck): stw r5, 0(r8)
addi r8, r8, -4
bdnz L(loop)
L(ret): stw r28, 0(r29)
stw r12, 4(r29)
lmw r28, 8(r1)
addi r1, r1, 32
blr
L(fix): cmplw cr6, r28, r7
bgt+ cr7, L(28)
blt- cr6, L(bck)
L(28): subfc r28, r7, r28
subfe r12, r10, r12
addi r5, r5, 1
b L(bck)
EPILOGUE()
|