1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
|
dnl PowerPC-64 mpn_mod_1_1p
dnl Copyright 2010, 2011 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
dnl
dnl The GNU MP Library is free software; you can redistribute it and/or modify
dnl it under the terms of either:
dnl
dnl * the GNU Lesser General Public License as published by the Free
dnl Software Foundation; either version 3 of the License, or (at your
dnl option) any later version.
dnl
dnl or
dnl
dnl * the GNU General Public License as published by the Free Software
dnl Foundation; either version 2 of the License, or (at your option) any
dnl later version.
dnl
dnl or both in parallel, as here.
dnl
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
dnl for more details.
dnl
dnl You should have received copies of the GNU General Public License and the
dnl GNU Lesser General Public License along with the GNU MP Library. If not,
dnl see https://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
C POWER3/PPC630 ?
C POWER4/PPC970 17
C POWER5 16
C POWER6 30
C POWER7 10.2
C TODO
C * Optimise, in particular the cps function. This was compiler-generated and
C then hand optimised.
C INPUT PARAMETERS
define(`ap', `r3')
define(`n', `r4')
define(`d', `r5')
define(`cps', `r6')
ASM_START()
EXTERN_FUNC(mpn_invert_limb)
PROLOGUE(mpn_mod_1_1p)
sldi r10, r4, 3
addi r4, r4, -1
add r3, r3, r10
ld r0, 16(r6) C B1modb
ld r12, 24(r6) C B2modb
ld r9, -8(r3)
ld r10, -16(r3)
mtctr r4
mulhdu r8, r9, r0
mulld r7, r9, r0
addc r11, r7, r10
addze r9, r8
bdz L(end)
ALIGN(16)
L(top): ld r4, -24(r3)
addi r3, r3, -8
nop
mulld r10, r11, r0
mulld r8, r9, r12
mulhdu r11, r11, r0
mulhdu r9, r9, r12
addc r7, r10, r4
addze r10, r11
addc r11, r8, r7
adde r9, r9, r10
bdnz L(top)
L(end):
ifdef(`HAVE_LIMB_LITTLE_ENDIAN',
` lwz r0, 8(r6)',
` lwz r0, 12(r6)')
ld r3, 0(r6)
cmpdi cr7, r0, 0
beq- cr7, L(4)
subfic r10, r0, 64
sld r9, r9, r0
srd r10, r11, r10
or r9, r10, r9
L(4): subfc r10, r5, r9
subfe r10, r10, r10
nand r10, r10, r10
sld r11, r11, r0
and r10, r10, r5
subf r9, r10, r9
mulhdu r10, r9, r3
mulld r3, r9, r3
addi r9, r9, 1
addc r8, r3, r11
adde r3, r10, r9
mulld r3, r3, r5
subf r3, r3, r11
cmpld cr7, r8, r3
bge cr7, L(5) C FIXME: Make branch-less
add r3, r3, r5
L(5): cmpld cr7, r3, r5
bge- cr7, L(10)
srd r3, r3, r0
blr
L(10): subf r3, r5, r3
srd r3, r3, r0
blr
EPILOGUE()
PROLOGUE(mpn_mod_1_1p_cps,toc)
mflr r0
std r29, -24(r1)
std r30, -16(r1)
std r31, -8(r1)
cntlzd r31, r4
std r0, 16(r1)
extsw r31, r31
mr r29, r3
stdu r1, -144(r1)
sld r30, r4, r31
mr r3, r30
CALL( mpn_invert_limb)
cmpdi cr7, r31, 0
neg r0, r30
beq- cr7, L(13)
subfic r11, r31, 64
li r0, 1
neg r9, r30
srd r11, r3, r11
sld r0, r0, r31
or r0, r11, r0
mulld r0, r0, r9
L(13): mulhdu r9, r0, r3
mulld r11, r0, r3
add r9, r0, r9
nor r9, r9, r9
mulld r9, r9, r30
cmpld cr7, r11, r9
bge cr7, L(14)
add r9, r9, r30
L(14): addi r1, r1, 144
srd r0, r0, r31
std r31, 8(r29)
std r3, 0(r29)
std r0, 16(r29)
ld r0, 16(r1)
srd r9, r9, r31
ld r30, -16(r1)
ld r31, -8(r1)
std r9, 24(r29)
ld r29, -24(r1)
mtlr r0
blr
EPILOGUE()
|