1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
|
; mc88100 __gmpn_mul_1 -- Multiply a limb vector with a single limb and
; store the product in a second limb vector.
; Copyright 1992, 1994, 1995, 2000 Free Software Foundation, Inc.
; This file is part of the GNU MP Library.
;
; The GNU MP Library is free software; you can redistribute it and/or modify
; it under the terms of either:
;
; * the GNU Lesser General Public License as published by the Free
; Software Foundation; either version 3 of the License, or (at your
; option) any later version.
;
; or
;
; * the GNU General Public License as published by the Free Software
; Foundation; either version 2 of the License, or (at your option) any
; later version.
;
; or both in parallel, as here.
;
; The GNU MP Library is distributed in the hope that it will be useful, but
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
; for more details.
;
; You should have received copies of the GNU General Public License and the
; GNU Lesser General Public License along with the GNU MP Library. If not,
; see https://www.gnu.org/licenses/.
; INPUT PARAMETERS
; res_ptr r2
; s1_ptr r3
; size r4
; s2_limb r5
; Common overhead is about 11 cycles/invocation.
; The speed for S2_LIMB >= 0x10000 is approximately 21 cycles/limb. (The
; pipeline stalls 2 cycles due to WB contention.)
; The speed for S2_LIMB < 0x10000 is approximately 16 cycles/limb. (The
; pipeline stalls 2 cycles due to WB contention and 1 cycle due to latency.)
; To enhance speed:
; 1. Unroll main loop 4-8 times.
; 2. Schedule code to avoid WB contention. It might be tempting to move the
; ld instruction in the loops down to save 2 cycles (less WB contention),
; but that looses because the ultimate value will be read from outside
; the allocated space. But if we handle the ultimate multiplication in
; the tail, we can do this.
; 3. Make the multiplication with less instructions. I think the code for
; (S2_LIMB >= 0x10000) is not minimal.
; With these techniques the (S2_LIMB >= 0x10000) case would run in 17 or
; less cycles/limb; the (S2_LIMB < 0x10000) case would run in 11
; cycles/limb. (Assuming infinite unrolling.)
text
align 16
global ___gmpn_mul_1
___gmpn_mul_1:
; Make S1_PTR and RES_PTR point at the end of their blocks
; and negate SIZE.
lda r3,r3[r4]
lda r6,r2[r4] ; RES_PTR in r6 since r2 is retval
subu r4,r0,r4
addu.co r2,r0,r0 ; r2 = cy = 0
ld r9,r3[r4]
mask r7,r5,0xffff ; r7 = lo(S2_LIMB)
extu r8,r5,16 ; r8 = hi(S2_LIMB)
bcnd.n eq0,r8,Lsmall ; jump if (hi(S2_LIMB) == 0)
subu r6,r6,4
; General code for any value of S2_LIMB.
; Make a stack frame and save r25 and r26
subu r31,r31,16
st.d r25,r31,8
; Enter the loop in the middle
br.n L1
addu r4,r4,1
Loop: ld r9,r3[r4]
st r26,r6[r4]
; bcnd ne0,r0,0 ; bubble
addu r4,r4,1
L1: mul r26,r9,r5 ; low word of product mul_1 WB ld
mask r12,r9,0xffff ; r12 = lo(s1_limb) mask_1
mul r11,r12,r7 ; r11 = prod_0 mul_2 WB mask_1
mul r10,r12,r8 ; r10 = prod_1a mul_3
extu r13,r9,16 ; r13 = hi(s1_limb) extu_1 WB mul_1
mul r12,r13,r7 ; r12 = prod_1b mul_4 WB extu_1
mul r25,r13,r8 ; r25 = prod_2 mul_5 WB mul_2
extu r11,r11,16 ; r11 = hi(prod_0) extu_2 WB mul_3
addu r10,r10,r11 ; addu_1 WB extu_2
; bcnd ne0,r0,0 ; bubble WB addu_1
addu.co r10,r10,r12 ; WB mul_4
mask.u r10,r10,0xffff ; move the 16 most significant bits...
addu.ci r10,r10,r0 ; ...to the low half of the word...
rot r10,r10,16 ; ...and put carry in pos 16.
addu.co r26,r26,r2 ; add old carry limb
bcnd.n ne0,r4,Loop
addu.ci r2,r25,r10 ; compute new carry limb
st r26,r6[r4]
ld.d r25,r31,8
jmp.n r1
addu r31,r31,16
; Fast code for S2_LIMB < 0x10000
Lsmall:
; Enter the loop in the middle
br.n SL1
addu r4,r4,1
SLoop: ld r9,r3[r4] ;
st r8,r6[r4] ;
addu r4,r4,1 ;
SL1: mul r8,r9,r5 ; low word of product
mask r12,r9,0xffff ; r12 = lo(s1_limb)
extu r13,r9,16 ; r13 = hi(s1_limb)
mul r11,r12,r7 ; r11 = prod_0
mul r12,r13,r7 ; r12 = prod_1b
addu.cio r8,r8,r2 ; add old carry limb
extu r10,r11,16 ; r11 = hi(prod_0)
addu r10,r10,r12 ;
bcnd.n ne0,r4,SLoop
extu r2,r10,16 ; r2 = new carry limb
jmp.n r1
st r8,r6[r4]
|