Black Lives Matter. Support the Equal Justice Initiative.

# Text file src/math/big/arith_ppc64x.s

## Documentation: math/big

```     1  // Copyright 2013 The Go Authors. All rights reserved.
2  // Use of this source code is governed by a BSD-style
4
5  //go:build !math_big_pure_go && (ppc64 || ppc64le)
6  // +build !math_big_pure_go
7  // +build ppc64 ppc64le
8
9  #include "textflag.h"
10
11  // This file provides fast assembly versions for the elementary
12  // arithmetic operations on vectors implemented in arith.go.
13
14  // func mulWW(x, y Word) (z1, z0 Word)
15  TEXT ·mulWW(SB), NOSPLIT, \$0
16  	MOVD   x+0(FP), R4
17  	MOVD   y+8(FP), R5
18  	MULHDU R4, R5, R6
19  	MULLD  R4, R5, R7
20  	MOVD   R6, z1+16(FP)
21  	MOVD   R7, z0+24(FP)
22  	RET
23
24  // func addVV(z, y, y []Word) (c Word)
25  // z[i] = x[i] + y[i] for all i, carrying
27  	MOVD  z_len+8(FP), R7   // R7 = z_len
28  	MOVD  x+24(FP), R8      // R8 = x[]
29  	MOVD  y+48(FP), R9      // R9 = y[]
30  	MOVD  z+0(FP), R10      // R10 = z[]
31
32  	// If z_len = 0, we are done
33  	CMP   R0, R7
34  	MOVD  R0, R4
35  	BEQ   done
36
37  	// Process the first iteration out of the loop so we can
38  	// use MOVDU and avoid 3 index registers updates.
39  	MOVD  0(R8), R11      // R11 = x[i]
40  	MOVD  0(R9), R12      // R12 = y[i]
41  	ADD   \$-1, R7         // R7 = z_len - 1
42  	ADDC  R12, R11, R15   // R15 = x[i] + y[i], set CA
43  	CMP   R0, R7
44  	MOVD  R15, 0(R10)     // z[i]
45  	BEQ   final          // If z_len was 1, we are done
46
47  	SRD   \$2, R7, R5      // R5 = z_len/4
48  	CMP   R0, R5
49  	MOVD  R5, CTR         // Set up loop counter
50  	BEQ   tail            // If R5 = 0, we can't use the loop
51
52  	// Process 4 elements per iteration. Unrolling this loop
53  	// means a performance trade-off: we will lose performance
54  	// for small values of z_len (0.90x in the worst case), but
55  	// gain significant performance as z_len increases (up to
56  	// 1.45x).
57  loop:
58  	MOVD  8(R8), R11      // R11 = x[i]
59  	MOVD  16(R8), R12     // R12 = x[i+1]
60  	MOVD  24(R8), R14     // R14 = x[i+2]
61  	MOVDU 32(R8), R15     // R15 = x[i+3]
62  	MOVD  8(R9), R16      // R16 = y[i]
63  	MOVD  16(R9), R17     // R17 = y[i+1]
64  	MOVD  24(R9), R18     // R18 = y[i+2]
65  	MOVDU 32(R9), R19     // R19 = y[i+3]
66  	ADDE  R11, R16, R20   // R20 = x[i] + y[i] + CA
67  	ADDE  R12, R17, R21   // R21 = x[i+1] + y[i+1] + CA
68  	ADDE  R14, R18, R22   // R22 = x[i+2] + y[i+2] + CA
69  	ADDE  R15, R19, R23   // R23 = x[i+3] + y[i+3] + CA
70  	MOVD  R20, 8(R10)     // z[i]
71  	MOVD  R21, 16(R10)    // z[i+1]
72  	MOVD  R22, 24(R10)    // z[i+2]
73  	MOVDU R23, 32(R10)    // z[i+3]
74  	ADD   \$-4, R7         // R7 = z_len - 4
75  	BC  16, 0, loop       // bdnz
76
77  	// We may have more elements to read
78  	CMP   R0, R7
79  	BEQ   final
80
81  	// Process the remaining elements, one at a time
82  tail:
83  	MOVDU 8(R8), R11      // R11 = x[i]
84  	MOVDU 8(R9), R16      // R16 = y[i]
85  	ADD   \$-1, R7         // R7 = z_len - 1
86  	ADDE  R11, R16, R20   // R20 = x[i] + y[i] + CA
87  	CMP   R0, R7
88  	MOVDU R20, 8(R10)     // z[i]
89  	BEQ   final           // If R7 = 0, we are done
90
91  	MOVDU 8(R8), R11
92  	MOVDU 8(R9), R16
95  	CMP   R0, R7
96  	MOVDU R20, 8(R10)
97  	BEQ   final
98
99  	MOVD  8(R8), R11
100  	MOVD  8(R9), R16
102  	MOVD  R20, 8(R10)
103
104  final:
105  	ADDZE R4              // Capture CA
106
107  done:
108  	MOVD  R4, c+72(FP)
109  	RET
110
111  // func subVV(z, x, y []Word) (c Word)
112  // z[i] = x[i] - y[i] for all i, carrying
113  TEXT ·subVV(SB), NOSPLIT, \$0
114  	MOVD  z_len+8(FP), R7 // R7 = z_len
115  	MOVD  x+24(FP), R8    // R8 = x[]
116  	MOVD  y+48(FP), R9    // R9 = y[]
117  	MOVD  z+0(FP), R10    // R10 = z[]
118
119  	// If z_len = 0, we are done
120  	CMP   R0, R7
121  	MOVD  R0, R4
122  	BEQ   done
123
124  	// Process the first iteration out of the loop so we can
125  	// use MOVDU and avoid 3 index registers updates.
126  	MOVD  0(R8), R11      // R11 = x[i]
127  	MOVD  0(R9), R12      // R12 = y[i]
128  	ADD   \$-1, R7         // R7 = z_len - 1
129  	SUBC  R12, R11, R15   // R15 = x[i] - y[i], set CA
130  	CMP   R0, R7
131  	MOVD  R15, 0(R10)     // z[i]
132  	BEQ   final           // If z_len was 1, we are done
133
134  	SRD   \$2, R7, R5      // R5 = z_len/4
135  	CMP   R0, R5
136  	MOVD  R5, CTR         // Set up loop counter
137  	BEQ   tail            // If R5 = 0, we can't use the loop
138
139  	// Process 4 elements per iteration. Unrolling this loop
140  	// means a performance trade-off: we will lose performance
141  	// for small values of z_len (0.92x in the worst case), but
142  	// gain significant performance as z_len increases (up to
143  	// 1.45x).
144  loop:
145  	MOVD  8(R8), R11      // R11 = x[i]
146  	MOVD  16(R8), R12     // R12 = x[i+1]
147  	MOVD  24(R8), R14     // R14 = x[i+2]
148  	MOVDU 32(R8), R15     // R15 = x[i+3]
149  	MOVD  8(R9), R16      // R16 = y[i]
150  	MOVD  16(R9), R17     // R17 = y[i+1]
151  	MOVD  24(R9), R18     // R18 = y[i+2]
152  	MOVDU 32(R9), R19     // R19 = y[i+3]
153  	SUBE  R16, R11, R20   // R20 = x[i] - y[i] + CA
154  	SUBE  R17, R12, R21   // R21 = x[i+1] - y[i+1] + CA
155  	SUBE  R18, R14, R22   // R22 = x[i+2] - y[i+2] + CA
156  	SUBE  R19, R15, R23   // R23 = x[i+3] - y[i+3] + CA
157  	MOVD  R20, 8(R10)     // z[i]
158  	MOVD  R21, 16(R10)    // z[i+1]
159  	MOVD  R22, 24(R10)    // z[i+2]
160  	MOVDU R23, 32(R10)    // z[i+3]
161  	ADD   \$-4, R7         // R7 = z_len - 4
162  	BC  16, 0, loop       // bdnz
163
164  	// We may have more elements to read
165  	CMP   R0, R7
166  	BEQ   final
167
168  	// Process the remaining elements, one at a time
169  tail:
170  	MOVDU 8(R8), R11      // R11 = x[i]
171  	MOVDU 8(R9), R16      // R16 = y[i]
172  	ADD   \$-1, R7         // R7 = z_len - 1
173  	SUBE  R16, R11, R20   // R20 = x[i] - y[i] + CA
174  	CMP   R0, R7
175  	MOVDU R20, 8(R10)     // z[i]
176  	BEQ   final           // If R7 = 0, we are done
177
178  	MOVDU 8(R8), R11
179  	MOVDU 8(R9), R16
181  	SUBE  R16, R11, R20
182  	CMP   R0, R7
183  	MOVDU R20, 8(R10)
184  	BEQ   final
185
186  	MOVD  8(R8), R11
187  	MOVD  8(R9), R16
188  	SUBE  R16, R11, R20
189  	MOVD  R20, 8(R10)
190
191  final:
193  	XOR   \$1, R4
194
195  done:
196  	MOVD  R4, c+72(FP)
197  	RET
198
199  // func addVW(z, x []Word, y Word) (c Word)
201  	MOVD z+0(FP), R10	// R10 = z[]
202  	MOVD x+24(FP), R8	// R8 = x[]
203  	MOVD y+48(FP), R4	// R4 = y = c
204  	MOVD z_len+8(FP), R11	// R11 = z_len
205
206  	CMP   R0, R11		// If z_len is zero, return
207  	BEQ   done
208
209  	// We will process the first iteration out of the loop so we capture
210  	// the value of c. In the subsequent iterations, we will rely on the
211  	// value of CA set here.
212  	MOVD  0(R8), R20	// R20 = x[i]
213  	ADD   \$-1, R11		// R11 = z_len - 1
214  	ADDC  R20, R4, R6	// R6 = x[i] + c
215  	CMP   R0, R11		// If z_len was 1, we are done
216  	MOVD  R6, 0(R10)	// z[i]
217  	BEQ   final
218
219  	// We will read 4 elements per iteration
220  	SRD   \$2, R11, R9	// R9 = z_len/4
221  	DCBT  (R8)
222  	CMP   R0, R9
223  	MOVD  R9, CTR		// Set up the loop counter
224  	BEQ   tail		// If R9 = 0, we can't use the loop
225
226  loop:
227  	MOVD  8(R8), R20	// R20 = x[i]
228  	MOVD  16(R8), R21	// R21 = x[i+1]
229  	MOVD  24(R8), R22	// R22 = x[i+2]
230  	MOVDU 32(R8), R23	// R23 = x[i+3]
231  	ADDZE R20, R24		// R24 = x[i] + CA
232  	ADDZE R21, R25		// R25 = x[i+1] + CA
233  	ADDZE R22, R26		// R26 = x[i+2] + CA
234  	ADDZE R23, R27		// R27 = x[i+3] + CA
235  	MOVD  R24, 8(R10)	// z[i]
236  	MOVD  R25, 16(R10)	// z[i+1]
237  	MOVD  R26, 24(R10)	// z[i+2]
238  	MOVDU R27, 32(R10)	// z[i+3]
239  	ADD   \$-4, R11		// R11 = z_len - 4
240  	BC    16, 0, loop	// bdnz
241
242  	// We may have some elements to read
243  	CMP R0, R11
244  	BEQ final
245
246  tail:
247  	MOVDU 8(R8), R20
250  	MOVDU R24, 8(R10)
251  	CMP R0, R11
252  	BEQ final
253
254  	MOVDU 8(R8), R20
257  	MOVDU R24, 8(R10)
258  	CMP R0, R11
259  	BEQ final
260
261  	MOVD 8(R8), R20
263  	MOVD R24, 8(R10)
264
265  final:
266  	ADDZE R0, R4		// c = CA
267  done:
268  	MOVD  R4, c+56(FP)
269  	RET
270
271  // func subVW(z, x []Word, y Word) (c Word)
272  TEXT ·subVW(SB), NOSPLIT, \$0
273  	MOVD  z+0(FP), R10	// R10 = z[]
274  	MOVD  x+24(FP), R8	// R8 = x[]
275  	MOVD  y+48(FP), R4	// R4 = y = c
276  	MOVD  z_len+8(FP), R11	// R11 = z_len
277
278  	CMP   R0, R11		// If z_len is zero, return
279  	BEQ   done
280
281  	// We will process the first iteration out of the loop so we capture
282  	// the value of c. In the subsequent iterations, we will rely on the
283  	// value of CA set here.
284  	MOVD  0(R8), R20	// R20 = x[i]
285  	ADD   \$-1, R11		// R11 = z_len - 1
286  	SUBC  R4, R20, R6	// R6 = x[i] - c
287  	CMP   R0, R11		// If z_len was 1, we are done
288  	MOVD  R6, 0(R10)	// z[i]
289  	BEQ   final
290
291  	// We will read 4 elements per iteration
292  	SRD   \$2, R11, R9	// R9 = z_len/4
293  	DCBT  (R8)
294  	CMP   R0, R9
295  	MOVD  R9, CTR		// Set up the loop counter
296  	BEQ   tail		// If R9 = 0, we can't use the loop
297
298  	// The loop here is almost the same as the one used in s390x, but
299  	// we don't need to capture CA every iteration because we've already
300  	// done that above.
301  loop:
302  	MOVD  8(R8), R20
303  	MOVD  16(R8), R21
304  	MOVD  24(R8), R22
305  	MOVDU 32(R8), R23
306  	SUBE  R0, R20
307  	SUBE  R0, R21
308  	SUBE  R0, R22
309  	SUBE  R0, R23
310  	MOVD  R20, 8(R10)
311  	MOVD  R21, 16(R10)
312  	MOVD  R22, 24(R10)
313  	MOVDU R23, 32(R10)
315  	BC    16, 0, loop	// bdnz
316
317  	// We may have some elements to read
318  	CMP   R0, R11
319  	BEQ   final
320
321  tail:
322  	MOVDU 8(R8), R20
323  	SUBE  R0, R20
325  	MOVDU R20, 8(R10)
326  	CMP   R0, R11
327  	BEQ   final
328
329  	MOVDU 8(R8), R20
330  	SUBE  R0, R20
332  	MOVDU R20, 8(R10)
333  	CMP   R0, R11
334  	BEQ   final
335
336  	MOVD  8(R8), R20
337  	SUBE  R0, R20
338  	MOVD  R20, 8(R10)
339
340  final:
341  	// Capture CA
342  	SUBE  R4, R4
343  	NEG   R4, R4
344
345  done:
346  	MOVD  R4, c+56(FP)
347  	RET
348
349  TEXT ·shlVU(SB), NOSPLIT, \$0
350  	BR ·shlVU_g(SB)
351
352  TEXT ·shrVU(SB), NOSPLIT, \$0
353  	BR ·shrVU_g(SB)
354
355  // func mulAddVWW(z, x []Word, y, r Word) (c Word)
357  	MOVD    z+0(FP), R10      // R10 = z[]
358  	MOVD    x+24(FP), R8      // R8 = x[]
359  	MOVD    y+48(FP), R9      // R9 = y
360  	MOVD    r+56(FP), R4      // R4 = r = c
361  	MOVD    z_len+8(FP), R11  // R11 = z_len
362
363  	CMP     R0, R11
364  	BEQ     done
365
366  	MOVD    0(R8), R20
368  	MULLD   R9, R20, R6       // R6 = z0 = Low-order(x[i]*y)
369  	MULHDU  R9, R20, R7       // R7 = z1 = High-order(x[i]*y)
370  	ADDC    R4, R6            // R6 = z0 + r
371  	ADDZE   R7                // R7 = z1 + CA
372  	CMP     R0, R11
373  	MOVD    R7, R4            // R4 = c
374  	MOVD    R6, 0(R10)        // z[i]
375  	BEQ     done
376
377  	// We will read 4 elements per iteration
378  	SRD     \$2, R11, R14      // R14 = z_len/4
379  	DCBT    (R8)
380  	CMP     R0, R14
381  	MOVD    R14, CTR          // Set up the loop counter
382  	BEQ     tail              // If R9 = 0, we can't use the loop
383
384  loop:
385  	MOVD    8(R8), R20        // R20 = x[i]
386  	MOVD    16(R8), R21       // R21 = x[i+1]
387  	MOVD    24(R8), R22       // R22 = x[i+2]
388  	MOVDU   32(R8), R23       // R23 = x[i+3]
389  	MULLD   R9, R20, R24      // R24 = z0[i]
390  	MULHDU  R9, R20, R20      // R20 = z1[i]
391  	ADDC    R4, R24           // R24 = z0[i] + c
392  	ADDZE   R20               // R7 = z1[i] + CA
393  	MULLD   R9, R21, R25
394  	MULHDU  R9, R21, R21
397  	MULLD   R9, R22, R26
398  	MULHDU  R9, R22, R22
399  	MULLD   R9, R23, R27
400  	MULHDU  R9, R23, R23
403  	MOVD    R24, 8(R10)       // z[i]
404  	MOVD    R25, 16(R10)      // z[i+1]
406  	ADDZE   R23,R4		  // update carry
407  	MOVD    R26, 24(R10)      // z[i+2]
408  	MOVDU   R27, 32(R10)      // z[i+3]
409  	ADD     \$-4, R11          // R11 = z_len - 4
410  	BC      16, 0, loop       // bdnz
411
412  	// We may have some elements to read
413  	CMP   R0, R11
414  	BEQ   done
415
416  	// Process the remaining elements, one at a time
417  tail:
418  	MOVDU   8(R8), R20        // R20 = x[i]
419  	MULLD   R9, R20, R24      // R24 = z0[i]
420  	MULHDU  R9, R20, R25      // R25 = z1[i]
421  	ADD     \$-1, R11          // R11 = z_len - 1
424  	MOVDU   R24, 8(R10)       // z[i]
425  	CMP     R0, R11
426  	MOVD    R25, R4           // R4 = c
427  	BEQ     done              // If R11 = 0, we are done
428
429  	MOVDU   8(R8), R20
430  	MULLD   R9, R20, R24
431  	MULHDU  R9, R20, R25
435  	MOVDU   R24, 8(R10)
436  	CMP     R0, R11
437  	MOVD    R25, R4
438  	BEQ     done
439
440  	MOVD    8(R8), R20
441  	MULLD   R9, R20, R24
442  	MULHDU  R9, R20, R25
446  	MOVD    R24, 8(R10)
447  	MOVD    R25, R4
448
449  done:
450  	MOVD    R4, c+64(FP)
451  	RET
452
453  // func addMulVVW(z, x []Word, y Word) (c Word)
455  	MOVD z+0(FP), R10	// R10 = z[]
456  	MOVD x+24(FP), R8	// R8 = x[]
457  	MOVD y+48(FP), R9	// R9 = y
458  	MOVD z_len+8(FP), R22	// R22 = z_len
459
460  	MOVD R0, R3		// R3 will be the index register
461  	CMP  R0, R22
462  	MOVD R0, R4		// R4 = c = 0
463  	MOVD R22, CTR		// Initialize loop counter
464  	BEQ  done
465
466  loop:
467  	MOVD  (R8)(R3), R20	// Load x[i]
468  	MOVD  (R10)(R3), R21	// Load z[i]
469  	MULLD  R9, R20, R6	// R6 = Low-order(x[i]*y)
470  	MULHDU R9, R20, R7	// R7 = High-order(x[i]*y)
471  	ADDC   R21, R6		// R6 = z0
472  	ADDZE  R7		// R7 = z1
473  	ADDC   R4, R6		// R6 = z0 + c + 0
474  	ADDZE  R7, R4           // c += z1
475  	MOVD   R6, (R10)(R3)	// Store z[i]