1 // Copyright 2018 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 //go:build ppc64 || ppc64le
6 // +build ppc64 ppc64le
7
8 #include "go_asm.h"
9 #include "textflag.h"
10
11 TEXT ·IndexByte(SB),NOSPLIT|NOFRAME,$0-40
12 MOVD b_base+0(FP), R3 // R3 = byte array pointer
13 MOVD b_len+8(FP), R4 // R4 = length
14 MOVBZ c+24(FP), R5 // R5 = byte
15 MOVD $ret+32(FP), R14 // R14 = &ret
16 BR indexbytebody<>(SB)
17
18 TEXT ·IndexByteString(SB),NOSPLIT|NOFRAME,$0-32
19 MOVD s_base+0(FP), R3 // R3 = string
20 MOVD s_len+8(FP), R4 // R4 = length
21 MOVBZ c+16(FP), R5 // R5 = byte
22 MOVD $ret+24(FP), R14 // R14 = &ret
23 BR indexbytebody<>(SB)
24
25 TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
26 MOVD R3,R17 // Save base address for calculating the index later.
27 RLDICR $0,R3,$60,R8 // Align address to doubleword boundary in R8.
28 RLDIMI $8,R5,$48,R5 // Replicating the byte across the register.
29 ADD R4,R3,R7 // Last acceptable address in R7.
30 DCBT (R8) // Prepare cache line.
31
32 RLDIMI $16,R5,$32,R5
33 CMPU R4,$32 // Check if it's a small string (≤32 bytes). Those will be processed differently.
34 MOVD $-1,R9
35 WORD $0x54661EB8 // Calculate padding in R6 (rlwinm r6,r3,3,26,28).
36 RLDIMI $32,R5,$0,R5
37 MOVD R7,R10 // Save last acceptable address in R10 for later.
38 ADD $-1,R7,R7
39 #ifdef GOARCH_ppc64le
40 SLD R6,R9,R9 // Prepare mask for Little Endian
41 #else
42 SRD R6,R9,R9 // Same for Big Endian
43 #endif
44 BLE small_string // Jump to the small string case if it's ≤32 bytes.
45
46 // If we are 64-byte aligned, branch to qw_align just to get the auxiliary values
47 // in V0, V1 and V10, then branch to the preloop.
48 ANDCC $63,R3,R11
49 BEQ CR0,qw_align
50 RLDICL $0,R3,$61,R11
51
52 MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8.
53 CMPB R12,R5,R3 // Check for a match.
54 AND R9,R3,R3 // Mask bytes below s_base
55 RLDICL $0,R7,$61,R6 // length-1
56 RLDICR $0,R7,$60,R7 // Last doubleword in R7
57 CMPU R3,$0,CR7 // If we have a match, jump to the final computation
58 BNE CR7,done
59 ADD $8,R8,R8
60 ADD $-8,R4,R4
61 ADD R4,R11,R4
62
63 // Check for quadword alignment
64 ANDCC $15,R8,R11
65 BEQ CR0,qw_align
66
67 // Not aligned, so handle the next doubleword
68 MOVD 0(R8),R12
69 CMPB R12,R5,R3
70 CMPU R3,$0,CR7
71 BNE CR7,done
72 ADD $8,R8,R8
73 ADD $-8,R4,R4
74
75 // Either quadword aligned or 64-byte at this point. We can use LVX.
76 qw_align:
77
78 // Set up auxiliary data for the vectorized algorithm.
79 VSPLTISB $0,V0 // Replicate 0 across V0
80 VSPLTISB $3,V10 // Use V10 as control for VBPERMQ
81 MTVRD R5,V1
82 LVSL (R0+R0),V11
83 VSLB V11,V10,V10
84 VSPLTB $7,V1,V1 // Replicate byte across V1
85 CMPU R4, $64 // If len ≤ 64, don't use the vectorized loop
86 BLE tail
87
88 // We will load 4 quardwords per iteration in the loop, so check for
89 // 64-byte alignment. If 64-byte aligned, then branch to the preloop.
90 ANDCC $63,R8,R11
91 BEQ CR0,preloop
92
93 // Not 64-byte aligned. Load one quadword at a time until aligned.
94 LVX (R8+R0),V4
95 VCMPEQUBCC V1,V4,V6 // Check for byte in V4
96 BNE CR6,found_qw_align
97 ADD $16,R8,R8
98 ADD $-16,R4,R4
99
100 ANDCC $63,R8,R11
101 BEQ CR0,preloop
102 LVX (R8+R0),V4
103 VCMPEQUBCC V1,V4,V6 // Check for byte in V4
104 BNE CR6,found_qw_align
105 ADD $16,R8,R8
106 ADD $-16,R4,R4
107
108 ANDCC $63,R8,R11
109 BEQ CR0,preloop
110 LVX (R8+R0),V4
111 VCMPEQUBCC V1,V4,V6 // Check for byte in V4
112 BNE CR6,found_qw_align
113 ADD $-16,R4,R4
114 ADD $16,R8,R8
115
116 // 64-byte aligned. Prepare for the main loop.
117 preloop:
118 CMPU R4,$64
119 BLE tail // If len ≤ 64, don't use the vectorized loop
120
121 // We are now aligned to a 64-byte boundary. We will load 4 quadwords
122 // per loop iteration. The last doubleword is in R10, so our loop counter
123 // starts at (R10-R8)/64.
124 SUB R8,R10,R6
125 SRD $6,R6,R9 // Loop counter in R9
126 MOVD R9,CTR
127
128 ADD $-64,R8,R8 // Adjust index for loop entry
129 MOVD $16,R11 // Load offsets for the vector loads
130 MOVD $32,R9
131 MOVD $48,R7
132
133 // Main loop we will load 64 bytes per iteration
134 loop:
135 ADD $64,R8,R8 // Fuse addi+lvx for performance
136 LVX (R8+R0),V2 // Load 4 16-byte vectors
137 LVX (R8+R11),V3
138 VCMPEQUB V1,V2,V6 // Look for byte in each vector
139 VCMPEQUB V1,V3,V7
140
141 LVX (R8+R9),V4
142 LVX (R8+R7),V5
143 VCMPEQUB V1,V4,V8
144 VCMPEQUB V1,V5,V9
145
146 VOR V6,V7,V11 // Compress the result in a single vector
147 VOR V8,V9,V12
148 VOR V11,V12,V13
149 VCMPEQUBCC V0,V13,V14 // Check for byte
150 BGE CR6,found
151 BC 16,0,loop // bdnz loop
152
153 // Handle the tailing bytes or R4 ≤ 64
154 RLDICL $0,R6,$58,R4
155 ADD $64,R8,R8
156 tail:
157 CMPU R4,$0
158 BEQ notfound
159 LVX (R8+R0),V4
160 VCMPEQUBCC V1,V4,V6
161 BNE CR6,found_qw_align
162 ADD $16,R8,R8
163 CMPU R4,$16,CR6
164 BLE CR6,notfound
165 ADD $-16,R4,R4
166
167 LVX (R8+R0),V4
168 VCMPEQUBCC V1,V4,V6
169 BNE CR6,found_qw_align
170 ADD $16,R8,R8
171 CMPU R4,$16,CR6
172 BLE CR6,notfound
173 ADD $-16,R4,R4
174
175 LVX (R8+R0),V4
176 VCMPEQUBCC V1,V4,V6
177 BNE CR6,found_qw_align
178 ADD $16,R8,R8
179 CMPU R4,$16,CR6
180 BLE CR6,notfound
181 ADD $-16,R4,R4
182
183 LVX (R8+R0),V4
184 VCMPEQUBCC V1,V4,V6
185 BNE CR6,found_qw_align
186
187 notfound:
188 MOVD $-1,R3
189 MOVD R3,(R14)
190 RET
191
192 found:
193 // We will now compress the results into a single doubleword,
194 // so it can be moved to a GPR for the final index calculation.
195
196 // The bytes in V6-V9 are either 0x00 or 0xFF. So, permute the
197 // first bit of each byte into bits 48-63.
198 VBPERMQ V6,V10,V6
199 VBPERMQ V7,V10,V7
200 VBPERMQ V8,V10,V8
201 VBPERMQ V9,V10,V9
202
203 // Shift each 16-bit component into its correct position for
204 // merging into a single doubleword.
205 #ifdef GOARCH_ppc64le
206 VSLDOI $2,V7,V7,V7
207 VSLDOI $4,V8,V8,V8
208 VSLDOI $6,V9,V9,V9
209 #else
210 VSLDOI $6,V6,V6,V6
211 VSLDOI $4,V7,V7,V7
212 VSLDOI $2,V8,V8,V8
213 #endif
214
215 // Merge V6-V9 into a single doubleword and move to a GPR.
216 VOR V6,V7,V11
217 VOR V8,V9,V4
218 VOR V4,V11,V4
219 MFVRD V4,R3
220
221 #ifdef GOARCH_ppc64le
222 ADD $-1,R3,R11
223 ANDN R3,R11,R11
224 POPCNTD R11,R11 // Count trailing zeros (Little Endian).
225 #else
226 CNTLZD R3,R11 // Count leading zeros (Big Endian).
227 #endif
228 ADD R8,R11,R3 // Calculate byte address
229
230 return:
231 SUB R17,R3
232 MOVD R3,(R14)
233 RET
234
235 found_qw_align:
236 // Use the same algorithm as above. Compress the result into
237 // a single doubleword and move it to a GPR for the final
238 // calculation.
239 VBPERMQ V6,V10,V6
240
241 #ifdef GOARCH_ppc64le
242 MFVRD V6,R3
243 ADD $-1,R3,R11
244 ANDN R3,R11,R11
245 POPCNTD R11,R11
246 #else
247 VSLDOI $6,V6,V6,V6
248 MFVRD V6,R3
249 CNTLZD R3,R11
250 #endif
251 ADD R8,R11,R3
252 CMPU R11,R4
253 BLT return
254 BR notfound
255
256 done:
257 // At this point, R3 has 0xFF in the same position as the byte we are
258 // looking for in the doubleword. Use that to calculate the exact index
259 // of the byte.
260 #ifdef GOARCH_ppc64le
261 ADD $-1,R3,R11
262 ANDN R3,R11,R11
263 POPCNTD R11,R11 // Count trailing zeros (Little Endian).
264 #else
265 CNTLZD R3,R11 // Count leading zeros (Big Endian).
266 #endif
267 CMPU R8,R7 // Check if we are at the last doubleword.
268 SRD $3,R11 // Convert trailing zeros to bytes.
269 ADD R11,R8,R3
270 CMPU R11,R6,CR7 // If at the last doubleword, check the byte offset.
271 BNE return
272 BLE CR7,return
273 BR notfound
274
275 small_string:
276 // We unroll this loop for better performance.
277 CMPU R4,$0 // Check for length=0
278 BEQ notfound
279
280 MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8.
281 CMPB R12,R5,R3 // Check for a match.
282 AND R9,R3,R3 // Mask bytes below s_base.
283 CMPU R3,$0,CR7 // If we have a match, jump to the final computation.
284 RLDICL $0,R7,$61,R6 // length-1
285 RLDICR $0,R7,$60,R7 // Last doubleword in R7.
286 CMPU R8,R7
287 BNE CR7,done
288 BEQ notfound // Hit length.
289
290 MOVDU 8(R8),R12
291 CMPB R12,R5,R3
292 CMPU R3,$0,CR6
293 CMPU R8,R7
294 BNE CR6,done
295 BEQ notfound
296
297 MOVDU 8(R8),R12
298 CMPB R12,R5,R3
299 CMPU R3,$0,CR6
300 CMPU R8,R7
301 BNE CR6,done
302 BEQ notfound
303
304 MOVDU 8(R8),R12
305 CMPB R12,R5,R3
306 CMPU R3,$0,CR6
307 CMPU R8,R7
308 BNE CR6,done
309 BEQ notfound
310
311 MOVDU 8(R8),R12
312 CMPB R12,R5,R3
313 CMPU R3,$0,CR6
314 BNE CR6,done
315 BR notfound
316
317
View as plain text