1 // Copyright 2018 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include "go_asm.h"
6 #include "textflag.h"
7
8 TEXT ·Compare<ABIInternal>(SB),NOSPLIT,$0-56
9 #ifdef GOEXPERIMENT_regabiargs
10 // AX = a_base (want in SI)
11 // BX = a_len (want in BX)
12 // CX = a_cap (unused)
13 // DI = b_base (want in DI)
14 // SI = b_len (want in DX)
15 // R8 = b_cap (unused)
16 MOVQ SI, DX
17 MOVQ AX, SI
18 #else
19 MOVQ a_base+0(FP), SI
20 MOVQ a_len+8(FP), BX
21 MOVQ b_base+24(FP), DI
22 MOVQ b_len+32(FP), DX
23 LEAQ ret+48(FP), R9
24 #endif
25 JMP cmpbody<>(SB)
26
27 TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT,$0-40
28 #ifdef GOEXPERIMENT_regabiargs
29 // AX = a_base (want in SI)
30 // BX = a_len (want in BX)
31 // CX = b_base (want in DI)
32 // DI = b_len (want in DX)
33 MOVQ AX, SI
34 MOVQ DI, DX
35 MOVQ CX, DI
36 #else
37 MOVQ a_base+0(FP), SI
38 MOVQ a_len+8(FP), BX
39 MOVQ b_base+16(FP), DI
40 MOVQ b_len+24(FP), DX
41 LEAQ ret+32(FP), R9
42 #endif
43 JMP cmpbody<>(SB)
44
45 // input:
46 // SI = a
47 // DI = b
48 // BX = alen
49 // DX = blen
50 #ifndef GOEXPERIMENT_regabiargs
51 // R9 = address of output word (stores -1/0/1 here)
52 #else
53 // output:
54 // AX = output (-1/0/1)
55 #endif
56 TEXT cmpbody<>(SB),NOSPLIT,$0-0
57 CMPQ SI, DI
58 JEQ allsame
59 CMPQ BX, DX
60 MOVQ DX, R8
61 CMOVQLT BX, R8 // R8 = min(alen, blen) = # of bytes to compare
62 CMPQ R8, $8
63 JB small
64
65 CMPQ R8, $63
66 JBE loop
67 CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
68 JEQ big_loop_avx2
69 JMP big_loop
70 loop:
71 CMPQ R8, $16
72 JBE _0through16
73 MOVOU (SI), X0
74 MOVOU (DI), X1
75 PCMPEQB X0, X1
76 PMOVMSKB X1, AX
77 XORQ $0xffff, AX // convert EQ to NE
78 JNE diff16 // branch if at least one byte is not equal
79 ADDQ $16, SI
80 ADDQ $16, DI
81 SUBQ $16, R8
82 JMP loop
83
84 diff64:
85 ADDQ $48, SI
86 ADDQ $48, DI
87 JMP diff16
88 diff48:
89 ADDQ $32, SI
90 ADDQ $32, DI
91 JMP diff16
92 diff32:
93 ADDQ $16, SI
94 ADDQ $16, DI
95 // AX = bit mask of differences
96 diff16:
97 BSFQ AX, BX // index of first byte that differs
98 XORQ AX, AX
99 MOVB (SI)(BX*1), CX
100 CMPB CX, (DI)(BX*1)
101 SETHI AX
102 LEAQ -1(AX*2), AX // convert 1/0 to +1/-1
103 #ifndef GOEXPERIMENT_regabiargs
104 MOVQ AX, (R9)
105 #endif
106 RET
107
108 // 0 through 16 bytes left, alen>=8, blen>=8
109 _0through16:
110 CMPQ R8, $8
111 JBE _0through8
112 MOVQ (SI), AX
113 MOVQ (DI), CX
114 CMPQ AX, CX
115 JNE diff8
116 _0through8:
117 MOVQ -8(SI)(R8*1), AX
118 MOVQ -8(DI)(R8*1), CX
119 CMPQ AX, CX
120 JEQ allsame
121
122 // AX and CX contain parts of a and b that differ.
123 diff8:
124 BSWAPQ AX // reverse order of bytes
125 BSWAPQ CX
126 XORQ AX, CX
127 BSRQ CX, CX // index of highest bit difference
128 SHRQ CX, AX // move a's bit to bottom
129 ANDQ $1, AX // mask bit
130 LEAQ -1(AX*2), AX // 1/0 => +1/-1
131 #ifndef GOEXPERIMENT_regabiargs
132 MOVQ AX, (R9)
133 #endif
134 RET
135
136 // 0-7 bytes in common
137 small:
138 LEAQ (R8*8), CX // bytes left -> bits left
139 NEGQ CX // - bits lift (== 64 - bits left mod 64)
140 JEQ allsame
141
142 // load bytes of a into high bytes of AX
143 CMPB SI, $0xf8
144 JA si_high
145 MOVQ (SI), SI
146 JMP si_finish
147 si_high:
148 MOVQ -8(SI)(R8*1), SI
149 SHRQ CX, SI
150 si_finish:
151 SHLQ CX, SI
152
153 // load bytes of b in to high bytes of BX
154 CMPB DI, $0xf8
155 JA di_high
156 MOVQ (DI), DI
157 JMP di_finish
158 di_high:
159 MOVQ -8(DI)(R8*1), DI
160 SHRQ CX, DI
161 di_finish:
162 SHLQ CX, DI
163
164 BSWAPQ SI // reverse order of bytes
165 BSWAPQ DI
166 XORQ SI, DI // find bit differences
167 JEQ allsame
168 BSRQ DI, CX // index of highest bit difference
169 SHRQ CX, SI // move a's bit to bottom
170 ANDQ $1, SI // mask bit
171 LEAQ -1(SI*2), AX // 1/0 => +1/-1
172 #ifndef GOEXPERIMENT_regabiargs
173 MOVQ AX, (R9)
174 #endif
175 RET
176
177 allsame:
178 XORQ AX, AX
179 XORQ CX, CX
180 CMPQ BX, DX
181 SETGT AX // 1 if alen > blen
182 SETEQ CX // 1 if alen == blen
183 LEAQ -1(CX)(AX*2), AX // 1,0,-1 result
184 #ifndef GOEXPERIMENT_regabiargs
185 MOVQ AX, (R9)
186 #endif
187 RET
188
189 // this works for >= 64 bytes of data.
190 big_loop:
191 MOVOU (SI), X0
192 MOVOU (DI), X1
193 PCMPEQB X0, X1
194 PMOVMSKB X1, AX
195 XORQ $0xffff, AX
196 JNE diff16
197
198 MOVOU 16(SI), X0
199 MOVOU 16(DI), X1
200 PCMPEQB X0, X1
201 PMOVMSKB X1, AX
202 XORQ $0xffff, AX
203 JNE diff32
204
205 MOVOU 32(SI), X0
206 MOVOU 32(DI), X1
207 PCMPEQB X0, X1
208 PMOVMSKB X1, AX
209 XORQ $0xffff, AX
210 JNE diff48
211
212 MOVOU 48(SI), X0
213 MOVOU 48(DI), X1
214 PCMPEQB X0, X1
215 PMOVMSKB X1, AX
216 XORQ $0xffff, AX
217 JNE diff64
218
219 ADDQ $64, SI
220 ADDQ $64, DI
221 SUBQ $64, R8
222 CMPQ R8, $64
223 JBE loop
224 JMP big_loop
225
226 // Compare 64-bytes per loop iteration.
227 // Loop is unrolled and uses AVX2.
228 big_loop_avx2:
229 VMOVDQU (SI), Y2
230 VMOVDQU (DI), Y3
231 VMOVDQU 32(SI), Y4
232 VMOVDQU 32(DI), Y5
233 VPCMPEQB Y2, Y3, Y0
234 VPMOVMSKB Y0, AX
235 XORL $0xffffffff, AX
236 JNE diff32_avx2
237 VPCMPEQB Y4, Y5, Y6
238 VPMOVMSKB Y6, AX
239 XORL $0xffffffff, AX
240 JNE diff64_avx2
241
242 ADDQ $64, SI
243 ADDQ $64, DI
244 SUBQ $64, R8
245 CMPQ R8, $64
246 JB big_loop_avx2_exit
247 JMP big_loop_avx2
248
249 // Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk.
250 diff32_avx2:
251 VZEROUPPER
252 JMP diff16
253
254 // Same as diff32_avx2, but for last 32 bytes.
255 diff64_avx2:
256 VZEROUPPER
257 JMP diff48
258
259 // For <64 bytes remainder jump to normal loop.
260 big_loop_avx2_exit:
261 VZEROUPPER
262 JMP loop
263
View as plain text