// Copyright 2018 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. #include "go_asm.h" #include "textflag.h" TEXT ·Index(SB),NOSPLIT,$0-56 MOVQ a_base+0(FP), DI MOVQ a_len+8(FP), DX MOVQ b_base+24(FP), R8 MOVQ b_len+32(FP), AX MOVQ DI, R10 LEAQ ret+48(FP), R11 JMP indexbody<>(SB) TEXT ·IndexString(SB),NOSPLIT,$0-40 MOVQ a_base+0(FP), DI MOVQ a_len+8(FP), DX MOVQ b_base+16(FP), R8 MOVQ b_len+24(FP), AX MOVQ DI, R10 LEAQ ret+32(FP), R11 JMP indexbody<>(SB) // AX: length of string, that we are searching for // DX: length of string, in which we are searching // DI: pointer to string, in which we are searching // R8: pointer to string, that we are searching for // R11: address, where to put return value // Note: We want len in DX and AX, because PCMPESTRI implicitly consumes them TEXT indexbody<>(SB),NOSPLIT,$0 CMPQ AX, DX JA fail CMPQ DX, $16 JAE sse42 no_sse42: CMPQ AX, $2 JA _3_or_more MOVW (R8), R8 LEAQ -1(DI)(DX*1), DX loop2: MOVW (DI), SI CMPW SI,R8 JZ success ADDQ $1,DI CMPQ DI,DX JB loop2 JMP fail _3_or_more: CMPQ AX, $3 JA _4_or_more MOVW 1(R8), BX MOVW (R8), R8 LEAQ -2(DI)(DX*1), DX loop3: MOVW (DI), SI CMPW SI,R8 JZ partial_success3 ADDQ $1,DI CMPQ DI,DX JB loop3 JMP fail partial_success3: MOVW 1(DI), SI CMPW SI,BX JZ success ADDQ $1,DI CMPQ DI,DX JB loop3 JMP fail _4_or_more: CMPQ AX, $4 JA _5_or_more MOVL (R8), R8 LEAQ -3(DI)(DX*1), DX loop4: MOVL (DI), SI CMPL SI,R8 JZ success ADDQ $1,DI CMPQ DI,DX JB loop4 JMP fail _5_or_more: CMPQ AX, $7 JA _8_or_more LEAQ 1(DI)(DX*1), DX SUBQ AX, DX MOVL -4(R8)(AX*1), BX MOVL (R8), R8 loop5to7: MOVL (DI), SI CMPL SI,R8 JZ partial_success5to7 ADDQ $1,DI CMPQ DI,DX JB loop5to7 JMP fail partial_success5to7: MOVL -4(AX)(DI*1), SI CMPL SI,BX JZ success ADDQ $1,DI CMPQ DI,DX JB loop5to7 JMP fail _8_or_more: CMPQ AX, $8 JA _9_or_more MOVQ (R8), R8 LEAQ -7(DI)(DX*1), DX loop8: MOVQ (DI), SI CMPQ SI,R8 JZ success ADDQ $1,DI CMPQ DI,DX JB loop8 JMP fail _9_or_more: CMPQ AX, $15 JA _16_or_more LEAQ 1(DI)(DX*1), DX SUBQ AX, DX MOVQ -8(R8)(AX*1), BX MOVQ (R8), R8 loop9to15: MOVQ (DI), SI CMPQ SI,R8 JZ partial_success9to15 ADDQ $1,DI CMPQ DI,DX JB loop9to15 JMP fail partial_success9to15: MOVQ -8(AX)(DI*1), SI CMPQ SI,BX JZ success ADDQ $1,DI CMPQ DI,DX JB loop9to15 JMP fail _16_or_more: CMPQ AX, $16 JA _17_or_more MOVOU (R8), X1 LEAQ -15(DI)(DX*1), DX loop16: MOVOU (DI), X2 PCMPEQB X1, X2 PMOVMSKB X2, SI CMPQ SI, $0xffff JE success ADDQ $1,DI CMPQ DI,DX JB loop16 JMP fail _17_or_more: CMPQ AX, $31 JA _32_or_more LEAQ 1(DI)(DX*1), DX SUBQ AX, DX MOVOU -16(R8)(AX*1), X0 MOVOU (R8), X1 loop17to31: MOVOU (DI), X2 PCMPEQB X1,X2 PMOVMSKB X2, SI CMPQ SI, $0xffff JE partial_success17to31 ADDQ $1,DI CMPQ DI,DX JB loop17to31 JMP fail partial_success17to31: MOVOU -16(AX)(DI*1), X3 PCMPEQB X0, X3 PMOVMSKB X3, SI CMPQ SI, $0xffff JE success ADDQ $1,DI CMPQ DI,DX JB loop17to31 JMP fail // We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63 // So no need to check cpuid _32_or_more: CMPQ AX, $32 JA _33_to_63 VMOVDQU (R8), Y1 LEAQ -31(DI)(DX*1), DX loop32: VMOVDQU (DI), Y2 VPCMPEQB Y1, Y2, Y3 VPMOVMSKB Y3, SI CMPL SI, $0xffffffff JE success_avx2 ADDQ $1,DI CMPQ DI,DX JB loop32 JMP fail_avx2 _33_to_63: LEAQ 1(DI)(DX*1), DX SUBQ AX, DX VMOVDQU -32(R8)(AX*1), Y0 VMOVDQU (R8), Y1 loop33to63: VMOVDQU (DI), Y2 VPCMPEQB Y1, Y2, Y3 VPMOVMSKB Y3, SI CMPL SI, $0xffffffff JE partial_success33to63 ADDQ $1,DI CMPQ DI,DX JB loop33to63 JMP fail_avx2 partial_success33to63: VMOVDQU -32(AX)(DI*1), Y3 VPCMPEQB Y0, Y3, Y4 VPMOVMSKB Y4, SI CMPL SI, $0xffffffff JE success_avx2 ADDQ $1,DI CMPQ DI,DX JB loop33to63 fail_avx2: VZEROUPPER fail: MOVQ $-1, (R11) RET success_avx2: VZEROUPPER JMP success sse42: CMPB internal∕cpu·X86+const_offsetX86HasSSE42(SB), $1 JNE no_sse42 CMPQ AX, $12 // PCMPESTRI is slower than normal compare, // so using it makes sense only if we advance 4+ bytes per compare // This value was determined experimentally and is the ~same // on Nehalem (first with SSE42) and Haswell. JAE _9_or_more LEAQ 16(R8), SI TESTW $0xff0, SI JEQ no_sse42 MOVOU (R8), X1 LEAQ -15(DI)(DX*1), SI MOVQ $16, R9 SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9 loop_sse42: // 0x0c means: unsigned byte compare (bits 0,1 are 00) // for equality (bits 2,3 are 11) // result is not masked or inverted (bits 4,5 are 00) // and corresponds to first matching byte (bit 6 is 0) PCMPESTRI $0x0c, (DI), X1 // CX == 16 means no match, // CX > R9 means partial match at the end of the string, // otherwise sep is at offset CX from X1 start CMPQ CX, R9 JBE sse42_success ADDQ R9, DI CMPQ DI, SI JB loop_sse42 PCMPESTRI $0x0c, -1(SI), X1 CMPQ CX, R9 JA fail LEAQ -1(SI), DI sse42_success: ADDQ CX, DI success: SUBQ R10, DI MOVQ DI, (R11) RET