// Copyright 2018 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. #include "textflag.h" TEXT ·IndexByte(SB),NOSPLIT,$0-40 MOVD b_base+0(FP), R0 MOVD b_len+8(FP), R2 MOVBU c+24(FP), R1 MOVD $ret+32(FP), R8 B indexbytebody<>(SB) TEXT ·IndexByteString(SB),NOSPLIT,$0-32 MOVD s_base+0(FP), R0 MOVD s_len+8(FP), R2 MOVBU c+16(FP), R1 MOVD $ret+24(FP), R8 B indexbytebody<>(SB) // input: // R0: data // R1: byte to search // R2: data len // R8: address to put result TEXT indexbytebody<>(SB),NOSPLIT,$0 // Core algorithm: // For each 32-byte chunk we calculate a 64-bit syndrome value, // with two bits per byte. For each tuple, bit 0 is set if the // relevant byte matched the requested character and bit 1 is // not used (faster than using a 32bit syndrome). Since the bits // in the syndrome reflect exactly the order in which things occur // in the original string, counting trailing zeros allows to // identify exactly which byte has matched. CBZ R2, fail MOVD R0, R11 // Magic constant 0x40100401 allows us to identify // which lane matches the requested byte. // 0x40100401 = ((1<<0) + (4<<8) + (16<<16) + (64<<24)) // Different bytes have different bit masks (i.e: 1, 4, 16, 64) MOVD $0x40100401, R5 VMOV R1, V0.B16 // Work with aligned 32-byte chunks BIC $0x1f, R0, R3 VMOV R5, V5.S4 ANDS $0x1f, R0, R9 AND $0x1f, R2, R10 BEQ loop // Input string is not 32-byte aligned. We calculate the // syndrome value for the aligned 32 bytes block containing // the first bytes and mask off the irrelevant part. VLD1.P (R3), [V1.B16, V2.B16] SUB $0x20, R9, R4 ADDS R4, R2, R2 VCMEQ V0.B16, V1.B16, V3.B16 VCMEQ V0.B16, V2.B16, V4.B16 VAND V5.B16, V3.B16, V3.B16 VAND V5.B16, V4.B16, V4.B16 VADDP V4.B16, V3.B16, V6.B16 // 256->128 VADDP V6.B16, V6.B16, V6.B16 // 128->64 VMOV V6.D[0], R6 // Clear the irrelevant lower bits LSL $1, R9, R4 LSR R4, R6, R6 LSL R4, R6, R6 // The first block can also be the last BLS masklast // Have we found something already? CBNZ R6, tail loop: VLD1.P (R3), [V1.B16, V2.B16] SUBS $0x20, R2, R2 VCMEQ V0.B16, V1.B16, V3.B16 VCMEQ V0.B16, V2.B16, V4.B16 // If we're out of data we finish regardless of the result BLS end // Use a fast check for the termination condition VORR V4.B16, V3.B16, V6.B16 VADDP V6.D2, V6.D2, V6.D2 VMOV V6.D[0], R6 // We're not out of data, loop if we haven't found the character CBZ R6, loop end: // Termination condition found, let's calculate the syndrome value VAND V5.B16, V3.B16, V3.B16 VAND V5.B16, V4.B16, V4.B16 VADDP V4.B16, V3.B16, V6.B16 VADDP V6.B16, V6.B16, V6.B16 VMOV V6.D[0], R6 // Only do the clear for the last possible block with less than 32 bytes // Condition flags come from SUBS in the loop BHS tail masklast: // Clear the irrelevant upper bits ADD R9, R10, R4 AND $0x1f, R4, R4 SUB $0x20, R4, R4 NEG R4<<1, R4 LSL R4, R6, R6 LSR R4, R6, R6 tail: // Check that we have found a character CBZ R6, fail // Count the trailing zeros using bit reversing RBIT R6, R6 // Compensate the last post-increment SUB $0x20, R3, R3 // And count the leading zeros CLZ R6, R6 // R6 is twice the offset into the fragment ADD R6>>1, R3, R0 // Compute the offset result SUB R11, R0, R0 MOVD R0, (R8) RET fail: MOVD $-1, R0 MOVD R0, (R8) RET