Black Lives Matter. Support the Equal Justice Initiative.

Source file src/index/suffixarray/sais2.go

Documentation: index/suffixarray

     1  // Copyright 2019 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Code generated by go generate; DO NOT EDIT.
     6  
     7  package suffixarray
     8  
     9  func text_64(text []byte, sa []int64) {
    10  	if int(int64(len(text))) != len(text) || len(text) != len(sa) {
    11  		panic("suffixarray: misuse of text_64")
    12  	}
    13  	sais_8_64(text, 256, sa, make([]int64, 2*256))
    14  }
    15  
    16  func sais_8_64(text []byte, textMax int, sa, tmp []int64) {
    17  	if len(sa) != len(text) || len(tmp) < int(textMax) {
    18  		panic("suffixarray: misuse of sais_8_64")
    19  	}
    20  
    21  	// Trivial base cases. Sorting 0 or 1 things is easy.
    22  	if len(text) == 0 {
    23  		return
    24  	}
    25  	if len(text) == 1 {
    26  		sa[0] = 0
    27  		return
    28  	}
    29  
    30  	// Establish slices indexed by text character
    31  	// holding character frequency and bucket-sort offsets.
    32  	// If there's only enough tmp for one slice,
    33  	// we make it the bucket offsets and recompute
    34  	// the character frequency each time we need it.
    35  	var freq, bucket []int64
    36  	if len(tmp) >= 2*textMax {
    37  		freq, bucket = tmp[:textMax], tmp[textMax:2*textMax]
    38  		freq[0] = -1 // mark as uninitialized
    39  	} else {
    40  		freq, bucket = nil, tmp[:textMax]
    41  	}
    42  
    43  	// The SAIS algorithm.
    44  	// Each of these calls makes one scan through sa.
    45  	// See the individual functions for documentation
    46  	// about each's role in the algorithm.
    47  	numLMS := placeLMS_8_64(text, sa, freq, bucket)
    48  	if numLMS <= 1 {
    49  		// 0 or 1 items are already sorted. Do nothing.
    50  	} else {
    51  		induceSubL_8_64(text, sa, freq, bucket)
    52  		induceSubS_8_64(text, sa, freq, bucket)
    53  		length_8_64(text, sa, numLMS)
    54  		maxID := assignID_8_64(text, sa, numLMS)
    55  		if maxID < numLMS {
    56  			map_64(sa, numLMS)
    57  			recurse_64(sa, tmp, numLMS, maxID)
    58  			unmap_8_64(text, sa, numLMS)
    59  		} else {
    60  			// If maxID == numLMS, then each LMS-substring
    61  			// is unique, so the relative ordering of two LMS-suffixes
    62  			// is determined by just the leading LMS-substring.
    63  			// That is, the LMS-suffix sort order matches the
    64  			// (simpler) LMS-substring sort order.
    65  			// Copy the original LMS-substring order into the
    66  			// suffix array destination.
    67  			copy(sa, sa[len(sa)-numLMS:])
    68  		}
    69  		expand_8_64(text, freq, bucket, sa, numLMS)
    70  	}
    71  	induceL_8_64(text, sa, freq, bucket)
    72  	induceS_8_64(text, sa, freq, bucket)
    73  
    74  	// Mark for caller that we overwrote tmp.
    75  	tmp[0] = -1
    76  }
    77  
    78  func sais_32(text []int32, textMax int, sa, tmp []int32) {
    79  	if len(sa) != len(text) || len(tmp) < int(textMax) {
    80  		panic("suffixarray: misuse of sais_32")
    81  	}
    82  
    83  	// Trivial base cases. Sorting 0 or 1 things is easy.
    84  	if len(text) == 0 {
    85  		return
    86  	}
    87  	if len(text) == 1 {
    88  		sa[0] = 0
    89  		return
    90  	}
    91  
    92  	// Establish slices indexed by text character
    93  	// holding character frequency and bucket-sort offsets.
    94  	// If there's only enough tmp for one slice,
    95  	// we make it the bucket offsets and recompute
    96  	// the character frequency each time we need it.
    97  	var freq, bucket []int32
    98  	if len(tmp) >= 2*textMax {
    99  		freq, bucket = tmp[:textMax], tmp[textMax:2*textMax]
   100  		freq[0] = -1 // mark as uninitialized
   101  	} else {
   102  		freq, bucket = nil, tmp[:textMax]
   103  	}
   104  
   105  	// The SAIS algorithm.
   106  	// Each of these calls makes one scan through sa.
   107  	// See the individual functions for documentation
   108  	// about each's role in the algorithm.
   109  	numLMS := placeLMS_32(text, sa, freq, bucket)
   110  	if numLMS <= 1 {
   111  		// 0 or 1 items are already sorted. Do nothing.
   112  	} else {
   113  		induceSubL_32(text, sa, freq, bucket)
   114  		induceSubS_32(text, sa, freq, bucket)
   115  		length_32(text, sa, numLMS)
   116  		maxID := assignID_32(text, sa, numLMS)
   117  		if maxID < numLMS {
   118  			map_32(sa, numLMS)
   119  			recurse_32(sa, tmp, numLMS, maxID)
   120  			unmap_32(text, sa, numLMS)
   121  		} else {
   122  			// If maxID == numLMS, then each LMS-substring
   123  			// is unique, so the relative ordering of two LMS-suffixes
   124  			// is determined by just the leading LMS-substring.
   125  			// That is, the LMS-suffix sort order matches the
   126  			// (simpler) LMS-substring sort order.
   127  			// Copy the original LMS-substring order into the
   128  			// suffix array destination.
   129  			copy(sa, sa[len(sa)-numLMS:])
   130  		}
   131  		expand_32(text, freq, bucket, sa, numLMS)
   132  	}
   133  	induceL_32(text, sa, freq, bucket)
   134  	induceS_32(text, sa, freq, bucket)
   135  
   136  	// Mark for caller that we overwrote tmp.
   137  	tmp[0] = -1
   138  }
   139  
   140  func sais_64(text []int64, textMax int, sa, tmp []int64) {
   141  	if len(sa) != len(text) || len(tmp) < int(textMax) {
   142  		panic("suffixarray: misuse of sais_64")
   143  	}
   144  
   145  	// Trivial base cases. Sorting 0 or 1 things is easy.
   146  	if len(text) == 0 {
   147  		return
   148  	}
   149  	if len(text) == 1 {
   150  		sa[0] = 0
   151  		return
   152  	}
   153  
   154  	// Establish slices indexed by text character
   155  	// holding character frequency and bucket-sort offsets.
   156  	// If there's only enough tmp for one slice,
   157  	// we make it the bucket offsets and recompute
   158  	// the character frequency each time we need it.
   159  	var freq, bucket []int64
   160  	if len(tmp) >= 2*textMax {
   161  		freq, bucket = tmp[:textMax], tmp[textMax:2*textMax]
   162  		freq[0] = -1 // mark as uninitialized
   163  	} else {
   164  		freq, bucket = nil, tmp[:textMax]
   165  	}
   166  
   167  	// The SAIS algorithm.
   168  	// Each of these calls makes one scan through sa.
   169  	// See the individual functions for documentation
   170  	// about each's role in the algorithm.
   171  	numLMS := placeLMS_64(text, sa, freq, bucket)
   172  	if numLMS <= 1 {
   173  		// 0 or 1 items are already sorted. Do nothing.
   174  	} else {
   175  		induceSubL_64(text, sa, freq, bucket)
   176  		induceSubS_64(text, sa, freq, bucket)
   177  		length_64(text, sa, numLMS)
   178  		maxID := assignID_64(text, sa, numLMS)
   179  		if maxID < numLMS {
   180  			map_64(sa, numLMS)
   181  			recurse_64(sa, tmp, numLMS, maxID)
   182  			unmap_64(text, sa, numLMS)
   183  		} else {
   184  			// If maxID == numLMS, then each LMS-substring
   185  			// is unique, so the relative ordering of two LMS-suffixes
   186  			// is determined by just the leading LMS-substring.
   187  			// That is, the LMS-suffix sort order matches the
   188  			// (simpler) LMS-substring sort order.
   189  			// Copy the original LMS-substring order into the
   190  			// suffix array destination.
   191  			copy(sa, sa[len(sa)-numLMS:])
   192  		}
   193  		expand_64(text, freq, bucket, sa, numLMS)
   194  	}
   195  	induceL_64(text, sa, freq, bucket)
   196  	induceS_64(text, sa, freq, bucket)
   197  
   198  	// Mark for caller that we overwrote tmp.
   199  	tmp[0] = -1
   200  }
   201  
   202  func freq_8_64(text []byte, freq, bucket []int64) []int64 {
   203  	if freq != nil && freq[0] >= 0 {
   204  		return freq // already computed
   205  	}
   206  	if freq == nil {
   207  		freq = bucket
   208  	}
   209  
   210  	freq = freq[:256] // eliminate bounds check for freq[c] below
   211  	for i := range freq {
   212  		freq[i] = 0
   213  	}
   214  	for _, c := range text {
   215  		freq[c]++
   216  	}
   217  	return freq
   218  }
   219  
   220  func freq_32(text []int32, freq, bucket []int32) []int32 {
   221  	if freq != nil && freq[0] >= 0 {
   222  		return freq // already computed
   223  	}
   224  	if freq == nil {
   225  		freq = bucket
   226  	}
   227  
   228  	for i := range freq {
   229  		freq[i] = 0
   230  	}
   231  	for _, c := range text {
   232  		freq[c]++
   233  	}
   234  	return freq
   235  }
   236  
   237  func freq_64(text []int64, freq, bucket []int64) []int64 {
   238  	if freq != nil && freq[0] >= 0 {
   239  		return freq // already computed
   240  	}
   241  	if freq == nil {
   242  		freq = bucket
   243  	}
   244  
   245  	for i := range freq {
   246  		freq[i] = 0
   247  	}
   248  	for _, c := range text {
   249  		freq[c]++
   250  	}
   251  	return freq
   252  }
   253  
   254  func bucketMin_8_64(text []byte, freq, bucket []int64) {
   255  	freq = freq_8_64(text, freq, bucket)
   256  	freq = freq[:256]     // establish len(freq) = 256, so 0 ≤ i < 256 below
   257  	bucket = bucket[:256] // eliminate bounds check for bucket[i] below
   258  	total := int64(0)
   259  	for i, n := range freq {
   260  		bucket[i] = total
   261  		total += n
   262  	}
   263  }
   264  
   265  func bucketMin_32(text []int32, freq, bucket []int32) {
   266  	freq = freq_32(text, freq, bucket)
   267  	total := int32(0)
   268  	for i, n := range freq {
   269  		bucket[i] = total
   270  		total += n
   271  	}
   272  }
   273  
   274  func bucketMin_64(text []int64, freq, bucket []int64) {
   275  	freq = freq_64(text, freq, bucket)
   276  	total := int64(0)
   277  	for i, n := range freq {
   278  		bucket[i] = total
   279  		total += n
   280  	}
   281  }
   282  
   283  func bucketMax_8_64(text []byte, freq, bucket []int64) {
   284  	freq = freq_8_64(text, freq, bucket)
   285  	freq = freq[:256]     // establish len(freq) = 256, so 0 ≤ i < 256 below
   286  	bucket = bucket[:256] // eliminate bounds check for bucket[i] below
   287  	total := int64(0)
   288  	for i, n := range freq {
   289  		total += n
   290  		bucket[i] = total
   291  	}
   292  }
   293  
   294  func bucketMax_32(text []int32, freq, bucket []int32) {
   295  	freq = freq_32(text, freq, bucket)
   296  	total := int32(0)
   297  	for i, n := range freq {
   298  		total += n
   299  		bucket[i] = total
   300  	}
   301  }
   302  
   303  func bucketMax_64(text []int64, freq, bucket []int64) {
   304  	freq = freq_64(text, freq, bucket)
   305  	total := int64(0)
   306  	for i, n := range freq {
   307  		total += n
   308  		bucket[i] = total
   309  	}
   310  }
   311  
   312  func placeLMS_8_64(text []byte, sa, freq, bucket []int64) int {
   313  	bucketMax_8_64(text, freq, bucket)
   314  
   315  	numLMS := 0
   316  	lastB := int64(-1)
   317  	bucket = bucket[:256] // eliminate bounds check for bucket[c1] below
   318  
   319  	// The next stanza of code (until the blank line) loop backward
   320  	// over text, stopping to execute a code body at each position i
   321  	// such that text[i] is an L-character and text[i+1] is an S-character.
   322  	// That is, i+1 is the position of the start of an LMS-substring.
   323  	// These could be hoisted out into a function with a callback,
   324  	// but at a significant speed cost. Instead, we just write these
   325  	// seven lines a few times in this source file. The copies below
   326  	// refer back to the pattern established by this original as the
   327  	// "LMS-substring iterator".
   328  	//
   329  	// In every scan through the text, c0, c1 are successive characters of text.
   330  	// In this backward scan, c0 == text[i] and c1 == text[i+1].
   331  	// By scanning backward, we can keep track of whether the current
   332  	// position is type-S or type-L according to the usual definition:
   333  	//
   334  	//	- position len(text) is type S with text[len(text)] == -1 (the sentinel)
   335  	//	- position i is type S if text[i] < text[i+1], or if text[i] == text[i+1] && i+1 is type S.
   336  	//	- position i is type L if text[i] > text[i+1], or if text[i] == text[i+1] && i+1 is type L.
   337  	//
   338  	// The backward scan lets us maintain the current type,
   339  	// update it when we see c0 != c1, and otherwise leave it alone.
   340  	// We want to identify all S positions with a preceding L.
   341  	// Position len(text) is one such position by definition, but we have
   342  	// nowhere to write it down, so we eliminate it by untruthfully
   343  	// setting isTypeS = false at the start of the loop.
   344  	c0, c1, isTypeS := byte(0), byte(0), false
   345  	for i := len(text) - 1; i >= 0; i-- {
   346  		c0, c1 = text[i], c0
   347  		if c0 < c1 {
   348  			isTypeS = true
   349  		} else if c0 > c1 && isTypeS {
   350  			isTypeS = false
   351  
   352  			// Bucket the index i+1 for the start of an LMS-substring.
   353  			b := bucket[c1] - 1
   354  			bucket[c1] = b
   355  			sa[b] = int64(i + 1)
   356  			lastB = b
   357  			numLMS++
   358  		}
   359  	}
   360  
   361  	// We recorded the LMS-substring starts but really want the ends.
   362  	// Luckily, with two differences, the start indexes and the end indexes are the same.
   363  	// The first difference is that the rightmost LMS-substring's end index is len(text),
   364  	// so the caller must pretend that sa[-1] == len(text), as noted above.
   365  	// The second difference is that the first leftmost LMS-substring start index
   366  	// does not end an earlier LMS-substring, so as an optimization we can omit
   367  	// that leftmost LMS-substring start index (the last one we wrote).
   368  	//
   369  	// Exception: if numLMS <= 1, the caller is not going to bother with
   370  	// the recursion at all and will treat the result as containing LMS-substring starts.
   371  	// In that case, we don't remove the final entry.
   372  	if numLMS > 1 {
   373  		sa[lastB] = 0
   374  	}
   375  	return numLMS
   376  }
   377  
   378  func placeLMS_32(text []int32, sa, freq, bucket []int32) int {
   379  	bucketMax_32(text, freq, bucket)
   380  
   381  	numLMS := 0
   382  	lastB := int32(-1)
   383  
   384  	// The next stanza of code (until the blank line) loop backward
   385  	// over text, stopping to execute a code body at each position i
   386  	// such that text[i] is an L-character and text[i+1] is an S-character.
   387  	// That is, i+1 is the position of the start of an LMS-substring.
   388  	// These could be hoisted out into a function with a callback,
   389  	// but at a significant speed cost. Instead, we just write these
   390  	// seven lines a few times in this source file. The copies below
   391  	// refer back to the pattern established by this original as the
   392  	// "LMS-substring iterator".
   393  	//
   394  	// In every scan through the text, c0, c1 are successive characters of text.
   395  	// In this backward scan, c0 == text[i] and c1 == text[i+1].
   396  	// By scanning backward, we can keep track of whether the current
   397  	// position is type-S or type-L according to the usual definition:
   398  	//
   399  	//	- position len(text) is type S with text[len(text)] == -1 (the sentinel)
   400  	//	- position i is type S if text[i] < text[i+1], or if text[i] == text[i+1] && i+1 is type S.
   401  	//	- position i is type L if text[i] > text[i+1], or if text[i] == text[i+1] && i+1 is type L.
   402  	//
   403  	// The backward scan lets us maintain the current type,
   404  	// update it when we see c0 != c1, and otherwise leave it alone.
   405  	// We want to identify all S positions with a preceding L.
   406  	// Position len(text) is one such position by definition, but we have
   407  	// nowhere to write it down, so we eliminate it by untruthfully
   408  	// setting isTypeS = false at the start of the loop.
   409  	c0, c1, isTypeS := int32(0), int32(0), false
   410  	for i := len(text) - 1; i >= 0; i-- {
   411  		c0, c1 = text[i], c0
   412  		if c0 < c1 {
   413  			isTypeS = true
   414  		} else if c0 > c1 && isTypeS {
   415  			isTypeS = false
   416  
   417  			// Bucket the index i+1 for the start of an LMS-substring.
   418  			b := bucket[c1] - 1
   419  			bucket[c1] = b
   420  			sa[b] = int32(i + 1)
   421  			lastB = b
   422  			numLMS++
   423  		}
   424  	}
   425  
   426  	// We recorded the LMS-substring starts but really want the ends.
   427  	// Luckily, with two differences, the start indexes and the end indexes are the same.
   428  	// The first difference is that the rightmost LMS-substring's end index is len(text),
   429  	// so the caller must pretend that sa[-1] == len(text), as noted above.
   430  	// The second difference is that the first leftmost LMS-substring start index
   431  	// does not end an earlier LMS-substring, so as an optimization we can omit
   432  	// that leftmost LMS-substring start index (the last one we wrote).
   433  	//
   434  	// Exception: if numLMS <= 1, the caller is not going to bother with
   435  	// the recursion at all and will treat the result as containing LMS-substring starts.
   436  	// In that case, we don't remove the final entry.
   437  	if numLMS > 1 {
   438  		sa[lastB] = 0
   439  	}
   440  	return numLMS
   441  }
   442  
   443  func placeLMS_64(text []int64, sa, freq, bucket []int64) int {
   444  	bucketMax_64(text, freq, bucket)
   445  
   446  	numLMS := 0
   447  	lastB := int64(-1)
   448  
   449  	// The next stanza of code (until the blank line) loop backward
   450  	// over text, stopping to execute a code body at each position i
   451  	// such that text[i] is an L-character and text[i+1] is an S-character.
   452  	// That is, i+1 is the position of the start of an LMS-substring.
   453  	// These could be hoisted out into a function with a callback,
   454  	// but at a significant speed cost. Instead, we just write these
   455  	// seven lines a few times in this source file. The copies below
   456  	// refer back to the pattern established by this original as the
   457  	// "LMS-substring iterator".
   458  	//
   459  	// In every scan through the text, c0, c1 are successive characters of text.
   460  	// In this backward scan, c0 == text[i] and c1 == text[i+1].
   461  	// By scanning backward, we can keep track of whether the current
   462  	// position is type-S or type-L according to the usual definition:
   463  	//
   464  	//	- position len(text) is type S with text[len(text)] == -1 (the sentinel)
   465  	//	- position i is type S if text[i] < text[i+1], or if text[i] == text[i+1] && i+1 is type S.
   466  	//	- position i is type L if text[i] > text[i+1], or if text[i] == text[i+1] && i+1 is type L.
   467  	//
   468  	// The backward scan lets us maintain the current type,
   469  	// update it when we see c0 != c1, and otherwise leave it alone.
   470  	// We want to identify all S positions with a preceding L.
   471  	// Position len(text) is one such position by definition, but we have
   472  	// nowhere to write it down, so we eliminate it by untruthfully
   473  	// setting isTypeS = false at the start of the loop.
   474  	c0, c1, isTypeS := int64(0), int64(0), false
   475  	for i := len(text) - 1; i >= 0; i-- {
   476  		c0, c1 = text[i], c0
   477  		if c0 < c1 {
   478  			isTypeS = true
   479  		} else if c0 > c1 && isTypeS {
   480  			isTypeS = false
   481  
   482  			// Bucket the index i+1 for the start of an LMS-substring.
   483  			b := bucket[c1] - 1
   484  			bucket[c1] = b
   485  			sa[b] = int64(i + 1)
   486  			lastB = b
   487  			numLMS++
   488  		}
   489  	}
   490  
   491  	// We recorded the LMS-substring starts but really want the ends.
   492  	// Luckily, with two differences, the start indexes and the end indexes are the same.
   493  	// The first difference is that the rightmost LMS-substring's end index is len(text),
   494  	// so the caller must pretend that sa[-1] == len(text), as noted above.
   495  	// The second difference is that the first leftmost LMS-substring start index
   496  	// does not end an earlier LMS-substring, so as an optimization we can omit
   497  	// that leftmost LMS-substring start index (the last one we wrote).
   498  	//
   499  	// Exception: if numLMS <= 1, the caller is not going to bother with
   500  	// the recursion at all and will treat the result as containing LMS-substring starts.
   501  	// In that case, we don't remove the final entry.
   502  	if numLMS > 1 {
   503  		sa[lastB] = 0
   504  	}
   505  	return numLMS
   506  }
   507  
   508  func induceSubL_8_64(text []byte, sa, freq, bucket []int64) {
   509  	// Initialize positions for left side of character buckets.
   510  	bucketMin_8_64(text, freq, bucket)
   511  	bucket = bucket[:256] // eliminate bounds check for bucket[cB] below
   512  
   513  	// As we scan the array left-to-right, each sa[i] = j > 0 is a correctly
   514  	// sorted suffix array entry (for text[j:]) for which we know that j-1 is type L.
   515  	// Because j-1 is type L, inserting it into sa now will sort it correctly.
   516  	// But we want to distinguish a j-1 with j-2 of type L from type S.
   517  	// We can process the former but want to leave the latter for the caller.
   518  	// We record the difference by negating j-1 if it is preceded by type S.
   519  	// Either way, the insertion (into the text[j-1] bucket) is guaranteed to
   520  	// happen at sa[i´] for some i´ > i, that is, in the portion of sa we have
   521  	// yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3,
   522  	// and so on, in sorted but not necessarily adjacent order, until it finds
   523  	// one preceded by an index of type S, at which point it must stop.
   524  	//
   525  	// As we scan through the array, we clear the worked entries (sa[i] > 0) to zero,
   526  	// and we flip sa[i] < 0 to -sa[i], so that the loop finishes with sa containing
   527  	// only the indexes of the leftmost L-type indexes for each LMS-substring.
   528  	//
   529  	// The suffix array sa therefore serves simultaneously as input, output,
   530  	// and a miraculously well-tailored work queue.
   531  
   532  	// placeLMS_8_64 left out the implicit entry sa[-1] == len(text),
   533  	// corresponding to the identified type-L index len(text)-1.
   534  	// Process it before the left-to-right scan of sa proper.
   535  	// See body in loop for commentary.
   536  	k := len(text) - 1
   537  	c0, c1 := text[k-1], text[k]
   538  	if c0 < c1 {
   539  		k = -k
   540  	}
   541  
   542  	// Cache recently used bucket index:
   543  	// we're processing suffixes in sorted order
   544  	// and accessing buckets indexed by the
   545  	// byte before the sorted order, which still
   546  	// has very good locality.
   547  	// Invariant: b is cached, possibly dirty copy of bucket[cB].
   548  	cB := c1
   549  	b := bucket[cB]
   550  	sa[b] = int64(k)
   551  	b++
   552  
   553  	for i := 0; i < len(sa); i++ {
   554  		j := int(sa[i])
   555  		if j == 0 {
   556  			// Skip empty entry.
   557  			continue
   558  		}
   559  		if j < 0 {
   560  			// Leave discovered type-S index for caller.
   561  			sa[i] = int64(-j)
   562  			continue
   563  		}
   564  		sa[i] = 0
   565  
   566  		// Index j was on work queue, meaning k := j-1 is L-type,
   567  		// so we can now place k correctly into sa.
   568  		// If k-1 is L-type, queue k for processing later in this loop.
   569  		// If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller.
   570  		k := j - 1
   571  		c0, c1 := text[k-1], text[k]
   572  		if c0 < c1 {
   573  			k = -k
   574  		}
   575  
   576  		if cB != c1 {
   577  			bucket[cB] = b
   578  			cB = c1
   579  			b = bucket[cB]
   580  		}
   581  		sa[b] = int64(k)
   582  		b++
   583  	}
   584  }
   585  
   586  func induceSubL_32(text []int32, sa, freq, bucket []int32) {
   587  	// Initialize positions for left side of character buckets.
   588  	bucketMin_32(text, freq, bucket)
   589  
   590  	// As we scan the array left-to-right, each sa[i] = j > 0 is a correctly
   591  	// sorted suffix array entry (for text[j:]) for which we know that j-1 is type L.
   592  	// Because j-1 is type L, inserting it into sa now will sort it correctly.
   593  	// But we want to distinguish a j-1 with j-2 of type L from type S.
   594  	// We can process the former but want to leave the latter for the caller.
   595  	// We record the difference by negating j-1 if it is preceded by type S.
   596  	// Either way, the insertion (into the text[j-1] bucket) is guaranteed to
   597  	// happen at sa[i´] for some i´ > i, that is, in the portion of sa we have
   598  	// yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3,
   599  	// and so on, in sorted but not necessarily adjacent order, until it finds
   600  	// one preceded by an index of type S, at which point it must stop.
   601  	//
   602  	// As we scan through the array, we clear the worked entries (sa[i] > 0) to zero,
   603  	// and we flip sa[i] < 0 to -sa[i], so that the loop finishes with sa containing
   604  	// only the indexes of the leftmost L-type indexes for each LMS-substring.
   605  	//
   606  	// The suffix array sa therefore serves simultaneously as input, output,
   607  	// and a miraculously well-tailored work queue.
   608  
   609  	// placeLMS_32 left out the implicit entry sa[-1] == len(text),
   610  	// corresponding to the identified type-L index len(text)-1.
   611  	// Process it before the left-to-right scan of sa proper.
   612  	// See body in loop for commentary.
   613  	k := len(text) - 1
   614  	c0, c1 := text[k-1], text[k]
   615  	if c0 < c1 {
   616  		k = -k
   617  	}
   618  
   619  	// Cache recently used bucket index:
   620  	// we're processing suffixes in sorted order
   621  	// and accessing buckets indexed by the
   622  	// int32 before the sorted order, which still
   623  	// has very good locality.
   624  	// Invariant: b is cached, possibly dirty copy of bucket[cB].
   625  	cB := c1
   626  	b := bucket[cB]
   627  	sa[b] = int32(k)
   628  	b++
   629  
   630  	for i := 0; i < len(sa); i++ {
   631  		j := int(sa[i])
   632  		if j == 0 {
   633  			// Skip empty entry.
   634  			continue
   635  		}
   636  		if j < 0 {
   637  			// Leave discovered type-S index for caller.
   638  			sa[i] = int32(-j)
   639  			continue
   640  		}
   641  		sa[i] = 0
   642  
   643  		// Index j was on work queue, meaning k := j-1 is L-type,
   644  		// so we can now place k correctly into sa.
   645  		// If k-1 is L-type, queue k for processing later in this loop.
   646  		// If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller.
   647  		k := j - 1
   648  		c0, c1 := text[k-1], text[k]
   649  		if c0 < c1 {
   650  			k = -k
   651  		}
   652  
   653  		if cB != c1 {
   654  			bucket[cB] = b
   655  			cB = c1
   656  			b = bucket[cB]
   657  		}
   658  		sa[b] = int32(k)
   659  		b++
   660  	}
   661  }
   662  
   663  func induceSubL_64(text []int64, sa, freq, bucket []int64) {
   664  	// Initialize positions for left side of character buckets.
   665  	bucketMin_64(text, freq, bucket)
   666  
   667  	// As we scan the array left-to-right, each sa[i] = j > 0 is a correctly
   668  	// sorted suffix array entry (for text[j:]) for which we know that j-1 is type L.
   669  	// Because j-1 is type L, inserting it into sa now will sort it correctly.
   670  	// But we want to distinguish a j-1 with j-2 of type L from type S.
   671  	// We can process the former but want to leave the latter for the caller.
   672  	// We record the difference by negating j-1 if it is preceded by type S.
   673  	// Either way, the insertion (into the text[j-1] bucket) is guaranteed to
   674  	// happen at sa[i´] for some i´ > i, that is, in the portion of sa we have
   675  	// yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3,
   676  	// and so on, in sorted but not necessarily adjacent order, until it finds
   677  	// one preceded by an index of type S, at which point it must stop.
   678  	//
   679  	// As we scan through the array, we clear the worked entries (sa[i] > 0) to zero,
   680  	// and we flip sa[i] < 0 to -sa[i], so that the loop finishes with sa containing
   681  	// only the indexes of the leftmost L-type indexes for each LMS-substring.
   682  	//
   683  	// The suffix array sa therefore serves simultaneously as input, output,
   684  	// and a miraculously well-tailored work queue.
   685  
   686  	// placeLMS_64 left out the implicit entry sa[-1] == len(text),
   687  	// corresponding to the identified type-L index len(text)-1.
   688  	// Process it before the left-to-right scan of sa proper.
   689  	// See body in loop for commentary.
   690  	k := len(text) - 1
   691  	c0, c1 := text[k-1], text[k]
   692  	if c0 < c1 {
   693  		k = -k
   694  	}
   695  
   696  	// Cache recently used bucket index:
   697  	// we're processing suffixes in sorted order
   698  	// and accessing buckets indexed by the
   699  	// int64 before the sorted order, which still
   700  	// has very good locality.
   701  	// Invariant: b is cached, possibly dirty copy of bucket[cB].
   702  	cB := c1
   703  	b := bucket[cB]
   704  	sa[b] = int64(k)
   705  	b++
   706  
   707  	for i := 0; i < len(sa); i++ {
   708  		j := int(sa[i])
   709  		if j == 0 {
   710  			// Skip empty entry.
   711  			continue
   712  		}
   713  		if j < 0 {
   714  			// Leave discovered type-S index for caller.
   715  			sa[i] = int64(-j)
   716  			continue
   717  		}
   718  		sa[i] = 0
   719  
   720  		// Index j was on work queue, meaning k := j-1 is L-type,
   721  		// so we can now place k correctly into sa.
   722  		// If k-1 is L-type, queue k for processing later in this loop.
   723  		// If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller.
   724  		k := j - 1
   725  		c0, c1 := text[k-1], text[k]
   726  		if c0 < c1 {
   727  			k = -k
   728  		}
   729  
   730  		if cB != c1 {
   731  			bucket[cB] = b
   732  			cB = c1
   733  			b = bucket[cB]
   734  		}
   735  		sa[b] = int64(k)
   736  		b++
   737  	}
   738  }
   739  
   740  func induceSubS_8_64(text []byte, sa, freq, bucket []int64) {
   741  	// Initialize positions for right side of character buckets.
   742  	bucketMax_8_64(text, freq, bucket)
   743  	bucket = bucket[:256] // eliminate bounds check for bucket[cB] below
   744  
   745  	// Analogous to induceSubL_8_64 above,
   746  	// as we scan the array right-to-left, each sa[i] = j > 0 is a correctly
   747  	// sorted suffix array entry (for text[j:]) for which we know that j-1 is type S.
   748  	// Because j-1 is type S, inserting it into sa now will sort it correctly.
   749  	// But we want to distinguish a j-1 with j-2 of type S from type L.
   750  	// We can process the former but want to leave the latter for the caller.
   751  	// We record the difference by negating j-1 if it is preceded by type L.
   752  	// Either way, the insertion (into the text[j-1] bucket) is guaranteed to
   753  	// happen at sa[i´] for some i´ < i, that is, in the portion of sa we have
   754  	// yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3,
   755  	// and so on, in sorted but not necessarily adjacent order, until it finds
   756  	// one preceded by an index of type L, at which point it must stop.
   757  	// That index (preceded by one of type L) is an LMS-substring start.
   758  	//
   759  	// As we scan through the array, we clear the worked entries (sa[i] > 0) to zero,
   760  	// and we flip sa[i] < 0 to -sa[i] and compact into the top of sa,
   761  	// so that the loop finishes with the top of sa containing exactly
   762  	// the LMS-substring start indexes, sorted by LMS-substring.
   763  
   764  	// Cache recently used bucket index:
   765  	cB := byte(0)
   766  	b := bucket[cB]
   767  
   768  	top := len(sa)
   769  	for i := len(sa) - 1; i >= 0; i-- {
   770  		j := int(sa[i])
   771  		if j == 0 {
   772  			// Skip empty entry.
   773  			continue
   774  		}
   775  		sa[i] = 0
   776  		if j < 0 {
   777  			// Leave discovered LMS-substring start index for caller.
   778  			top--
   779  			sa[top] = int64(-j)
   780  			continue
   781  		}
   782  
   783  		// Index j was on work queue, meaning k := j-1 is S-type,
   784  		// so we can now place k correctly into sa.
   785  		// If k-1 is S-type, queue k for processing later in this loop.
   786  		// If k-1 is L-type (text[k-1] > text[k]), queue -k to save for the caller.
   787  		k := j - 1
   788  		c1 := text[k]
   789  		c0 := text[k-1]
   790  		if c0 > c1 {
   791  			k = -k
   792  		}
   793  
   794  		if cB != c1 {
   795  			bucket[cB] = b
   796  			cB = c1
   797  			b = bucket[cB]
   798  		}
   799  		b--
   800  		sa[b] = int64(k)
   801  	}
   802  }
   803  
   804  func induceSubS_32(text []int32, sa, freq, bucket []int32) {
   805  	// Initialize positions for right side of character buckets.
   806  	bucketMax_32(text, freq, bucket)
   807  
   808  	// Analogous to induceSubL_32 above,
   809  	// as we scan the array right-to-left, each sa[i] = j > 0 is a correctly
   810  	// sorted suffix array entry (for text[j:]) for which we know that j-1 is type S.
   811  	// Because j-1 is type S, inserting it into sa now will sort it correctly.
   812  	// But we want to distinguish a j-1 with j-2 of type S from type L.
   813  	// We can process the former but want to leave the latter for the caller.
   814  	// We record the difference by negating j-1 if it is preceded by type L.
   815  	// Either way, the insertion (into the text[j-1] bucket) is guaranteed to
   816  	// happen at sa[i´] for some i´ < i, that is, in the portion of sa we have
   817  	// yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3,
   818  	// and so on, in sorted but not necessarily adjacent order, until it finds
   819  	// one preceded by an index of type L, at which point it must stop.
   820  	// That index (preceded by one of type L) is an LMS-substring start.
   821  	//
   822  	// As we scan through the array, we clear the worked entries (sa[i] > 0) to zero,
   823  	// and we flip sa[i] < 0 to -sa[i] and compact into the top of sa,
   824  	// so that the loop finishes with the top of sa containing exactly
   825  	// the LMS-substring start indexes, sorted by LMS-substring.
   826  
   827  	// Cache recently used bucket index:
   828  	cB := int32(0)
   829  	b := bucket[cB]
   830  
   831  	top := len(sa)
   832  	for i := len(sa) - 1; i >= 0; i-- {
   833  		j := int(sa[i])
   834  		if j == 0 {
   835  			// Skip empty entry.
   836  			continue
   837  		}
   838  		sa[i] = 0
   839  		if j < 0 {
   840  			// Leave discovered LMS-substring start index for caller.
   841  			top--
   842  			sa[top] = int32(-j)
   843  			continue
   844  		}
   845  
   846  		// Index j was on work queue, meaning k := j-1 is S-type,
   847  		// so we can now place k correctly into sa.
   848  		// If k-1 is S-type, queue k for processing later in this loop.
   849  		// If k-1 is L-type (text[k-1] > text[k]), queue -k to save for the caller.
   850  		k := j - 1
   851  		c1 := text[k]
   852  		c0 := text[k-1]
   853  		if c0 > c1 {
   854  			k = -k
   855  		}
   856  
   857  		if cB != c1 {
   858  			bucket[cB] = b
   859  			cB = c1
   860  			b = bucket[cB]
   861  		}
   862  		b--
   863  		sa[b] = int32(k)
   864  	}
   865  }
   866  
   867  func induceSubS_64(text []int64, sa, freq, bucket []int64) {
   868  	// Initialize positions for right side of character buckets.
   869  	bucketMax_64(text, freq, bucket)
   870  
   871  	// Analogous to induceSubL_64 above,
   872  	// as we scan the array right-to-left, each sa[i] = j > 0 is a correctly
   873  	// sorted suffix array entry (for text[j:]) for which we know that j-1 is type S.
   874  	// Because j-1 is type S, inserting it into sa now will sort it correctly.
   875  	// But we want to distinguish a j-1 with j-2 of type S from type L.
   876  	// We can process the former but want to leave the latter for the caller.
   877  	// We record the difference by negating j-1 if it is preceded by type L.
   878  	// Either way, the insertion (into the text[j-1] bucket) is guaranteed to
   879  	// happen at sa[i´] for some i´ < i, that is, in the portion of sa we have
   880  	// yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3,
   881  	// and so on, in sorted but not necessarily adjacent order, until it finds
   882  	// one preceded by an index of type L, at which point it must stop.
   883  	// That index (preceded by one of type L) is an LMS-substring start.
   884  	//
   885  	// As we scan through the array, we clear the worked entries (sa[i] > 0) to zero,
   886  	// and we flip sa[i] < 0 to -sa[i] and compact into the top of sa,
   887  	// so that the loop finishes with the top of sa containing exactly
   888  	// the LMS-substring start indexes, sorted by LMS-substring.
   889  
   890  	// Cache recently used bucket index:
   891  	cB := int64(0)
   892  	b := bucket[cB]
   893  
   894  	top := len(sa)
   895  	for i := len(sa) - 1; i >= 0; i-- {
   896  		j := int(sa[i])
   897  		if j == 0 {
   898  			// Skip empty entry.
   899  			continue
   900  		}
   901  		sa[i] = 0
   902  		if j < 0 {
   903  			// Leave discovered LMS-substring start index for caller.
   904  			top--
   905  			sa[top] = int64(-j)
   906  			continue
   907  		}
   908  
   909  		// Index j was on work queue, meaning k := j-1 is S-type,
   910  		// so we can now place k correctly into sa.
   911  		// If k-1 is S-type, queue k for processing later in this loop.
   912  		// If k-1 is L-type (text[k-1] > text[k]), queue -k to save for the caller.
   913  		k := j - 1
   914  		c1 := text[k]
   915  		c0 := text[k-1]
   916  		if c0 > c1 {
   917  			k = -k
   918  		}
   919  
   920  		if cB != c1 {
   921  			bucket[cB] = b
   922  			cB = c1
   923  			b = bucket[cB]
   924  		}
   925  		b--
   926  		sa[b] = int64(k)
   927  	}
   928  }
   929  
   930  func length_8_64(text []byte, sa []int64, numLMS int) {
   931  	end := 0 // index of current LMS-substring end (0 indicates final LMS-substring)
   932  
   933  	// The encoding of N text bytes into a “length” word
   934  	// adds 1 to each byte, packs them into the bottom
   935  	// N*8 bits of a word, and then bitwise inverts the result.
   936  	// That is, the text sequence A B C (hex 41 42 43)
   937  	// encodes as ^uint64(0x42_43_44).
   938  	// LMS-substrings can never start or end with 0xFF.
   939  	// Adding 1 ensures the encoded byte sequence never
   940  	// starts or ends with 0x00, so that present bytes can be
   941  	// distinguished from zero-padding in the top bits,
   942  	// so the length need not be separately encoded.
   943  	// Inverting the bytes increases the chance that a
   944  	// 4-byte encoding will still be ≥ len(text).
   945  	// In particular, if the first byte is ASCII (<= 0x7E, so +1 <= 0x7F)
   946  	// then the high bit of the inversion will be set,
   947  	// making it clearly not a valid length (it would be a negative one).
   948  	//
   949  	// cx holds the pre-inverted encoding (the packed incremented bytes).
   950  	cx := uint64(0) // byte-only
   951  
   952  	// This stanza (until the blank line) is the "LMS-substring iterator",
   953  	// described in placeLMS_8_64 above, with one line added to maintain cx.
   954  	c0, c1, isTypeS := byte(0), byte(0), false
   955  	for i := len(text) - 1; i >= 0; i-- {
   956  		c0, c1 = text[i], c0
   957  		cx = cx<<8 | uint64(c1+1) // byte-only
   958  		if c0 < c1 {
   959  			isTypeS = true
   960  		} else if c0 > c1 && isTypeS {
   961  			isTypeS = false
   962  
   963  			// Index j = i+1 is the start of an LMS-substring.
   964  			// Compute length or encoded text to store in sa[j/2].
   965  			j := i + 1
   966  			var code int64
   967  			if end == 0 {
   968  				code = 0
   969  			} else {
   970  				code = int64(end - j)
   971  				if code <= 64/8 && ^cx >= uint64(len(text)) { // byte-only
   972  					code = int64(^cx) // byte-only
   973  				} // byte-only
   974  			}
   975  			sa[j>>1] = code
   976  			end = j + 1
   977  			cx = uint64(c1 + 1) // byte-only
   978  		}
   979  	}
   980  }
   981  
   982  func length_32(text []int32, sa []int32, numLMS int) {
   983  	end := 0 // index of current LMS-substring end (0 indicates final LMS-substring)
   984  
   985  	// The encoding of N text int32s into a “length” word
   986  	// adds 1 to each int32, packs them into the bottom
   987  	// N*8 bits of a word, and then bitwise inverts the result.
   988  	// That is, the text sequence A B C (hex 41 42 43)
   989  	// encodes as ^uint32(0x42_43_44).
   990  	// LMS-substrings can never start or end with 0xFF.
   991  	// Adding 1 ensures the encoded int32 sequence never
   992  	// starts or ends with 0x00, so that present int32s can be
   993  	// distinguished from zero-padding in the top bits,
   994  	// so the length need not be separately encoded.
   995  	// Inverting the int32s increases the chance that a
   996  	// 4-int32 encoding will still be ≥ len(text).
   997  	// In particular, if the first int32 is ASCII (<= 0x7E, so +1 <= 0x7F)
   998  	// then the high bit of the inversion will be set,
   999  	// making it clearly not a valid length (it would be a negative one).
  1000  	//
  1001  	// cx holds the pre-inverted encoding (the packed incremented int32s).
  1002  
  1003  	// This stanza (until the blank line) is the "LMS-substring iterator",
  1004  	// described in placeLMS_32 above, with one line added to maintain cx.
  1005  	c0, c1, isTypeS := int32(0), int32(0), false
  1006  	for i := len(text) - 1; i >= 0; i-- {
  1007  		c0, c1 = text[i], c0
  1008  		if c0 < c1 {
  1009  			isTypeS = true
  1010  		} else if c0 > c1 && isTypeS {
  1011  			isTypeS = false
  1012  
  1013  			// Index j = i+1 is the start of an LMS-substring.
  1014  			// Compute length or encoded text to store in sa[j/2].
  1015  			j := i + 1
  1016  			var code int32
  1017  			if end == 0 {
  1018  				code = 0
  1019  			} else {
  1020  				code = int32(end - j)
  1021  			}
  1022  			sa[j>>1] = code
  1023  			end = j + 1
  1024  		}
  1025  	}
  1026  }
  1027  
  1028  func length_64(text []int64, sa []int64, numLMS int) {
  1029  	end := 0 // index of current LMS-substring end (0 indicates final LMS-substring)
  1030  
  1031  	// The encoding of N text int64s into a “length” word
  1032  	// adds 1 to each int64, packs them into the bottom
  1033  	// N*8 bits of a word, and then bitwise inverts the result.
  1034  	// That is, the text sequence A B C (hex 41 42 43)
  1035  	// encodes as ^uint64(0x42_43_44).
  1036  	// LMS-substrings can never start or end with 0xFF.
  1037  	// Adding 1 ensures the encoded int64 sequence never
  1038  	// starts or ends with 0x00, so that present int64s can be
  1039  	// distinguished from zero-padding in the top bits,
  1040  	// so the length need not be separately encoded.
  1041  	// Inverting the int64s increases the chance that a
  1042  	// 4-int64 encoding will still be ≥ len(text).
  1043  	// In particular, if the first int64 is ASCII (<= 0x7E, so +1 <= 0x7F)
  1044  	// then the high bit of the inversion will be set,
  1045  	// making it clearly not a valid length (it would be a negative one).
  1046  	//
  1047  	// cx holds the pre-inverted encoding (the packed incremented int64s).
  1048  
  1049  	// This stanza (until the blank line) is the "LMS-substring iterator",
  1050  	// described in placeLMS_64 above, with one line added to maintain cx.
  1051  	c0, c1, isTypeS := int64(0), int64(0), false
  1052  	for i := len(text) - 1; i >= 0; i-- {
  1053  		c0, c1 = text[i], c0
  1054  		if c0 < c1 {
  1055  			isTypeS = true
  1056  		} else if c0 > c1 && isTypeS {
  1057  			isTypeS = false
  1058  
  1059  			// Index j = i+1 is the start of an LMS-substring.
  1060  			// Compute length or encoded text to store in sa[j/2].
  1061  			j := i + 1
  1062  			var code int64
  1063  			if end == 0 {
  1064  				code = 0
  1065  			} else {
  1066  				code = int64(end - j)
  1067  			}
  1068  			sa[j>>1] = code
  1069  			end = j + 1
  1070  		}
  1071  	}
  1072  }
  1073  
  1074  func assignID_8_64(text []byte, sa []int64, numLMS int) int {
  1075  	id := 0
  1076  	lastLen := int64(-1) // impossible
  1077  	lastPos := int64(0)
  1078  	for _, j := range sa[len(sa)-numLMS:] {
  1079  		// Is the LMS-substring at index j new, or is it the same as the last one we saw?
  1080  		n := sa[j/2]
  1081  		if n != lastLen {
  1082  			goto New
  1083  		}
  1084  		if uint64(n) >= uint64(len(text)) {
  1085  			// “Length” is really encoded full text, and they match.
  1086  			goto Same
  1087  		}
  1088  		{
  1089  			// Compare actual texts.
  1090  			n := int(n)
  1091  			this := text[j:][:n]
  1092  			last := text[lastPos:][:n]
  1093  			for i := 0; i < n; i++ {
  1094  				if this[i] != last[i] {
  1095  					goto New
  1096  				}
  1097  			}
  1098  			goto Same
  1099  		}
  1100  	New:
  1101  		id++
  1102  		lastPos = j
  1103  		lastLen = n
  1104  	Same:
  1105  		sa[j/2] = int64(id)
  1106  	}
  1107  	return id
  1108  }
  1109  
  1110  func assignID_32(text []int32, sa []int32, numLMS int) int {
  1111  	id := 0
  1112  	lastLen := int32(-1) // impossible
  1113  	lastPos := int32(0)
  1114  	for _, j := range sa[len(sa)-numLMS:] {
  1115  		// Is the LMS-substring at index j new, or is it the same as the last one we saw?
  1116  		n := sa[j/2]
  1117  		if n != lastLen {
  1118  			goto New
  1119  		}
  1120  		if uint32(n) >= uint32(len(text)) {
  1121  			// “Length” is really encoded full text, and they match.
  1122  			goto Same
  1123  		}
  1124  		{
  1125  			// Compare actual texts.
  1126  			n := int(n)
  1127  			this := text[j:][:n]
  1128  			last := text[lastPos:][:n]
  1129  			for i := 0; i < n; i++ {
  1130  				if this[i] != last[i] {
  1131  					goto New
  1132  				}
  1133  			}
  1134  			goto Same
  1135  		}
  1136  	New:
  1137  		id++
  1138  		lastPos = j
  1139  		lastLen = n
  1140  	Same:
  1141  		sa[j/2] = int32(id)
  1142  	}
  1143  	return id
  1144  }
  1145  
  1146  func assignID_64(text []int64, sa []int64, numLMS int) int {
  1147  	id := 0
  1148  	lastLen := int64(-1) // impossible
  1149  	lastPos := int64(0)
  1150  	for _, j := range sa[len(sa)-numLMS:] {
  1151  		// Is the LMS-substring at index j new, or is it the same as the last one we saw?
  1152  		n := sa[j/2]
  1153  		if n != lastLen {
  1154  			goto New
  1155  		}
  1156  		if uint64(n) >= uint64(len(text)) {
  1157  			// “Length” is really encoded full text, and they match.
  1158  			goto Same
  1159  		}
  1160  		{
  1161  			// Compare actual texts.
  1162  			n := int(n)
  1163  			this := text[j:][:n]
  1164  			last := text[lastPos:][:n]
  1165  			for i := 0; i < n; i++ {
  1166  				if this[i] != last[i] {
  1167  					goto New
  1168  				}
  1169  			}
  1170  			goto Same
  1171  		}
  1172  	New:
  1173  		id++
  1174  		lastPos = j
  1175  		lastLen = n
  1176  	Same:
  1177  		sa[j/2] = int64(id)
  1178  	}
  1179  	return id
  1180  }
  1181  
  1182  func map_64(sa []int64, numLMS int) {
  1183  	w := len(sa)
  1184  	for i := len(sa) / 2; i >= 0; i-- {
  1185  		j := sa[i]
  1186  		if j > 0 {
  1187  			w--
  1188  			sa[w] = j - 1
  1189  		}
  1190  	}
  1191  }
  1192  
  1193  func recurse_64(sa, oldTmp []int64, numLMS, maxID int) {
  1194  	dst, saTmp, text := sa[:numLMS], sa[numLMS:len(sa)-numLMS], sa[len(sa)-numLMS:]
  1195  
  1196  	// Set up temporary space for recursive call.
  1197  	// We must pass sais_64 a tmp buffer wiith at least maxID entries.
  1198  	//
  1199  	// The subproblem is guaranteed to have length at most len(sa)/2,
  1200  	// so that sa can hold both the subproblem and its suffix array.
  1201  	// Nearly all the time, however, the subproblem has length < len(sa)/3,
  1202  	// in which case there is a subproblem-sized middle of sa that
  1203  	// we can reuse for temporary space (saTmp).
  1204  	// When recurse_64 is called from sais_8_64, oldTmp is length 512
  1205  	// (from text_64), and saTmp will typically be much larger, so we'll use saTmp.
  1206  	// When deeper recursions come back to recurse_64, now oldTmp is
  1207  	// the saTmp from the top-most recursion, it is typically larger than
  1208  	// the current saTmp (because the current sa gets smaller and smaller
  1209  	// as the recursion gets deeper), and we keep reusing that top-most
  1210  	// large saTmp instead of the offered smaller ones.
  1211  	//
  1212  	// Why is the subproblem length so often just under len(sa)/3?
  1213  	// See Nong, Zhang, and Chen, section 3.6 for a plausible explanation.
  1214  	// In brief, the len(sa)/2 case would correspond to an SLSLSLSLSLSL pattern
  1215  	// in the input, perfect alternation of larger and smaller input bytes.
  1216  	// Real text doesn't do that. If each L-type index is randomly followed
  1217  	// by either an L-type or S-type index, then half the substrings will
  1218  	// be of the form SLS, but the other half will be longer. Of that half,
  1219  	// half (a quarter overall) will be SLLS; an eighth will be SLLLS, and so on.
  1220  	// Not counting the final S in each (which overlaps the first S in the next),
  1221  	// This works out to an average length 2×½ + 3×¼ + 4×⅛ + ... = 3.
  1222  	// The space we need is further reduced by the fact that many of the
  1223  	// short patterns like SLS will often be the same character sequences
  1224  	// repeated throughout the text, reducing maxID relative to numLMS.
  1225  	//
  1226  	// For short inputs, the averages may not run in our favor, but then we
  1227  	// can often fall back to using the length-512 tmp available in the
  1228  	// top-most call. (Also a short allocation would not be a big deal.)
  1229  	//
  1230  	// For pathological inputs, we fall back to allocating a new tmp of length
  1231  	// max(maxID, numLMS/2). This level of the recursion needs maxID,
  1232  	// and all deeper levels of the recursion will need no more than numLMS/2,
  1233  	// so this one allocation is guaranteed to suffice for the entire stack
  1234  	// of recursive calls.
  1235  	tmp := oldTmp
  1236  	if len(tmp) < len(saTmp) {
  1237  		tmp = saTmp
  1238  	}
  1239  	if len(tmp) < numLMS {
  1240  		// TestSAIS/forcealloc reaches this code.
  1241  		n := maxID
  1242  		if n < numLMS/2 {
  1243  			n = numLMS / 2
  1244  		}
  1245  		tmp = make([]int64, n)
  1246  	}
  1247  
  1248  	// sais_64 requires that the caller arrange to clear dst,
  1249  	// because in general the caller may know dst is
  1250  	// freshly-allocated and already cleared. But this one is not.
  1251  	for i := range dst {
  1252  		dst[i] = 0
  1253  	}
  1254  	sais_64(text, maxID, dst, tmp)
  1255  }
  1256  
  1257  func unmap_8_64(text []byte, sa []int64, numLMS int) {
  1258  	unmap := sa[len(sa)-numLMS:]
  1259  	j := len(unmap)
  1260  
  1261  	// "LMS-substring iterator" (see placeLMS_8_64 above).
  1262  	c0, c1, isTypeS := byte(0), byte(0), false
  1263  	for i := len(text) - 1; i >= 0; i-- {
  1264  		c0, c1 = text[i], c0
  1265  		if c0 < c1 {
  1266  			isTypeS = true
  1267  		} else if c0 > c1 && isTypeS {
  1268  			isTypeS = false
  1269  
  1270  			// Populate inverse map.
  1271  			j--
  1272  			unmap[j] = int64(i + 1)
  1273  		}
  1274  	}
  1275  
  1276  	// Apply inverse map to subproblem suffix array.
  1277  	sa = sa[:numLMS]
  1278  	for i := 0; i < len(sa); i++ {
  1279  		sa[i] = unmap[sa[i]]
  1280  	}
  1281  }
  1282  
  1283  func unmap_32(text []int32, sa []int32, numLMS int) {
  1284  	unmap := sa[len(sa)-numLMS:]
  1285  	j := len(unmap)
  1286  
  1287  	// "LMS-substring iterator" (see placeLMS_32 above).
  1288  	c0, c1, isTypeS := int32(0), int32(0), false
  1289  	for i := len(text) - 1; i >= 0; i-- {
  1290  		c0, c1 = text[i], c0
  1291  		if c0 < c1 {
  1292  			isTypeS = true
  1293  		} else if c0 > c1 && isTypeS {
  1294  			isTypeS = false
  1295  
  1296  			// Populate inverse map.
  1297  			j--
  1298  			unmap[j] = int32(i + 1)
  1299  		}
  1300  	}
  1301  
  1302  	// Apply inverse map to subproblem suffix array.
  1303  	sa = sa[:numLMS]
  1304  	for i := 0; i < len(sa); i++ {
  1305  		sa[i] = unmap[sa[i]]
  1306  	}
  1307  }
  1308  
  1309  func unmap_64(text []int64, sa []int64, numLMS int) {
  1310  	unmap := sa[len(sa)-numLMS:]
  1311  	j := len(unmap)
  1312  
  1313  	// "LMS-substring iterator" (see placeLMS_64 above).
  1314  	c0, c1, isTypeS := int64(0), int64(0), false
  1315  	for i := len(text) - 1; i >= 0; i-- {
  1316  		c0, c1 = text[i], c0
  1317  		if c0 < c1 {
  1318  			isTypeS = true
  1319  		} else if c0 > c1 && isTypeS {
  1320  			isTypeS = false
  1321  
  1322  			// Populate inverse map.
  1323  			j--
  1324  			unmap[j] = int64(i + 1)
  1325  		}
  1326  	}
  1327  
  1328  	// Apply inverse map to subproblem suffix array.
  1329  	sa = sa[:numLMS]
  1330  	for i := 0; i < len(sa); i++ {
  1331  		sa[i] = unmap[sa[i]]
  1332  	}
  1333  }
  1334  
  1335  func expand_8_64(text []byte, freq, bucket, sa []int64, numLMS int) {
  1336  	bucketMax_8_64(text, freq, bucket)
  1337  	bucket = bucket[:256] // eliminate bound check for bucket[c] below
  1338  
  1339  	// Loop backward through sa, always tracking
  1340  	// the next index to populate from sa[:numLMS].
  1341  	// When we get to one, populate it.
  1342  	// Zero the rest of the slots; they have dead values in them.
  1343  	x := numLMS - 1
  1344  	saX := sa[x]
  1345  	c := text[saX]
  1346  	b := bucket[c] - 1
  1347  	bucket[c] = b
  1348  
  1349  	for i := len(sa) - 1; i >= 0; i-- {
  1350  		if i != int(b) {
  1351  			sa[i] = 0
  1352  			continue
  1353  		}
  1354  		sa[i] = saX
  1355  
  1356  		// Load next entry to put down (if any).
  1357  		if x > 0 {
  1358  			x--
  1359  			saX = sa[x] // TODO bounds check
  1360  			c = text[saX]
  1361  			b = bucket[c] - 1
  1362  			bucket[c] = b
  1363  		}
  1364  	}
  1365  }
  1366  
  1367  func expand_32(text []int32, freq, bucket, sa []int32, numLMS int) {
  1368  	bucketMax_32(text, freq, bucket)
  1369  
  1370  	// Loop backward through sa, always tracking
  1371  	// the next index to populate from sa[:numLMS].
  1372  	// When we get to one, populate it.
  1373  	// Zero the rest of the slots; they have dead values in them.
  1374  	x := numLMS - 1
  1375  	saX := sa[x]
  1376  	c := text[saX]
  1377  	b := bucket[c] - 1
  1378  	bucket[c] = b
  1379  
  1380  	for i := len(sa) - 1; i >= 0; i-- {
  1381  		if i != int(b) {
  1382  			sa[i] = 0
  1383  			continue
  1384  		}
  1385  		sa[i] = saX
  1386  
  1387  		// Load next entry to put down (if any).
  1388  		if x > 0 {
  1389  			x--
  1390  			saX = sa[x] // TODO bounds check
  1391  			c = text[saX]
  1392  			b = bucket[c] - 1
  1393  			bucket[c] = b
  1394  		}
  1395  	}
  1396  }
  1397  
  1398  func expand_64(text []int64, freq, bucket, sa []int64, numLMS int) {
  1399  	bucketMax_64(text, freq, bucket)
  1400  
  1401  	// Loop backward through sa, always tracking
  1402  	// the next index to populate from sa[:numLMS].
  1403  	// When we get to one, populate it.
  1404  	// Zero the rest of the slots; they have dead values in them.
  1405  	x := numLMS - 1
  1406  	saX := sa[x]
  1407  	c := text[saX]
  1408  	b := bucket[c] - 1
  1409  	bucket[c] = b
  1410  
  1411  	for i := len(sa) - 1; i >= 0; i-- {
  1412  		if i != int(b) {
  1413  			sa[i] = 0
  1414  			continue
  1415  		}
  1416  		sa[i] = saX
  1417  
  1418  		// Load next entry to put down (if any).
  1419  		if x > 0 {
  1420  			x--
  1421  			saX = sa[x] // TODO bounds check
  1422  			c = text[saX]
  1423  			b = bucket[c] - 1
  1424  			bucket[c] = b
  1425  		}
  1426  	}
  1427  }
  1428  
  1429  func induceL_8_64(text []byte, sa, freq, bucket []int64) {
  1430  	// Initialize positions for left side of character buckets.
  1431  	bucketMin_8_64(text, freq, bucket)
  1432  	bucket = bucket[:256] // eliminate bounds check for bucket[cB] below
  1433  
  1434  	// This scan is similar to the one in induceSubL_8_64 above.
  1435  	// That one arranges to clear all but the leftmost L-type indexes.
  1436  	// This scan leaves all the L-type indexes and the original S-type
  1437  	// indexes, but it negates the positive leftmost L-type indexes
  1438  	// (the ones that induceS_8_64 needs to process).
  1439  
  1440  	// expand_8_64 left out the implicit entry sa[-1] == len(text),
  1441  	// corresponding to the identified type-L index len(text)-1.
  1442  	// Process it before the left-to-right scan of sa proper.
  1443  	// See body in loop for commentary.
  1444  	k := len(text) - 1
  1445  	c0, c1 := text[k-1], text[k]
  1446  	if c0 < c1 {
  1447  		k = -k
  1448  	}
  1449  
  1450  	// Cache recently used bucket index.
  1451  	cB := c1
  1452  	b := bucket[cB]
  1453  	sa[b] = int64(k)
  1454  	b++
  1455  
  1456  	for i := 0; i < len(sa); i++ {
  1457  		j := int(sa[i])
  1458  		if j <= 0 {
  1459  			// Skip empty or negated entry (including negated zero).
  1460  			continue
  1461  		}
  1462  
  1463  		// Index j was on work queue, meaning k := j-1 is L-type,
  1464  		// so we can now place k correctly into sa.
  1465  		// If k-1 is L-type, queue k for processing later in this loop.
  1466  		// If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller.
  1467  		// If k is zero, k-1 doesn't exist, so we only need to leave it
  1468  		// for the caller. The caller can't tell the difference between
  1469  		// an empty slot and a non-empty zero, but there's no need
  1470  		// to distinguish them anyway: the final suffix array will end up
  1471  		// with one zero somewhere, and that will be a real zero.
  1472  		k := j - 1
  1473  		c1 := text[k]
  1474  		if k > 0 {
  1475  			if c0 := text[k-1]; c0 < c1 {
  1476  				k = -k
  1477  			}
  1478  		}
  1479  
  1480  		if cB != c1 {
  1481  			bucket[cB] = b
  1482  			cB = c1
  1483  			b = bucket[cB]
  1484  		}
  1485  		sa[b] = int64(k)
  1486  		b++
  1487  	}
  1488  }
  1489  
  1490  func induceL_32(text []int32, sa, freq, bucket []int32) {
  1491  	// Initialize positions for left side of character buckets.
  1492  	bucketMin_32(text, freq, bucket)
  1493  
  1494  	// This scan is similar to the one in induceSubL_32 above.
  1495  	// That one arranges to clear all but the leftmost L-type indexes.
  1496  	// This scan leaves all the L-type indexes and the original S-type
  1497  	// indexes, but it negates the positive leftmost L-type indexes
  1498  	// (the ones that induceS_32 needs to process).
  1499  
  1500  	// expand_32 left out the implicit entry sa[-1] == len(text),
  1501  	// corresponding to the identified type-L index len(text)-1.
  1502  	// Process it before the left-to-right scan of sa proper.
  1503  	// See body in loop for commentary.
  1504  	k := len(text) - 1
  1505  	c0, c1 := text[k-1], text[k]
  1506  	if c0 < c1 {
  1507  		k = -k
  1508  	}
  1509  
  1510  	// Cache recently used bucket index.
  1511  	cB := c1
  1512  	b := bucket[cB]
  1513  	sa[b] = int32(k)
  1514  	b++
  1515  
  1516  	for i := 0; i < len(sa); i++ {
  1517  		j := int(sa[i])
  1518  		if j <= 0 {
  1519  			// Skip empty or negated entry (including negated zero).
  1520  			continue
  1521  		}
  1522  
  1523  		// Index j was on work queue, meaning k := j-1 is L-type,
  1524  		// so we can now place k correctly into sa.
  1525  		// If k-1 is L-type, queue k for processing later in this loop.
  1526  		// If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller.
  1527  		// If k is zero, k-1 doesn't exist, so we only need to leave it
  1528  		// for the caller. The caller can't tell the difference between
  1529  		// an empty slot and a non-empty zero, but there's no need
  1530  		// to distinguish them anyway: the final suffix array will end up
  1531  		// with one zero somewhere, and that will be a real zero.
  1532  		k := j - 1
  1533  		c1 := text[k]
  1534  		if k > 0 {
  1535  			if c0 := text[k-1]; c0 < c1 {
  1536  				k = -k
  1537  			}
  1538  		}
  1539  
  1540  		if cB != c1 {
  1541  			bucket[cB] = b
  1542  			cB = c1
  1543  			b = bucket[cB]
  1544  		}
  1545  		sa[b] = int32(k)
  1546  		b++
  1547  	}
  1548  }
  1549  
  1550  func induceL_64(text []int64, sa, freq, bucket []int64) {
  1551  	// Initialize positions for left side of character buckets.
  1552  	bucketMin_64(text, freq, bucket)
  1553  
  1554  	// This scan is similar to the one in induceSubL_64 above.
  1555  	// That one arranges to clear all but the leftmost L-type indexes.
  1556  	// This scan leaves all the L-type indexes and the original S-type
  1557  	// indexes, but it negates the positive leftmost L-type indexes
  1558  	// (the ones that induceS_64 needs to process).
  1559  
  1560  	// expand_64 left out the implicit entry sa[-1] == len(text),
  1561  	// corresponding to the identified type-L index len(text)-1.
  1562  	// Process it before the left-to-right scan of sa proper.
  1563  	// See body in loop for commentary.
  1564  	k := len(text) - 1
  1565  	c0, c1 := text[k-1], text[k]
  1566  	if c0 < c1 {
  1567  		k = -k
  1568  	}
  1569  
  1570  	// Cache recently used bucket index.
  1571  	cB := c1
  1572  	b := bucket[cB]
  1573  	sa[b] = int64(k)
  1574  	b++
  1575  
  1576  	for i := 0; i < len(sa); i++ {
  1577  		j := int(sa[i])
  1578  		if j <= 0 {
  1579  			// Skip empty or negated entry (including negated zero).
  1580  			continue
  1581  		}
  1582  
  1583  		// Index j was on work queue, meaning k := j-1 is L-type,
  1584  		// so we can now place k correctly into sa.
  1585  		// If k-1 is L-type, queue k for processing later in this loop.
  1586  		// If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller.
  1587  		// If k is zero, k-1 doesn't exist, so we only need to leave it
  1588  		// for the caller. The caller can't tell the difference between
  1589  		// an empty slot and a non-empty zero, but there's no need
  1590  		// to distinguish them anyway: the final suffix array will end up
  1591  		// with one zero somewhere, and that will be a real zero.
  1592  		k := j - 1
  1593  		c1 := text[k]
  1594  		if k > 0 {
  1595  			if c0 := text[k-1]; c0 < c1 {
  1596  				k = -k
  1597  			}
  1598  		}
  1599  
  1600  		if cB != c1 {
  1601  			bucket[cB] = b
  1602  			cB = c1
  1603  			b = bucket[cB]
  1604  		}
  1605  		sa[b] = int64(k)
  1606  		b++
  1607  	}
  1608  }
  1609  
  1610  func induceS_8_64(text []byte, sa, freq, bucket []int64) {
  1611  	// Initialize positions for right side of character buckets.
  1612  	bucketMax_8_64(text, freq, bucket)
  1613  	bucket = bucket[:256] // eliminate bounds check for bucket[cB] below
  1614  
  1615  	cB := byte(0)
  1616  	b := bucket[cB]
  1617  
  1618  	for i := len(sa) - 1; i >= 0; i-- {
  1619  		j := int(sa[i])
  1620  		if j >= 0 {
  1621  			// Skip non-flagged entry.
  1622  			// (This loop can't see an empty entry; 0 means the real zero index.)
  1623  			continue
  1624  		}
  1625  
  1626  		// Negative j is a work queue entry; rewrite to positive j for final suffix array.
  1627  		j = -j
  1628  		sa[i] = int64(j)
  1629  
  1630  		// Index j was on work queue (encoded as -j but now decoded),
  1631  		// meaning k := j-1 is L-type,
  1632  		// so we can now place k correctly into sa.
  1633  		// If k-1 is S-type, queue -k for processing later in this loop.
  1634  		// If k-1 is L-type (text[k-1] > text[k]), queue k to save for the caller.
  1635  		// If k is zero, k-1 doesn't exist, so we only need to leave it
  1636  		// for the caller.
  1637  		k := j - 1
  1638  		c1 := text[k]
  1639  		if k > 0 {
  1640  			if c0 := text[k-1]; c0 <= c1 {
  1641  				k = -k
  1642  			}
  1643  		}
  1644  
  1645  		if cB != c1 {
  1646  			bucket[cB] = b
  1647  			cB = c1
  1648  			b = bucket[cB]
  1649  		}
  1650  		b--
  1651  		sa[b] = int64(k)
  1652  	}
  1653  }
  1654  
  1655  func induceS_32(text []int32, sa, freq, bucket []int32) {
  1656  	// Initialize positions for right side of character buckets.
  1657  	bucketMax_32(text, freq, bucket)
  1658  
  1659  	cB := int32(0)
  1660  	b := bucket[cB]
  1661  
  1662  	for i := len(sa) - 1; i >= 0; i-- {
  1663  		j := int(sa[i])
  1664  		if j >= 0 {
  1665  			// Skip non-flagged entry.
  1666  			// (This loop can't see an empty entry; 0 means the real zero index.)
  1667  			continue
  1668  		}
  1669  
  1670  		// Negative j is a work queue entry; rewrite to positive j for final suffix array.
  1671  		j = -j
  1672  		sa[i] = int32(j)
  1673  
  1674  		// Index j was on work queue (encoded as -j but now decoded),
  1675  		// meaning k := j-1 is L-type,
  1676  		// so we can now place k correctly into sa.
  1677  		// If k-1 is S-type, queue -k for processing later in this loop.
  1678  		// If k-1 is L-type (text[k-1] > text[k]), queue k to save for the caller.
  1679  		// If k is zero, k-1 doesn't exist, so we only need to leave it
  1680  		// for the caller.
  1681  		k := j - 1
  1682  		c1 := text[k]
  1683  		if k > 0 {
  1684  			if c0 := text[k-1]; c0 <= c1 {
  1685  				k = -k
  1686  			}
  1687  		}
  1688  
  1689  		if cB != c1 {
  1690  			bucket[cB] = b
  1691  			cB = c1
  1692  			b = bucket[cB]
  1693  		}
  1694  		b--
  1695  		sa[b] = int32(k)
  1696  	}
  1697  }
  1698  
  1699  func induceS_64(text []int64, sa, freq, bucket []int64) {
  1700  	// Initialize positions for right side of character buckets.
  1701  	bucketMax_64(text, freq, bucket)
  1702  
  1703  	cB := int64(0)
  1704  	b := bucket[cB]
  1705  
  1706  	for i := len(sa) - 1; i >= 0; i-- {
  1707  		j := int(sa[i])
  1708  		if j >= 0 {
  1709  			// Skip non-flagged entry.
  1710  			// (This loop can't see an empty entry; 0 means the real zero index.)
  1711  			continue
  1712  		}
  1713  
  1714  		// Negative j is a work queue entry; rewrite to positive j for final suffix array.
  1715  		j = -j
  1716  		sa[i] = int64(j)
  1717  
  1718  		// Index j was on work queue (encoded as -j but now decoded),
  1719  		// meaning k := j-1 is L-type,
  1720  		// so we can now place k correctly into sa.
  1721  		// If k-1 is S-type, queue -k for processing later in this loop.
  1722  		// If k-1 is L-type (text[k-1] > text[k]), queue k to save for the caller.
  1723  		// If k is zero, k-1 doesn't exist, so we only need to leave it
  1724  		// for the caller.
  1725  		k := j - 1
  1726  		c1 := text[k]
  1727  		if k > 0 {
  1728  			if c0 := text[k-1]; c0 <= c1 {
  1729  				k = -k
  1730  			}
  1731  		}
  1732  
  1733  		if cB != c1 {
  1734  			bucket[cB] = b
  1735  			cB = c1
  1736  			b = bucket[cB]
  1737  		}
  1738  		b--
  1739  		sa[b] = int64(k)
  1740  	}
  1741  }
  1742  

View as plain text