OLD | NEW |
---|---|
(Empty) | |
1 // Copyright 2017 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #ifndef CHROME_INSTALLER_ZUCCHINI_SUFFIX_ARRAY_H_ | |
6 #define CHROME_INSTALLER_ZUCCHINI_SUFFIX_ARRAY_H_ | |
7 | |
8 #include <algorithm> | |
9 #include <cassert> | |
10 #include <iterator> | |
11 #include <numeric> | |
12 #include <vector> | |
13 | |
14 #include "base/logging.h" | |
15 | |
16 namespace zucchini { | |
17 | |
18 // A functor class that implements the naive suffix sorting algorithm that uses | |
19 // std::sort with lexicographical compare. This is only meant as reference of | |
20 // the interface. | |
21 class NaiveSuffixSort { | |
22 public: | |
23 // Type requirements: | |
24 // |InputRng| is an input random access range. | |
25 // |KeyType| is an unsigned integer type. | |
26 // |SAIt| is a random access iterator with mutable references. | |
27 template <class InputRng, class KeyType, class SAIt> | |
28 // |str| is the input string on which suffix sort is applied. | |
29 // Characters found in |str| must be in the range [0, |key_bound|) | |
30 // |suffix_array| is the beginning of the destination range, which is at least | |
31 // as large as |str|. | |
32 void operator()(const InputRng& str, | |
33 KeyType key_bound, | |
34 SAIt suffix_array) const { | |
35 using size_type = typename SAIt::value_type; | |
36 | |
37 size_type n = static_cast<size_type>(std::end(str) - std::begin(str)); | |
38 | |
39 // |suffix_array| is first filled with ordered indices of |str|. | |
40 // Those indices are then sorted with lexicographical comparisons in |str|. | |
41 std::iota(suffix_array, suffix_array + n, 0); | |
42 std::sort(suffix_array, suffix_array + n, [&str](size_type i, size_type j) { | |
43 return std::lexicographical_compare(std::begin(str) + i, std::end(str), | |
44 std::begin(str) + j, std::end(str)); | |
45 }); | |
46 } | |
47 }; | |
48 | |
49 // A functor class that implements suffix array induced sorting (SA-IS) | |
50 // algorithm with linear time and memory complexity, | |
51 // see http://ieeexplore.ieee.org/abstract/document/5582081/ | |
52 class Sais { | |
grt (UTC plus 2)
2017/07/24 09:09:42
"Names should be descriptive; avoid abbreviation."
etiennep1
2017/07/27 17:26:02
Done.
| |
53 public: | |
54 // Type requirements: | |
55 // |InputRng| is an input random access range. | |
56 // |KeyType| is an unsigned integer type. | |
57 // |SAIt| is a random access iterator with mutable values. | |
58 template <class InputRng, class KeyType, class SAIt> | |
59 // |str| is the input string on which suffix sort is applied. | |
60 // Characters found in |str| must be in the range [0, |key_bound|) | |
61 // |suffix_array| is the beginning of the destination range, which is at least | |
62 // as large as |str|. | |
63 void operator()(const InputRng& str, | |
64 KeyType key_bound, | |
65 SAIt suffix_array) const { | |
66 using value_type = typename InputRng::value_type; | |
67 using size_type = typename SAIt::value_type; | |
68 | |
69 static_assert(std::is_unsigned<value_type>::value, | |
70 "Sais only supports input string with unsigned values"); | |
71 static_assert(std::is_unsigned<KeyType>::value, "KeyType must be unsigned"); | |
72 | |
73 size_type n = static_cast<size_type>(std::end(str) - std::begin(str)); | |
74 | |
75 Implementation<size_type, KeyType>::SuffixSort(std::begin(str), n, | |
76 key_bound, suffix_array); | |
77 } | |
78 | |
79 // Given string S of length n. We assume S is terminated by a unique sentinel | |
80 // $, which is considered as the smallest character. This sentinel does not | |
81 // exist in memory and is only treated implicitly, hence |n| does not count | |
82 // the sentinel in this implementation. We denote suf(S,i) the suffix formed | |
83 // by S[i..n). | |
84 | |
85 // A suffix suf(S,i) is said to be S-type or L-type, if suf(S,i) < suf(S,i+1) | |
86 // or suf(S,i) > suf(S,i+1), respectively. | |
87 enum SLType : bool { SType, LType }; | |
88 | |
89 // A character S[i] is said to be S-type or L-type if the suffix suf(S,i) is | |
90 // S-type or L-type, respectively. | |
91 | |
92 // A character S[i] is called LMS (leftmost S-type), if S[i] is S-type and | |
93 // S[i-1] is L-type. A suffix suf(S,i) is called LMS, if S[i] is an LMS | |
94 // character. | |
95 | |
96 // A substring S[i..j) is an LMS-substring if | |
97 // (1) S[i] is LMS, S[j] is LMS or the sentinel $, and S[i..j) has no other | |
98 // LMS characters, or | |
99 // (2) S[i..j) is the sentinel $. | |
100 | |
101 template <class SizeType, class KeyType> | |
102 struct Implementation { | |
grt (UTC plus 2)
2017/07/24 09:09:42
should this be in a private: section of Sais?
etiennep1
2017/07/27 17:26:02
It would make sense to have this private, but it i
| |
103 static_assert(std::is_unsigned<SizeType>::value, | |
104 "SizeType must be unsigned"); | |
105 static_assert(std::is_unsigned<KeyType>::value, "KeyType must be unsigned"); | |
106 using size_type = SizeType; | |
107 using key_type = KeyType; | |
108 | |
109 using iterator = typename std::vector<size_type>::iterator; | |
110 using const_iterator = typename std::vector<size_type>::const_iterator; | |
111 | |
112 // Partition every suffix based on SL-type. Returns the number of LMS | |
113 // suffixes. | |
114 template <class StrIt> | |
115 static size_type BuildSLPartition( | |
116 StrIt str, | |
117 size_type length, | |
118 key_type key_bound, | |
119 std::vector<SLType>::reverse_iterator sl_partition_it) { | |
120 // We will count LMS suffixes (S to L-type or last S-type). | |
121 size_type lms_count = 0; | |
122 | |
123 // |previous_type| is initialized to L-type to avoid counting an extra | |
124 // LMS suffix at the end | |
125 SLType previous_type = LType; | |
126 | |
127 // Initialized to dummy, impossible key. | |
128 key_type previous_key = key_bound; | |
129 | |
130 // We're travelling backward to determine the partition, | |
131 // as if we prepend one character at a time to the string, ex: | |
132 // b$ is L-type because b > $. | |
133 // ab$ is S-type because a < b, implying ab$ < b$. | |
134 // bab$ is L-type because b > a, implying bab$ > ab$. | |
135 // bbab$ is L-type, because bab$ was also L-type, implying bbab$ > bab$. | |
136 for (auto str_it = std::reverse_iterator<StrIt>(str + length); | |
137 str_it != std::reverse_iterator<StrIt>(str); | |
138 ++str_it, ++sl_partition_it) { | |
139 key_type current_key = *str_it; | |
140 | |
141 if (current_key > previous_key || previous_key == key_bound) { | |
142 // S[i] > S[i + 1] or S[i] is last character. | |
143 if (previous_type == SType) | |
144 // suf(S,i) is L-type and suf(S,i + 1) is S-type, therefore, | |
145 // suf(S,i+1) was a LMS suffix. | |
146 ++lms_count; | |
147 | |
148 previous_type = LType; // For next round. | |
149 } else if (current_key < previous_key) { | |
150 // S[i] < S[i + 1] | |
151 previous_type = SType; // For next round. | |
152 } | |
153 // Else, S[i] == S[i + 1]: | |
154 // The next character that differs determines the SL-type, | |
155 // so we reuse the last seen type. | |
156 | |
157 *sl_partition_it = previous_type; | |
158 previous_key = current_key; // For next round. | |
159 } | |
160 | |
161 return lms_count; | |
162 } | |
163 | |
164 // Find indices of LMS suffixes and write result to |lms_indices|. | |
165 static void FindLmsSuffixes(const std::vector<SLType>& sl_partition, | |
166 iterator lms_indices) { | |
167 // |previous_type| is initialized to S-type to avoid counting an extra | |
168 // LMS suffix at the beginning | |
169 SLType previous_type = SType; | |
170 for (size_type i = 0; i < sl_partition.size(); ++i) { | |
171 if (sl_partition[i] == SType && previous_type == LType) | |
172 *lms_indices++ = i; | |
173 previous_type = sl_partition[i]; | |
174 } | |
175 } | |
176 | |
177 template <class StrIt> | |
178 static std::vector<size_type> MakeBucketCount(StrIt str, | |
179 size_type length, | |
180 key_type key_bound) { | |
181 // Occurrence of every unique character is counted in |buckets| | |
182 std::vector<size_type> buckets(static_cast<size_type>(key_bound)); | |
183 | |
184 for (auto it = str; it != str + length; ++it) | |
185 ++buckets[*it]; | |
186 return buckets; | |
187 } | |
188 | |
189 // Apply induced sort from |lms_indices| to |suffix_array| associated with | |
190 // the string |str|. | |
191 template <class StrIt, class SAIt> | |
192 static void InducedSort(StrIt str, | |
193 size_type length, | |
194 const std::vector<SLType>& sl_partition, | |
195 const std::vector<size_type>& lms_indices, | |
196 const std::vector<size_type>& buckets, | |
197 SAIt suffix_array) { | |
198 // All indices are first marked as unset with the illegal value |length|. | |
199 std::fill(suffix_array, suffix_array + length, length); | |
200 | |
201 // Used to mark bucket boundaries (head or end) as indices in str. | |
202 DCHECK(!buckets.empty()); | |
203 std::vector<size_type> bucket_bounds(buckets.size()); | |
204 | |
205 // Step 1: Assign indices for LMS suffixes, populating the end of | |
206 // respective buckets but keeping relative order. | |
207 | |
208 // Find the end of each bucket and write it to |bucket_bounds|. | |
209 std::partial_sum(buckets.begin(), buckets.end(), bucket_bounds.begin()); | |
210 | |
211 // Process each |lms_indices| backward, and assign them to the end of | |
212 // their respective buckets, so relative order is preserved. | |
213 for (auto it = lms_indices.crbegin(); it != lms_indices.crend(); ++it) { | |
214 key_type key = str[*it]; | |
215 suffix_array[--bucket_bounds[key]] = *it; | |
216 } | |
217 | |
218 // Step 2 | |
219 // Scan forward |suffix_array|; for each modified suf(S,i) for which | |
220 // suf(S,SA(i) - 1) is L-type, place suf(S,SA(i) - 1) to the current | |
221 // head of the corresponding bucket and forward the bucket head to the | |
222 // right. | |
223 | |
224 // Find the head of each bucket and write it to |bucket_bounds|. Since | |
225 // only LMS suffixes where inserted in |suffix_array| during Step 1, | |
226 // |bucket_bounds| does not contains the head of each bucket and needs to | |
227 // be updated. | |
228 bucket_bounds[0] = 0; | |
229 std::partial_sum(buckets.begin(), buckets.end() - 1, | |
230 bucket_bounds.begin() + 1); | |
231 | |
232 // From Step 1, the sentinel $, which we treat implicitly, would have | |
233 // been placed at the beginning of |suffix_array|, since $ is always | |
234 // considered as the smallest character. We then have to deal with the | |
235 // previous (last) suffix. | |
236 if (sl_partition[length - 1] == LType) { | |
237 key_type key = str[length - 1]; | |
238 suffix_array[bucket_bounds[key]++] = length - 1; | |
239 } | |
240 for (auto it = suffix_array; it != suffix_array + length; ++it) { | |
241 size_type suffix_index = *it; | |
242 | |
243 // While the original algorithm marks unset suffixes with -1, | |
244 // we found that marking them with |length| is also possible and more | |
245 // convenient because we are working with unsigned integers. | |
246 if (suffix_index != length && suffix_index > 0 && | |
247 sl_partition[--suffix_index] == LType) { | |
248 key_type key = str[suffix_index]; | |
249 suffix_array[bucket_bounds[key]++] = suffix_index; | |
250 } | |
251 } | |
252 | |
253 // Step 3 | |
254 // Scan backward |suffix_array|; for each modified suf(S, i) for which | |
255 // suf(S,SA(i) - 1) is S-type, place suf(S,SA(i) - 1) to the current | |
256 // end of the corresponding bucket and forward the bucket head to the | |
257 // left. | |
258 | |
259 // Find the end of each bucket and write it to |bucket_bounds|. Since | |
260 // only L-type suffixes where inserted in |suffix_array| during Step 2, | |
261 // |bucket_bounds| does not contain the end of each bucket and needs to | |
262 // be updated. | |
263 std::partial_sum(buckets.begin(), buckets.end(), bucket_bounds.begin()); | |
264 | |
265 for (auto it = std::reverse_iterator<SAIt>(suffix_array + length); | |
266 it != std::reverse_iterator<SAIt>(suffix_array); ++it) { | |
267 size_type suffix_index = *it; | |
268 if (suffix_index != length && suffix_index > 0 && | |
269 sl_partition[--suffix_index] == SType) { | |
270 key_type key = str[suffix_index]; | |
271 suffix_array[--bucket_bounds[key]] = suffix_index; | |
272 } | |
273 } | |
274 // Deals with the last suffix, because of the sentinel. | |
275 if (sl_partition[length - 1] == SType) { | |
276 key_type key = str[length - 1]; | |
277 suffix_array[--bucket_bounds[key]] = length - 1; | |
278 } | |
279 } | |
280 | |
281 // Given a string S starting at |str| with length |length|, an array | |
282 // starting at |substring_array| containing lexicographically ordered LMS | |
283 // terminated substring indices of S and an SL-Type partition |sl_partition| | |
284 // of S, assigns a unique label to every unique LMS substring. The sorted | |
285 // labels for all LMS substrings are written to |lms_str|, while the indices | |
286 // of LMS suffixes are written to |lms_indices|. In addition, returns the | |
287 // total number of unique labels. | |
288 template <class StrIt, class SAIt> | |
289 static size_type LabelLmsSubstrings(StrIt str, | |
290 size_type length, | |
291 const std::vector<SLType>& sl_partition, | |
292 SAIt suffix_array, | |
293 iterator lms_indices, | |
294 iterator lms_str) { | |
295 // Labelling starts at 0. | |
296 size_type label = 0; | |
297 | |
298 // |previous_lms| is initialized to 0 to indicate it is unset. | |
299 // Note that suf(S,0) is never a LMS suffix. Substrings will be visited in | |
300 // lexicographical order. | |
301 size_type previous_lms = 0; | |
302 for (auto it = suffix_array; it != suffix_array + length; ++it) { | |
303 if (*it > 0 && sl_partition[*it] == SType && | |
304 sl_partition[*it - 1] == LType) { | |
305 // suf(S, *it) is a LMS suffix. | |
306 | |
307 size_type current_lms = *it; | |
308 if (previous_lms != 0) { | |
309 // There was a previous LMS suffix. Check if the current LMS | |
310 // substring is equal to the previous one. | |
311 SLType current_lms_type = SType; | |
312 SLType previous_lms_type = SType; | |
313 for (size_type k = 0;; ++k) { | |
314 // |current_lms_end| and |previous_lms_end| denote whether we have | |
315 // reached the end of the current and previous LMS substring, | |
316 // respectively | |
317 bool current_lms_end = false; | |
318 bool previous_lms_end = false; | |
319 | |
320 // Check for both previous and current substring ends. | |
321 // Note that it is more convenient to check if | |
322 // suf(S,current_lms + k) is an LMS suffix than to retrieve it | |
323 // from lms_indices. | |
324 if (current_lms + k >= length || | |
325 (current_lms_type == LType && | |
326 sl_partition[current_lms + k] == SType)) { | |
327 current_lms_end = true; | |
328 } | |
329 if (previous_lms + k >= length || | |
330 (previous_lms_type == LType && | |
331 sl_partition[previous_lms + k] == SType)) { | |
332 previous_lms_end = true; | |
333 } | |
334 | |
335 if (current_lms_end && previous_lms_end) { | |
336 break; // Previous and current substrings are identical. | |
337 } else if (current_lms_end != previous_lms_end || | |
338 str[current_lms + k] != str[previous_lms + k]) { | |
339 // Previous and current substrings differ, a new label is used. | |
340 ++label; | |
341 break; | |
342 } | |
343 | |
344 current_lms_type = sl_partition[current_lms + k]; | |
345 previous_lms_type = sl_partition[previous_lms + k]; | |
346 } | |
347 } | |
348 *lms_indices++ = *it; | |
349 *lms_str++ = label; | |
350 previous_lms = current_lms; | |
351 } | |
352 } | |
353 | |
354 return label + 1; | |
355 } | |
356 | |
357 // Implementation of the SA-IS algorithm. |str| must be a random access | |
358 // iterator pointing at the beginning of S with length |length|. The result | |
359 // is writtend in |suffix_array|, a random access iterator. | |
360 template <class StrIt, class SAIt> | |
361 static void SuffixSort(StrIt str, | |
362 size_type length, | |
363 key_type key_bound, | |
364 SAIt suffix_array) { | |
365 if (length == 1) | |
366 *suffix_array = 0; | |
367 if (length < 2) | |
368 return; | |
369 | |
370 std::vector<SLType> sl_partition(length); | |
371 size_type lms_count = | |
372 BuildSLPartition(str, length, key_bound, sl_partition.rbegin()); | |
373 std::vector<size_type> lms_indices(lms_count); | |
374 FindLmsSuffixes(sl_partition, lms_indices.begin()); | |
375 std::vector<size_type> buckets = MakeBucketCount(str, length, key_bound); | |
376 | |
377 if (lms_indices.size() > 1) { | |
378 // Given |lms_indices| in the same order they appear in |str|, induce | |
379 // LMS substrings relative order and write result to |suffix_array|. | |
380 InducedSort(str, length, sl_partition, lms_indices, buckets, | |
381 suffix_array); | |
382 std::vector<size_type> lms_str(lms_indices.size()); | |
383 | |
384 // Given LMS substrings in relative order found in |suffix_array|, | |
385 // map LMS substrings to unique labels to form a new string, |lms_str|. | |
386 size_type label_count = | |
387 LabelLmsSubstrings(str, length, sl_partition, suffix_array, | |
388 lms_indices.begin(), lms_str.begin()); | |
389 | |
390 if (label_count < lms_str.size()) { | |
391 // Reorder |lms_str| to have LMS suffixes in the same order they | |
392 // appear in |str|. | |
393 for (size_type i = 0; i < lms_indices.size(); ++i) | |
394 suffix_array[lms_indices[i]] = lms_str[i]; | |
395 | |
396 SLType previous_type = SType; | |
397 for (size_type i = 0, j = 0; i < sl_partition.size(); ++i) { | |
398 if (sl_partition[i] == SType && previous_type == LType) { | |
399 lms_str[j] = suffix_array[i]; | |
400 lms_indices[j++] = i; | |
401 } | |
402 previous_type = sl_partition[i]; | |
403 } | |
404 | |
405 // Recursively apply SuffixSort on |lms_str|, which is formed from | |
406 // labeled LMS suffixes in the same order they appear in |str|. | |
407 // Note that |KeyType| will be size_type because |lms_str| contains | |
408 // indices. |lms_str| is at most half the length of |str|. | |
409 Implementation<size_type, size_type>::SuffixSort( | |
410 lms_str.begin(), static_cast<size_type>(lms_str.size()), | |
411 label_count, suffix_array); | |
412 | |
413 // Map LMS labels back to indices in |str| and write result to | |
414 // |lms_indices|. We're using |suffix_array| as a temporary buffer. | |
415 for (size_type i = 0; i < lms_indices.size(); ++i) | |
416 suffix_array[i] = lms_indices[suffix_array[i]]; | |
417 std::copy_n(suffix_array, lms_indices.size(), lms_indices.begin()); | |
418 | |
419 // At this point, |lms_indices| contains sorted LMS suffixes of |str|. | |
420 } | |
421 } | |
422 // Given |lms_indices| where LMS suffixes are sorted, induce the full | |
423 // order of suffixes in |str|. | |
424 InducedSort(str, length, sl_partition, lms_indices, buckets, | |
425 suffix_array); | |
426 } | |
427 }; | |
grt (UTC plus 2)
2017/07/24 09:09:42
?
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(I
etiennep1
2017/07/27 17:26:02
Done.
| |
428 }; | |
429 | |
430 // Generates a sorted suffix array for the input string |str| using the functor | |
431 // |Algorithm| which provides an interface equivalent to NaiveSuffixSort. | |
432 /// Characters found in |str| are assumed to be in range [0, |key_bound|). | |
433 // Returns the suffix array as a vector. | |
434 // |StrRng| is an input random access range. | |
435 // |KeyType| is an unsigned integer type. | |
436 template <class Algorithm, class StrRng, class KeyType> | |
437 std::vector<typename StrRng::size_type> MakeSuffixArray(const StrRng& str, | |
438 KeyType key_bound) { | |
439 Algorithm sort; | |
440 std::vector<typename StrRng::size_type> suffix_array(str.end() - str.begin()); | |
441 sort(str, key_bound, suffix_array.begin()); | |
442 return suffix_array; | |
443 } | |
444 | |
445 // Type requirements: | |
446 // |SARng| is an input random access range. | |
447 // |StrIt1| is a random access iterator. | |
448 // |StrIt2| is a forward iterator. | |
449 template <class SARng, class StrIt1, class StrIt2> | |
450 // Lexicographical lower bound using binary search for | |
451 // [|str2_first|, |str2_last|) in the suffix array |suffix_array| of a string | |
452 // starting at |str1_first|. This does not necessarily return the index of | |
453 // the longest matching substring. | |
454 auto SuffixLowerBound(const SARng& suffix_array, | |
455 StrIt1 str1_first, | |
456 StrIt2 str2_first, | |
457 StrIt2 str2_last) -> decltype(std::begin(suffix_array)) { | |
458 using size_type = typename SARng::value_type; | |
459 | |
460 size_t n = std::end(suffix_array) - std::begin(suffix_array); | |
461 auto it = std::lower_bound( | |
462 std::begin(suffix_array), std::end(suffix_array), str2_first, | |
463 [str1_first, str2_last, n](size_type a, StrIt2 b) { | |
464 return std::lexicographical_compare(str1_first + a, str1_first + n, b, | |
465 str2_last); | |
466 }); | |
467 return it; | |
468 } | |
469 | |
470 } // namespace zucchini | |
471 | |
472 #endif // CHROME_INSTALLER_ZUCCHINI_SUFFIX_ARRAY_H_ | |
OLD | NEW |