chrome/installer/zucchini/suffix_array.h - Issue 2963463002: [Zucchini] Generic suffix array algorithms.

Side by Side Diff: chrome/installer/zucchini/suffix_array.h

Issue 2963463002: [Zucchini] Generic suffix array algorithms. (Closed)

Patch Set: Rebase Created 3 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
(Empty)
	1 // Copyright 2017 The Chromium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4

	5 #ifndef CHROME_INSTALLER_ZUCCHINI_SUFFIX_ARRAY_H_

	6 #define CHROME_INSTALLER_ZUCCHINI_SUFFIX_ARRAY_H_

	7

	8 #include <algorithm>

	9 #include <cassert>

	10 #include <iterator>

	11 #include <numeric>

	12 #include <vector>

	13

	14 #include "base/logging.h"

	15

	16 namespace zucchini {

	17

	18 // A functor class that implements the naive suffix sorting algorithm that uses

	19 // std::sort with lexicographical compare. This is only meant as reference of

	20 // the interface.

	21 class NaiveSuffixSort {

	22 public:

	23 // Type requirements:

	24 // \|InputRng\| is an input random access range.

	25 // \|KeyType\| is an unsigned integer type.

	26 // \|SAIt\| is a random access iterator with mutable references.

	27 template <class InputRng, class KeyType, class SAIt>

	28 // \|str\| is the input string on which suffix sort is applied.

	29 // Characters found in \|str\| must be in the range [0, \|key_bound\|)

	30 // \|suffix_array\| is the beginning of the destination range, which is at least

	31 // as large as \|str\|.

	32 void operator()(const InputRng& str,

	33 KeyType key_bound,

	34 SAIt suffix_array) const {

	35 using size_type = typename SAIt::value_type;

	36

	37 size_type n = static_cast<size_type>(std::end(str) - std::begin(str));

	38

	39 // \|suffix_array\| is first filled with ordered indices of \|str\|.

	40 // Those indices are then sorted with lexicographical comparisons in \|str\|.

	41 std::iota(suffix_array, suffix_array + n, 0);

	42 std::sort(suffix_array, suffix_array + n, [&str](size_type i, size_type j) {

	43 return std::lexicographical_compare(std::begin(str) + i, std::end(str),

	44 std::begin(str) + j, std::end(str));

	45 });

	46 }

	47 };

	48

	49 // A functor class that implements suffix array induced sorting (SA-IS)

	50 // algorithm with linear time and memory complexity,

	51 // see http://ieeexplore.ieee.org/abstract/document/5582081/

	52 class Sais {
	grt (UTC plus 2) 2017/07/24 09:09:42 "Names should be descriptive; avoid abbreviation." "Names should be descriptive; avoid abbreviation." (https://google.github.io/styleguide/cppguide.html#General_Naming_Rules) etiennep1 2017/07/27 17:26:02 Done. Show quoted text On 2017/07/24 09:09:42, grt (UTC plus 2) wrote: > "Names should be descriptive; avoid abbreviation." > (https://google.github.io/styleguide/cppguide.html#General_Naming_Rules) Done.
	53 public:

	54 // Type requirements:

	55 // \|InputRng\| is an input random access range.

	56 // \|KeyType\| is an unsigned integer type.

	57 // \|SAIt\| is a random access iterator with mutable values.

	58 template <class InputRng, class KeyType, class SAIt>

	59 // \|str\| is the input string on which suffix sort is applied.

	60 // Characters found in \|str\| must be in the range [0, \|key_bound\|)

	61 // \|suffix_array\| is the beginning of the destination range, which is at least

	62 // as large as \|str\|.

	63 void operator()(const InputRng& str,

	64 KeyType key_bound,

	65 SAIt suffix_array) const {

	66 using value_type = typename InputRng::value_type;

	67 using size_type = typename SAIt::value_type;

	68

	69 static_assert(std::is_unsigned<value_type>::value,

	70 "Sais only supports input string with unsigned values");

	71 static_assert(std::is_unsigned<KeyType>::value, "KeyType must be unsigned");

	72

	73 size_type n = static_cast<size_type>(std::end(str) - std::begin(str));

	74

	75 Implementation<size_type, KeyType>::SuffixSort(std::begin(str), n,

	76 key_bound, suffix_array);

	77 }

	78

	79 // Given string S of length n. We assume S is terminated by a unique sentinel

	80 // $, which is considered as the smallest character. This sentinel does not

	81 // exist in memory and is only treated implicitly, hence \|n\| does not count

	82 // the sentinel in this implementation. We denote suf(S,i) the suffix formed

	83 // by S[i..n).

	84

	85 // A suffix suf(S,i) is said to be S-type or L-type, if suf(S,i) < suf(S,i+1)

	86 // or suf(S,i) > suf(S,i+1), respectively.

	87 enum SLType : bool { SType, LType };

	88

	89 // A character S[i] is said to be S-type or L-type if the suffix suf(S,i) is

	90 // S-type or L-type, respectively.

	91

	92 // A character S[i] is called LMS (leftmost S-type), if S[i] is S-type and

	93 // S[i-1] is L-type. A suffix suf(S,i) is called LMS, if S[i] is an LMS

	94 // character.

	95

	96 // A substring S[i..j) is an LMS-substring if

	97 // (1) S[i] is LMS, S[j] is LMS or the sentinel $, and S[i..j) has no other

	98 // LMS characters, or

	99 // (2) S[i..j) is the sentinel $.

	100

	101 template <class SizeType, class KeyType>

	102 struct Implementation {
	grt (UTC plus 2) 2017/07/24 09:09:42 should this be in a private: section of Sais? should this be in a private: section of Sais? etiennep1 2017/07/27 17:26:02 It would make sense to have this private, but it i Show quoted text On 2017/07/24 09:09:42, grt (UTC plus 2) wrote: > should this be in a private: section of Sais? It would make sense to have this private, but it is convenient to have it public for unittest, and since everything is wrapped in Implementation class, it makes it pretty obvious this is an implementation detail. I also have a feeling friendship for unittest is frowned upon. Let me know if there's a better way to to this.
	103 static_assert(std::is_unsigned<SizeType>::value,

	104 "SizeType must be unsigned");

	105 static_assert(std::is_unsigned<KeyType>::value, "KeyType must be unsigned");

	106 using size_type = SizeType;

	107 using key_type = KeyType;

	108

	109 using iterator = typename std::vector<size_type>::iterator;

	110 using const_iterator = typename std::vector<size_type>::const_iterator;

	111

	112 // Partition every suffix based on SL-type. Returns the number of LMS

	113 // suffixes.

	114 template <class StrIt>

	115 static size_type BuildSLPartition(

	116 StrIt str,

	117 size_type length,

	118 key_type key_bound,

	119 std::vector<SLType>::reverse_iterator sl_partition_it) {

	120 // We will count LMS suffixes (S to L-type or last S-type).

	121 size_type lms_count = 0;

	122

	123 // \|previous_type\| is initialized to L-type to avoid counting an extra

	124 // LMS suffix at the end

	125 SLType previous_type = LType;

	126

	127 // Initialized to dummy, impossible key.

	128 key_type previous_key = key_bound;

	129

	130 // We're travelling backward to determine the partition,

	131 // as if we prepend one character at a time to the string, ex:

	132 // b$ is L-type because b > $.

	133 // ab$ is S-type because a < b, implying ab$ < b$.

	134 // bab$ is L-type because b > a, implying bab$ > ab$.

	135 // bbab$ is L-type, because bab$ was also L-type, implying bbab$ > bab$.

	136 for (auto str_it = std::reverse_iterator<StrIt>(str + length);

	137 str_it != std::reverse_iterator<StrIt>(str);

	138 ++str_it, ++sl_partition_it) {

	139 key_type current_key = *str_it;

	140

	141 if (current_key > previous_key \|\| previous_key == key_bound) {

	142 // S[i] > S[i + 1] or S[i] is last character.

	143 if (previous_type == SType)

	144 // suf(S,i) is L-type and suf(S,i + 1) is S-type, therefore,

	145 // suf(S,i+1) was a LMS suffix.

	146 ++lms_count;

	147

	148 previous_type = LType; // For next round.

	149 } else if (current_key < previous_key) {

	150 // S[i] < S[i + 1]

	151 previous_type = SType; // For next round.

	152 }

	153 // Else, S[i] == S[i + 1]:

	154 // The next character that differs determines the SL-type,

	155 // so we reuse the last seen type.

	156

	157 *sl_partition_it = previous_type;

	158 previous_key = current_key; // For next round.

	159 }

	160

	161 return lms_count;

	162 }

	163

	164 // Find indices of LMS suffixes and write result to \|lms_indices\|.

	165 static void FindLmsSuffixes(const std::vector<SLType>& sl_partition,

	166 iterator lms_indices) {

	167 // \|previous_type\| is initialized to S-type to avoid counting an extra

	168 // LMS suffix at the beginning

	169 SLType previous_type = SType;

	170 for (size_type i = 0; i < sl_partition.size(); ++i) {

	171 if (sl_partition[i] == SType && previous_type == LType)

	172 *lms_indices++ = i;

	173 previous_type = sl_partition[i];

	174 }

	175 }

	176

	177 template <class StrIt>

	178 static std::vector<size_type> MakeBucketCount(StrIt str,

	179 size_type length,

	180 key_type key_bound) {

	181 // Occurrence of every unique character is counted in \|buckets\|

	182 std::vector<size_type> buckets(static_cast<size_type>(key_bound));

	183

	184 for (auto it = str; it != str + length; ++it)

	185 ++buckets[*it];

	186 return buckets;

	187 }

	188

	189 // Apply induced sort from \|lms_indices\| to \|suffix_array\| associated with

	190 // the string \|str\|.

	191 template <class StrIt, class SAIt>

	192 static void InducedSort(StrIt str,

	193 size_type length,

	194 const std::vector<SLType>& sl_partition,

	195 const std::vector<size_type>& lms_indices,

	196 const std::vector<size_type>& buckets,

	197 SAIt suffix_array) {

	198 // All indices are first marked as unset with the illegal value \|length\|.

	199 std::fill(suffix_array, suffix_array + length, length);

	200

	201 // Used to mark bucket boundaries (head or end) as indices in str.

	202 DCHECK(!buckets.empty());

	203 std::vector<size_type> bucket_bounds(buckets.size());

	204

	205 // Step 1: Assign indices for LMS suffixes, populating the end of

	206 // respective buckets but keeping relative order.

	207

	208 // Find the end of each bucket and write it to \|bucket_bounds\|.

	209 std::partial_sum(buckets.begin(), buckets.end(), bucket_bounds.begin());

	210

	211 // Process each \|lms_indices\| backward, and assign them to the end of

	212 // their respective buckets, so relative order is preserved.

	213 for (auto it = lms_indices.crbegin(); it != lms_indices.crend(); ++it) {

	214 key_type key = str[*it];

	215 suffix_array[--bucket_bounds[key]] = *it;

	216 }

	217

	218 // Step 2

	219 // Scan forward \|suffix_array\|; for each modified suf(S,i) for which

	220 // suf(S,SA(i) - 1) is L-type, place suf(S,SA(i) - 1) to the current

	221 // head of the corresponding bucket and forward the bucket head to the

	222 // right.

	223

	224 // Find the head of each bucket and write it to \|bucket_bounds\|. Since

	225 // only LMS suffixes where inserted in \|suffix_array\| during Step 1,

	226 // \|bucket_bounds\| does not contains the head of each bucket and needs to

	227 // be updated.

	228 bucket_bounds[0] = 0;

	229 std::partial_sum(buckets.begin(), buckets.end() - 1,

	230 bucket_bounds.begin() + 1);

	231

	232 // From Step 1, the sentinel $, which we treat implicitly, would have

	233 // been placed at the beginning of \|suffix_array\|, since $ is always

	234 // considered as the smallest character. We then have to deal with the

	235 // previous (last) suffix.

	236 if (sl_partition[length - 1] == LType) {

	237 key_type key = str[length - 1];

	238 suffix_array[bucket_bounds[key]++] = length - 1;

	239 }

	240 for (auto it = suffix_array; it != suffix_array + length; ++it) {

	241 size_type suffix_index = *it;

	242

	243 // While the original algorithm marks unset suffixes with -1,

	244 // we found that marking them with \|length\| is also possible and more

	245 // convenient because we are working with unsigned integers.

	246 if (suffix_index != length && suffix_index > 0 &&

	247 sl_partition[--suffix_index] == LType) {

	248 key_type key = str[suffix_index];

	249 suffix_array[bucket_bounds[key]++] = suffix_index;

	250 }

	251 }

	252

	253 // Step 3

	254 // Scan backward \|suffix_array\|; for each modified suf(S, i) for which

	255 // suf(S,SA(i) - 1) is S-type, place suf(S,SA(i) - 1) to the current

	256 // end of the corresponding bucket and forward the bucket head to the

	257 // left.

	258

	259 // Find the end of each bucket and write it to \|bucket_bounds\|. Since

	260 // only L-type suffixes where inserted in \|suffix_array\| during Step 2,

	261 // \|bucket_bounds\| does not contain the end of each bucket and needs to

	262 // be updated.

	263 std::partial_sum(buckets.begin(), buckets.end(), bucket_bounds.begin());

	264

	265 for (auto it = std::reverse_iterator<SAIt>(suffix_array + length);

	266 it != std::reverse_iterator<SAIt>(suffix_array); ++it) {

	267 size_type suffix_index = *it;

	268 if (suffix_index != length && suffix_index > 0 &&

	269 sl_partition[--suffix_index] == SType) {

	270 key_type key = str[suffix_index];

	271 suffix_array[--bucket_bounds[key]] = suffix_index;

	272 }

	273 }

	274 // Deals with the last suffix, because of the sentinel.

	275 if (sl_partition[length - 1] == SType) {

	276 key_type key = str[length - 1];

	277 suffix_array[--bucket_bounds[key]] = length - 1;

	278 }

	279 }

	280

	281 // Given a string S starting at \|str\| with length \|length\|, an array

	282 // starting at \|substring_array\| containing lexicographically ordered LMS

	283 // terminated substring indices of S and an SL-Type partition \|sl_partition\|

	284 // of S, assigns a unique label to every unique LMS substring. The sorted

	285 // labels for all LMS substrings are written to \|lms_str\|, while the indices

	286 // of LMS suffixes are written to \|lms_indices\|. In addition, returns the

	287 // total number of unique labels.

	288 template <class StrIt, class SAIt>

	289 static size_type LabelLmsSubstrings(StrIt str,

	290 size_type length,

	291 const std::vector<SLType>& sl_partition,

	292 SAIt suffix_array,

	293 iterator lms_indices,

	294 iterator lms_str) {

	295 // Labelling starts at 0.

	296 size_type label = 0;

	297

	298 // \|previous_lms\| is initialized to 0 to indicate it is unset.

	299 // Note that suf(S,0) is never a LMS suffix. Substrings will be visited in

	300 // lexicographical order.

	301 size_type previous_lms = 0;

	302 for (auto it = suffix_array; it != suffix_array + length; ++it) {

	303 if (it > 0 && sl_partition[it] == SType &&

	304 sl_partition[*it - 1] == LType) {

	305 // suf(S, *it) is a LMS suffix.

	306

	307 size_type current_lms = *it;

	308 if (previous_lms != 0) {

	309 // There was a previous LMS suffix. Check if the current LMS

	310 // substring is equal to the previous one.

	311 SLType current_lms_type = SType;

	312 SLType previous_lms_type = SType;

	313 for (size_type k = 0;; ++k) {

	314 // \|current_lms_end\| and \|previous_lms_end\| denote whether we have

	315 // reached the end of the current and previous LMS substring,

	316 // respectively

	317 bool current_lms_end = false;

	318 bool previous_lms_end = false;

	319

	320 // Check for both previous and current substring ends.

	321 // Note that it is more convenient to check if

	322 // suf(S,current_lms + k) is an LMS suffix than to retrieve it

	323 // from lms_indices.

	324 if (current_lms + k >= length \|\|

	325 (current_lms_type == LType &&

	326 sl_partition[current_lms + k] == SType)) {

	327 current_lms_end = true;

	328 }

	329 if (previous_lms + k >= length \|\|

	330 (previous_lms_type == LType &&

	331 sl_partition[previous_lms + k] == SType)) {

	332 previous_lms_end = true;

	333 }

	334

	335 if (current_lms_end && previous_lms_end) {

	336 break; // Previous and current substrings are identical.

	337 } else if (current_lms_end != previous_lms_end \|\|

	338 str[current_lms + k] != str[previous_lms + k]) {

	339 // Previous and current substrings differ, a new label is used.

	340 ++label;

	341 break;

	342 }

	343

	344 current_lms_type = sl_partition[current_lms + k];

	345 previous_lms_type = sl_partition[previous_lms + k];

	346 }

	347 }

	348 lms_indices++ = it;

	349 *lms_str++ = label;

	350 previous_lms = current_lms;

	351 }

	352 }

	353

	354 return label + 1;

	355 }

	356

	357 // Implementation of the SA-IS algorithm. \|str\| must be a random access

	358 // iterator pointing at the beginning of S with length \|length\|. The result

	359 // is writtend in \|suffix_array\|, a random access iterator.

	360 template <class StrIt, class SAIt>

	361 static void SuffixSort(StrIt str,

	362 size_type length,

	363 key_type key_bound,

	364 SAIt suffix_array) {

	365 if (length == 1)

	366 *suffix_array = 0;

	367 if (length < 2)

	368 return;

	369

	370 std::vector<SLType> sl_partition(length);

	371 size_type lms_count =

	372 BuildSLPartition(str, length, key_bound, sl_partition.rbegin());

	373 std::vector<size_type> lms_indices(lms_count);

	374 FindLmsSuffixes(sl_partition, lms_indices.begin());

	375 std::vector<size_type> buckets = MakeBucketCount(str, length, key_bound);

	376

	377 if (lms_indices.size() > 1) {

	378 // Given \|lms_indices\| in the same order they appear in \|str\|, induce

	379 // LMS substrings relative order and write result to \|suffix_array\|.

	380 InducedSort(str, length, sl_partition, lms_indices, buckets,

	381 suffix_array);

	382 std::vector<size_type> lms_str(lms_indices.size());

	383

	384 // Given LMS substrings in relative order found in \|suffix_array\|,

	385 // map LMS substrings to unique labels to form a new string, \|lms_str\|.

	386 size_type label_count =

	387 LabelLmsSubstrings(str, length, sl_partition, suffix_array,

	388 lms_indices.begin(), lms_str.begin());

	389

	390 if (label_count < lms_str.size()) {

	391 // Reorder \|lms_str\| to have LMS suffixes in the same order they

	392 // appear in \|str\|.

	393 for (size_type i = 0; i < lms_indices.size(); ++i)

	394 suffix_array[lms_indices[i]] = lms_str[i];

	395

	396 SLType previous_type = SType;

	397 for (size_type i = 0, j = 0; i < sl_partition.size(); ++i) {

	398 if (sl_partition[i] == SType && previous_type == LType) {

	399 lms_str[j] = suffix_array[i];

	400 lms_indices[j++] = i;

	401 }

	402 previous_type = sl_partition[i];

	403 }

	404

	405 // Recursively apply SuffixSort on \|lms_str\|, which is formed from

	406 // labeled LMS suffixes in the same order they appear in \|str\|.

	407 // Note that \|KeyType\| will be size_type because \|lms_str\| contains

	408 // indices. \|lms_str\| is at most half the length of \|str\|.

	409 Implementation<size_type, size_type>::SuffixSort(

	410 lms_str.begin(), static_cast<size_type>(lms_str.size()),

	411 label_count, suffix_array);

	412

	413 // Map LMS labels back to indices in \|str\| and write result to

	414 // \|lms_indices\|. We're using \|suffix_array\| as a temporary buffer.

	415 for (size_type i = 0; i < lms_indices.size(); ++i)

	416 suffix_array[i] = lms_indices[suffix_array[i]];

	417 std::copy_n(suffix_array, lms_indices.size(), lms_indices.begin());

	418

	419 // At this point, \|lms_indices\| contains sorted LMS suffixes of \|str\|.

	420 }

	421 }

	422 // Given \|lms_indices\| where LMS suffixes are sorted, induce the full

	423 // order of suffixes in \|str\|.

	424 InducedSort(str, length, sl_partition, lms_indices, buckets,

	425 suffix_array);

	426 }

	427 };
	grt (UTC plus 2) 2017/07/24 09:09:42 ? private: DISALLOW_IMPLICIT_CONSTRUCTORS(I ? private: DISALLOW_IMPLICIT_CONSTRUCTORS(Implementation); }; etiennep1 2017/07/27 17:26:02 Done. Show quoted text On 2017/07/24 09:09:42, grt (UTC plus 2) wrote: > ? > private: > DISALLOW_IMPLICIT_CONSTRUCTORS(Implementation); > }; Done.
	428 };

	429

	430 // Generates a sorted suffix array for the input string \|str\| using the functor

	431 // \|Algorithm\| which provides an interface equivalent to NaiveSuffixSort.

	432 /// Characters found in \|str\| are assumed to be in range [0, \|key_bound\|).

	433 // Returns the suffix array as a vector.

	434 // \|StrRng\| is an input random access range.

	435 // \|KeyType\| is an unsigned integer type.

	436 template <class Algorithm, class StrRng, class KeyType>

	437 std::vector<typename StrRng::size_type> MakeSuffixArray(const StrRng& str,

	438 KeyType key_bound) {

	439 Algorithm sort;

	440 std::vector<typename StrRng::size_type> suffix_array(str.end() - str.begin());

	441 sort(str, key_bound, suffix_array.begin());

	442 return suffix_array;

	443 }

	444

	445 // Type requirements:

	446 // \|SARng\| is an input random access range.

	447 // \|StrIt1\| is a random access iterator.

	448 // \|StrIt2\| is a forward iterator.

	449 template <class SARng, class StrIt1, class StrIt2>

	450 // Lexicographical lower bound using binary search for

	451 // [\|str2_first\|, \|str2_last\|) in the suffix array \|suffix_array\| of a string

	452 // starting at \|str1_first\|. This does not necessarily return the index of

	453 // the longest matching substring.

	454 auto SuffixLowerBound(const SARng& suffix_array,

	455 StrIt1 str1_first,

	456 StrIt2 str2_first,

	457 StrIt2 str2_last) -> decltype(std::begin(suffix_array)) {

	458 using size_type = typename SARng::value_type;

	459

	460 size_t n = std::end(suffix_array) - std::begin(suffix_array);

	461 auto it = std::lower_bound(

	462 std::begin(suffix_array), std::end(suffix_array), str2_first,

	463 [str1_first, str2_last, n](size_type a, StrIt2 b) {

	464 return std::lexicographical_compare(str1_first + a, str1_first + n, b,

	465 str2_last);

	466 });

	467 return it;

	468 }

	469

	470 } // namespace zucchini

	471

	472 #endif // CHROME_INSTALLER_ZUCCHINI_SUFFIX_ARRAY_H_

OLD	NEW

« no previous file with comments | « chrome/installer/zucchini/BUILD.gn ('k') | chrome/installer/zucchini/suffix_array_unittest.cc » ('j') | no next file with comments »