Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(52)

Side by Side Diff: chrome/installer/zucchini/suffix_array_unittest.cc

Issue 2963463002: [Zucchini] Generic suffix array algorithms. (Closed)
Patch Set: Rebase Created 3 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright 2017 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "chrome/installer/zucchini/suffix_array.h"
6
7 #include <algorithm>
8 #include <cstddef>
9 #include <initializer_list>
10 #include <string>
11 #include <vector>
12
13 #include "testing/gtest/include/gtest/gtest.h"
14
15 namespace zucchini {
16
17 using ustring = std::basic_string<unsigned char>;
18
19 constexpr uint16_t kNumChar = 256;
20
21 ustring MakeUnsignedString(const std::string& str) {
22 return {str.begin(), str.end()};
23 }
24
25 template <class T>
26 std::vector<T> MakeVector(const std::initializer_list<T>& ilist) {
27 return {ilist.begin(), ilist.end()};
28 }
29
30 void TestSlPartition(std::initializer_list<Sais::SLType> expected_sl_partition,
31 std::initializer_list<size_t> expected_lms_indices,
32 std::string str) {
33 using SaisImpl = Sais::Implementation<size_t, uint16_t>;
34
35 std::vector<Sais::SLType> sl_partition(str.size());
36 EXPECT_EQ(expected_lms_indices.size(),
37 SaisImpl::BuildSLPartition(str.begin(), str.size(), kNumChar,
38 sl_partition.rbegin()));
39 EXPECT_EQ(MakeVector(expected_sl_partition), sl_partition);
40
41 std::vector<size_t> lms_indices(expected_lms_indices.size());
42 SaisImpl::FindLmsSuffixes(expected_sl_partition, lms_indices.begin());
43 EXPECT_EQ(MakeVector(expected_lms_indices), lms_indices);
44 }
45
46 TEST(SaisTest, BuildSLPartition) {
47 TestSlPartition({}, {}, "");
48 TestSlPartition(
49 {
50 Sais::LType,
51 },
52 {}, "a");
53 TestSlPartition(
54 {
55 Sais::LType, Sais::LType,
56 },
57 {}, "ba");
58 TestSlPartition(
59 {
60 Sais::SType, Sais::LType,
61 },
62 {}, "ab");
63 TestSlPartition(
64 {
65 Sais::SType, Sais::SType, Sais::LType,
66 },
67 {}, "aab");
68 TestSlPartition(
69 {
70 Sais::LType, Sais::LType, Sais::LType,
71 },
72 {}, "bba");
73 TestSlPartition(
74 {
75 Sais::LType, Sais::SType, Sais::LType,
76 },
77 {1}, "bab");
78 TestSlPartition(
79 {
80 Sais::LType, Sais::SType, Sais::SType, Sais::LType,
81 },
82 {1}, "baab");
83
84 TestSlPartition(
85 {
86 Sais::LType, // zucchini
87 Sais::LType, // ucchini
88 Sais::SType, // cchini
89 Sais::SType, // chini
90 Sais::SType, // hini
91 Sais::SType, // ini
92 Sais::LType, // ni
93 Sais::LType, // i
94 },
95 {2}, "zucchini");
96 }
97
98 std::vector<size_t> BucketCount(const std::initializer_list<unsigned char> str,
99 uint16_t max_key) {
100 using SaisImpl = Sais::Implementation<size_t, uint16_t>;
101 return SaisImpl::MakeBucketCount(str.begin(), str.size(), max_key);
102 }
103
104 TEST(SaisTest, BucketCount) {
105 using vec = std::vector<size_t>;
106
107 EXPECT_EQ(vec({0, 0, 0, 0}), BucketCount({}, 4));
108 EXPECT_EQ(vec({1, 0, 0, 0}), BucketCount({0}, 4));
109 EXPECT_EQ(vec({0, 2, 0, 1}), BucketCount({1, 1, 3}, 4));
110 }
111
112 std::vector<size_t> InducedSortSubstring(ustring str) {
113 using SaisImpl = Sais::Implementation<size_t, uint16_t>;
114 std::vector<Sais::SLType> sl_partition(str.size());
115 size_t lms_count = SaisImpl::BuildSLPartition(
116 str.begin(), str.size(), kNumChar, sl_partition.rbegin());
117 std::vector<size_t> lms_indices(lms_count);
118 SaisImpl::FindLmsSuffixes(sl_partition, lms_indices.begin());
119 auto buckets = SaisImpl::MakeBucketCount(str.begin(), str.size(), kNumChar);
120
121 std::vector<size_t> suffix_array(str.size());
122 SaisImpl::InducedSort(str, str.size(), sl_partition, lms_indices, buckets,
123 suffix_array.begin());
124
125 return suffix_array;
126 }
127
128 TEST(SaisTest, InducedSortSubstring) {
129 using vec = std::vector<size_t>;
130
131 auto us = MakeUnsignedString;
132
133 // L; a$
134 EXPECT_EQ(vec({0}), InducedSortSubstring(us("a")));
135
136 // SL; ab$, b$
137 EXPECT_EQ(vec({0, 1}), InducedSortSubstring(us("ab")));
138
139 // LL; a$, ba$
140 EXPECT_EQ(vec({1, 0}), InducedSortSubstring(us("ba")));
141
142 // SLL; a$, aba$, ba$
143 EXPECT_EQ(vec({2, 0, 1}), InducedSortSubstring(us("aba")));
144
145 // LSL; ab$, b$, ba
146 EXPECT_EQ(vec({1, 2, 0}), InducedSortSubstring(us("bab")));
147
148 // SSL; aab$, ab$, b$
149 EXPECT_EQ(vec({0, 1, 2}), InducedSortSubstring(us("aab")));
150
151 // LSSL; aab$, ab$, b$, ba
152 EXPECT_EQ(vec({1, 2, 3, 0}), InducedSortSubstring(us("baab")));
153 }
154
155 template <class Algorithm>
156 void TestSuffixSort(ustring test_str) {
157 std::vector<size_t> suffix_array =
158 MakeSuffixArray<Algorithm>(test_str, kNumChar);
159 EXPECT_EQ(test_str.size(), suffix_array.size());
160
161 // Expect that I[] is a permutation of [0, len].
162 std::vector<size_t> sorted_suffix(suffix_array.begin(), suffix_array.end());
163 std::sort(sorted_suffix.begin(), sorted_suffix.end());
164 for (size_t i = 0; i < test_str.size(); ++i)
165 EXPECT_EQ(i, sorted_suffix[i]);
166
167 // Expect that all suffixes are strictly ordered.
168 auto end = test_str.end();
169 for (size_t i = 1; i < test_str.size(); ++i) {
170 auto suf1 = test_str.begin() + suffix_array[i - 1];
171 auto suf2 = test_str.begin() + suffix_array[i];
172 bool is_less = std::lexicographical_compare(suf1, end, suf2, end);
173 EXPECT_TRUE(is_less);
174 }
175 }
176
177 constexpr const char* test_strs[] = {
178 "",
179 "a",
180 "aa",
181 "za",
182 "CACAO",
183 "aaaaa",
184 "banana",
185 "tobeornottobe",
186 "The quick brown fox jumps over the lazy dog.",
187 "elephantelephantelephantelephantelephant",
188 "walawalawashington",
189 "-------------------------",
190 "011010011001011010010110011010010",
191 "3141592653589793238462643383279502884197169399375105",
192 "\xFF\xFE\xFF\xFE\xFD\x80\x30\x31\x32\x80\x30\xFF\x01\xAB\xCD",
193 "abccbaabccbaabccbaabccbaabccbaabccbaabccbaabccba",
194 "0123456789876543210",
195 "9876543210123456789",
196 "aababcabcdabcdeabcdefabcdefg",
197 "asdhklgalksdjghalksdjghalksdjgh",
198 };
199
200 TEST(SuffixSortTest, NaiveSuffixSort) {
201 for (const std::string& test_str : test_strs) {
202 TestSuffixSort<NaiveSuffixSort>(MakeUnsignedString(test_str));
203 }
204 }
205
206 TEST(SuffixSortTest, SaisSort) {
207 for (const std::string& test_str : test_strs) {
208 TestSuffixSort<Sais>(MakeUnsignedString(test_str));
209 }
210 }
211
212 // Test with sequence that has every character.
213 TEST(SuffixSortTest, AllChar) {
214 std::vector<unsigned char> all_char(kNumChar);
215 std::iota(all_char.begin(), all_char.end(), 0);
216
217 {
218 std::vector<size_t> suffix_array =
219 MakeSuffixArray<Sais>(all_char, kNumChar);
220 for (size_t i = 0; i < kNumChar; ++i)
221 EXPECT_EQ(i, suffix_array[i]);
222 }
223
224 std::vector<unsigned char> all_char_reverse(all_char.rbegin(),
225 all_char.rend());
226 {
227 std::vector<size_t> suffix_array =
228 MakeSuffixArray<Sais>(all_char_reverse, kNumChar);
229 for (size_t i = 0; i < kNumChar; ++i)
230 EXPECT_EQ(kNumChar - i - 1, suffix_array[i]);
231 }
232 }
233
234 void TestSuffixLowerBound(ustring base_str, ustring search_str) {
235 std::vector<size_t> suffix_array =
236 MakeSuffixArray<NaiveSuffixSort>(base_str, kNumChar);
237
238 auto pos = SuffixLowerBound(suffix_array, base_str.begin(),
239 search_str.begin(), search_str.end());
240
241 auto end = base_str.end();
242 if (pos != suffix_array.begin()) {
243 // Previous suffix is less than |search_str|.
244 auto suf = base_str.begin() + pos[-1];
245 bool is_less = std::lexicographical_compare(suf, end, search_str.begin(),
246 search_str.end());
247 EXPECT_TRUE(is_less);
248 }
249 if (pos != suffix_array.end()) {
250 // Current suffix is greater of equal to |search_str|.
251 auto suf = base_str.begin() + *pos;
252 bool is_less = std::lexicographical_compare(suf, end, search_str.begin(),
253 search_str.end());
254 EXPECT_FALSE(is_less);
255 }
256 }
257
258 TEST(SuffixArrayTest, LowerBound) {
259 auto us = MakeUnsignedString;
260
261 TestSuffixLowerBound(us(""), us(""));
262 TestSuffixLowerBound(us(""), us("a"));
263 TestSuffixLowerBound(us("b"), us(""));
264 TestSuffixLowerBound(us("b"), us("a"));
265 TestSuffixLowerBound(us("b"), us("c"));
266 TestSuffixLowerBound(us("b"), us("bc"));
267 TestSuffixLowerBound(us("aa"), us("a"));
268 TestSuffixLowerBound(us("aa"), us("aa"));
269
270 ustring sentence = us("the quick brown fox jumps over the lazy dog.");
271 // Entire string: exact and unique.
272 TestSuffixLowerBound(sentence, sentence);
273 // Empty string: exact and non-unique.
274 TestSuffixLowerBound(sentence, us(""));
275 // Exact and unique suffix matches.
276 TestSuffixLowerBound(sentence, us("."));
277 TestSuffixLowerBound(sentence, us("the lazy dog."));
278 // Exact and unique non-suffix matches.
279 TestSuffixLowerBound(sentence, us("quick"));
280 TestSuffixLowerBound(sentence, us("the quick"));
281 // Partial and unique matches.
282 TestSuffixLowerBound(sentence, us("fox jumps with the hosps"));
283 TestSuffixLowerBound(sentence, us("xyz"));
284 // Exact and non-unique match: take lexicographical first.
285 TestSuffixLowerBound(sentence, us("the"));
286 TestSuffixLowerBound(sentence, us(" "));
287 // Partial and non-unique match.
288 // query < "the l"... < "the q"...
289 TestSuffixLowerBound(sentence, us("the apple"));
290 // "the l"... < query < "the q"...
291 TestSuffixLowerBound(sentence, us("the opera"));
292 // "the l"... < "the q"... < query
293 TestSuffixLowerBound(sentence, us("the zebra"));
294 // Prefix match dominates suffix match (unique).
295 TestSuffixLowerBound(sentence, us("over quick brown fox"));
296 // Empty matchs.
297 TestSuffixLowerBound(sentence, us(","));
298 TestSuffixLowerBound(sentence, us("1234"));
299 TestSuffixLowerBound(sentence, us("THE QUICK BROWN FOX"));
300 TestSuffixLowerBound(sentence, us("(the"));
301 }
302
303 TEST(SuffixArrayTest, LowerBoundExact) {
304 for (const std::string& test_str : test_strs) {
305 ustring test_ustr = MakeUnsignedString(test_str);
306
307 std::vector<size_t> suffix_array =
308 MakeSuffixArray<Sais>(test_ustr, kNumChar);
309
310 for (size_t lo = 0; lo < test_str.size(); ++lo) {
311 for (size_t hi = lo + 1; hi <= test_str.size(); ++hi) {
312 ustring query(test_ustr.begin() + lo, test_ustr.begin() + hi);
313 ASSERT_EQ(query.size(), hi - lo);
314 auto pos = SuffixLowerBound(suffix_array, test_ustr.begin(),
315 query.begin(), query.end());
316 EXPECT_TRUE(
317 std::equal(query.begin(), query.end(), test_ustr.begin() + *pos));
318 }
319 }
320 }
321 }
322
323 } // namespace zucchini
OLDNEW
« chrome/installer/zucchini/suffix_array.h ('K') | « chrome/installer/zucchini/suffix_array.h ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698