OLD | NEW |
(Empty) | |
| 1 // Copyright 2017 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. |
| 4 |
| 5 #include "chrome/installer/zucchini/suffix_array.h" |
| 6 |
| 7 #include <algorithm> |
| 8 #include <cstddef> |
| 9 #include <initializer_list> |
| 10 #include <string> |
| 11 #include <vector> |
| 12 |
| 13 #include "testing/gtest/include/gtest/gtest.h" |
| 14 |
| 15 namespace zucchini { |
| 16 |
| 17 using ustring = std::basic_string<unsigned char>; |
| 18 |
| 19 constexpr uint16_t kNumChar = 256; |
| 20 |
| 21 ustring MakeUnsignedString(const std::string& str) { |
| 22 return {str.begin(), str.end()}; |
| 23 } |
| 24 |
| 25 template <class T> |
| 26 std::vector<T> MakeVector(const std::initializer_list<T>& ilist) { |
| 27 return {ilist.begin(), ilist.end()}; |
| 28 } |
| 29 |
| 30 void TestSlPartition(std::initializer_list<Sais::SLType> expected_sl_partition, |
| 31 std::initializer_list<size_t> expected_lms_indices, |
| 32 std::string str) { |
| 33 using SaisImpl = Sais::Implementation<size_t, uint16_t>; |
| 34 |
| 35 std::vector<Sais::SLType> sl_partition(str.size()); |
| 36 EXPECT_EQ(expected_lms_indices.size(), |
| 37 SaisImpl::BuildSLPartition(str.begin(), str.size(), kNumChar, |
| 38 sl_partition.rbegin())); |
| 39 EXPECT_EQ(MakeVector(expected_sl_partition), sl_partition); |
| 40 |
| 41 std::vector<size_t> lms_indices(expected_lms_indices.size()); |
| 42 SaisImpl::FindLmsSuffixes(expected_sl_partition, lms_indices.begin()); |
| 43 EXPECT_EQ(MakeVector(expected_lms_indices), lms_indices); |
| 44 } |
| 45 |
| 46 TEST(SaisTest, BuildSLPartition) { |
| 47 TestSlPartition({}, {}, ""); |
| 48 TestSlPartition( |
| 49 { |
| 50 Sais::LType, |
| 51 }, |
| 52 {}, "a"); |
| 53 TestSlPartition( |
| 54 { |
| 55 Sais::LType, Sais::LType, |
| 56 }, |
| 57 {}, "ba"); |
| 58 TestSlPartition( |
| 59 { |
| 60 Sais::SType, Sais::LType, |
| 61 }, |
| 62 {}, "ab"); |
| 63 TestSlPartition( |
| 64 { |
| 65 Sais::SType, Sais::SType, Sais::LType, |
| 66 }, |
| 67 {}, "aab"); |
| 68 TestSlPartition( |
| 69 { |
| 70 Sais::LType, Sais::LType, Sais::LType, |
| 71 }, |
| 72 {}, "bba"); |
| 73 TestSlPartition( |
| 74 { |
| 75 Sais::LType, Sais::SType, Sais::LType, |
| 76 }, |
| 77 {1}, "bab"); |
| 78 TestSlPartition( |
| 79 { |
| 80 Sais::LType, Sais::SType, Sais::SType, Sais::LType, |
| 81 }, |
| 82 {1}, "baab"); |
| 83 |
| 84 TestSlPartition( |
| 85 { |
| 86 Sais::LType, // zucchini |
| 87 Sais::LType, // ucchini |
| 88 Sais::SType, // cchini |
| 89 Sais::SType, // chini |
| 90 Sais::SType, // hini |
| 91 Sais::SType, // ini |
| 92 Sais::LType, // ni |
| 93 Sais::LType, // i |
| 94 }, |
| 95 {2}, "zucchini"); |
| 96 } |
| 97 |
| 98 std::vector<size_t> BucketCount(const std::initializer_list<unsigned char> str, |
| 99 uint16_t max_key) { |
| 100 using SaisImpl = Sais::Implementation<size_t, uint16_t>; |
| 101 return SaisImpl::MakeBucketCount(str.begin(), str.size(), max_key); |
| 102 } |
| 103 |
| 104 TEST(SaisTest, BucketCount) { |
| 105 using vec = std::vector<size_t>; |
| 106 |
| 107 EXPECT_EQ(vec({0, 0, 0, 0}), BucketCount({}, 4)); |
| 108 EXPECT_EQ(vec({1, 0, 0, 0}), BucketCount({0}, 4)); |
| 109 EXPECT_EQ(vec({0, 2, 0, 1}), BucketCount({1, 1, 3}, 4)); |
| 110 } |
| 111 |
| 112 std::vector<size_t> InducedSortSubstring(ustring str) { |
| 113 using SaisImpl = Sais::Implementation<size_t, uint16_t>; |
| 114 std::vector<Sais::SLType> sl_partition(str.size()); |
| 115 size_t lms_count = SaisImpl::BuildSLPartition( |
| 116 str.begin(), str.size(), kNumChar, sl_partition.rbegin()); |
| 117 std::vector<size_t> lms_indices(lms_count); |
| 118 SaisImpl::FindLmsSuffixes(sl_partition, lms_indices.begin()); |
| 119 auto buckets = SaisImpl::MakeBucketCount(str.begin(), str.size(), kNumChar); |
| 120 |
| 121 std::vector<size_t> suffix_array(str.size()); |
| 122 SaisImpl::InducedSort(str, str.size(), sl_partition, lms_indices, buckets, |
| 123 suffix_array.begin()); |
| 124 |
| 125 return suffix_array; |
| 126 } |
| 127 |
| 128 TEST(SaisTest, InducedSortSubstring) { |
| 129 using vec = std::vector<size_t>; |
| 130 |
| 131 auto us = MakeUnsignedString; |
| 132 |
| 133 // L; a$ |
| 134 EXPECT_EQ(vec({0}), InducedSortSubstring(us("a"))); |
| 135 |
| 136 // SL; ab$, b$ |
| 137 EXPECT_EQ(vec({0, 1}), InducedSortSubstring(us("ab"))); |
| 138 |
| 139 // LL; a$, ba$ |
| 140 EXPECT_EQ(vec({1, 0}), InducedSortSubstring(us("ba"))); |
| 141 |
| 142 // SLL; a$, aba$, ba$ |
| 143 EXPECT_EQ(vec({2, 0, 1}), InducedSortSubstring(us("aba"))); |
| 144 |
| 145 // LSL; ab$, b$, ba |
| 146 EXPECT_EQ(vec({1, 2, 0}), InducedSortSubstring(us("bab"))); |
| 147 |
| 148 // SSL; aab$, ab$, b$ |
| 149 EXPECT_EQ(vec({0, 1, 2}), InducedSortSubstring(us("aab"))); |
| 150 |
| 151 // LSSL; aab$, ab$, b$, ba |
| 152 EXPECT_EQ(vec({1, 2, 3, 0}), InducedSortSubstring(us("baab"))); |
| 153 } |
| 154 |
| 155 template <class Algorithm> |
| 156 void TestSuffixSort(ustring test_str) { |
| 157 std::vector<size_t> suffix_array = |
| 158 MakeSuffixArray<Algorithm>(test_str, kNumChar); |
| 159 EXPECT_EQ(test_str.size(), suffix_array.size()); |
| 160 |
| 161 // Expect that I[] is a permutation of [0, len]. |
| 162 std::vector<size_t> sorted_suffix(suffix_array.begin(), suffix_array.end()); |
| 163 std::sort(sorted_suffix.begin(), sorted_suffix.end()); |
| 164 for (size_t i = 0; i < test_str.size(); ++i) |
| 165 EXPECT_EQ(i, sorted_suffix[i]); |
| 166 |
| 167 // Expect that all suffixes are strictly ordered. |
| 168 auto end = test_str.end(); |
| 169 for (size_t i = 1; i < test_str.size(); ++i) { |
| 170 auto suf1 = test_str.begin() + suffix_array[i - 1]; |
| 171 auto suf2 = test_str.begin() + suffix_array[i]; |
| 172 bool is_less = std::lexicographical_compare(suf1, end, suf2, end); |
| 173 EXPECT_TRUE(is_less); |
| 174 } |
| 175 } |
| 176 |
| 177 constexpr const char* test_strs[] = { |
| 178 "", |
| 179 "a", |
| 180 "aa", |
| 181 "za", |
| 182 "CACAO", |
| 183 "aaaaa", |
| 184 "banana", |
| 185 "tobeornottobe", |
| 186 "The quick brown fox jumps over the lazy dog.", |
| 187 "elephantelephantelephantelephantelephant", |
| 188 "walawalawashington", |
| 189 "-------------------------", |
| 190 "011010011001011010010110011010010", |
| 191 "3141592653589793238462643383279502884197169399375105", |
| 192 "\xFF\xFE\xFF\xFE\xFD\x80\x30\x31\x32\x80\x30\xFF\x01\xAB\xCD", |
| 193 "abccbaabccbaabccbaabccbaabccbaabccbaabccbaabccba", |
| 194 "0123456789876543210", |
| 195 "9876543210123456789", |
| 196 "aababcabcdabcdeabcdefabcdefg", |
| 197 "asdhklgalksdjghalksdjghalksdjgh", |
| 198 }; |
| 199 |
| 200 TEST(SuffixSortTest, NaiveSuffixSort) { |
| 201 for (const std::string& test_str : test_strs) { |
| 202 TestSuffixSort<NaiveSuffixSort>(MakeUnsignedString(test_str)); |
| 203 } |
| 204 } |
| 205 |
| 206 TEST(SuffixSortTest, SaisSort) { |
| 207 for (const std::string& test_str : test_strs) { |
| 208 TestSuffixSort<Sais>(MakeUnsignedString(test_str)); |
| 209 } |
| 210 } |
| 211 |
| 212 // Test with sequence that has every character. |
| 213 TEST(SuffixSortTest, AllChar) { |
| 214 std::vector<unsigned char> all_char(kNumChar); |
| 215 std::iota(all_char.begin(), all_char.end(), 0); |
| 216 |
| 217 { |
| 218 std::vector<size_t> suffix_array = |
| 219 MakeSuffixArray<Sais>(all_char, kNumChar); |
| 220 for (size_t i = 0; i < kNumChar; ++i) |
| 221 EXPECT_EQ(i, suffix_array[i]); |
| 222 } |
| 223 |
| 224 std::vector<unsigned char> all_char_reverse(all_char.rbegin(), |
| 225 all_char.rend()); |
| 226 { |
| 227 std::vector<size_t> suffix_array = |
| 228 MakeSuffixArray<Sais>(all_char_reverse, kNumChar); |
| 229 for (size_t i = 0; i < kNumChar; ++i) |
| 230 EXPECT_EQ(kNumChar - i - 1, suffix_array[i]); |
| 231 } |
| 232 } |
| 233 |
| 234 void TestSuffixLowerBound(ustring base_str, ustring search_str) { |
| 235 std::vector<size_t> suffix_array = |
| 236 MakeSuffixArray<NaiveSuffixSort>(base_str, kNumChar); |
| 237 |
| 238 auto pos = SuffixLowerBound(suffix_array, base_str.begin(), |
| 239 search_str.begin(), search_str.end()); |
| 240 |
| 241 auto end = base_str.end(); |
| 242 if (pos != suffix_array.begin()) { |
| 243 // Previous suffix is less than |search_str|. |
| 244 auto suf = base_str.begin() + pos[-1]; |
| 245 bool is_less = std::lexicographical_compare(suf, end, search_str.begin(), |
| 246 search_str.end()); |
| 247 EXPECT_TRUE(is_less); |
| 248 } |
| 249 if (pos != suffix_array.end()) { |
| 250 // Current suffix is greater of equal to |search_str|. |
| 251 auto suf = base_str.begin() + *pos; |
| 252 bool is_less = std::lexicographical_compare(suf, end, search_str.begin(), |
| 253 search_str.end()); |
| 254 EXPECT_FALSE(is_less); |
| 255 } |
| 256 } |
| 257 |
| 258 TEST(SuffixArrayTest, LowerBound) { |
| 259 auto us = MakeUnsignedString; |
| 260 |
| 261 TestSuffixLowerBound(us(""), us("")); |
| 262 TestSuffixLowerBound(us(""), us("a")); |
| 263 TestSuffixLowerBound(us("b"), us("")); |
| 264 TestSuffixLowerBound(us("b"), us("a")); |
| 265 TestSuffixLowerBound(us("b"), us("c")); |
| 266 TestSuffixLowerBound(us("b"), us("bc")); |
| 267 TestSuffixLowerBound(us("aa"), us("a")); |
| 268 TestSuffixLowerBound(us("aa"), us("aa")); |
| 269 |
| 270 ustring sentence = us("the quick brown fox jumps over the lazy dog."); |
| 271 // Entire string: exact and unique. |
| 272 TestSuffixLowerBound(sentence, sentence); |
| 273 // Empty string: exact and non-unique. |
| 274 TestSuffixLowerBound(sentence, us("")); |
| 275 // Exact and unique suffix matches. |
| 276 TestSuffixLowerBound(sentence, us(".")); |
| 277 TestSuffixLowerBound(sentence, us("the lazy dog.")); |
| 278 // Exact and unique non-suffix matches. |
| 279 TestSuffixLowerBound(sentence, us("quick")); |
| 280 TestSuffixLowerBound(sentence, us("the quick")); |
| 281 // Partial and unique matches. |
| 282 TestSuffixLowerBound(sentence, us("fox jumps with the hosps")); |
| 283 TestSuffixLowerBound(sentence, us("xyz")); |
| 284 // Exact and non-unique match: take lexicographical first. |
| 285 TestSuffixLowerBound(sentence, us("the")); |
| 286 TestSuffixLowerBound(sentence, us(" ")); |
| 287 // Partial and non-unique match. |
| 288 // query < "the l"... < "the q"... |
| 289 TestSuffixLowerBound(sentence, us("the apple")); |
| 290 // "the l"... < query < "the q"... |
| 291 TestSuffixLowerBound(sentence, us("the opera")); |
| 292 // "the l"... < "the q"... < query |
| 293 TestSuffixLowerBound(sentence, us("the zebra")); |
| 294 // Prefix match dominates suffix match (unique). |
| 295 TestSuffixLowerBound(sentence, us("over quick brown fox")); |
| 296 // Empty matchs. |
| 297 TestSuffixLowerBound(sentence, us(",")); |
| 298 TestSuffixLowerBound(sentence, us("1234")); |
| 299 TestSuffixLowerBound(sentence, us("THE QUICK BROWN FOX")); |
| 300 TestSuffixLowerBound(sentence, us("(the")); |
| 301 } |
| 302 |
| 303 TEST(SuffixArrayTest, LowerBoundExact) { |
| 304 for (const std::string& test_str : test_strs) { |
| 305 ustring test_ustr = MakeUnsignedString(test_str); |
| 306 |
| 307 std::vector<size_t> suffix_array = |
| 308 MakeSuffixArray<Sais>(test_ustr, kNumChar); |
| 309 |
| 310 for (size_t lo = 0; lo < test_str.size(); ++lo) { |
| 311 for (size_t hi = lo + 1; hi <= test_str.size(); ++hi) { |
| 312 ustring query(test_ustr.begin() + lo, test_ustr.begin() + hi); |
| 313 ASSERT_EQ(query.size(), hi - lo); |
| 314 auto pos = SuffixLowerBound(suffix_array, test_ustr.begin(), |
| 315 query.begin(), query.end()); |
| 316 EXPECT_TRUE( |
| 317 std::equal(query.begin(), query.end(), test_ustr.begin() + *pos)); |
| 318 } |
| 319 } |
| 320 } |
| 321 } |
| 322 |
| 323 } // namespace zucchini |
OLD | NEW |