OLD | NEW |
1 // Copyright 2015 The Chromium Authors. All rights reserved. | 1 // Copyright 2015 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "components/dom_distiller/content/renderer/distillability_agent.h" | 5 #include "components/dom_distiller/content/renderer/distillability_agent.h" |
6 | 6 |
7 #include "base/metrics/histogram_macros.h" | 7 #include "base/metrics/histogram_macros.h" |
8 #include "base/strings/string_util.h" | 8 #include "base/strings/string_util.h" |
9 #include "components/dom_distiller/content/common/distillability_service.mojom.h
" | 9 #include "components/dom_distiller/content/common/distillability_service.mojom.h
" |
10 #include "components/dom_distiller/core/distillable_page_detector.h" | 10 #include "components/dom_distiller/core/distillable_page_detector.h" |
(...skipping 23 matching lines...) Expand all Loading... |
34 BLACKLISTED, | 34 BLACKLISTED, |
35 TOO_SHORT, | 35 TOO_SHORT, |
36 NOT_REJECTED, | 36 NOT_REJECTED, |
37 REJECTION_BUCKET_BOUNDARY | 37 REJECTION_BUCKET_BOUNDARY |
38 }; | 38 }; |
39 | 39 |
40 // Returns whether it is necessary to send updates back to the browser. | 40 // Returns whether it is necessary to send updates back to the browser. |
41 // The number of updates can be from 0 to 2. See the tests in | 41 // The number of updates can be from 0 to 2. See the tests in |
42 // "distillable_page_utils_browsertest.cc". | 42 // "distillable_page_utils_browsertest.cc". |
43 // Most heuristics types only require one update after parsing. | 43 // Most heuristics types only require one update after parsing. |
44 // Adaboost is the only one doing the second update, which is after loading. | 44 // Adaboost-based heuristics are the only ones doing the second update, |
| 45 // which is after loading. |
45 bool NeedToUpdate(bool is_loaded) { | 46 bool NeedToUpdate(bool is_loaded) { |
46 switch (GetDistillerHeuristicsType()) { | 47 switch (GetDistillerHeuristicsType()) { |
47 case DistillerHeuristicsType::ALWAYS_TRUE: | 48 case DistillerHeuristicsType::ALWAYS_TRUE: |
48 return !is_loaded; | 49 return !is_loaded; |
49 case DistillerHeuristicsType::OG_ARTICLE: | 50 case DistillerHeuristicsType::OG_ARTICLE: |
50 return !is_loaded; | 51 return !is_loaded; |
51 case DistillerHeuristicsType::ADABOOST_MODEL: | 52 case DistillerHeuristicsType::ADABOOST_MODEL: |
| 53 case DistillerHeuristicsType::ALL_ARTICLES: |
52 return true; | 54 return true; |
53 case DistillerHeuristicsType::NONE: | 55 case DistillerHeuristicsType::NONE: |
54 default: | 56 default: |
55 return false; | 57 return false; |
56 } | 58 } |
57 } | 59 } |
58 | 60 |
59 // Returns whether this update is the last one for the page. | 61 // Returns whether this update is the last one for the page. |
60 bool IsLast(bool is_loaded) { | 62 bool IsLast(bool is_loaded) { |
61 if (GetDistillerHeuristicsType() == DistillerHeuristicsType::ADABOOST_MODEL) | 63 if (GetDistillerHeuristicsType() == DistillerHeuristicsType::ADABOOST_MODEL || |
| 64 GetDistillerHeuristicsType() == DistillerHeuristicsType::ALL_ARTICLES) |
62 return is_loaded; | 65 return is_loaded; |
63 | 66 |
64 return true; | 67 return true; |
65 } | 68 } |
66 | 69 |
67 bool IsBlacklisted(const GURL& url) { | 70 bool IsBlacklisted(const GURL& url) { |
68 for (size_t i = 0; i < arraysize(kBlacklist); ++i) { | 71 for (size_t i = 0; i < arraysize(kBlacklist); ++i) { |
69 if (base::LowerCaseEqualsASCII(url.host(), kBlacklist[i])) { | 72 if (base::LowerCaseEqualsASCII(url.host(), kBlacklist[i])) { |
70 return true; | 73 return true; |
71 } | 74 } |
72 } | 75 } |
73 return false; | 76 return false; |
74 } | 77 } |
75 | 78 |
76 bool IsDistillablePageAdaboost(WebDocument& doc, | 79 bool IsDistillablePageAdaboost(WebDocument& doc, |
77 const DistillablePageDetector* detector, | 80 const DistillablePageDetector* detector, |
78 const DistillablePageDetector* long_page, | 81 const DistillablePageDetector* long_page, |
79 bool is_last) { | 82 bool is_last, |
| 83 bool exclude_mobile) { |
80 WebDistillabilityFeatures features = doc.DistillabilityFeatures(); | 84 WebDistillabilityFeatures features = doc.DistillabilityFeatures(); |
81 GURL parsed_url(doc.Url()); | 85 GURL parsed_url(doc.Url()); |
82 if (!parsed_url.is_valid()) { | 86 if (!parsed_url.is_valid()) { |
83 return false; | 87 return false; |
84 } | 88 } |
85 std::vector<double> derived = CalculateDerivedFeatures( | 89 std::vector<double> derived = CalculateDerivedFeatures( |
86 features.open_graph, parsed_url, features.element_count, | 90 features.open_graph, parsed_url, features.element_count, |
87 features.anchor_count, features.form_count, features.moz_score, | 91 features.anchor_count, features.form_count, features.moz_score, |
88 features.moz_score_all_sqrt, features.moz_score_all_linear); | 92 features.moz_score_all_sqrt, features.moz_score_all_linear); |
89 double score = detector->Score(derived) - detector->GetThreshold(); | 93 double score = detector->Score(derived) - detector->GetThreshold(); |
(...skipping 48 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
138 TOO_SHORT, REJECTION_BUCKET_BOUNDARY); | 142 TOO_SHORT, REJECTION_BUCKET_BOUNDARY); |
139 } else { | 143 } else { |
140 UMA_HISTOGRAM_ENUMERATION("DomDistiller.DistillabilityRejection", | 144 UMA_HISTOGRAM_ENUMERATION("DomDistiller.DistillabilityRejection", |
141 NOT_REJECTED, REJECTION_BUCKET_BOUNDARY); | 145 NOT_REJECTED, REJECTION_BUCKET_BOUNDARY); |
142 } | 146 } |
143 } | 147 } |
144 | 148 |
145 if (blacklisted) { | 149 if (blacklisted) { |
146 return false; | 150 return false; |
147 } | 151 } |
148 if (features.is_mobile_friendly) { | 152 if (exclude_mobile && features.is_mobile_friendly) { |
149 return false; | 153 return false; |
150 } | 154 } |
151 return distillable && long_article; | 155 return distillable && long_article; |
152 } | 156 } |
153 | 157 |
154 bool IsDistillablePage(WebDocument& doc, bool is_last) { | 158 bool IsDistillablePage(WebDocument& doc, bool is_last) { |
155 switch (GetDistillerHeuristicsType()) { | 159 switch (GetDistillerHeuristicsType()) { |
156 case DistillerHeuristicsType::ALWAYS_TRUE: | 160 case DistillerHeuristicsType::ALWAYS_TRUE: |
157 return true; | 161 return true; |
158 case DistillerHeuristicsType::OG_ARTICLE: | 162 case DistillerHeuristicsType::OG_ARTICLE: |
159 return doc.DistillabilityFeatures().open_graph; | 163 return doc.DistillabilityFeatures().open_graph; |
160 case DistillerHeuristicsType::ADABOOST_MODEL: | 164 case DistillerHeuristicsType::ADABOOST_MODEL: |
161 return IsDistillablePageAdaboost(doc, | 165 return IsDistillablePageAdaboost( |
162 DistillablePageDetector::GetNewModel(), | 166 doc, DistillablePageDetector::GetNewModel(), |
163 DistillablePageDetector::GetLongPageModel(), is_last); | 167 DistillablePageDetector::GetLongPageModel(), is_last, true); |
| 168 case DistillerHeuristicsType::ALL_ARTICLES: |
| 169 return IsDistillablePageAdaboost( |
| 170 doc, DistillablePageDetector::GetNewModel(), |
| 171 DistillablePageDetector::GetLongPageModel(), is_last, false); |
164 case DistillerHeuristicsType::NONE: | 172 case DistillerHeuristicsType::NONE: |
165 default: | 173 default: |
166 return false; | 174 return false; |
167 } | 175 } |
168 } | 176 } |
169 | 177 |
170 } // namespace | 178 } // namespace |
171 | 179 |
172 DistillabilityAgent::DistillabilityAgent( | 180 DistillabilityAgent::DistillabilityAgent( |
173 content::RenderFrame* render_frame) | 181 content::RenderFrame* render_frame) |
(...skipping 30 matching lines...) Expand all Loading... |
204 IsDistillablePage(doc, is_last), is_last); | 212 IsDistillablePage(doc, is_last), is_last); |
205 } | 213 } |
206 | 214 |
207 DistillabilityAgent::~DistillabilityAgent() {} | 215 DistillabilityAgent::~DistillabilityAgent() {} |
208 | 216 |
209 void DistillabilityAgent::OnDestruct() { | 217 void DistillabilityAgent::OnDestruct() { |
210 delete this; | 218 delete this; |
211 } | 219 } |
212 | 220 |
213 } // namespace dom_distiller | 221 } // namespace dom_distiller |
OLD | NEW |