| // Copyright 2015 The Chromium Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "DocumentStatisticsCollector.h" |
| |
| #include "core/HTMLNames.h" |
| #include "core/InputTypeNames.h" |
| #include "core/dom/ElementTraversal.h" |
| #include "core/dom/NodeComputedStyle.h" |
| #include "core/dom/Text.h" |
| #include "core/frame/FrameHost.h" |
| #include "core/frame/LocalFrame.h" |
| #include "core/frame/VisualViewport.h" |
| #include "core/html/HTMLHeadElement.h" |
| #include "core/html/HTMLInputElement.h" |
| #include "core/html/HTMLMetaElement.h" |
| #include "platform/Histogram.h" |
| #include "public/platform/Platform.h" |
| #include "public/platform/WebDistillability.h" |
| |
| namespace blink { |
| |
| using namespace HTMLNames; |
| |
| namespace { |
| |
| // Saturate the length of a paragraph to save time. |
| const int kTextContentLengthSaturation = 1000; |
| |
| // Filter out short P elements. The threshold is set to around 2 English sentences. |
| const unsigned kParagraphLengthThreshold = 140; |
| |
| // Saturate the scores to save time. The max is the score of 6 long paragraphs. |
| const double kMozScoreSaturation = 175.954539583; // 6 * sqrt(kTextContentLengthSaturation - kParagraphLengthThreshold) |
| const double kMozScoreAllSqrtSaturation = 189.73665961; // 6 * sqrt(kTextContentLengthSaturation); |
| const double kMozScoreAllLinearSaturation = 6 * kTextContentLengthSaturation; |
| |
| unsigned textContentLengthSaturated(const Element& root) |
| { |
| unsigned length = 0; |
| // This skips shadow DOM intentionally, to match the JavaScript implementation. |
| // We would like to use the same statistics extracted by the JavaScript implementation |
| // on iOS, and JavaScript cannot peek deeply into shadow DOM except on modern Chrome |
| // versions. |
| // Given shadow DOM rarely appears in <P> elements in long-form articles, the overall |
| // accuracy should not be largely affected. |
| for (Node& node : NodeTraversal::inclusiveDescendantsOf(root)) { |
| if (!node.isTextNode()) { |
| continue; |
| } |
| length += toText(node).length(); |
| if (length > kTextContentLengthSaturation) { |
| return kTextContentLengthSaturation; |
| } |
| } |
| return length; |
| } |
| |
| bool isVisible(const Element& element) |
| { |
| const ComputedStyle* style = element.computedStyle(); |
| if (!style) |
| return false; |
| return ( |
| style->display() != NONE |
| && style->visibility() != EVisibility::Hidden |
| && style->opacity() != 0 |
| ); |
| } |
| |
| bool matchAttributes(const Element& element, const Vector<String>& words) |
| { |
| const String& classes = element.getClassAttribute(); |
| const String& id = element.getIdAttribute(); |
| for (const String& word : words) { |
| if (classes.findIgnoringCase(word) != WTF::kNotFound |
| || id.findIgnoringCase(word) != WTF::kNotFound) { |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| bool isGoodForScoring(const WebDistillabilityFeatures& features, const Element& element) |
| { |
| DEFINE_STATIC_LOCAL(Vector<String>, unlikelyCandidates, ()); |
| if (unlikelyCandidates.isEmpty()) { |
| auto words = { |
| "banner", |
| "combx", |
| "comment", |
| "community", |
| "disqus", |
| "extra", |
| "foot", |
| "header", |
| "menu", |
| "related", |
| "remark", |
| "rss", |
| "share", |
| "shoutbox", |
| "sidebar", |
| "skyscraper", |
| "sponsor", |
| "ad-break", |
| "agegate", |
| "pagination", |
| "pager", |
| "popup" |
| }; |
| for (auto word : words) { |
| unlikelyCandidates.append(word); |
| } |
| } |
| DEFINE_STATIC_LOCAL(Vector<String>, highlyLikelyCandidates, ()); |
| if (highlyLikelyCandidates.isEmpty()) { |
| auto words = { |
| "and", |
| "article", |
| "body", |
| "column", |
| "main", |
| "shadow" |
| }; |
| for (auto word : words) { |
| highlyLikelyCandidates.append(word); |
| } |
| } |
| |
| if (!isVisible(element)) |
| return false; |
| if (features.mozScore >= kMozScoreSaturation |
| && features.mozScoreAllSqrt >= kMozScoreAllSqrtSaturation |
| && features.mozScoreAllLinear >= kMozScoreAllLinearSaturation) |
| return false; |
| if (matchAttributes(element, unlikelyCandidates) |
| && !matchAttributes(element, highlyLikelyCandidates)) |
| return false; |
| return true; |
| } |
| |
| // underListItem denotes that at least one of the ancesters is <li> element. |
| void collectFeatures(Element& root, WebDistillabilityFeatures& features, bool underListItem = false) |
| { |
| for (Node& node : NodeTraversal::childrenOf(root)) { |
| bool isListItem = false; |
| if (!node.isElementNode()) { |
| continue; |
| } |
| |
| features.elementCount++; |
| Element& element = toElement(node); |
| if (element.hasTagName(aTag)) { |
| features.anchorCount++; |
| } else if (element.hasTagName(formTag)) { |
| features.formCount++; |
| } else if (element.hasTagName(inputTag)) { |
| const HTMLInputElement& input = toHTMLInputElement(element); |
| if (input.type() == InputTypeNames::text) { |
| features.textInputCount++; |
| } else if (input.type() == InputTypeNames::password) { |
| features.passwordInputCount++; |
| } |
| } else if (element.hasTagName(pTag) || element.hasTagName(preTag)) { |
| if (element.hasTagName(pTag)) { |
| features.pCount++; |
| } else { |
| features.preCount++; |
| } |
| if (!underListItem && isGoodForScoring(features, element)) { |
| unsigned length = textContentLengthSaturated(element); |
| if (length >= kParagraphLengthThreshold) { |
| features.mozScore += sqrt(length - kParagraphLengthThreshold); |
| features.mozScore = std::min(features.mozScore, kMozScoreSaturation); |
| } |
| features.mozScoreAllSqrt += sqrt(length); |
| features.mozScoreAllSqrt = std::min(features.mozScoreAllSqrt, kMozScoreAllSqrtSaturation); |
| |
| features.mozScoreAllLinear += length; |
| features.mozScoreAllLinear = std::min(features.mozScoreAllLinear, kMozScoreAllLinearSaturation); |
| } |
| } else if (element.hasTagName(liTag)) { |
| isListItem = true; |
| } |
| collectFeatures(element, features, underListItem || isListItem); |
| } |
| } |
| |
| bool hasOpenGraphArticle(const Element& head) |
| { |
| DEFINE_STATIC_LOCAL(AtomicString, ogType, ("og:type")); |
| DEFINE_STATIC_LOCAL(AtomicString, propertyAttr, ("property")); |
| for (const Element* child = ElementTraversal::firstChild(head); child; child = ElementTraversal::nextSibling(*child)) { |
| if (!isHTMLMetaElement(*child)) |
| continue; |
| const HTMLMetaElement& meta = toHTMLMetaElement(*child); |
| |
| if (meta.name() == ogType || meta.getAttribute(propertyAttr) == ogType) { |
| if (equalIgnoringCase(meta.content(), "article")) { |
| return true; |
| } |
| } |
| } |
| return false; |
| } |
| |
| bool isMobileFriendly(Document& document) |
| { |
| if (FrameHost* frameHost = document.frameHost()) |
| return frameHost->visualViewport().shouldDisableDesktopWorkarounds(); |
| return false; |
| } |
| |
| } // namespace |
| |
| WebDistillabilityFeatures DocumentStatisticsCollector::collectStatistics(Document& document) |
| { |
| TRACE_EVENT0("blink", "DocumentStatisticsCollector::collectStatistics"); |
| |
| WebDistillabilityFeatures features = WebDistillabilityFeatures(); |
| |
| if (!document.frame() || !document.frame()->isMainFrame()) |
| return features; |
| |
| DCHECK(document.hasFinishedParsing()); |
| |
| HTMLElement* body = document.body(); |
| HTMLElement* head = document.head(); |
| |
| if (!body || !head) |
| return features; |
| |
| features.isMobileFriendly = isMobileFriendly(document); |
| |
| double startTime = monotonicallyIncreasingTime(); |
| |
| // This should be cheap since collectStatistics is only called right after layout. |
| document.updateStyleAndLayoutTree(); |
| |
| // Traverse the DOM tree and collect statistics. |
| collectFeatures(*body, features); |
| features.openGraph = hasOpenGraphArticle(*head); |
| |
| double elapsedTime = monotonicallyIncreasingTime() - startTime; |
| |
| DEFINE_STATIC_LOCAL(CustomCountHistogram, distillabilityHistogram, ("WebCore.DistillabilityUs", 1, 1000000, 50)); |
| distillabilityHistogram.count(static_cast<int>(1e6 * elapsedTime)); |
| |
| return features; |
| } |
| |
| } // namespace blink |