blob: 6b47d4410f790fe71542499d378b4f0edbba5339 [file] [log] [blame]
// Copyright 2015 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "core/dom/DocumentStatisticsCollector.h"
#include "core/dom/Document.h"
#include "core/frame/FrameView.h"
#include "core/html/HTMLHeadElement.h"
#include "core/html/HTMLLinkElement.h"
#include "core/testing/DummyPageHolder.h"
#include "public/platform/WebDistillability.h"
#include "testing/gmock/include/gmock/gmock.h"
#include "testing/gtest/include/gtest/gtest.h"
#include "wtf/text/StringBuilder.h"
#include <memory>
namespace blink {
// Saturate the length of a paragraph to save time.
const unsigned kTextContentLengthSaturation = 1000;
// Filter out short P elements. The threshold is set to around 2 English
// sentences.
const unsigned kParagraphLengthThreshold = 140;
class DocumentStatisticsCollectorTest : public ::testing::Test {
protected:
void SetUp() override;
void TearDown() override { ThreadState::current()->collectAllGarbage(); }
Document& document() const { return m_dummyPageHolder->document(); }
void setHtmlInnerHTML(const String&);
private:
std::unique_ptr<DummyPageHolder> m_dummyPageHolder;
};
void DocumentStatisticsCollectorTest::SetUp() {
m_dummyPageHolder = DummyPageHolder::create(IntSize(800, 600));
}
void DocumentStatisticsCollectorTest::setHtmlInnerHTML(
const String& htmlContent) {
document().documentElement()->setInnerHTML((htmlContent),
ASSERT_NO_EXCEPTION);
}
// This test checks open graph articles can be recognized.
TEST_F(DocumentStatisticsCollectorTest, HasOpenGraphArticle) {
setHtmlInnerHTML(
"<head>"
// Note the case-insensitive matching of the word "article".
" <meta property='og:type' content='arTiclE' />"
"</head>");
WebDistillabilityFeatures features =
DocumentStatisticsCollector::collectStatistics(document());
EXPECT_TRUE(features.openGraph);
}
// This test checks non-existence of open graph articles can be recognized.
TEST_F(DocumentStatisticsCollectorTest, NoOpenGraphArticle) {
setHtmlInnerHTML(
"<head>"
" <meta property='og:type' content='movie' />"
"</head>");
WebDistillabilityFeatures features =
DocumentStatisticsCollector::collectStatistics(document());
EXPECT_FALSE(features.openGraph);
}
// This test checks element counts are correct.
TEST_F(DocumentStatisticsCollectorTest, CountElements) {
setHtmlInnerHTML(
"<form>"
" <input type='text'>"
" <input type='password'>"
"</form>"
"<pre></pre>"
"<p><a> </a></p>"
"<ul><li><p><a> </a></p></li></ul>");
WebDistillabilityFeatures features =
DocumentStatisticsCollector::collectStatistics(document());
EXPECT_FALSE(features.openGraph);
EXPECT_EQ(10u, features.elementCount);
EXPECT_EQ(2u, features.anchorCount);
EXPECT_EQ(1u, features.formCount);
EXPECT_EQ(1u, features.textInputCount);
EXPECT_EQ(1u, features.passwordInputCount);
EXPECT_EQ(2u, features.pCount);
EXPECT_EQ(1u, features.preCount);
}
// This test checks score calculations are correct.
TEST_F(DocumentStatisticsCollectorTest, CountScore) {
setHtmlInnerHTML(
"<p class='menu' id='article'>1</p>" // textContentLength = 1
"<ul><li><p>12</p></li></ul>" // textContentLength = 2, skipped because
// under li
"<p class='menu'>123</p>" // textContentLength = 3, skipped because
// unlikelyCandidates
"<p>"
"12345678901234567890123456789012345678901234567890"
"12345678901234567890123456789012345678901234567890"
"12345678901234567890123456789012345678901234"
"</p>" // textContentLength = 144
"<p style='display:none'>12345</p>" // textContentLength = 5, skipped
// because invisible
"<div style='display:none'><p>123456</p></div>" // textContentLength = 6,
// skipped because
// invisible
"<div style='visibility:hidden'><p>1234567</p></div>" // textContentLength
// = 7, skipped
// because
// invisible
"<p style='opacity:0'>12345678</p>" // textContentLength = 8, skipped
// because invisible
"<p><a href='#'>1234 </a>6 <b> 9</b></p>" // textContentLength = 9
"<ul><li></li><p>123456789012</p></ul>" // textContentLength = 12
);
WebDistillabilityFeatures features =
DocumentStatisticsCollector::collectStatistics(document());
EXPECT_DOUBLE_EQ(features.mozScore, sqrt(144 - kParagraphLengthThreshold));
EXPECT_DOUBLE_EQ(features.mozScoreAllSqrt,
1 + sqrt(144) + sqrt(9) + sqrt(12));
EXPECT_DOUBLE_EQ(features.mozScoreAllLinear, 1 + 144 + 9 + 12);
}
// This test checks saturation of score calculations is correct.
TEST_F(DocumentStatisticsCollectorTest, CountScoreSaturation) {
StringBuilder html;
for (int i = 0; i < 10; i++) {
html.append("<p>");
for (int j = 0; j < 1000; j++) {
html.append("0123456789");
}
html.append("</p>");
}
setHtmlInnerHTML(html.toString());
WebDistillabilityFeatures features =
DocumentStatisticsCollector::collectStatistics(document());
double error = 1e-5;
EXPECT_NEAR(features.mozScore, 6 * sqrt(kTextContentLengthSaturation -
kParagraphLengthThreshold),
error);
EXPECT_NEAR(features.mozScoreAllSqrt, 6 * sqrt(kTextContentLengthSaturation),
error);
EXPECT_NEAR(features.mozScoreAllLinear, 6 * kTextContentLengthSaturation,
error);
}
} // namespace blink