| /* |
| * Copyright (C) 2011 Google Inc. All rights reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions are |
| * met: |
| * |
| * * Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * * Redistributions in binary form must reproduce the above |
| * copyright notice, this list of conditions and the following disclaimer |
| * in the documentation and/or other materials provided with the |
| * distribution. |
| * * Neither the name of Google Inc. nor the names of its |
| * contributors may be used to endorse or promote products derived from |
| * this software without specific prior written permission. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| #include "core/frame/FrameSerializer.h" |
| |
| #include "core/HTMLNames.h" |
| #include "core/InputTypeNames.h" |
| #include "core/css/CSSFontFaceRule.h" |
| #include "core/css/CSSFontFaceSrcValue.h" |
| #include "core/css/CSSImageValue.h" |
| #include "core/css/CSSImportRule.h" |
| #include "core/css/CSSRuleList.h" |
| #include "core/css/CSSStyleDeclaration.h" |
| #include "core/css/CSSStyleRule.h" |
| #include "core/css/CSSValueList.h" |
| #include "core/css/StylePropertySet.h" |
| #include "core/css/StyleRule.h" |
| #include "core/css/StyleSheetContents.h" |
| #include "core/dom/Document.h" |
| #include "core/dom/Element.h" |
| #include "core/dom/Text.h" |
| #include "core/editing/serializers/MarkupAccumulator.h" |
| #include "core/fetch/FontResource.h" |
| #include "core/fetch/ImageResource.h" |
| #include "core/frame/LocalFrame.h" |
| #include "core/html/HTMLFrameElementBase.h" |
| #include "core/html/HTMLImageElement.h" |
| #include "core/html/HTMLInputElement.h" |
| #include "core/html/HTMLLinkElement.h" |
| #include "core/html/HTMLMetaElement.h" |
| #include "core/html/HTMLStyleElement.h" |
| #include "core/html/ImageDocument.h" |
| #include "core/style/StyleFetchedImage.h" |
| #include "core/style/StyleImage.h" |
| #include "platform/Histogram.h" |
| #include "platform/SerializedResource.h" |
| #include "platform/graphics/Image.h" |
| #include "platform/heap/Handle.h" |
| #include "platform/tracing/TraceEvent.h" |
| #include "wtf/HashSet.h" |
| #include "wtf/text/CString.h" |
| #include "wtf/text/StringBuilder.h" |
| #include "wtf/text/TextEncoding.h" |
| #include "wtf/text/WTFString.h" |
| |
| namespace { |
| |
| const int32_t secondsToMicroseconds = 1000 * 1000; |
| const int32_t maxSerializationTimeUmaMicroseconds = 10 * secondsToMicroseconds; |
| |
| } // namespace |
| |
| namespace blink { |
| |
| static bool shouldIgnoreElement(const Element& element) { |
| if (isHTMLScriptElement(element)) |
| return true; |
| if (isHTMLNoScriptElement(element)) |
| return true; |
| return isHTMLMetaElement(element) && |
| toHTMLMetaElement(element).computeEncoding().isValid(); |
| } |
| |
| class SerializerMarkupAccumulator : public MarkupAccumulator { |
| STACK_ALLOCATED(); |
| |
| public: |
| SerializerMarkupAccumulator(FrameSerializer::Delegate&, |
| const Document&, |
| HeapVector<Member<Node>>&); |
| ~SerializerMarkupAccumulator() override; |
| |
| protected: |
| void appendText(StringBuilder& out, Text&) override; |
| bool shouldIgnoreAttribute(const Attribute&) override; |
| void appendElement(StringBuilder& out, Element&, Namespaces*) override; |
| void appendAttribute(StringBuilder& out, |
| const Element&, |
| const Attribute&, |
| Namespaces*) override; |
| void appendStartTag(Node&, Namespaces* = nullptr) override; |
| void appendEndTag(const Element&) override; |
| |
| private: |
| void appendAttributeValue(StringBuilder& out, const String& attributeValue); |
| void appendRewrittenAttribute(StringBuilder& out, |
| const Element&, |
| const String& attributeName, |
| const String& attributeValue); |
| |
| FrameSerializer::Delegate& m_delegate; |
| Member<const Document> m_document; |
| |
| // FIXME: |FrameSerializer| uses |m_nodes| for collecting nodes in document |
| // included into serialized text then extracts image, object, etc. The size |
| // of this vector isn't small for large document. It is better to use |
| // callback like functionality. |
| HeapVector<Member<Node>>& m_nodes; |
| |
| // Elements with links rewritten via appendAttribute method. |
| HeapHashSet<Member<const Element>> m_elementsWithRewrittenLinks; |
| }; |
| |
| SerializerMarkupAccumulator::SerializerMarkupAccumulator( |
| FrameSerializer::Delegate& delegate, |
| const Document& document, |
| HeapVector<Member<Node>>& nodes) |
| : MarkupAccumulator(ResolveAllURLs), |
| m_delegate(delegate), |
| m_document(&document), |
| m_nodes(nodes) {} |
| |
| SerializerMarkupAccumulator::~SerializerMarkupAccumulator() {} |
| |
| void SerializerMarkupAccumulator::appendText(StringBuilder& result, |
| Text& text) { |
| Element* parent = text.parentElement(); |
| if (parent && !shouldIgnoreElement(*parent)) |
| MarkupAccumulator::appendText(result, text); |
| } |
| |
| bool SerializerMarkupAccumulator::shouldIgnoreAttribute( |
| const Attribute& attribute) { |
| return m_delegate.shouldIgnoreAttribute(attribute); |
| } |
| |
| void SerializerMarkupAccumulator::appendElement(StringBuilder& result, |
| Element& element, |
| Namespaces* namespaces) { |
| if (!shouldIgnoreElement(element)) |
| MarkupAccumulator::appendElement(result, element, namespaces); |
| |
| // TODO(tiger): Refactor MarkupAccumulator so it is easier to append an |
| // element like this, without special cases for XHTML |
| if (isHTMLHeadElement(element)) { |
| result.append("<meta http-equiv=\"Content-Type\" content=\""); |
| appendAttributeValue(result, m_document->suggestedMIMEType()); |
| result.append("; charset="); |
| appendAttributeValue(result, m_document->characterSet()); |
| if (m_document->isXHTMLDocument()) |
| result.append("\" />"); |
| else |
| result.append("\">"); |
| } |
| |
| // FIXME: For object (plugins) tags and video tag we could replace them by an |
| // image of their current contents. |
| } |
| |
| void SerializerMarkupAccumulator::appendAttribute(StringBuilder& out, |
| const Element& element, |
| const Attribute& attribute, |
| Namespaces* namespaces) { |
| // Check if link rewriting can affect the attribute. |
| bool isLinkAttribute = element.hasLegalLinkAttribute(attribute.name()); |
| bool isSrcDocAttribute = isHTMLFrameElementBase(element) && |
| attribute.name() == HTMLNames::srcdocAttr; |
| if (isLinkAttribute || isSrcDocAttribute) { |
| // Check if the delegate wants to do link rewriting for the element. |
| String newLinkForTheElement; |
| if (m_delegate.rewriteLink(element, newLinkForTheElement)) { |
| if (isLinkAttribute) { |
| // Rewrite element links. |
| appendRewrittenAttribute(out, element, attribute.name().toString(), |
| newLinkForTheElement); |
| } else { |
| ASSERT(isSrcDocAttribute); |
| // Emit src instead of srcdoc attribute for frame elements - we want the |
| // serialized subframe to use html contents from the link provided by |
| // Delegate::rewriteLink rather than html contents from srcdoc |
| // attribute. |
| appendRewrittenAttribute(out, element, HTMLNames::srcAttr.localName(), |
| newLinkForTheElement); |
| } |
| return; |
| } |
| } |
| |
| // Fallback to appending the original attribute. |
| MarkupAccumulator::appendAttribute(out, element, attribute, namespaces); |
| } |
| |
| void SerializerMarkupAccumulator::appendStartTag(Node& node, |
| Namespaces* namespaces) { |
| MarkupAccumulator::appendStartTag(node, namespaces); |
| m_nodes.append(&node); |
| } |
| |
| void SerializerMarkupAccumulator::appendEndTag(const Element& element) { |
| if (!shouldIgnoreElement(element)) |
| MarkupAccumulator::appendEndTag(element); |
| } |
| |
| void SerializerMarkupAccumulator::appendAttributeValue( |
| StringBuilder& out, |
| const String& attributeValue) { |
| MarkupFormatter::appendAttributeValue(out, attributeValue, |
| m_document->isHTMLDocument()); |
| } |
| |
| void SerializerMarkupAccumulator::appendRewrittenAttribute( |
| StringBuilder& out, |
| const Element& element, |
| const String& attributeName, |
| const String& attributeValue) { |
| if (m_elementsWithRewrittenLinks.contains(&element)) |
| return; |
| m_elementsWithRewrittenLinks.add(&element); |
| |
| // Append the rewritten attribute. |
| // TODO(tiger): Refactor MarkupAccumulator so it is easier to append an |
| // attribute like this. |
| out.append(' '); |
| out.append(attributeName); |
| out.append("=\""); |
| appendAttributeValue(out, attributeValue); |
| out.append("\""); |
| } |
| |
| // TODO(tiger): Right now there is no support for rewriting URLs inside CSS |
| // documents which leads to bugs like <https://crbug.com/251898>. Not being |
| // able to rewrite URLs inside CSS documents means that resources imported from |
| // url(...) statements in CSS might not work when rewriting links for the |
| // "Webpage, Complete" method of saving a page. It will take some work but it |
| // needs to be done if we want to continue to support non-MHTML saved pages. |
| |
| FrameSerializer::FrameSerializer(Vector<SerializedResource>& resources, |
| Delegate& delegate) |
| : m_resources(&resources), |
| m_isSerializingCss(false), |
| m_delegate(delegate) {} |
| |
| void FrameSerializer::serializeFrame(const LocalFrame& frame) { |
| TRACE_EVENT0("page-serialization", "FrameSerializer::serializeFrame"); |
| ASSERT(frame.document()); |
| Document& document = *frame.document(); |
| KURL url = document.url(); |
| |
| // If frame is an image document, add the image and don't continue |
| if (document.isImageDocument()) { |
| ImageDocument& imageDocument = toImageDocument(document); |
| addImageToResources(imageDocument.cachedImage(), url); |
| return; |
| } |
| |
| HeapVector<Member<Node>> serializedNodes; |
| { |
| TRACE_EVENT0("page-serialization", "FrameSerializer::serializeFrame HTML"); |
| SCOPED_BLINK_UMA_HISTOGRAM_TIMER( |
| "PageSerialization.SerializationTime.Html"); |
| SerializerMarkupAccumulator accumulator(m_delegate, document, |
| serializedNodes); |
| String text = |
| serializeNodes<EditingStrategy>(accumulator, document, IncludeNode); |
| |
| CString frameHTML = |
| document.encoding().encode(text, WTF::EntitiesForUnencodables); |
| m_resources->append(SerializedResource( |
| url, document.suggestedMIMEType(), |
| SharedBuffer::create(frameHTML.data(), frameHTML.length()))); |
| } |
| |
| for (Node* node : serializedNodes) { |
| ASSERT(node); |
| if (!node->isElementNode()) |
| continue; |
| |
| Element& element = toElement(*node); |
| // We have to process in-line style as it might contain some resources |
| // (typically background images). |
| if (element.isStyledElement()) { |
| retrieveResourcesForProperties(element.inlineStyle(), document); |
| retrieveResourcesForProperties(element.presentationAttributeStyle(), |
| document); |
| } |
| |
| if (isHTMLImageElement(element)) { |
| HTMLImageElement& imageElement = toHTMLImageElement(element); |
| KURL url = |
| document.completeURL(imageElement.getAttribute(HTMLNames::srcAttr)); |
| ImageResource* cachedImage = imageElement.cachedImage(); |
| addImageToResources(cachedImage, url); |
| } else if (isHTMLInputElement(element)) { |
| HTMLInputElement& inputElement = toHTMLInputElement(element); |
| if (inputElement.type() == InputTypeNames::image && |
| inputElement.imageLoader()) { |
| KURL url = inputElement.src(); |
| ImageResource* cachedImage = inputElement.imageLoader()->image(); |
| addImageToResources(cachedImage, url); |
| } |
| } else if (isHTMLLinkElement(element)) { |
| HTMLLinkElement& linkElement = toHTMLLinkElement(element); |
| if (CSSStyleSheet* sheet = linkElement.sheet()) { |
| KURL url = |
| document.completeURL(linkElement.getAttribute(HTMLNames::hrefAttr)); |
| serializeCSSStyleSheet(*sheet, url); |
| } |
| } else if (isHTMLStyleElement(element)) { |
| HTMLStyleElement& styleElement = toHTMLStyleElement(element); |
| if (CSSStyleSheet* sheet = styleElement.sheet()) |
| serializeCSSStyleSheet(*sheet, KURL()); |
| } |
| } |
| } |
| |
| void FrameSerializer::serializeCSSStyleSheet(CSSStyleSheet& styleSheet, |
| const KURL& url) { |
| TRACE_EVENT2("page-serialization", "FrameSerializer::serializeCSSStyleSheet", |
| "type", "CSS", "url", url.elidedString().utf8().data()); |
| // Only report UMA metric if this is not a reentrant CSS serialization call. |
| double cssStartTime = 0; |
| if (!m_isSerializingCss) { |
| m_isSerializingCss = true; |
| cssStartTime = monotonicallyIncreasingTime(); |
| } |
| |
| StringBuilder cssText; |
| cssText.append("@charset \""); |
| cssText.append(styleSheet.contents()->charset().lower()); |
| cssText.append("\";\n\n"); |
| |
| for (unsigned i = 0; i < styleSheet.length(); ++i) { |
| CSSRule* rule = styleSheet.item(i); |
| String itemText = rule->cssText(); |
| if (!itemText.isEmpty()) { |
| cssText.append(itemText); |
| if (i < styleSheet.length() - 1) |
| cssText.append("\n\n"); |
| } |
| |
| // Some rules have resources associated with them that we need to retrieve. |
| serializeCSSRule(rule); |
| } |
| |
| if (shouldAddURL(url)) { |
| WTF::TextEncoding textEncoding(styleSheet.contents()->charset()); |
| ASSERT(textEncoding.isValid()); |
| String textString = cssText.toString(); |
| CString text = |
| textEncoding.encode(textString, WTF::CSSEncodedEntitiesForUnencodables); |
| m_resources->append( |
| SerializedResource(url, String("text/css"), |
| SharedBuffer::create(text.data(), text.length()))); |
| m_resourceURLs.add(url); |
| } |
| |
| if (cssStartTime != 0) { |
| m_isSerializingCss = false; |
| DEFINE_STATIC_LOCAL(CustomCountHistogram, cssHistogram, |
| ("PageSerialization.SerializationTime.CSSElement", 0, |
| maxSerializationTimeUmaMicroseconds, 50)); |
| cssHistogram.count( |
| static_cast<int64_t>((monotonicallyIncreasingTime() - cssStartTime) * |
| secondsToMicroseconds)); |
| } |
| } |
| |
| void FrameSerializer::serializeCSSRule(CSSRule* rule) { |
| ASSERT(rule->parentStyleSheet()->ownerDocument()); |
| Document& document = *rule->parentStyleSheet()->ownerDocument(); |
| |
| switch (rule->type()) { |
| case CSSRule::kStyleRule: |
| retrieveResourcesForProperties( |
| &toCSSStyleRule(rule)->styleRule()->properties(), document); |
| break; |
| |
| case CSSRule::kImportRule: { |
| CSSImportRule* importRule = toCSSImportRule(rule); |
| KURL sheetBaseURL = rule->parentStyleSheet()->baseURL(); |
| ASSERT(sheetBaseURL.isValid()); |
| KURL importURL = KURL(sheetBaseURL, importRule->href()); |
| if (m_resourceURLs.contains(importURL)) |
| break; |
| if (importRule->styleSheet()) |
| serializeCSSStyleSheet(*importRule->styleSheet(), importURL); |
| break; |
| } |
| |
| // Rules inheriting CSSGroupingRule |
| case CSSRule::kMediaRule: |
| case CSSRule::kSupportsRule: { |
| CSSRuleList* ruleList = rule->cssRules(); |
| for (unsigned i = 0; i < ruleList->length(); ++i) |
| serializeCSSRule(ruleList->item(i)); |
| break; |
| } |
| |
| case CSSRule::kFontFaceRule: |
| retrieveResourcesForProperties( |
| &toCSSFontFaceRule(rule)->styleRule()->properties(), document); |
| break; |
| |
| // Rules in which no external resources can be referenced |
| case CSSRule::kCharsetRule: |
| case CSSRule::kPageRule: |
| case CSSRule::kKeyframesRule: |
| case CSSRule::kKeyframeRule: |
| case CSSRule::kNamespaceRule: |
| case CSSRule::kViewportRule: |
| break; |
| } |
| } |
| |
| bool FrameSerializer::shouldAddURL(const KURL& url) { |
| return url.isValid() && !m_resourceURLs.contains(url) && |
| !url.protocolIsData() && !m_delegate.shouldSkipResourceWithURL(url); |
| } |
| |
| void FrameSerializer::addToResources(const Resource& resource, |
| PassRefPtr<const SharedBuffer> data, |
| const KURL& url) { |
| if (m_delegate.shouldSkipResource(resource)) |
| return; |
| |
| if (!data) { |
| DLOG(ERROR) << "No data for resource " << url.getString(); |
| return; |
| } |
| |
| String mimeType = resource.response().mimeType(); |
| m_resources->append(SerializedResource(url, mimeType, std::move(data))); |
| m_resourceURLs.add(url); |
| } |
| |
| void FrameSerializer::addImageToResources(ImageResource* image, |
| const KURL& url) { |
| if (!image || !image->hasImage() || image->errorOccurred() || |
| !shouldAddURL(url)) |
| return; |
| |
| TRACE_EVENT2("page-serialization", "FrameSerializer::addImageToResources", |
| "type", "image", "url", url.elidedString().utf8().data()); |
| double imageStartTime = monotonicallyIncreasingTime(); |
| |
| RefPtr<const SharedBuffer> data = image->getImage()->data(); |
| addToResources(*image, data, url); |
| |
| // If we're already reporting time for CSS serialization don't report it for |
| // this image to avoid reporting the same time twice. |
| if (!m_isSerializingCss) { |
| DEFINE_STATIC_LOCAL(CustomCountHistogram, imageHistogram, |
| ("PageSerialization.SerializationTime.ImageElement", 0, |
| maxSerializationTimeUmaMicroseconds, 50)); |
| imageHistogram.count( |
| static_cast<int64_t>((monotonicallyIncreasingTime() - imageStartTime) * |
| secondsToMicroseconds)); |
| } |
| } |
| |
| void FrameSerializer::addFontToResources(FontResource* font) { |
| if (!font || !font->isLoaded() || !font->resourceBuffer() || |
| !shouldAddURL(font->url())) |
| return; |
| |
| RefPtr<const SharedBuffer> data(font->resourceBuffer()); |
| |
| addToResources(*font, data, font->url()); |
| } |
| |
| void FrameSerializer::retrieveResourcesForProperties( |
| const StylePropertySet* styleDeclaration, |
| Document& document) { |
| if (!styleDeclaration) |
| return; |
| |
| // The background-image and list-style-image (for ul or ol) are the CSS |
| // properties that make use of images. We iterate to make sure we include any |
| // other image properties there might be. |
| unsigned propertyCount = styleDeclaration->propertyCount(); |
| for (unsigned i = 0; i < propertyCount; ++i) { |
| const CSSValue& cssValue = styleDeclaration->propertyAt(i).value(); |
| retrieveResourcesForCSSValue(cssValue, document); |
| } |
| } |
| |
| void FrameSerializer::retrieveResourcesForCSSValue(const CSSValue& cssValue, |
| Document& document) { |
| if (cssValue.isImageValue()) { |
| const CSSImageValue& imageValue = toCSSImageValue(cssValue); |
| if (imageValue.isCachePending()) |
| return; |
| StyleImage* styleImage = imageValue.cachedImage(); |
| if (!styleImage || !styleImage->isImageResource()) |
| return; |
| |
| addImageToResources(styleImage->cachedImage(), |
| styleImage->cachedImage()->url()); |
| } else if (cssValue.isFontFaceSrcValue()) { |
| const CSSFontFaceSrcValue& fontFaceSrcValue = |
| toCSSFontFaceSrcValue(cssValue); |
| if (fontFaceSrcValue.isLocal()) { |
| return; |
| } |
| |
| addFontToResources(fontFaceSrcValue.fetch(&document)); |
| } else if (cssValue.isValueList()) { |
| const CSSValueList& cssValueList = toCSSValueList(cssValue); |
| for (unsigned i = 0; i < cssValueList.length(); i++) |
| retrieveResourcesForCSSValue(cssValueList.item(i), document); |
| } |
| } |
| |
| // Returns MOTW (Mark of the Web) declaration before html tag which is in |
| // HTML comment, e.g. "<!-- saved from url=(%04d)%s -->" |
| // See http://msdn2.microsoft.com/en-us/library/ms537628(VS.85).aspx. |
| String FrameSerializer::markOfTheWebDeclaration(const KURL& url) { |
| StringBuilder builder; |
| bool emitsMinus = false; |
| CString orignalUrl = url.getString().ascii(); |
| for (const char* string = orignalUrl.data(); *string; ++string) { |
| const char ch = *string; |
| if (ch == '-' && emitsMinus) { |
| builder.append("%2D"); |
| emitsMinus = false; |
| continue; |
| } |
| emitsMinus = ch == '-'; |
| builder.append(ch); |
| } |
| CString escapedUrl = builder.toString().ascii(); |
| return String::format("saved from url=(%04d)%s", |
| static_cast<int>(escapedUrl.length()), |
| escapedUrl.data()); |
| } |
| |
| } // namespace blink |