third_party/blink/renderer/core/html/parser/xss_auditor.cc - chromium/src - Git at Google

 /*
  * Copyright (C) 2011 Adam Barth. All Rights Reserved.
  * Copyright (C) 2011 Daniel Bates (dbates@intudata.com).
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */

 #include "third_party/blink/renderer/core/html/parser/xss_auditor.h"

 #include <memory>

 #include "third_party/blink/renderer/core/dom/document.h"
 #include "third_party/blink/renderer/core/frame/local_frame.h"
 #include "third_party/blink/renderer/core/frame/settings.h"
 #include "third_party/blink/renderer/core/frame/use_counter.h"
 #include "third_party/blink/renderer/core/html/html_param_element.h"
 #include "third_party/blink/renderer/core/html/link_rel_attribute.h"
 #include "third_party/blink/renderer/core/html/parser/html_document_parser.h"
 #include "third_party/blink/renderer/core/html/parser/html_parser_idioms.h"
 #include "third_party/blink/renderer/core/html/parser/text_resource_decoder.h"
 #include "third_party/blink/renderer/core/html/parser/xss_auditor_delegate.h"
 #include "third_party/blink/renderer/core/html_names.h"
 #include "third_party/blink/renderer/core/inspector/console_message.h"
 #include "third_party/blink/renderer/core/loader/document_loader.h"
 #include "third_party/blink/renderer/core/loader/mixed_content_checker.h"
 #include "third_party/blink/renderer/core/svg_names.h"
 #include "third_party/blink/renderer/core/xlink_names.h"
 #include "third_party/blink/renderer/platform/network/encoded_form_data.h"
 #include "third_party/blink/renderer/platform/text/decode_escape_sequences.h"
 #include "third_party/blink/renderer/platform/wtf/ascii_ctype.h"

 namespace {

 // SecurityOrigin::urlWithUniqueSecurityOrigin() can't be used cross-thread, or
 // we'd use it instead.
 const char kURLWithUniqueOrigin[] = "data:,";

 const char kSafeJavaScriptURL[] = "javascript:void(0)";

 }  // namespace

 namespace blink {

 using namespace html_names;

 static bool IsNonCanonicalCharacter(UChar c) {
   // We remove all non-ASCII characters, including non-printable ASCII
   // characters.
   //
   // Note, we don't remove backslashes like PHP stripslashes(), which among
   // other things converts "\\0" to the \0 character. Instead, we remove
   // backslashes and zeros (since the string "\\0" =(remove backslashes)=> "0").
   // However, this has the adverse effect that we remove any legitimate zeros
   // from a string.
   //
   // We also remove forward-slash, because it is common for some servers to
   // collapse successive path components, eg, a//b becomes a/b.
   //
   // We also remove the questionmark character, since some severs replace
   // invalid high-bytes with a questionmark. We are already stripping the
   // high-bytes so we also strip the questionmark to match.
   //
   // We also move the percent character, since some servers strip it when
   // there's a malformed sequence.
   //
   // For instance: new String("http://localhost:8000?x") => new
   // String("http:localhost:8x").
   return (c == '\\' || c == '0' || c == '\0' || c == '/' || c == '?' ||
           c == '%' || c >= 127);
 }

 static bool IsRequiredForInjection(UChar c) {
   return (c == '\'' || c == '"' || c == '<' || c == '>');
 }

 static bool IsTerminatingCharacter(UChar c) {
   return (c == '&' || c == '/' || c == '"' || c == '\'' || c == '<' ||
           c == '>' || c == ',' || c == ';');
 }

 static bool IsSlash(UChar c) {
   return (c == '/' || c == '\\');
 }

 static bool IsHTMLQuote(UChar c) {
   return (c == '"' || c == '\'');
 }

 static bool IsJSNewline(UChar c) {
   // Per ecma-262 section 7.3 Line Terminators.
   return (c == '\n' || c == '\r' || c == 0x2028 || c == 0x2029);
 }

 static bool StartsHTMLOpenCommentAt(const String& string, wtf_size_t start) {
   return (start + 3 < string.length() && string[start] == '<' &&
           string[start + 1] == '!' && string[start + 2] == '-' &&
           string[start + 3] == '-');
 }

 static bool StartsHTMLCloseCommentAt(const String& string, wtf_size_t start) {
   return (start + 2 < string.length() && string[start] == '-' &&
           string[start + 1] == '-' && string[start + 2] == '>');
 }

 static bool StartsSingleLineCommentAt(const String& string, wtf_size_t start) {
   return (start + 1 < string.length() && string[start] == '/' &&
           string[start + 1] == '/');
 }

 static bool StartsMultiLineCommentAt(const String& string, wtf_size_t start) {
   return (start + 1 < string.length() && string[start] == '/' &&
           string[start + 1] == '*');
 }

 static bool StartsOpeningScriptTagAt(const String& string, wtf_size_t start) {
   if (start + 6 >= string.length())
     return false;
   // TODO(esprehn): StringView should probably have startsWith.
   StringView script("<script");
   return EqualIgnoringASCIICase(StringView(string, start, script.length()),
                                 script);
 }

 static bool StartsClosingScriptTagAt(const String& string, wtf_size_t start) {
   if (start + 7 >= string.length())
     return false;
   // TODO(esprehn): StringView should probably have startsWith.
   StringView script("</script");
   return EqualIgnoringASCIICase(StringView(string, start, script.length()),
                                 script);
 }

 // If other files need this, we should move this to
 // core/html/parser/html_parser_idioms.h
 template <wtf_size_t inlineCapacity>
 bool ThreadSafeMatch(const Vector<UChar, inlineCapacity>& vector,
                      const QualifiedName& qname) {
   return EqualIgnoringNullity(vector, qname.LocalName().Impl());
 }

 static bool HasName(const HTMLToken& token, const QualifiedName& name) {
   return ThreadSafeMatch(token.GetName(), name);
 }

 static bool FindAttributeWithName(const HTMLToken& token,
                                   const QualifiedName& name,
                                   wtf_size_t& index_of_matching_attribute) {
   // Notice that we're careful not to ref the StringImpl here because we might
   // be on a background thread.
   const String& attr_name = name.NamespaceURI() == xlink_names::kNamespaceURI
                                 ? "xlink:" + name.LocalName().GetString()
                                 : name.LocalName().GetString();

   for (wtf_size_t i = 0; i < token.Attributes().size(); ++i) {
     if (EqualIgnoringNullity(token.Attributes().at(i).NameAsVector(),
                              attr_name)) {
       index_of_matching_attribute = i;
       return true;
     }
   }
   return false;
 }

 static bool IsNameOfInlineEventHandler(const Vector<UChar, 32>& name) {
   const wtf_size_t kLengthOfShortestInlineEventHandlerName =
       5;  // To wit: oncut.
   if (name.size() < kLengthOfShortestInlineEventHandlerName)
     return false;
   return name[0] == 'o' && name[1] == 'n';
 }

 static bool IsDangerousHTTPEquiv(const String& value) {
   String equiv = value.StripWhiteSpace();
   return DeprecatedEqualIgnoringCase(equiv, "refresh") ||
          DeprecatedEqualIgnoringCase(equiv, "set-cookie");
 }

 static inline String Decode16BitUnicodeEscapeSequences(const String& string) {
   // Note, the encoding is ignored since each %u-escape sequence represents a
   // UTF-16 code unit.
   return DecodeEscapeSequences<Unicode16BitEscapeSequence>(string,
                                                            UTF8Encoding());
 }

 static inline String DecodeStandardURLEscapeSequences(
     const String& string,
     const WTF::TextEncoding& encoding) {
   // We use DecodeEscapeSequences() instead of DecodeURLEscapeSequences()
   // (declared in weborigin/kurl.h) to avoid platform-specific URL decoding
   // differences (e.g. KURLGoogle).
   return DecodeEscapeSequences<URLEscapeSequence>(string, encoding);
 }

 static String FullyDecodeString(const String& string,
                                 const WTF::TextEncoding& encoding) {
   wtf_size_t old_working_string_length;
   String working_string = string;
   do {
     old_working_string_length = working_string.length();
     working_string = Decode16BitUnicodeEscapeSequences(
         DecodeStandardURLEscapeSequences(working_string, encoding));
   } while (working_string.length() < old_working_string_length);
   working_string.Replace('+', ' ');
   return working_string;
 }

 // XSSAuditor's task is to determine how much of any given content came
 // from a reflection vs. what occurs normally on the page. It must do
 // this in face of an attacker avoiding detection by splicing on page
 // content in such a way as to remain syntactically valid. The next two
 // functions apply heurisitcs to get the longest possible fragment in
 // face of such trickery.

 static void TruncateForSrcLikeAttribute(String& decoded_snippet) {
   // In HTTP URLs, characters in the query string (following the first ?),
   // in the fragment (following the first #), or even in the path (typically
   // following the third slash but subject to generous interpretation of a
   // lack of leading slashes) may be merely ignored by an attacker's server
   // when a remote script or script-like resource is requested. Hence these
   // are places where organic page content may be spliced.
   //
   // In DATA URLS, the payload starts at the first comma, and the the first
   //  "/*", "//", or "<!--" may introduce a comment, which can then be used
   // to splice page data harmlessly onto the end of the payload.
   //
   // Also, DATA URLs may use the same string literal tricks as with script
   // content itself. In either case, content following this may come from the
   // page and may be ignored when the script is executed. Also, any of these
   // characters may now be represented by the (enlarged) set of html5 entities.
   //
   // For simplicity, we don't differentiate based on URL scheme, and stop at
   // any of the following:
   //   - the first &, since it might be part of an entity for any of the
   //     subsequent punctuation.
   //   - the first # or ?, since the query and fragment can be ignored.
   //   - the third slash, since this typically starts the path, but account
   //     for a possible lack of leading slashes following the scheme).
   //   - the first slash, <, ', or " once a comma is seen, since we
   //     may now be in a data URL payload.
   int slash_count = 0;
   bool comma_seen = false;
   bool colon_seen = false;
   for (wtf_size_t current_length = 0,
                   remaining_length = decoded_snippet.length();
        remaining_length; ++current_length, --remaining_length) {
     UChar current_char = decoded_snippet[current_length];
     if (current_char == ':' && !colon_seen) {
       if (remaining_length > 1 && !IsSlash(decoded_snippet[current_length + 1]))
         ++slash_count;
       if (remaining_length > 2 && !IsSlash(decoded_snippet[current_length + 2]))
         ++slash_count;
       colon_seen = true;
     }
     if (current_char == '&' || current_char == '?' || current_char == '#' ||
         (IsSlash(current_char) && (comma_seen || ++slash_count > 2)) ||
         (current_char == '<' && comma_seen) ||
         (current_char == '\'' && comma_seen) ||
         (current_char == '"' && comma_seen)) {
       decoded_snippet.Truncate(current_length);
       return;
     }
     if (current_char == ',')
       comma_seen = true;
   }
 }

 static void TruncateForScriptLikeAttribute(String& decoded_snippet) {
   // Beware of trailing characters which came from the page itself, not the
   // injected vector. Excluding the terminating character covers common cases
   // where the page immediately ends the attribute, but doesn't cover more
   // complex cases where there is other page data following the injection.
   //
   // Generally, these won't parse as javascript, so the injected vector
   // typically excludes them from consideration via a single-line comment or
   // by enclosing them in a string literal terminated later by the page's own
   // closing punctuation. Since the snippet has not been parsed, the vector
   // may also try to introduce these via entities. As a result, we'd like to
   // stop before the first "//", the first <!--, the first entity, or the first
   // quote not immediately following the first equals sign (taking whitespace
   // into consideration).
   //
   // To keep things simpler, we don't try to distinguish between
   // entity-introducing amperands vs. other uses, nor do we bother to check for
   // a second slash for a comment, nor do we bother to check for !-- following a
   // less-than sign. We stop instead on any ampersand slash, or less-than sign.
   wtf_size_t position = 0;
   if ((position = decoded_snippet.Find("=")) != kNotFound &&
       (position = decoded_snippet.Find(IsNotHTMLSpace<UChar>, position + 1)) !=
           kNotFound &&
       (position = decoded_snippet.Find(
            IsTerminatingCharacter,
            IsHTMLQuote(decoded_snippet[position]) ? position + 1 : position)) !=
           kNotFound) {
     decoded_snippet.Truncate(position);
   }
 }

 static void TruncateForSemicolonSeparatedScriptLikeAttribute(
     String& decoded_snippet) {
   // Same as script-like attributes, but semicolons can introduce page data.
   TruncateForScriptLikeAttribute(decoded_snippet);
   wtf_size_t position = decoded_snippet.Find(";");
   if (position != kNotFound)
     decoded_snippet.Truncate(position);
 }

 static bool IsSemicolonSeparatedAttribute(
     const HTMLToken::Attribute& attribute) {
   return ThreadSafeMatch(attribute.NameAsVector(), svg_names::kValuesAttr);
 }

 static bool IsSemicolonSeparatedValueContainingJavaScriptURL(
     const String& value) {
   Vector<String> value_list;
   value.Split(';', value_list);
   for (wtf_size_t i = 0; i < value_list.size(); ++i) {
     String stripped = StripLeadingAndTrailingHTMLSpaces(value_list[i]);
     if (ProtocolIsJavaScript(stripped))
       return true;
   }
   return false;
 }

 XSSAuditor::XSSAuditor()
     : is_enabled_(false),
       xss_protection_(kFilterReflectedXSS),
       did_send_valid_xss_protection_header_(false),
       state_(kUninitialized),
       script_tag_found_in_request_(false),
       script_tag_nesting_level_(0),
       encoding_(UTF8Encoding()) {
   // Although tempting to call init() at this point, the various objects
   // we want to reference might not all have been constructed yet.
 }

 void XSSAuditor::InitForFragment() {
   DCHECK(IsMainThread());
   DCHECK_EQ(state_, kUninitialized);
   state_ = kFilteringTokens;
   // When parsing a fragment, we don't enable the XSS auditor because it's
   // too much overhead.
   DCHECK(!is_enabled_);
 }

 void XSSAuditor::Init(Document* document,
                       XSSAuditorDelegate* auditor_delegate) {
   DCHECK(IsMainThread());
   if (state_ != kUninitialized)
     return;
   state_ = kFilteringTokens;

   if (Settings* settings = document->GetSettings())
     is_enabled_ = settings->GetXSSAuditorEnabled();

   if (!is_enabled_)
     return;

   document_url_ = document->Url();
   document_url_.RemoveFragmentIdentifier();
   document_url_ = document_url_.Copy();  // Make thread safe.

   // In theory, the Document could have detached from the LocalFrame after the
   // XSSAuditor was constructed.
   if (!document->GetFrame()) {
     is_enabled_ = false;
     return;
   }

   if (document_url_.IsEmpty()) {
     // The URL can be empty when opening a new browser window or calling
     // window.open("").
     is_enabled_ = false;
     return;
   }

   if (document_url_.ProtocolIsData()) {
     is_enabled_ = false;
     return;
   }

   if (document->Encoding().IsValid())
     encoding_ = document->Encoding();

   if (DocumentLoader* document_loader =
           document->GetFrame()->Loader().GetDocumentLoader()) {
     const AtomicString& header_value =
         document_loader->GetResponse().HttpHeaderField(
             http_names::kXXSSProtection);
     String error_details;
     unsigned error_position = 0;
     String report_url;
     KURL xss_protection_report_url;

     ReflectedXSSDisposition xss_protection_header = ParseXSSProtectionHeader(
         header_value, error_details, error_position, report_url);

     if (xss_protection_header == kAllowReflectedXSS)
       UseCounter::Count(*document, WebFeature::kXSSAuditorDisabled);
     else if (xss_protection_header == kFilterReflectedXSS)
       UseCounter::Count(*document, WebFeature::kXSSAuditorEnabledFilter);
     else if (xss_protection_header == kBlockReflectedXSS)
       UseCounter::Count(*document, WebFeature::kXSSAuditorEnabledBlock);
     else if (xss_protection_header == kReflectedXSSInvalid)
       UseCounter::Count(*document, WebFeature::kXSSAuditorInvalid);

     did_send_valid_xss_protection_header_ =
         xss_protection_header != kReflectedXSSUnset &&
         xss_protection_header != kReflectedXSSInvalid;
     if ((xss_protection_header == kFilterReflectedXSS ||
          xss_protection_header == kBlockReflectedXSS) &&
         !report_url.IsEmpty()) {
       xss_protection_report_url = document->CompleteURL(report_url);
       if (MixedContentChecker::IsMixedContent(document->GetSecurityOrigin(),
                                               xss_protection_report_url)) {
         error_details = "insecure reporting URL for secure page";
         xss_protection_header = kReflectedXSSInvalid;
         xss_protection_report_url = KURL();
       }
     }
     if (xss_protection_header == kReflectedXSSInvalid) {
       document->AddConsoleMessage(ConsoleMessage::Create(
           kSecurityMessageSource, kErrorMessageLevel,
           "Error parsing header X-XSS-Protection: " + header_value + ": " +
               error_details + " at character position " +
               String::Format("%u", error_position) +
               ". The default protections will be applied."));
     }

     xss_protection_ = xss_protection_header;
     if (xss_protection_ == kReflectedXSSInvalid ||
         xss_protection_ == kReflectedXSSUnset) {
       xss_protection_ = kBlockReflectedXSS;
     }

     if (auditor_delegate)
       auditor_delegate->SetReportURL(xss_protection_report_url.Copy());

     EncodedFormData* http_body = document_loader->HttpBody();
     if (http_body && !http_body->IsEmpty())
       http_body_as_string_ = http_body->FlattenToString();
   }

   SetEncoding(encoding_);
 }

 void XSSAuditor::SetEncoding(const WTF::TextEncoding& encoding) {
   const wtf_size_t kMiniumLengthForSuffixTree =
       512;  // FIXME: Tune this parameter.
   const int kSuffixTreeDepth = 5;

   if (!encoding.IsValid())
     return;

   encoding_ = encoding;

   decoded_url_ = Canonicalize(document_url_.GetString(), kNoTruncation);
   if (decoded_url_.Find(IsRequiredForInjection) == kNotFound)
     decoded_url_ = String();

   if (!http_body_as_string_.IsEmpty()) {
     decoded_http_body_ = Canonicalize(http_body_as_string_, kNoTruncation);
     http_body_as_string_ = String();
     if (decoded_http_body_.Find(IsRequiredForInjection) == kNotFound)
       decoded_http_body_ = String();
     if (decoded_http_body_.length() >= kMiniumLengthForSuffixTree) {
       decoded_http_body_suffix_tree_ =
           std::make_unique<SuffixTree<ASCIICodebook>>(decoded_http_body_,
                                                       kSuffixTreeDepth);
     }
   }

   if (decoded_url_.IsEmpty() && decoded_http_body_.IsEmpty())
     is_enabled_ = false;
 }

 std::unique_ptr<XSSInfo> XSSAuditor::FilterToken(
     const FilterTokenRequest& request) {
   DCHECK_NE(state_, kUninitialized);
   if (!is_enabled_ || xss_protection_ == kAllowReflectedXSS)
     return nullptr;

   bool did_block_script = false;
   if (request.token.GetType() == HTMLToken::kStartTag)
     did_block_script = FilterStartToken(request);
   else if (script_tag_nesting_level_) {
     if (request.token.GetType() == HTMLToken::kCharacter)
       did_block_script = FilterCharacterToken(request);
     else if (request.token.GetType() == HTMLToken::kEndTag)
       FilterEndToken(request);
   }

   if (did_block_script) {
     bool did_block_entire_page = (xss_protection_ == kBlockReflectedXSS);
     std::unique_ptr<XSSInfo> xss_info =
         XSSInfo::Create(document_url_, did_block_entire_page,
                         did_send_valid_xss_protection_header_);
     return xss_info;
   }
   return nullptr;
 }

 bool XSSAuditor::FilterStartToken(const FilterTokenRequest& request) {
   state_ = kFilteringTokens;
   bool did_block_script = EraseDangerousAttributesIfInjected(request);

   if (HasName(request.token, kScriptTag)) {
     did_block_script |= FilterScriptToken(request);
     DCHECK(request.should_allow_cdata || !script_tag_nesting_level_);
     script_tag_nesting_level_++;
   } else if (HasName(request.token, kObjectTag))
     did_block_script |= FilterObjectToken(request);
   else if (HasName(request.token, kParamTag))
     did_block_script |= FilterParamToken(request);
   else if (HasName(request.token, kEmbedTag))
     did_block_script |= FilterEmbedToken(request);
   else if (HasName(request.token, kIFrameTag) ||
            HasName(request.token, kFrameTag))
     did_block_script |= FilterFrameToken(request);
   else if (HasName(request.token, kMetaTag))
     did_block_script |= FilterMetaToken(request);
   else if (HasName(request.token, kBaseTag))
     did_block_script |= FilterBaseToken(request);
   else if (HasName(request.token, kFormTag))
     did_block_script |= FilterFormToken(request);
   else if (HasName(request.token, kInputTag))
     did_block_script |= FilterInputToken(request);
   else if (HasName(request.token, kButtonTag))
     did_block_script |= FilterButtonToken(request);
   else if (HasName(request.token, kLinkTag))
     did_block_script |= FilterLinkToken(request);

   return did_block_script;
 }

 void XSSAuditor::FilterEndToken(const FilterTokenRequest& request) {
   DCHECK(script_tag_nesting_level_);
   state_ = kFilteringTokens;
   if (HasName(request.token, kScriptTag)) {
     script_tag_nesting_level_--;
     DCHECK(request.should_allow_cdata || !script_tag_nesting_level_);
   }
 }

 bool XSSAuditor::FilterCharacterToken(const FilterTokenRequest& request) {
   DCHECK(script_tag_nesting_level_);
   DCHECK_NE(state_, kUninitialized);
   if (state_ == kPermittingAdjacentCharacterTokens)
     return false;

   if (state_ == kFilteringTokens && script_tag_found_in_request_) {
     String snippet = CanonicalizedSnippetForJavaScript(request);
     if (IsContainedInRequest(snippet))
       state_ = kSuppressingAdjacentCharacterTokens;
     else if (!snippet.IsEmpty())
       state_ = kPermittingAdjacentCharacterTokens;
   }
   if (state_ == kSuppressingAdjacentCharacterTokens) {
     request.token.EraseCharacters();
     // Technically, character tokens can't be empty.
     request.token.AppendToCharacter(' ');
     return true;
   }
   return false;
 }

 bool XSSAuditor::FilterScriptToken(const FilterTokenRequest& request) {
   DCHECK_EQ(request.token.GetType(), HTMLToken::kStartTag);
   DCHECK(HasName(request.token, kScriptTag));

   bool did_block_script = false;
   script_tag_found_in_request_ =
       IsContainedInRequest(CanonicalizedSnippetForTagName(request));
   if (script_tag_found_in_request_) {
     did_block_script |= EraseAttributeIfInjected(
         request, kSrcAttr, BlankURL().GetString(), kSrcLikeAttributeTruncation);
     did_block_script |= EraseAttributeIfInjected(request, svg_names::kHrefAttr,
                                                  BlankURL().GetString(),
                                                  kSrcLikeAttributeTruncation);
     did_block_script |= EraseAttributeIfInjected(
         request, xlink_names::kHrefAttr, BlankURL().GetString(),
         kSrcLikeAttributeTruncation);
   }
   return did_block_script;
 }

 bool XSSAuditor::FilterObjectToken(const FilterTokenRequest& request) {
   DCHECK_EQ(request.token.GetType(), HTMLToken::kStartTag);
   DCHECK(HasName(request.token, kObjectTag));

   bool did_block_script = false;
   if (IsContainedInRequest(CanonicalizedSnippetForTagName(request))) {
     did_block_script |=
         EraseAttributeIfInjected(request, kDataAttr, BlankURL().GetString(),
                                  kSrcLikeAttributeTruncation);
     did_block_script |= EraseAttributeIfInjected(request, kTypeAttr);
     did_block_script |= EraseAttributeIfInjected(request, kClassidAttr);
   }
   return did_block_script;
 }

 bool XSSAuditor::FilterParamToken(const FilterTokenRequest& request) {
   DCHECK_EQ(request.token.GetType(), HTMLToken::kStartTag);
   DCHECK(HasName(request.token, kParamTag));

   wtf_size_t index_of_name_attribute;
   if (!FindAttributeWithName(request.token, kNameAttr, index_of_name_attribute))
     return false;

   const HTMLToken::Attribute& name_attribute =
       request.token.Attributes().at(index_of_name_attribute);
   if (!HTMLParamElement::IsURLParameter(name_attribute.Value()))
     return false;

   return EraseAttributeIfInjected(request, kValueAttr, BlankURL().GetString(),
                                   kSrcLikeAttributeTruncation);
 }

 bool XSSAuditor::FilterEmbedToken(const FilterTokenRequest& request) {
   DCHECK_EQ(request.token.GetType(), HTMLToken::kStartTag);
   DCHECK(HasName(request.token, kEmbedTag));

   bool did_block_script = false;
   if (IsContainedInRequest(CanonicalizedSnippetForTagName(request))) {
     did_block_script |= EraseAttributeIfInjected(request, kCodeAttr, String(),
                                                  kSrcLikeAttributeTruncation);
     did_block_script |= EraseAttributeIfInjected(
         request, kSrcAttr, BlankURL().GetString(), kSrcLikeAttributeTruncation);
     did_block_script |= EraseAttributeIfInjected(request, kTypeAttr);
   }
   return did_block_script;
 }

 bool XSSAuditor::FilterFrameToken(const FilterTokenRequest& request) {
   DCHECK_EQ(request.token.GetType(), HTMLToken::kStartTag);
   DCHECK(HasName(request.token, kIFrameTag) ||
          HasName(request.token, kFrameTag));

   bool did_block_script = EraseAttributeIfInjected(
       request, kSrcdocAttr, String(), kScriptLikeAttributeTruncation);
   if (IsContainedInRequest(CanonicalizedSnippetForTagName(request)))
     did_block_script |= EraseAttributeIfInjected(request, kSrcAttr, String(),
                                                  kSrcLikeAttributeTruncation);

   return did_block_script;
 }

 bool XSSAuditor::FilterMetaToken(const FilterTokenRequest& request) {
   DCHECK_EQ(request.token.GetType(), HTMLToken::kStartTag);
   DCHECK(HasName(request.token, kMetaTag));

   return EraseAttributeIfInjected(request, kHttpEquivAttr);
 }

 bool XSSAuditor::FilterBaseToken(const FilterTokenRequest& request) {
   DCHECK_EQ(request.token.GetType(), HTMLToken::kStartTag);
   DCHECK(HasName(request.token, kBaseTag));

   return EraseAttributeIfInjected(request, kHrefAttr, String(),
                                   kSrcLikeAttributeTruncation);
 }

 bool XSSAuditor::FilterFormToken(const FilterTokenRequest& request) {
   DCHECK_EQ(request.token.GetType(), HTMLToken::kStartTag);
   DCHECK(HasName(request.token, kFormTag));

   return EraseAttributeIfInjected(request, kActionAttr, kURLWithUniqueOrigin,
                                   kSrcLikeAttributeTruncation);
 }

 bool XSSAuditor::FilterInputToken(const FilterTokenRequest& request) {
   DCHECK_EQ(request.token.GetType(), HTMLToken::kStartTag);
   DCHECK(HasName(request.token, kInputTag));

   return EraseAttributeIfInjected(request, kFormactionAttr,
                                   kURLWithUniqueOrigin,
                                   kSrcLikeAttributeTruncation);
 }

 bool XSSAuditor::FilterButtonToken(const FilterTokenRequest& request) {
   DCHECK_EQ(request.token.GetType(), HTMLToken::kStartTag);
   DCHECK(HasName(request.token, kButtonTag));

   return EraseAttributeIfInjected(request, kFormactionAttr,
                                   kURLWithUniqueOrigin,
                                   kSrcLikeAttributeTruncation);
 }

 bool XSSAuditor::FilterLinkToken(const FilterTokenRequest& request) {
   DCHECK_EQ(request.token.GetType(), HTMLToken::kStartTag);
   DCHECK(HasName(request.token, kLinkTag));

   wtf_size_t index_of_attribute = 0;
   if (!FindAttributeWithName(request.token, kRelAttr, index_of_attribute))
     return false;

   const HTMLToken::Attribute& attribute =
       request.token.Attributes().at(index_of_attribute);
   LinkRelAttribute parsed_attribute(attribute.Value());
   if (!parsed_attribute.IsImport())
     return false;

   return EraseAttributeIfInjected(request, kHrefAttr, kURLWithUniqueOrigin,
                                   kSrcLikeAttributeTruncation,
                                   kAllowSameOriginHref);
 }

 bool XSSAuditor::EraseDangerousAttributesIfInjected(
     const FilterTokenRequest& request) {
   bool did_block_script = false;
   for (wtf_size_t i = 0; i < request.token.Attributes().size(); ++i) {
     bool erase_attribute = false;
     bool value_contains_java_script_url = false;
     const HTMLToken::Attribute& attribute = request.token.Attributes().at(i);
     // FIXME: Don't create a new String for every attribute.value in the
     // document.
     if (IsNameOfInlineEventHandler(attribute.NameAsVector())) {
       erase_attribute = IsContainedInRequest(
           Canonicalize(SnippetFromAttribute(request, attribute),
                        kScriptLikeAttributeTruncation));
     } else if (IsSemicolonSeparatedAttribute(attribute)) {
       if (IsSemicolonSeparatedValueContainingJavaScriptURL(attribute.Value())) {
         value_contains_java_script_url = true;
         erase_attribute =
             IsContainedInRequest(Canonicalize(
                 NameFromAttribute(request, attribute), kNoTruncation)) &&
             IsContainedInRequest(
                 Canonicalize(SnippetFromAttribute(request, attribute),
                              kSemicolonSeparatedScriptLikeAttributeTruncation));
       }
     } else if (ProtocolIsJavaScript(
                    StripLeadingAndTrailingHTMLSpaces(attribute.Value()))) {
       value_contains_java_script_url = true;
       erase_attribute = IsContainedInRequest(
           Canonicalize(SnippetFromAttribute(request, attribute),
                        kScriptLikeAttributeTruncation));
     }
     if (!erase_attribute)
       continue;
     request.token.EraseValueOfAttribute(i);
     if (value_contains_java_script_url)
       request.token.AppendToAttributeValue(i, kSafeJavaScriptURL);
     did_block_script = true;
   }
   return did_block_script;
 }

 bool XSSAuditor::EraseAttributeIfInjected(const FilterTokenRequest& request,
                                           const QualifiedName& attribute_name,
                                           const String& replacement_value,
                                           TruncationKind treatment,
                                           HrefRestriction restriction) {
   wtf_size_t index_of_attribute = 0;
   if (!FindAttributeWithName(request.token, attribute_name, index_of_attribute))
     return false;

   const HTMLToken::Attribute& attribute =
       request.token.Attributes().at(index_of_attribute);
   if (!IsContainedInRequest(
           Canonicalize(SnippetFromAttribute(request, attribute), treatment)))
     return false;

   if (ThreadSafeMatch(attribute_name, kSrcAttr) ||
       (restriction == kAllowSameOriginHref &&
        ThreadSafeMatch(attribute_name, kHrefAttr))) {
     if (IsLikelySafeResource(attribute.Value()))
       return false;
   } else if (ThreadSafeMatch(attribute_name, kHttpEquivAttr)) {
     if (!IsDangerousHTTPEquiv(attribute.Value()))
       return false;
   }

   request.token.EraseValueOfAttribute(index_of_attribute);
   if (!replacement_value.IsEmpty())
     request.token.AppendToAttributeValue(index_of_attribute, replacement_value);

   return true;
 }

 String XSSAuditor::CanonicalizedSnippetForTagName(
     const FilterTokenRequest& request) {
   String source = request.source_tracker.SourceForToken(request.token);

   // TODO(tsepez): fix HTMLSourceTracker not to include NULs.
   // Beware that the source tracker may include leading NULs as part of
   // the souce for the token.
   unsigned start = 0;
   for (start = 0; start < source.length() && source[start] == '\0'; ++start)
     continue;

   // Grab a fixed number of characters equal to the length of the token's name
   // plus one (to account for the "<").
   return Canonicalize(
       source.Substring(start, request.token.GetName().size() + 1),
       kNoTruncation);
 }

 String XSSAuditor::NameFromAttribute(const FilterTokenRequest& request,
                                      const HTMLToken::Attribute& attribute) {
   // The range inlcudes the character which terminates the name. So,
   // for an input of |name="value"|, the snippet is |name=|.
   int start = attribute.NameRange().start - request.token.StartIndex();
   int end = attribute.ValueRange().start - request.token.StartIndex();
   return request.source_tracker.SourceForToken(request.token)
       .Substring(start, end - start);
 }

 String XSSAuditor::SnippetFromAttribute(const FilterTokenRequest& request,
                                         const HTMLToken::Attribute& attribute) {
   // The range doesn't include the character which terminates the value. So,
   // for an input of |name="value"|, the snippet is |name="value|. For a space
   // terminated unquoted input of |name=value |, the snippet is |name=value|.
   // Beware of empty unquoted values at the end of a token, we need to make sure
   // we don't clip off the equals-sign as there is no trailing space.
   // FIXME: We should grab one character before the name also.
   int name_start = attribute.NameRange().start - request.token.StartIndex();
   int value_start = attribute.ValueRange().start - request.token.StartIndex();
   int value_end = attribute.ValueRange().end - request.token.StartIndex();
   int length = value_end - name_start;
   if (value_start == value_end)
     length += 1;
   return request.source_tracker.SourceForToken(request.token)
       .Substring(name_start, length);
 }

 String XSSAuditor::Canonicalize(String snippet, TruncationKind treatment) {
   String decoded_snippet = FullyDecodeString(snippet, encoding_);

   if (treatment != kNoTruncation) {
     if (decoded_snippet.length() > kMaximumFragmentLengthTarget) {
       // Let the page influence the stopping point to avoid disclosing leading
       // fragments. Stop when we hit whitespace, since that is unlikely to be
       // part a leading fragment.
       wtf_size_t position = kMaximumFragmentLengthTarget;
       while (position < decoded_snippet.length() &&
              !IsHTMLSpace(decoded_snippet[position]))
         ++position;
       decoded_snippet.Truncate(position);
     }
     if (treatment == kSrcLikeAttributeTruncation)
       TruncateForSrcLikeAttribute(decoded_snippet);
     else if (treatment == kScriptLikeAttributeTruncation)
       TruncateForScriptLikeAttribute(decoded_snippet);
     else if (treatment == kSemicolonSeparatedScriptLikeAttributeTruncation)
       TruncateForSemicolonSeparatedScriptLikeAttribute(decoded_snippet);
   }

   return decoded_snippet.RemoveCharacters(&IsNonCanonicalCharacter);
 }

 String XSSAuditor::CanonicalizedSnippetForJavaScript(
     const FilterTokenRequest& request) {
   String string = request.source_tracker.SourceForToken(request.token);
   wtf_size_t start_position = 0;
   wtf_size_t end_position = string.length();
   wtf_size_t found_position = kNotFound;
   wtf_size_t last_non_space_position = kNotFound;

   // Skip over initial comments to find start of code.
   while (start_position < end_position) {
     while (start_position < end_position &&
            IsHTMLSpace<UChar>(string[start_position]))
       start_position++;

     // Under SVG/XML rules, only HTML comment syntax matters and the parser
     // returns these as a separate comment tokens. Having consumed whitespace,
     // we need not look further for these.
     if (request.should_allow_cdata)
       break;

     // Under HTML rules, both the HTML and JS comment synatx matters, and the
     // HTML comment ends at the end of the line, not with -->.
     if (StartsHTMLOpenCommentAt(string, start_position) ||
         StartsSingleLineCommentAt(string, start_position)) {
       while (start_position < end_position &&
              !IsJSNewline(string[start_position]))
         start_position++;
     } else if (StartsMultiLineCommentAt(string, start_position)) {
       if (start_position + 2 < end_position &&
           (found_position = string.Find("*/", start_position + 2)) != kNotFound)
         start_position = found_position + 2;
       else
         start_position = end_position;
     } else
       break;
   }

   String result;
   while (start_position < end_position && !result.length()) {
     // Stop at next comment (using the same rules as above for SVG/XML vs HTML),
     // when we encounter a comma, when we encounter a backtick, when we hit an
     // opening <script> tag, when we encounter a HTML closing comment, or when
     // we exceed the maximum length target.
     // - The comma rule covers a common parameter concatenation case performed
     //   by some web servers.
     // - The backtick rule covers the ECMA6 multi-line template string feature.
     // - The HTML closing comment rule covers the generous interpretation in
     //   https://tc39.github.io/ecma262/#prod-annexB-HTMLCloseComment.
     last_non_space_position = kNotFound;
     for (found_position = start_position; found_position < end_position;
          found_position++) {
       if (StartsSingleLineCommentAt(string, found_position) ||
           StartsMultiLineCommentAt(string, found_position)) {
         break;
       }
       if (!request.should_allow_cdata) {
         if (StartsHTMLOpenCommentAt(string, found_position) ||
             StartsHTMLCloseCommentAt(string, found_position)) {
           break;
         }
       }
       if (string[found_position] == ',' || string[found_position] == '`')
         break;

       if (last_non_space_position != kNotFound &&
           (StartsOpeningScriptTagAt(string, found_position) ||
            StartsClosingScriptTagAt(string, found_position))) {
         found_position = last_non_space_position + 1;
         break;
       }
       if (found_position > start_position + kMaximumFragmentLengthTarget) {
         // After hitting the length target, we can only stop at a point where we
         // know we are not in the middle of a %-escape sequence. For the sake of
         // simplicity, approximate not stopping inside a (possibly multiply
         // encoded) %-escape sequence by breaking on whitespace only. We should
         // have enough text in these cases to avoid false positives.
         if (IsHTMLSpace<UChar>(string[found_position]))
           break;
       }
       if (!IsHTMLSpace<UChar>(string[found_position]))
         last_non_space_position = found_position;
     }
     result = Canonicalize(
         string.Substring(start_position, found_position - start_position),
         kNoTruncation);
     start_position = found_position + 1;
   }

   return result;
 }

 bool XSSAuditor::IsContainedInRequest(const String& decoded_snippet) {
   if (decoded_snippet.IsEmpty())
     return false;
   if (decoded_url_.FindIgnoringCase(decoded_snippet, 0) != kNotFound)
     return true;
   if (decoded_http_body_suffix_tree_ &&
       !decoded_http_body_suffix_tree_->MightContain(decoded_snippet))
     return false;
   return decoded_http_body_.FindIgnoringCase(decoded_snippet, 0) != kNotFound;
 }

 bool XSSAuditor::IsLikelySafeResource(const String& url) {
   // Give empty URLs and about:blank a pass. Making a resourceURL from an
   // empty string below will likely later fail the "no query args test" as
   // it inherits the document's query args.
   if (url.IsEmpty() || url == BlankURL().GetString())
     return true;

   // If the resource is loaded from the same host as the enclosing page, it's
   // probably not an XSS attack, so we reduce false positives by allowing the
   // request, ignoring scheme and port considerations. If the resource has a
   // query string, we're more suspicious, however, because that's pretty rare
   // and the attacker might be able to trick a server-side script into doing
   // something dangerous with the query string.
   if (document_url_.Host().IsEmpty())
     return false;

   KURL resource_url(document_url_, url);
   return (document_url_.Host() == resource_url.Host() &&
           resource_url.Query().IsEmpty());
 }

 bool XSSAuditor::IsSafeToSendToAnotherThread() const {
   return document_url_.IsSafeToSendToAnotherThread() &&
          decoded_url_.IsSafeToSendToAnotherThread() &&
          decoded_http_body_.IsSafeToSendToAnotherThread() &&
          http_body_as_string_.IsSafeToSendToAnotherThread();
 }

 }  // namespace blink