blob: 97311d45034c93af7058e8c52c237205f177f0ed [file] [log] [blame]
/*
* Copyright (C) 2013 Google, Inc. All Rights Reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef HTMLToken_h
#define HTMLToken_h
#include "core/dom/Attribute.h"
#include "core/html/parser/HTMLParserIdioms.h"
#include "wtf/Forward.h"
#include "wtf/PtrUtil.h"
#include <memory>
namespace blink {
class DoctypeData {
USING_FAST_MALLOC(DoctypeData);
WTF_MAKE_NONCOPYABLE(DoctypeData);
public:
DoctypeData()
: m_hasPublicIdentifier(false),
m_hasSystemIdentifier(false),
m_forceQuirks(false) {}
bool m_hasPublicIdentifier;
bool m_hasSystemIdentifier;
WTF::Vector<UChar> m_publicIdentifier;
WTF::Vector<UChar> m_systemIdentifier;
bool m_forceQuirks;
};
static inline Attribute* findAttributeInVector(Vector<Attribute>& attributes,
const QualifiedName& name) {
for (unsigned i = 0; i < attributes.size(); ++i) {
if (attributes.at(i).name().matches(name))
return &attributes.at(i);
}
return 0;
}
class HTMLToken {
WTF_MAKE_NONCOPYABLE(HTMLToken);
USING_FAST_MALLOC(HTMLToken);
public:
enum TokenType {
Uninitialized,
DOCTYPE,
StartTag,
EndTag,
Comment,
Character,
EndOfFile,
};
class Attribute {
DISALLOW_NEW_EXCEPT_PLACEMENT_NEW();
public:
class Range {
DISALLOW_NEW();
public:
static constexpr int kInvalidOffset = -1;
inline void clear() {
#if ENABLE(ASSERT)
start = kInvalidOffset;
end = kInvalidOffset;
#endif
}
// Check Range instance that is actively being parsed.
inline void checkValidStart() const {
DCHECK_NE(start, kInvalidOffset);
DCHECK_GE(start, 0);
}
// Check Range instance which finished parse.
inline void checkValid() const {
checkValidStart();
DCHECK_NE(end, kInvalidOffset);
DCHECK_GE(end, 0);
DCHECK_LE(start, end);
}
int start;
int end;
};
AtomicString name() const { return AtomicString(m_name); }
String nameAttemptStaticStringCreation() const {
return attemptStaticStringCreation(m_name, Likely8Bit);
}
const Vector<UChar, 32>& nameAsVector() const { return m_name; }
void appendToName(UChar c) { m_name.append(c); }
PassRefPtr<StringImpl> value8BitIfNecessary() const {
return StringImpl::create8BitIfPossible(m_value);
}
String value() const { return String(m_value); }
void appendToValue(UChar c) { m_value.append(c); }
void appendToValue(const String& value) { value.appendTo(m_value); }
void clearValue() { m_value.clear(); }
const Range& nameRange() const { return m_nameRange; }
const Range& valueRange() const { return m_valueRange; }
Range& mutableNameRange() { return m_nameRange; }
Range& mutableValueRange() { return m_valueRange; }
private:
Vector<UChar, 32> m_name;
Vector<UChar, 32> m_value;
Range m_nameRange;
Range m_valueRange;
};
typedef Vector<Attribute, 10> AttributeList;
// By using an inline capacity of 256, we avoid spilling over into an malloced
// buffer approximately 99% of the time based on a non-scientific browse
// around a number of popular web sites on 23 May 2013.
typedef Vector<UChar, 256> DataVector;
HTMLToken() { clear(); }
void clear() {
m_type = Uninitialized;
m_range.clear();
m_range.start = 0;
m_baseOffset = 0;
// Don't call Vector::clear() as that would destroy the
// alloced VectorBuffer. If the innerHTML'd content has
// two 257 character text nodes in a row, we'll needlessly
// thrash malloc. When we finally finish the parse the
// HTMLToken will be destroyed and the VectorBuffer released.
m_data.shrink(0);
m_orAllData = 0;
}
bool isUninitialized() { return m_type == Uninitialized; }
TokenType type() const { return m_type; }
void makeEndOfFile() {
ASSERT(m_type == Uninitialized);
m_type = EndOfFile;
}
// Range and offset methods exposed for HTMLSourceTracker and
// HTMLViewSourceParser.
int startIndex() const { return m_range.start; }
int endIndex() const { return m_range.end; }
void setBaseOffset(int offset) { m_baseOffset = offset; }
void end(int endOffset) { m_range.end = endOffset - m_baseOffset; }
const DataVector& data() const {
ASSERT(m_type == Character || m_type == Comment || m_type == StartTag ||
m_type == EndTag);
return m_data;
}
bool isAll8BitData() const { return (m_orAllData <= 0xff); }
const DataVector& name() const {
ASSERT(m_type == StartTag || m_type == EndTag || m_type == DOCTYPE);
return m_data;
}
void appendToName(UChar character) {
ASSERT(m_type == StartTag || m_type == EndTag || m_type == DOCTYPE);
ASSERT(character);
m_data.append(character);
m_orAllData |= character;
}
/* DOCTYPE Tokens */
bool forceQuirks() const {
ASSERT(m_type == DOCTYPE);
return m_doctypeData->m_forceQuirks;
}
void setForceQuirks() {
ASSERT(m_type == DOCTYPE);
m_doctypeData->m_forceQuirks = true;
}
void beginDOCTYPE() {
ASSERT(m_type == Uninitialized);
m_type = DOCTYPE;
m_doctypeData = wrapUnique(new DoctypeData);
}
void beginDOCTYPE(UChar character) {
ASSERT(character);
beginDOCTYPE();
m_data.append(character);
m_orAllData |= character;
}
// FIXME: Distinguish between a missing public identifer and an empty one.
const WTF::Vector<UChar>& publicIdentifier() const {
ASSERT(m_type == DOCTYPE);
return m_doctypeData->m_publicIdentifier;
}
// FIXME: Distinguish between a missing system identifer and an empty one.
const WTF::Vector<UChar>& systemIdentifier() const {
ASSERT(m_type == DOCTYPE);
return m_doctypeData->m_systemIdentifier;
}
void setPublicIdentifierToEmptyString() {
ASSERT(m_type == DOCTYPE);
m_doctypeData->m_hasPublicIdentifier = true;
m_doctypeData->m_publicIdentifier.clear();
}
void setSystemIdentifierToEmptyString() {
ASSERT(m_type == DOCTYPE);
m_doctypeData->m_hasSystemIdentifier = true;
m_doctypeData->m_systemIdentifier.clear();
}
void appendToPublicIdentifier(UChar character) {
ASSERT(character);
ASSERT(m_type == DOCTYPE);
ASSERT(m_doctypeData->m_hasPublicIdentifier);
m_doctypeData->m_publicIdentifier.append(character);
}
void appendToSystemIdentifier(UChar character) {
ASSERT(character);
ASSERT(m_type == DOCTYPE);
ASSERT(m_doctypeData->m_hasSystemIdentifier);
m_doctypeData->m_systemIdentifier.append(character);
}
std::unique_ptr<DoctypeData> releaseDoctypeData() {
return std::move(m_doctypeData);
}
/* Start/End Tag Tokens */
bool selfClosing() const {
ASSERT(m_type == StartTag || m_type == EndTag);
return m_selfClosing;
}
void setSelfClosing() {
ASSERT(m_type == StartTag || m_type == EndTag);
m_selfClosing = true;
}
void beginStartTag(UChar character) {
ASSERT(character);
ASSERT(m_type == Uninitialized);
m_type = StartTag;
m_selfClosing = false;
m_currentAttribute = 0;
m_attributes.clear();
m_data.append(character);
m_orAllData |= character;
}
void beginEndTag(LChar character) {
ASSERT(m_type == Uninitialized);
m_type = EndTag;
m_selfClosing = false;
m_currentAttribute = 0;
m_attributes.clear();
m_data.append(character);
}
void beginEndTag(const Vector<LChar, 32>& characters) {
ASSERT(m_type == Uninitialized);
m_type = EndTag;
m_selfClosing = false;
m_currentAttribute = 0;
m_attributes.clear();
m_data.appendVector(characters);
}
void addNewAttribute() {
ASSERT(m_type == StartTag || m_type == EndTag);
m_attributes.grow(m_attributes.size() + 1);
m_currentAttribute = &m_attributes.last();
m_currentAttribute->mutableNameRange().clear();
m_currentAttribute->mutableValueRange().clear();
}
void beginAttributeName(int offset) {
m_currentAttribute->mutableNameRange().start = offset - m_baseOffset;
m_currentAttribute->nameRange().checkValidStart();
}
void endAttributeName(int offset) {
int index = offset - m_baseOffset;
m_currentAttribute->mutableNameRange().end = index;
m_currentAttribute->nameRange().checkValid();
m_currentAttribute->mutableValueRange().start = index;
m_currentAttribute->mutableValueRange().end = index;
}
void beginAttributeValue(int offset) {
m_currentAttribute->mutableValueRange().clear();
m_currentAttribute->mutableValueRange().start = offset - m_baseOffset;
m_currentAttribute->valueRange().checkValidStart();
}
void endAttributeValue(int offset) {
m_currentAttribute->mutableValueRange().end = offset - m_baseOffset;
m_currentAttribute->valueRange().checkValid();
}
void appendToAttributeName(UChar character) {
ASSERT(character);
ASSERT(m_type == StartTag || m_type == EndTag);
m_currentAttribute->nameRange().checkValidStart();
m_currentAttribute->appendToName(character);
}
void appendToAttributeValue(UChar character) {
ASSERT(character);
ASSERT(m_type == StartTag || m_type == EndTag);
m_currentAttribute->valueRange().checkValidStart();
m_currentAttribute->appendToValue(character);
}
void appendToAttributeValue(size_t i, const String& value) {
ASSERT(!value.isEmpty());
ASSERT(m_type == StartTag || m_type == EndTag);
m_attributes[i].appendToValue(value);
}
const AttributeList& attributes() const {
ASSERT(m_type == StartTag || m_type == EndTag);
return m_attributes;
}
const Attribute* getAttributeItem(const QualifiedName& name) const {
for (unsigned i = 0; i < m_attributes.size(); ++i) {
if (m_attributes.at(i).name() == name.localName())
return &m_attributes.at(i);
}
return 0;
}
// Used by the XSSAuditor to nuke XSS-laden attributes.
void eraseValueOfAttribute(size_t i) {
ASSERT(m_type == StartTag || m_type == EndTag);
m_attributes[i].clearValue();
}
/* Character Tokens */
// Starting a character token works slightly differently than starting
// other types of tokens because we want to save a per-character branch.
void ensureIsCharacterToken() {
ASSERT(m_type == Uninitialized || m_type == Character);
m_type = Character;
}
const DataVector& characters() const {
ASSERT(m_type == Character);
return m_data;
}
void appendToCharacter(char character) {
ASSERT(m_type == Character);
m_data.append(character);
}
void appendToCharacter(UChar character) {
ASSERT(m_type == Character);
m_data.append(character);
m_orAllData |= character;
}
void appendToCharacter(const Vector<LChar, 32>& characters) {
ASSERT(m_type == Character);
m_data.appendVector(characters);
}
/* Comment Tokens */
const DataVector& comment() const {
ASSERT(m_type == Comment);
return m_data;
}
void beginComment() {
ASSERT(m_type == Uninitialized);
m_type = Comment;
}
void appendToComment(UChar character) {
ASSERT(character);
ASSERT(m_type == Comment);
m_data.append(character);
m_orAllData |= character;
}
// Only for XSSAuditor
void eraseCharacters() {
ASSERT(m_type == Character);
m_data.clear();
m_orAllData = 0;
}
private:
TokenType m_type;
Attribute::Range m_range; // Always starts at zero.
int m_baseOffset;
DataVector m_data;
UChar m_orAllData;
// For StartTag and EndTag
bool m_selfClosing;
AttributeList m_attributes;
// A pointer into m_attributes used during lexing.
Attribute* m_currentAttribute;
// For DOCTYPE
std::unique_ptr<DoctypeData> m_doctypeData;
};
#ifndef NDEBUG
const char* toString(HTMLToken::TokenType);
#endif
} // namespace blink
#endif