blob: 51a241c27f484954834c40f8146bd3ad6ed3a3c8 [file] [log] [blame]
/*
* (C) 1999 Lars Knoll (knoll@kde.org)
* Copyright (C) 2004, 2005, 2006, 2007, 2008, 2010, 2012 Apple Inc. All rights
* reserved.
* Copyright (C) 2007-2009 Torch Mobile, Inc.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public License
* along with this library; see the file COPYING.LIB. If not, write to
* the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
* Boston, MA 02110-1301, USA.
*/
#include "wtf/text/WTFString.h"
#include "wtf/ASCIICType.h"
#include "wtf/DataLog.h"
#include "wtf/HexNumber.h"
#include "wtf/MathExtras.h"
#include "wtf/StringExtras.h"
#include "wtf/Vector.h"
#include "wtf/dtoa.h"
#include "wtf/text/CString.h"
#include "wtf/text/CharacterNames.h"
#include "wtf/text/IntegerToStringConversion.h"
#include "wtf/text/UTF8.h"
#include "wtf/text/Unicode.h"
#include <algorithm>
#include <stdarg.h>
namespace WTF {
using namespace Unicode;
// Construct a string with UTF-16 data.
String::String(const UChar* characters, unsigned length)
: m_impl(characters ? StringImpl::create(characters, length) : nullptr) {}
// Construct a string with UTF-16 data, from a null-terminated source.
String::String(const UChar* str) {
if (!str)
return;
m_impl = StringImpl::create(str, lengthOfNullTerminatedString(str));
}
// Construct a string with latin1 data.
String::String(const LChar* characters, unsigned length)
: m_impl(characters ? StringImpl::create(characters, length) : nullptr) {}
String::String(const char* characters, unsigned length)
: m_impl(characters ? StringImpl::create(
reinterpret_cast<const LChar*>(characters),
length)
: nullptr) {}
// Construct a string with latin1 data, from a null-terminated source.
String::String(const LChar* characters)
: m_impl(characters ? StringImpl::create(characters) : nullptr) {}
String::String(const char* characters)
: m_impl(characters ? StringImpl::create(
reinterpret_cast<const LChar*>(characters))
: nullptr) {}
void String::append(const StringView& string) {
if (string.isEmpty())
return;
if (!m_impl) {
m_impl = string.toString().releaseImpl();
return;
}
// FIXME: This is extremely inefficient. So much so that we might want to
// take this out of String's API. We can make it better by optimizing the
// case where exactly one String is pointing at this StringImpl, but even
// then it's going to require a call into the allocator every single time.
if (m_impl->is8Bit() && string.is8Bit()) {
LChar* data;
RELEASE_ASSERT(string.length() <=
std::numeric_limits<unsigned>::max() - m_impl->length());
RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(
m_impl->length() + string.length(), data);
memcpy(data, m_impl->characters8(), m_impl->length() * sizeof(LChar));
memcpy(data + m_impl->length(), string.characters8(),
string.length() * sizeof(LChar));
m_impl = newImpl.release();
return;
}
UChar* data;
RELEASE_ASSERT(string.length() <=
std::numeric_limits<unsigned>::max() - m_impl->length());
RefPtr<StringImpl> newImpl =
StringImpl::createUninitialized(m_impl->length() + string.length(), data);
if (m_impl->is8Bit())
StringImpl::copyChars(data, m_impl->characters8(), m_impl->length());
else
StringImpl::copyChars(data, m_impl->characters16(), m_impl->length());
if (string.is8Bit())
StringImpl::copyChars(data + m_impl->length(), string.characters8(),
string.length());
else
StringImpl::copyChars(data + m_impl->length(), string.characters16(),
string.length());
m_impl = newImpl.release();
}
template <typename CharacterType>
inline void String::appendInternal(CharacterType c) {
// FIXME: This is extremely inefficient. So much so that we might want to
// take this out of String's API. We can make it better by optimizing the
// case where exactly one String is pointing at this StringImpl, but even
// then it's going to require a call into the allocator every single time.
if (!m_impl) {
m_impl = StringImpl::create(&c, 1);
return;
}
// FIXME: We should be able to create an 8 bit string via this code path.
UChar* data;
RELEASE_ASSERT(m_impl->length() < std::numeric_limits<unsigned>::max());
RefPtr<StringImpl> newImpl =
StringImpl::createUninitialized(m_impl->length() + 1, data);
if (m_impl->is8Bit())
StringImpl::copyChars(data, m_impl->characters8(), m_impl->length());
else
StringImpl::copyChars(data, m_impl->characters16(), m_impl->length());
data[m_impl->length()] = c;
m_impl = newImpl.release();
}
void String::append(LChar c) {
appendInternal(c);
}
void String::append(UChar c) {
appendInternal(c);
}
int codePointCompare(const String& a, const String& b) {
return codePointCompare(a.impl(), b.impl());
}
int codePointCompareIgnoringASCIICase(const String& a, const char* b) {
return codePointCompareIgnoringASCIICase(a.impl(),
reinterpret_cast<const LChar*>(b));
}
template <typename CharType>
PassRefPtr<StringImpl> insertInternal(PassRefPtr<StringImpl> impl,
const CharType* charactersToInsert,
unsigned lengthToInsert,
unsigned position) {
if (!lengthToInsert)
return impl;
ASSERT(charactersToInsert);
UChar* data; // FIXME: We should be able to create an 8 bit string here.
RELEASE_ASSERT(lengthToInsert <=
std::numeric_limits<unsigned>::max() - impl->length());
RefPtr<StringImpl> newImpl =
StringImpl::createUninitialized(impl->length() + lengthToInsert, data);
if (impl->is8Bit())
StringImpl::copyChars(data, impl->characters8(), position);
else
StringImpl::copyChars(data, impl->characters16(), position);
StringImpl::copyChars(data + position, charactersToInsert, lengthToInsert);
if (impl->is8Bit())
StringImpl::copyChars(data + position + lengthToInsert,
impl->characters8() + position,
impl->length() - position);
else
StringImpl::copyChars(data + position + lengthToInsert,
impl->characters16() + position,
impl->length() - position);
return newImpl.release();
}
void String::insert(const StringView& string, unsigned position) {
if (string.isEmpty()) {
if (string.isNull())
return;
if (isNull())
m_impl = string.toString().releaseImpl();
return;
}
if (position >= length()) {
if (string.is8Bit())
append(string);
else
append(string);
return;
}
DCHECK(m_impl);
if (string.is8Bit())
m_impl = insertInternal(m_impl.release(), string.characters8(),
string.length(), position);
else
m_impl = insertInternal(m_impl.release(), string.characters16(),
string.length(), position);
}
UChar32 String::characterStartingAt(unsigned i) const {
if (!m_impl || i >= m_impl->length())
return 0;
return m_impl->characterStartingAt(i);
}
void String::ensure16Bit() {
if (isNull())
return;
if (!is8Bit())
return;
if (unsigned length = this->length())
m_impl =
make16BitFrom8BitSource(m_impl->characters8(), length).releaseImpl();
else
m_impl = StringImpl::empty16Bit();
}
void String::truncate(unsigned length) {
if (m_impl)
m_impl = m_impl->truncate(length);
}
void String::remove(unsigned start, unsigned lengthToRemove) {
if (m_impl)
m_impl = m_impl->remove(start, lengthToRemove);
}
String String::substring(unsigned pos, unsigned len) const {
if (!m_impl)
return String();
return m_impl->substring(pos, len);
}
String String::lower() const {
if (!m_impl)
return String();
return m_impl->lower();
}
String String::upper() const {
if (!m_impl)
return String();
return m_impl->upper();
}
String String::lower(const AtomicString& localeIdentifier) const {
if (!m_impl)
return String();
return m_impl->lower(localeIdentifier);
}
String String::upper(const AtomicString& localeIdentifier) const {
if (!m_impl)
return String();
return m_impl->upper(localeIdentifier);
}
String String::stripWhiteSpace() const {
if (!m_impl)
return String();
return m_impl->stripWhiteSpace();
}
String String::stripWhiteSpace(IsWhiteSpaceFunctionPtr isWhiteSpace) const {
if (!m_impl)
return String();
return m_impl->stripWhiteSpace(isWhiteSpace);
}
String String::simplifyWhiteSpace(StripBehavior stripBehavior) const {
if (!m_impl)
return String();
return m_impl->simplifyWhiteSpace(stripBehavior);
}
String String::simplifyWhiteSpace(IsWhiteSpaceFunctionPtr isWhiteSpace,
StripBehavior stripBehavior) const {
if (!m_impl)
return String();
return m_impl->simplifyWhiteSpace(isWhiteSpace, stripBehavior);
}
String String::removeCharacters(CharacterMatchFunctionPtr findMatch) const {
if (!m_impl)
return String();
return m_impl->removeCharacters(findMatch);
}
String String::foldCase() const {
if (!m_impl)
return String();
return m_impl->foldCase();
}
String String::format(const char* format, ...) {
va_list args;
va_start(args, format);
// Do the format once to get the length.
#if COMPILER(MSVC)
int result = _vscprintf(format, args);
#else
char ch;
int result = vsnprintf(&ch, 1, format, args);
// We need to call va_end() and then va_start() again here, as the
// contents of args is undefined after the call to vsnprintf
// according to http://man.cx/snprintf(3)
//
// Not calling va_end/va_start here happens to work on lots of
// systems, but fails e.g. on 64bit Linux.
#endif
va_end(args);
if (result == 0)
return String("");
if (result < 0)
return String();
Vector<char, 256> buffer;
unsigned len = result;
buffer.grow(len + 1);
va_start(args, format);
// Now do the formatting again, guaranteed to fit.
vsnprintf(buffer.data(), buffer.size(), format, args);
va_end(args);
return StringImpl::create(reinterpret_cast<const LChar*>(buffer.data()), len);
}
template <typename IntegerType>
static String integerToString(IntegerType input) {
IntegerToStringConverter<IntegerType> converter(input);
return StringImpl::create(converter.characters8(), converter.length());
}
String String::number(int number) {
return integerToString(number);
}
String String::number(unsigned number) {
return integerToString(number);
}
String String::number(long number) {
return integerToString(number);
}
String String::number(unsigned long number) {
return integerToString(number);
}
String String::number(long long number) {
return integerToString(number);
}
String String::number(unsigned long long number) {
return integerToString(number);
}
String String::number(double number, unsigned precision) {
NumberToStringBuffer buffer;
return String(numberToFixedPrecisionString(number, precision, buffer));
}
String String::numberToStringECMAScript(double number) {
NumberToStringBuffer buffer;
return String(numberToString(number, buffer));
}
String String::numberToStringFixedWidth(double number, unsigned decimalPlaces) {
NumberToStringBuffer buffer;
return String(numberToFixedWidthString(number, decimalPlaces, buffer));
}
int String::toIntStrict(bool* ok, int base) const {
if (!m_impl) {
if (ok)
*ok = false;
return 0;
}
return m_impl->toIntStrict(ok, base);
}
unsigned String::toUIntStrict(bool* ok, int base) const {
if (!m_impl) {
if (ok)
*ok = false;
return 0;
}
return m_impl->toUIntStrict(ok, base);
}
int64_t String::toInt64Strict(bool* ok, int base) const {
if (!m_impl) {
if (ok)
*ok = false;
return 0;
}
return m_impl->toInt64Strict(ok, base);
}
uint64_t String::toUInt64Strict(bool* ok, int base) const {
if (!m_impl) {
if (ok)
*ok = false;
return 0;
}
return m_impl->toUInt64Strict(ok, base);
}
int String::toInt(bool* ok) const {
if (!m_impl) {
if (ok)
*ok = false;
return 0;
}
return m_impl->toInt(ok);
}
unsigned String::toUInt(bool* ok) const {
if (!m_impl) {
if (ok)
*ok = false;
return 0;
}
return m_impl->toUInt(ok);
}
int64_t String::toInt64(bool* ok) const {
if (!m_impl) {
if (ok)
*ok = false;
return 0;
}
return m_impl->toInt64(ok);
}
uint64_t String::toUInt64(bool* ok) const {
if (!m_impl) {
if (ok)
*ok = false;
return 0;
}
return m_impl->toUInt64(ok);
}
double String::toDouble(bool* ok) const {
if (!m_impl) {
if (ok)
*ok = false;
return 0.0;
}
return m_impl->toDouble(ok);
}
float String::toFloat(bool* ok) const {
if (!m_impl) {
if (ok)
*ok = false;
return 0.0f;
}
return m_impl->toFloat(ok);
}
String String::isolatedCopy() const {
if (!m_impl)
return String();
return m_impl->isolatedCopy();
}
bool String::isSafeToSendToAnotherThread() const {
return !m_impl || m_impl->isSafeToSendToAnotherThread();
}
void String::split(const String& separator,
bool allowEmptyEntries,
Vector<String>& result) const {
result.clear();
unsigned startPos = 0;
size_t endPos;
while ((endPos = find(separator, startPos)) != kNotFound) {
if (allowEmptyEntries || startPos != endPos)
result.append(substring(startPos, endPos - startPos));
startPos = endPos + separator.length();
}
if (allowEmptyEntries || startPos != length())
result.append(substring(startPos));
}
void String::split(UChar separator,
bool allowEmptyEntries,
Vector<String>& result) const {
result.clear();
unsigned startPos = 0;
size_t endPos;
while ((endPos = find(separator, startPos)) != kNotFound) {
if (allowEmptyEntries || startPos != endPos)
result.append(substring(startPos, endPos - startPos));
startPos = endPos + 1;
}
if (allowEmptyEntries || startPos != length())
result.append(substring(startPos));
}
CString String::ascii() const {
// Printable ASCII characters 32..127 and the null character are
// preserved, characters outside of this range are converted to '?'.
unsigned length = this->length();
if (!length) {
char* characterBuffer;
return CString::newUninitialized(length, characterBuffer);
}
if (this->is8Bit()) {
const LChar* characters = this->characters8();
char* characterBuffer;
CString result = CString::newUninitialized(length, characterBuffer);
for (unsigned i = 0; i < length; ++i) {
LChar ch = characters[i];
characterBuffer[i] = ch && (ch < 0x20 || ch > 0x7f) ? '?' : ch;
}
return result;
}
const UChar* characters = this->characters16();
char* characterBuffer;
CString result = CString::newUninitialized(length, characterBuffer);
for (unsigned i = 0; i < length; ++i) {
UChar ch = characters[i];
characterBuffer[i] =
ch && (ch < 0x20 || ch > 0x7f) ? '?' : static_cast<char>(ch);
}
return result;
}
CString String::latin1() const {
// Basic Latin1 (ISO) encoding - Unicode characters 0..255 are
// preserved, characters outside of this range are converted to '?'.
unsigned length = this->length();
if (!length)
return CString("", 0);
if (is8Bit())
return CString(reinterpret_cast<const char*>(this->characters8()), length);
const UChar* characters = this->characters16();
char* characterBuffer;
CString result = CString::newUninitialized(length, characterBuffer);
for (unsigned i = 0; i < length; ++i) {
UChar ch = characters[i];
characterBuffer[i] = ch > 0xff ? '?' : static_cast<char>(ch);
}
return result;
}
// Helper to write a three-byte UTF-8 code point to the buffer, caller must
// check room is available.
static inline void putUTF8Triple(char*& buffer, UChar ch) {
ASSERT(ch >= 0x0800);
*buffer++ = static_cast<char>(((ch >> 12) & 0x0F) | 0xE0);
*buffer++ = static_cast<char>(((ch >> 6) & 0x3F) | 0x80);
*buffer++ = static_cast<char>((ch & 0x3F) | 0x80);
}
CString String::utf8(UTF8ConversionMode mode) const {
unsigned length = this->length();
if (!length)
return CString("", 0);
// Allocate a buffer big enough to hold all the characters
// (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes).
// Optimization ideas, if we find this function is hot:
// * We could speculatively create a CStringBuffer to contain 'length'
// characters, and resize if necessary (i.e. if the buffer contains
// non-ascii characters). (Alternatively, scan the buffer first for
// ascii characters, so we know this will be sufficient).
// * We could allocate a CStringBuffer with an appropriate size to
// have a good chance of being able to write the string into the
// buffer without reallocing (say, 1.5 x length).
if (length > std::numeric_limits<unsigned>::max() / 3)
return CString();
Vector<char, 1024> bufferVector(length * 3);
char* buffer = bufferVector.data();
if (is8Bit()) {
const LChar* characters = this->characters8();
ConversionResult result =
convertLatin1ToUTF8(&characters, characters + length, &buffer,
buffer + bufferVector.size());
// (length * 3) should be sufficient for any conversion
ASSERT_UNUSED(result, result != targetExhausted);
} else {
const UChar* characters = this->characters16();
if (mode == StrictUTF8ConversionReplacingUnpairedSurrogatesWithFFFD) {
const UChar* charactersEnd = characters + length;
char* bufferEnd = buffer + bufferVector.size();
while (characters < charactersEnd) {
// Use strict conversion to detect unpaired surrogates.
ConversionResult result = convertUTF16ToUTF8(&characters, charactersEnd,
&buffer, bufferEnd, true);
ASSERT(result != targetExhausted);
// Conversion fails when there is an unpaired surrogate. Put
// replacement character (U+FFFD) instead of the unpaired
// surrogate.
if (result != conversionOK) {
ASSERT((0xD800 <= *characters && *characters <= 0xDFFF));
// There should be room left, since one UChar hasn't been
// converted.
ASSERT((buffer + 3) <= bufferEnd);
putUTF8Triple(buffer, replacementCharacter);
++characters;
}
}
} else {
bool strict = mode == StrictUTF8Conversion;
ConversionResult result =
convertUTF16ToUTF8(&characters, characters + length, &buffer,
buffer + bufferVector.size(), strict);
// (length * 3) should be sufficient for any conversion
ASSERT(result != targetExhausted);
// Only produced from strict conversion.
if (result == sourceIllegal) {
ASSERT(strict);
return CString();
}
// Check for an unconverted high surrogate.
if (result == sourceExhausted) {
if (strict)
return CString();
// This should be one unpaired high surrogate. Treat it the same
// was as an unpaired high surrogate would have been handled in
// the middle of a string with non-strict conversion - which is
// to say, simply encode it to UTF-8.
ASSERT((characters + 1) == (this->characters16() + length));
ASSERT((*characters >= 0xD800) && (*characters <= 0xDBFF));
// There should be room left, since one UChar hasn't been
// converted.
ASSERT((buffer + 3) <= (buffer + bufferVector.size()));
putUTF8Triple(buffer, *characters);
}
}
}
return CString(bufferVector.data(), buffer - bufferVector.data());
}
String String::make8BitFrom16BitSource(const UChar* source, size_t length) {
if (!length)
return emptyString();
LChar* destination;
String result = String::createUninitialized(length, destination);
copyLCharsFromUCharSource(destination, source, length);
return result;
}
String String::make16BitFrom8BitSource(const LChar* source, size_t length) {
if (!length)
return emptyString16Bit();
UChar* destination;
String result = String::createUninitialized(length, destination);
StringImpl::copyChars(destination, source, length);
return result;
}
String String::fromUTF8(const LChar* stringStart, size_t length) {
RELEASE_ASSERT(length <= std::numeric_limits<unsigned>::max());
if (!stringStart)
return String();
if (!length)
return emptyString();
if (charactersAreAllASCII(stringStart, length))
return StringImpl::create(stringStart, length);
Vector<UChar, 1024> buffer(length);
UChar* bufferStart = buffer.data();
UChar* bufferCurrent = bufferStart;
const char* stringCurrent = reinterpret_cast<const char*>(stringStart);
if (convertUTF8ToUTF16(
&stringCurrent, reinterpret_cast<const char*>(stringStart + length),
&bufferCurrent, bufferCurrent + buffer.size()) != conversionOK)
return String();
unsigned utf16Length = bufferCurrent - bufferStart;
ASSERT(utf16Length < length);
return StringImpl::create(bufferStart, utf16Length);
}
String String::fromUTF8(const LChar* string) {
if (!string)
return String();
return fromUTF8(string, strlen(reinterpret_cast<const char*>(string)));
}
String String::fromUTF8(const CString& s) {
return fromUTF8(s.data());
}
String String::fromUTF8WithLatin1Fallback(const LChar* string, size_t size) {
String utf8 = fromUTF8(string, size);
if (!utf8)
return String(string, size);
return utf8;
}
const String& emptyString() {
DEFINE_STATIC_LOCAL(String, emptyString, (StringImpl::empty()));
return emptyString;
}
const String& emptyString16Bit() {
DEFINE_STATIC_LOCAL(String, emptyString, (StringImpl::empty16Bit()));
return emptyString;
}
std::ostream& operator<<(std::ostream& out, const String& string) {
if (string.isNull())
return out << "<null>";
out << '"';
for (unsigned index = 0; index < string.length(); ++index) {
// Print shorthands for select cases.
UChar character = string[index];
switch (character) {
case '\t':
out << "\\t";
break;
case '\n':
out << "\\n";
break;
case '\r':
out << "\\r";
break;
case '"':
out << "\\\"";
break;
case '\\':
out << "\\\\";
break;
default:
if (isASCIIPrintable(character)) {
out << static_cast<char>(character);
} else {
// Print "\uXXXX" for control or non-ASCII characters.
out << "\\u";
out.width(4);
out.fill('0');
out.setf(std::ios_base::hex, std::ios_base::basefield);
out.setf(std::ios::uppercase);
out << character;
}
break;
}
}
return out << '"';
}
} // namespace WTF
#ifndef NDEBUG
// For use in the debugger
String* string(const char*);
Vector<char> asciiDebug(StringImpl*);
Vector<char> asciiDebug(String&);
void String::show() const {
dataLogF("%s\n", asciiDebug(impl()).data());
}
String* string(const char* s) {
// leaks memory!
return new String(s);
}
Vector<char> asciiDebug(StringImpl* impl) {
if (!impl)
return asciiDebug(String("[null]").impl());
Vector<char> buffer;
for (unsigned i = 0; i < impl->length(); ++i) {
UChar ch = (*impl)[i];
if (isASCIIPrintable(ch)) {
if (ch == '\\')
buffer.append('\\');
buffer.append(static_cast<char>(ch));
} else {
buffer.append('\\');
buffer.append('u');
appendUnsignedAsHexFixedSize(ch, buffer, 4);
}
}
buffer.append('\0');
return buffer;
}
Vector<char> asciiDebug(String& string) {
return asciiDebug(string.impl());
}
#endif