third_party/WebKit/Source/wtf/text/TextCodecUTF8.cpp - chromium/src - Git at Google

 /*
  * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */

 #include "wtf/text/TextCodecUTF8.h"

 #include "wtf/PtrUtil.h"
 #include "wtf/text/CString.h"
 #include "wtf/text/CharacterNames.h"
 #include "wtf/text/StringBuffer.h"
 #include "wtf/text/TextCodecASCIIFastPath.h"
 #include <memory>

 namespace WTF {

 using namespace WTF::Unicode;

 const int nonCharacter = -1;

 std::unique_ptr<TextCodec> TextCodecUTF8::create(const TextEncoding&,
                                                  const void*) {
   return wrapUnique(new TextCodecUTF8);
 }

 void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar) {
   registrar("UTF-8", "UTF-8");

   // Additional aliases that originally were present in the encoding
   // table in WebKit on Macintosh, and subsequently added by
   // TextCodecICU. Perhaps we can prove some are not used on the web
   // and remove them.
   registrar("unicode11utf8", "UTF-8");
   registrar("unicode20utf8", "UTF-8");
   registrar("utf8", "UTF-8");
   registrar("x-unicode20utf8", "UTF-8");

   // Additional aliases present in the WHATWG Encoding Standard
   // (http://encoding.spec.whatwg.org/)
   // and Firefox (24), but not in ICU 4.6.
   registrar("unicode-1-1-utf-8", "UTF-8");
 }

 void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar) {
   registrar("UTF-8", create, 0);
 }

 static inline int nonASCIISequenceLength(uint8_t firstByte) {
   static const uint8_t lengths[256] = {
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
   return lengths[firstByte];
 }

 static inline int decodeNonASCIISequence(const uint8_t* sequence,
                                          unsigned length) {
   ASSERT(!isASCII(sequence[0]));
   if (length == 2) {
     ASSERT(sequence[0] <= 0xDF);
     if (sequence[0] < 0xC2)
       return nonCharacter;
     if (sequence[1] < 0x80 || sequence[1] > 0xBF)
       return nonCharacter;
     return ((sequence[0] << 6) + sequence[1]) - 0x00003080;
   }
   if (length == 3) {
     ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF);
     switch (sequence[0]) {
       case 0xE0:
         if (sequence[1] < 0xA0 || sequence[1] > 0xBF)
           return nonCharacter;
         break;
       case 0xED:
         if (sequence[1] < 0x80 || sequence[1] > 0x9F)
           return nonCharacter;
         break;
       default:
         if (sequence[1] < 0x80 || sequence[1] > 0xBF)
           return nonCharacter;
     }
     if (sequence[2] < 0x80 || sequence[2] > 0xBF)
       return nonCharacter;
     return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) -
            0x000E2080;
   }
   ASSERT(length == 4);
   ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4);
   switch (sequence[0]) {
     case 0xF0:
       if (sequence[1] < 0x90 || sequence[1] > 0xBF)
         return nonCharacter;
       break;
     case 0xF4:
       if (sequence[1] < 0x80 || sequence[1] > 0x8F)
         return nonCharacter;
       break;
     default:
       if (sequence[1] < 0x80 || sequence[1] > 0xBF)
         return nonCharacter;
   }
   if (sequence[2] < 0x80 || sequence[2] > 0xBF)
     return nonCharacter;
   if (sequence[3] < 0x80 || sequence[3] > 0xBF)
     return nonCharacter;
   return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) +
           sequence[3]) -
          0x03C82080;
 }

 static inline UChar* appendCharacter(UChar* destination, int character) {
   ASSERT(character != nonCharacter);
   ASSERT(!U_IS_SURROGATE(character));
   if (U_IS_BMP(character)) {
     *destination++ = static_cast<UChar>(character);
   } else {
     *destination++ = U16_LEAD(character);
     *destination++ = U16_TRAIL(character);
   }
   return destination;
 }

 void TextCodecUTF8::consumePartialSequenceByte() {
   --m_partialSequenceSize;
   memmove(m_partialSequence, m_partialSequence + 1, m_partialSequenceSize);
 }

 void TextCodecUTF8::handleError(UChar*& destination,
                                 bool stopOnError,
                                 bool& sawError) {
   sawError = true;
   if (stopOnError)
     return;
   // Each error generates a replacement character and consumes one byte.
   *destination++ = replacementCharacter;
   consumePartialSequenceByte();
 }

 template <>
 bool TextCodecUTF8::handlePartialSequence<LChar>(LChar*& destination,
                                                  const uint8_t*& source,
                                                  const uint8_t* end,
                                                  bool flush,
                                                  bool,
                                                  bool&) {
   ASSERT(m_partialSequenceSize);
   do {
     if (isASCII(m_partialSequence[0])) {
       *destination++ = m_partialSequence[0];
       consumePartialSequenceByte();
       continue;
     }
     int count = nonASCIISequenceLength(m_partialSequence[0]);
     if (!count)
       return true;

     if (count > m_partialSequenceSize) {
       if (count - m_partialSequenceSize > end - source) {
         if (!flush) {
           // The new data is not enough to complete the sequence, so
           // add it to the existing partial sequence.
           memcpy(m_partialSequence + m_partialSequenceSize, source,
                  end - source);
           m_partialSequenceSize += end - source;
           return false;
         }
         // An incomplete partial sequence at the end is an error, but it will
         // create a 16 bit string due to the replacementCharacter. Let the 16
         // bit path handle the error.
         return true;
       }
       memcpy(m_partialSequence + m_partialSequenceSize, source,
              count - m_partialSequenceSize);
       source += count - m_partialSequenceSize;
       m_partialSequenceSize = count;
     }
     int character = decodeNonASCIISequence(m_partialSequence, count);
     if (character & ~0xff)
       return true;

     m_partialSequenceSize -= count;
     *destination++ = static_cast<LChar>(character);
   } while (m_partialSequenceSize);

   return false;
 }

 template <>
 bool TextCodecUTF8::handlePartialSequence<UChar>(UChar*& destination,
                                                  const uint8_t*& source,
                                                  const uint8_t* end,
                                                  bool flush,
                                                  bool stopOnError,
                                                  bool& sawError) {
   ASSERT(m_partialSequenceSize);
   do {
     if (isASCII(m_partialSequence[0])) {
       *destination++ = m_partialSequence[0];
       consumePartialSequenceByte();
       continue;
     }
     int count = nonASCIISequenceLength(m_partialSequence[0]);
     if (!count) {
       handleError(destination, stopOnError, sawError);
       if (stopOnError)
         return false;
       continue;
     }
     if (count > m_partialSequenceSize) {
       if (count - m_partialSequenceSize > end - source) {
         if (!flush) {
           // The new data is not enough to complete the sequence, so
           // add it to the existing partial sequence.
           memcpy(m_partialSequence + m_partialSequenceSize, source,
                  end - source);
           m_partialSequenceSize += end - source;
           return false;
         }
         // An incomplete partial sequence at the end is an error.
         handleError(destination, stopOnError, sawError);
         if (stopOnError)
           return false;
         continue;
       }
       memcpy(m_partialSequence + m_partialSequenceSize, source,
              count - m_partialSequenceSize);
       source += count - m_partialSequenceSize;
       m_partialSequenceSize = count;
     }
     int character = decodeNonASCIISequence(m_partialSequence, count);
     if (character == nonCharacter) {
       handleError(destination, stopOnError, sawError);
       if (stopOnError)
         return false;
       continue;
     }

     m_partialSequenceSize -= count;
     destination = appendCharacter(destination, character);
   } while (m_partialSequenceSize);

   return false;
 }

 String TextCodecUTF8::decode(const char* bytes,
                              size_t length,
                              FlushBehavior flush,
                              bool stopOnError,
                              bool& sawError) {
   // Each input byte might turn into a character.
   // That includes all bytes in the partial-sequence buffer because
   // each byte in an invalid sequence will turn into a replacement character.
   StringBuffer<LChar> buffer(m_partialSequenceSize + length);

   const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes);
   const uint8_t* end = source + length;
   const uint8_t* alignedEnd = alignToMachineWord(end);
   LChar* destination = buffer.characters();

   do {
     if (m_partialSequenceSize) {
       // Explicitly copy destination and source pointers to avoid taking
       // pointers to the local variables, which may harm code generation by
       // disabling some optimizations in some compilers.
       LChar* destinationForHandlePartialSequence = destination;
       const uint8_t* sourceForHandlePartialSequence = source;
       if (handlePartialSequence(destinationForHandlePartialSequence,
                                 sourceForHandlePartialSequence, end, flush,
                                 stopOnError, sawError)) {
         source = sourceForHandlePartialSequence;
         goto upConvertTo16Bit;
       }
       destination = destinationForHandlePartialSequence;
       source = sourceForHandlePartialSequence;
       if (m_partialSequenceSize)
         break;
     }

     while (source < end) {
       if (isASCII(*source)) {
         // Fast path for ASCII. Most UTF-8 text will be ASCII.
         if (isAlignedToMachineWord(source)) {
           while (source < alignedEnd) {
             MachineWord chunk =
                 *reinterpret_cast_ptr<const MachineWord*>(source);
             if (!isAllASCII<LChar>(chunk))
               break;
             copyASCIIMachineWord(destination, source);
             source += sizeof(MachineWord);
             destination += sizeof(MachineWord);
           }
           if (source == end)
             break;
           if (!isASCII(*source))
             continue;
         }
         *destination++ = *source++;
         continue;
       }
       int count = nonASCIISequenceLength(*source);
       int character;
       if (count == 0) {
         character = nonCharacter;
       } else {
         if (count > end - source) {
           ASSERT_WITH_SECURITY_IMPLICATION(
               end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
           ASSERT(!m_partialSequenceSize);
           m_partialSequenceSize = end - source;
           memcpy(m_partialSequence, source, m_partialSequenceSize);
           source = end;
           break;
         }
         character = decodeNonASCIISequence(source, count);
       }
       if (character == nonCharacter) {
         sawError = true;
         if (stopOnError)
           break;

         goto upConvertTo16Bit;
       }
       if (character > 0xff)
         goto upConvertTo16Bit;

       source += count;
       *destination++ = static_cast<LChar>(character);
     }
   } while (flush && m_partialSequenceSize);

   buffer.shrink(destination - buffer.characters());

   return String::adopt(buffer);

 upConvertTo16Bit:
   StringBuffer<UChar> buffer16(m_partialSequenceSize + length);

   UChar* destination16 = buffer16.characters();

   // Copy the already converted characters
   for (LChar* converted8 = buffer.characters(); converted8 < destination;)
     *destination16++ = *converted8++;

   do {
     if (m_partialSequenceSize) {
       // Explicitly copy destination and source pointers to avoid taking
       // pointers to the local variables, which may harm code generation by
       // disabling some optimizations in some compilers.
       UChar* destinationForHandlePartialSequence = destination16;
       const uint8_t* sourceForHandlePartialSequence = source;
       handlePartialSequence(destinationForHandlePartialSequence,
                             sourceForHandlePartialSequence, end, flush,
                             stopOnError, sawError);
       destination16 = destinationForHandlePartialSequence;
       source = sourceForHandlePartialSequence;
       if (m_partialSequenceSize)
         break;
     }

     while (source < end) {
       if (isASCII(*source)) {
         // Fast path for ASCII. Most UTF-8 text will be ASCII.
         if (isAlignedToMachineWord(source)) {
           while (source < alignedEnd) {
             MachineWord chunk =
                 *reinterpret_cast_ptr<const MachineWord*>(source);
             if (!isAllASCII<LChar>(chunk))
               break;
             copyASCIIMachineWord(destination16, source);
             source += sizeof(MachineWord);
             destination16 += sizeof(MachineWord);
           }
           if (source == end)
             break;
           if (!isASCII(*source))
             continue;
         }
         *destination16++ = *source++;
         continue;
       }
       int count = nonASCIISequenceLength(*source);
       int character;
       if (count == 0) {
         character = nonCharacter;
       } else {
         if (count > end - source) {
           ASSERT_WITH_SECURITY_IMPLICATION(
               end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
           ASSERT(!m_partialSequenceSize);
           m_partialSequenceSize = end - source;
           memcpy(m_partialSequence, source, m_partialSequenceSize);
           source = end;
           break;
         }
         character = decodeNonASCIISequence(source, count);
       }
       if (character == nonCharacter) {
         sawError = true;
         if (stopOnError)
           break;
         // Each error generates a replacement character and consumes one byte.
         *destination16++ = replacementCharacter;
         ++source;
         continue;
       }
       source += count;
       destination16 = appendCharacter(destination16, character);
     }
   } while (flush && m_partialSequenceSize);

   buffer16.shrink(destination16 - buffer16.characters());

   return String::adopt(buffer16);
 }

 template <typename CharType>
 CString TextCodecUTF8::encodeCommon(const CharType* characters, size_t length) {
   // The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3.
   // BMP characters take only one UTF-16 code unit and can take up to 3 bytes
   // (3x).
   // Non-BMP characters take two UTF-16 code units and can take up to 4 bytes
   // (2x).
   if (length > std::numeric_limits<size_t>::max() / 3)
     CRASH();
   Vector<uint8_t> bytes(length * 3);

   size_t i = 0;
   size_t bytesWritten = 0;
   while (i < length) {
     UChar32 character;
     U16_NEXT(characters, i, length, character);
     // U16_NEXT will simply emit a surrogate code point if an unmatched
     // surrogate is encountered; we must convert it to a
     // U+FFFD (REPLACEMENT CHARACTER) here.
     if (0xD800 <= character && character <= 0xDFFF)
       character = replacementCharacter;
     U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character);
   }

   return CString(reinterpret_cast<char*>(bytes.data()), bytesWritten);
 }

 CString TextCodecUTF8::encode(const UChar* characters,
                               size_t length,
                               UnencodableHandling) {
   return encodeCommon(characters, length);
 }

 CString TextCodecUTF8::encode(const LChar* characters,
                               size_t length,
                               UnencodableHandling) {
   return encodeCommon(characters, length);
 }

 }  // namespace WTF
	/*
	* Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
	* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
	* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
	* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
	* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
	* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include "wtf/text/TextCodecUTF8.h"

	#include "wtf/PtrUtil.h"
	#include "wtf/text/CString.h"
	#include "wtf/text/CharacterNames.h"
	#include "wtf/text/StringBuffer.h"
	#include "wtf/text/TextCodecASCIIFastPath.h"
	#include <memory>

	namespace WTF {

	using namespace WTF::Unicode;

	const int nonCharacter = -1;

	std::unique_ptr<TextCodec> TextCodecUTF8::create(const TextEncoding&,
	const void*) {
	return wrapUnique(new TextCodecUTF8);
	}

	void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar) {
	registrar("UTF-8", "UTF-8");

	// Additional aliases that originally were present in the encoding
	// table in WebKit on Macintosh, and subsequently added by
	// TextCodecICU. Perhaps we can prove some are not used on the web
	// and remove them.
	registrar("unicode11utf8", "UTF-8");
	registrar("unicode20utf8", "UTF-8");
	registrar("utf8", "UTF-8");
	registrar("x-unicode20utf8", "UTF-8");

	// Additional aliases present in the WHATWG Encoding Standard
	// (http://encoding.spec.whatwg.org/)
	// and Firefox (24), but not in ICU 4.6.
	registrar("unicode-1-1-utf-8", "UTF-8");
	}

	void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar) {
	registrar("UTF-8", create, 0);
	}

	static inline int nonASCIISequenceLength(uint8_t firstByte) {
	static const uint8_t lengths[256] = {
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
	2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
	4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
	return lengths[firstByte];
	}

	static inline int decodeNonASCIISequence(const uint8_t* sequence,
	unsigned length) {
	ASSERT(!isASCII(sequence[0]));
	if (length == 2) {
	ASSERT(sequence[0] <= 0xDF);
	if (sequence[0] < 0xC2)
	return nonCharacter;
	if (sequence[1] < 0x80 \|\| sequence[1] > 0xBF)
	return nonCharacter;
	return ((sequence[0] << 6) + sequence[1]) - 0x00003080;
	}
	if (length == 3) {
	ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF);
	switch (sequence[0]) {
	case 0xE0:
	if (sequence[1] < 0xA0 \|\| sequence[1] > 0xBF)
	return nonCharacter;
	break;
	case 0xED:
	if (sequence[1] < 0x80 \|\| sequence[1] > 0x9F)
	return nonCharacter;
	break;
	default:
	if (sequence[1] < 0x80 \|\| sequence[1] > 0xBF)
	return nonCharacter;
	}
	if (sequence[2] < 0x80 \|\| sequence[2] > 0xBF)
	return nonCharacter;
	return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) -
	0x000E2080;
	}
	ASSERT(length == 4);
	ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4);
	switch (sequence[0]) {
	case 0xF0:
	if (sequence[1] < 0x90 \|\| sequence[1] > 0xBF)
	return nonCharacter;
	break;
	case 0xF4:
	if (sequence[1] < 0x80 \|\| sequence[1] > 0x8F)
	return nonCharacter;
	break;
	default:
	if (sequence[1] < 0x80 \|\| sequence[1] > 0xBF)
	return nonCharacter;
	}
	if (sequence[2] < 0x80 \|\| sequence[2] > 0xBF)
	return nonCharacter;
	if (sequence[3] < 0x80 \|\| sequence[3] > 0xBF)
	return nonCharacter;
	return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) +
	sequence[3]) -
	0x03C82080;
	}

	static inline UChar* appendCharacter(UChar* destination, int character) {
	ASSERT(character != nonCharacter);
	ASSERT(!U_IS_SURROGATE(character));
	if (U_IS_BMP(character)) {
	*destination++ = static_cast<UChar>(character);
	} else {
	*destination++ = U16_LEAD(character);
	*destination++ = U16_TRAIL(character);
	}
	return destination;
	}

	void TextCodecUTF8::consumePartialSequenceByte() {
	--m_partialSequenceSize;
	memmove(m_partialSequence, m_partialSequence + 1, m_partialSequenceSize);
	}

	void TextCodecUTF8::handleError(UChar*& destination,
	bool stopOnError,
	bool& sawError) {
	sawError = true;
	if (stopOnError)
	return;
	// Each error generates a replacement character and consumes one byte.
	*destination++ = replacementCharacter;
	consumePartialSequenceByte();
	}

	template <>
	bool TextCodecUTF8::handlePartialSequence<LChar>(LChar*& destination,
	const uint8_t*& source,
	const uint8_t* end,
	bool flush,
	bool,
	bool&) {
	ASSERT(m_partialSequenceSize);
	do {
	if (isASCII(m_partialSequence[0])) {
	*destination++ = m_partialSequence[0];
	consumePartialSequenceByte();
	continue;
	}
	int count = nonASCIISequenceLength(m_partialSequence[0]);
	if (!count)
	return true;

	if (count > m_partialSequenceSize) {
	if (count - m_partialSequenceSize > end - source) {
	if (!flush) {
	// The new data is not enough to complete the sequence, so
	// add it to the existing partial sequence.
	memcpy(m_partialSequence + m_partialSequenceSize, source,
	end - source);
	m_partialSequenceSize += end - source;
	return false;
	}
	// An incomplete partial sequence at the end is an error, but it will
	// create a 16 bit string due to the replacementCharacter. Let the 16
	// bit path handle the error.
	return true;
	}
	memcpy(m_partialSequence + m_partialSequenceSize, source,
	count - m_partialSequenceSize);
	source += count - m_partialSequenceSize;
	m_partialSequenceSize = count;
	}
	int character = decodeNonASCIISequence(m_partialSequence, count);
	if (character & ~0xff)
	return true;

	m_partialSequenceSize -= count;
	*destination++ = static_cast<LChar>(character);
	} while (m_partialSequenceSize);

	return false;
	}

	template <>
	bool TextCodecUTF8::handlePartialSequence<UChar>(UChar*& destination,
	const uint8_t*& source,
	const uint8_t* end,
	bool flush,
	bool stopOnError,
	bool& sawError) {
	ASSERT(m_partialSequenceSize);
	do {
	if (isASCII(m_partialSequence[0])) {
	*destination++ = m_partialSequence[0];
	consumePartialSequenceByte();
	continue;
	}
	int count = nonASCIISequenceLength(m_partialSequence[0]);
	if (!count) {
	handleError(destination, stopOnError, sawError);
	if (stopOnError)
	return false;
	continue;
	}
	if (count > m_partialSequenceSize) {
	if (count - m_partialSequenceSize > end - source) {
	if (!flush) {
	// The new data is not enough to complete the sequence, so
	// add it to the existing partial sequence.
	memcpy(m_partialSequence + m_partialSequenceSize, source,
	end - source);
	m_partialSequenceSize += end - source;
	return false;
	}
	// An incomplete partial sequence at the end is an error.
	handleError(destination, stopOnError, sawError);
	if (stopOnError)
	return false;
	continue;
	}
	memcpy(m_partialSequence + m_partialSequenceSize, source,
	count - m_partialSequenceSize);
	source += count - m_partialSequenceSize;
	m_partialSequenceSize = count;
	}
	int character = decodeNonASCIISequence(m_partialSequence, count);
	if (character == nonCharacter) {
	handleError(destination, stopOnError, sawError);
	if (stopOnError)
	return false;
	continue;
	}

	m_partialSequenceSize -= count;
	destination = appendCharacter(destination, character);
	} while (m_partialSequenceSize);

	return false;
	}

	String TextCodecUTF8::decode(const char* bytes,
	size_t length,
	FlushBehavior flush,
	bool stopOnError,
	bool& sawError) {
	// Each input byte might turn into a character.
	// That includes all bytes in the partial-sequence buffer because
	// each byte in an invalid sequence will turn into a replacement character.
	StringBuffer<LChar> buffer(m_partialSequenceSize + length);

	const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes);
	const uint8_t* end = source + length;
	const uint8_t* alignedEnd = alignToMachineWord(end);
	LChar* destination = buffer.characters();

	do {
	if (m_partialSequenceSize) {
	// Explicitly copy destination and source pointers to avoid taking
	// pointers to the local variables, which may harm code generation by
	// disabling some optimizations in some compilers.
	LChar* destinationForHandlePartialSequence = destination;
	const uint8_t* sourceForHandlePartialSequence = source;
	if (handlePartialSequence(destinationForHandlePartialSequence,
	sourceForHandlePartialSequence, end, flush,
	stopOnError, sawError)) {
	source = sourceForHandlePartialSequence;
	goto upConvertTo16Bit;
	}
	destination = destinationForHandlePartialSequence;
	source = sourceForHandlePartialSequence;
	if (m_partialSequenceSize)
	break;
	}

	while (source < end) {
	if (isASCII(*source)) {
	// Fast path for ASCII. Most UTF-8 text will be ASCII.
	if (isAlignedToMachineWord(source)) {
	while (source < alignedEnd) {
	MachineWord chunk =
	reinterpret_cast_ptr<const MachineWord>(source);
	if (!isAllASCII<LChar>(chunk))
	break;
	copyASCIIMachineWord(destination, source);
	source += sizeof(MachineWord);
	destination += sizeof(MachineWord);
	}
	if (source == end)
	break;
	if (!isASCII(*source))
	continue;
	}
	destination++ = source++;
	continue;
	}
	int count = nonASCIISequenceLength(*source);
	int character;
	if (count == 0) {
	character = nonCharacter;
	} else {
	if (count > end - source) {
	ASSERT_WITH_SECURITY_IMPLICATION(
	end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
	ASSERT(!m_partialSequenceSize);
	m_partialSequenceSize = end - source;
	memcpy(m_partialSequence, source, m_partialSequenceSize);
	source = end;
	break;
	}
	character = decodeNonASCIISequence(source, count);
	}
	if (character == nonCharacter) {
	sawError = true;
	if (stopOnError)
	break;

	goto upConvertTo16Bit;
	}
	if (character > 0xff)
	goto upConvertTo16Bit;

	source += count;
	*destination++ = static_cast<LChar>(character);
	}
	} while (flush && m_partialSequenceSize);

	buffer.shrink(destination - buffer.characters());

	return String::adopt(buffer);

	upConvertTo16Bit:
	StringBuffer<UChar> buffer16(m_partialSequenceSize + length);

	UChar* destination16 = buffer16.characters();

	// Copy the already converted characters
	for (LChar* converted8 = buffer.characters(); converted8 < destination;)
	destination16++ = converted8++;

	do {
	if (m_partialSequenceSize) {
	// Explicitly copy destination and source pointers to avoid taking
	// pointers to the local variables, which may harm code generation by
	// disabling some optimizations in some compilers.
	UChar* destinationForHandlePartialSequence = destination16;
	const uint8_t* sourceForHandlePartialSequence = source;
	handlePartialSequence(destinationForHandlePartialSequence,
	sourceForHandlePartialSequence, end, flush,
	stopOnError, sawError);
	destination16 = destinationForHandlePartialSequence;
	source = sourceForHandlePartialSequence;
	if (m_partialSequenceSize)
	break;
	}

	while (source < end) {
	if (isASCII(*source)) {
	// Fast path for ASCII. Most UTF-8 text will be ASCII.
	if (isAlignedToMachineWord(source)) {
	while (source < alignedEnd) {
	MachineWord chunk =
	reinterpret_cast_ptr<const MachineWord>(source);
	if (!isAllASCII<LChar>(chunk))
	break;
	copyASCIIMachineWord(destination16, source);
	source += sizeof(MachineWord);
	destination16 += sizeof(MachineWord);
	}
	if (source == end)
	break;
	if (!isASCII(*source))
	continue;
	}
	destination16++ = source++;
	continue;
	}
	int count = nonASCIISequenceLength(*source);
	int character;
	if (count == 0) {
	character = nonCharacter;
	} else {
	if (count > end - source) {
	ASSERT_WITH_SECURITY_IMPLICATION(
	end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
	ASSERT(!m_partialSequenceSize);
	m_partialSequenceSize = end - source;
	memcpy(m_partialSequence, source, m_partialSequenceSize);
	source = end;
	break;
	}
	character = decodeNonASCIISequence(source, count);
	}
	if (character == nonCharacter) {
	sawError = true;
	if (stopOnError)
	break;
	// Each error generates a replacement character and consumes one byte.
	*destination16++ = replacementCharacter;
	++source;
	continue;
	}
	source += count;
	destination16 = appendCharacter(destination16, character);
	}
	} while (flush && m_partialSequenceSize);

	buffer16.shrink(destination16 - buffer16.characters());

	return String::adopt(buffer16);
	}

	template <typename CharType>
	CString TextCodecUTF8::encodeCommon(const CharType* characters, size_t length) {
	// The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3.
	// BMP characters take only one UTF-16 code unit and can take up to 3 bytes
	// (3x).
	// Non-BMP characters take two UTF-16 code units and can take up to 4 bytes
	// (2x).
	if (length > std::numeric_limits<size_t>::max() / 3)
	CRASH();
	Vector<uint8_t> bytes(length * 3);

	size_t i = 0;
	size_t bytesWritten = 0;
	while (i < length) {
	UChar32 character;
	U16_NEXT(characters, i, length, character);
	// U16_NEXT will simply emit a surrogate code point if an unmatched
	// surrogate is encountered; we must convert it to a
	// U+FFFD (REPLACEMENT CHARACTER) here.
	if (0xD800 <= character && character <= 0xDFFF)
	character = replacementCharacter;
	U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character);
	}

	return CString(reinterpret_cast<char*>(bytes.data()), bytesWritten);
	}

	CString TextCodecUTF8::encode(const UChar* characters,
	size_t length,
	UnencodableHandling) {
	return encodeCommon(characters, length);
	}

	CString TextCodecUTF8::encode(const LChar* characters,
	size_t length,
	UnencodableHandling) {
	return encodeCommon(characters, length);
	}

	} // namespace WTF