src/parsing/scanner-character-streams.cc - v8/v8 - Git at Google

 // Copyright 2011 the V8 project authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include "src/parsing/scanner-character-streams.h"

 #include <memory>
 #include <vector>

 #include "include/v8.h"
 #include "src/counters.h"
 #include "src/globals.h"
 #include "src/handles.h"
 #include "src/objects-inl.h"
 #include "src/parsing/scanner.h"
 #include "src/unicode-inl.h"

 namespace v8 {
 namespace internal {

 class ScopedExternalStringLock {
  public:
   explicit ScopedExternalStringLock(ExternalString* string) {
     DCHECK(string);
     if (string->IsExternalOneByteString()) {
       resource_ = ExternalOneByteString::cast(string)->resource();
     } else {
       DCHECK(string->IsExternalTwoByteString());
       resource_ = ExternalTwoByteString::cast(string)->resource();
     }
     DCHECK(resource_);
     resource_->Lock();
   }

   // Copying a lock increases the locking depth.
   ScopedExternalStringLock(const ScopedExternalStringLock& other)
       : resource_(other.resource_) {
     resource_->Lock();
   }

   ~ScopedExternalStringLock() { resource_->Unlock(); }

  private:
   // Not nullptr.
   const v8::String::ExternalStringResourceBase* resource_;
 };

 namespace {
 const unibrow::uchar kUtf8Bom = 0xFEFF;
 }  // namespace

 template <typename Char>
 struct CharTraits;

 template <>
 struct CharTraits<uint8_t> {
   typedef SeqOneByteString String;
   typedef ExternalOneByteString ExternalString;
 };

 template <>
 struct CharTraits<uint16_t> {
   typedef SeqTwoByteString String;
   typedef ExternalTwoByteString ExternalString;
 };

 template <typename Char>
 struct Range {
   const Char* start;
   const Char* end;

   size_t length() { return static_cast<size_t>(end - start); }
   bool unaligned_start() const {
     return reinterpret_cast<intptr_t>(start) % sizeof(Char) == 1;
   }
 };

 // A Char stream backed by an on-heap SeqOneByteString or SeqTwoByteString.
 template <typename Char>
 class OnHeapStream {
  public:
   typedef typename CharTraits<Char>::String String;

   OnHeapStream(Handle<String> string, size_t start_offset, size_t end)
       : string_(string), start_offset_(start_offset), length_(end) {}

   OnHeapStream(const OnHeapStream& other) : start_offset_(0), length_(0) {
     UNREACHABLE();
   }

   Range<Char> GetDataAt(size_t pos) {
     return {&string_->GetChars()[start_offset_ + Min(length_, pos)],
             &string_->GetChars()[start_offset_ + length_]};
   }

   static const bool kCanBeCloned = false;
   static const bool kCanAccessHeap = true;

  private:
   Handle<String> string_;
   const size_t start_offset_;
   const size_t length_;
 };

 // A Char stream backed by an off-heap ExternalOneByteString or
 // ExternalTwoByteString.
 template <typename Char>
 class ExternalStringStream {
   typedef typename CharTraits<Char>::ExternalString ExternalString;

  public:
   ExternalStringStream(ExternalString* string, size_t start_offset,
                        size_t length)
       : lock_(string),
         data_(string->GetChars() + start_offset),
         length_(length) {}

   ExternalStringStream(const ExternalStringStream& other)
       : lock_(other.lock_), data_(other.data_), length_(other.length_) {}

   Range<Char> GetDataAt(size_t pos) {
     return {&data_[Min(length_, pos)], &data_[length_]};
   }

   static const bool kCanBeCloned = true;
   static const bool kCanAccessHeap = false;

  private:
   ScopedExternalStringLock lock_;
   const Char* const data_;
   const size_t length_;
 };

 // A Char stream backed by a C array. Testing only.
 template <typename Char>
 class TestingStream {
  public:
   TestingStream(const Char* data, size_t length)
       : data_(data), length_(length) {}
   Range<Char> GetDataAt(size_t pos) {
     return {&data_[Min(length_, pos)], &data_[length_]};
   }

   static const bool kCanBeCloned = true;
   static const bool kCanAccessHeap = false;

  private:
   const Char* const data_;
   const size_t length_;
 };

 // A Char stream backed by multiple source-stream provided off-heap chunks.
 template <typename Char>
 class ChunkedStream {
  public:
   ChunkedStream(ScriptCompiler::ExternalSourceStream* source,
                 RuntimeCallStats* stats)
       : source_(source), stats_(stats) {}

   ChunkedStream(const ChunkedStream& other) {
     // TODO(rmcilroy): Implement cloning for chunked streams.
     UNREACHABLE();
   }

   Range<Char> GetDataAt(size_t pos) {
     Chunk chunk = FindChunk(pos);
     size_t buffer_end = chunk.length;
     size_t buffer_pos = Min(buffer_end, pos - chunk.position);
     return {&chunk.data[buffer_pos], &chunk.data[buffer_end]};
   }

   ~ChunkedStream() {
     for (Chunk& chunk : chunks_) delete[] chunk.data;
   }

   static const bool kCanBeCloned = false;
   static const bool kCanAccessHeap = false;

  private:
   struct Chunk {
     Chunk(const Char* const data, size_t position, size_t length)
         : data(data), position(position), length(length) {}
     const Char* const data;
     // The logical position of data.
     const size_t position;
     const size_t length;
     size_t end_position() const { return position + length; }
   };

   Chunk FindChunk(size_t position) {
     while (V8_UNLIKELY(chunks_.empty())) FetchChunk(size_t{0});

     // Walk forwards while the position is in front of the current chunk.
     while (position >= chunks_.back().end_position() &&
            chunks_.back().length > 0) {
       FetchChunk(chunks_.back().end_position());
     }

     // Walk backwards.
     for (auto reverse_it = chunks_.rbegin(); reverse_it != chunks_.rend();
          ++reverse_it) {
       if (reverse_it->position <= position) return *reverse_it;
     }

     UNREACHABLE();
   }

   virtual void ProcessChunk(const uint8_t* data, size_t position,
                             size_t length) {
     // Incoming data has to be aligned to Char size.
     DCHECK_EQ(0, length % sizeof(Char));
     chunks_.emplace_back(reinterpret_cast<const Char*>(data), position,
                          length / sizeof(Char));
   }

   void FetchChunk(size_t position) {
     const uint8_t* data = nullptr;
     size_t length;
     {
       RuntimeCallTimerScope scope(stats_,
                                   RuntimeCallCounterId::kGetMoreDataCallback);
       length = source_->GetMoreData(&data);
     }
     ProcessChunk(data, position, length);
   }

   ScriptCompiler::ExternalSourceStream* source_;
   RuntimeCallStats* stats_;

  protected:
   std::vector<struct Chunk> chunks_;
 };

 template <typename Char>
 class Utf8ChunkedStream : public ChunkedStream<uint16_t> {
  public:
   Utf8ChunkedStream(ScriptCompiler::ExternalSourceStream* source,
                     RuntimeCallStats* stats)
       : ChunkedStream<uint16_t>(source, stats) {}

   STATIC_ASSERT(sizeof(Char) == sizeof(uint16_t));
   void ProcessChunk(const uint8_t* data, size_t position, size_t length) final {
     if (length == 0) {
       unibrow::uchar t = unibrow::Utf8::ValueOfIncrementalFinish(&state_);
       if (t != unibrow::Utf8::kBufferEmpty) {
         DCHECK_EQ(t, unibrow::Utf8::kBadChar);
         incomplete_char_ = 0;
         uint16_t* result = new uint16_t[1];
         result[0] = unibrow::Utf8::kBadChar;
         chunks_.emplace_back(result, position, 1);
         position++;
       }
       chunks_.emplace_back(nullptr, position, 0);
       delete[] data;
       return;
     }

     // First count the number of complete characters that can be produced.

     unibrow::Utf8::State state = state_;
     uint32_t incomplete_char = incomplete_char_;
     bool seen_bom = seen_bom_;

     size_t i = 0;
     size_t chars = 0;
     while (i < length) {
       unibrow::uchar t = unibrow::Utf8::ValueOfIncremental(data[i], &i, &state,
                                                            &incomplete_char);
       if (!seen_bom && t == kUtf8Bom && position + chars == 0) {
         seen_bom = true;
         // BOM detected at beginning of the stream. Don't copy it.
       } else if (t != unibrow::Utf8::kIncomplete) {
         chars++;
         if (t > unibrow::Utf16::kMaxNonSurrogateCharCode) chars++;
       }
     }

     // Process the data.

     // If there aren't any complete characters, update the state without
     // producing a chunk.
     if (chars == 0) {
       state_ = state;
       incomplete_char_ = incomplete_char;
       seen_bom_ = seen_bom;
       delete[] data;
       return;
     }

     // Update the state and produce a chunk with complete characters.
     uint16_t* result = new uint16_t[chars];
     uint16_t* cursor = result;
     i = 0;

     while (i < length) {
       unibrow::uchar t = unibrow::Utf8::ValueOfIncremental(data[i], &i, &state_,
                                                            &incomplete_char_);
       if (V8_LIKELY(t < kUtf8Bom)) {
         *(cursor++) = static_cast<uc16>(t);  // The by most frequent case.
       } else if (t == unibrow::Utf8::kIncomplete) {
         continue;
       } else if (!seen_bom_ && t == kUtf8Bom && position == 0 &&
                  cursor == result) {
         // BOM detected at beginning of the stream. Don't copy it.
         seen_bom_ = true;
       } else if (t <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
         *(cursor++) = static_cast<uc16>(t);
       } else {
         *(cursor++) = unibrow::Utf16::LeadSurrogate(t);
         *(cursor++) = unibrow::Utf16::TrailSurrogate(t);
       }
     }

     chunks_.emplace_back(result, position, chars);
     delete[] data;
   }

  private:
   uint32_t incomplete_char_ = 0;
   unibrow::Utf8::State state_ = unibrow::Utf8::State::kAccept;
   bool seen_bom_ = false;
 };

 // Provides a buffered utf-16 view on the bytes from the underlying ByteStream.
 // Chars are buffered if either the underlying stream isn't utf-16 or the
 // underlying utf-16 stream might move (is on-heap).
 template <template <typename T> class ByteStream>
 class BufferedCharacterStream : public Utf16CharacterStream {
  public:
   template <class... TArgs>
   BufferedCharacterStream(size_t pos, TArgs... args) : byte_stream_(args...) {
     buffer_pos_ = pos;
   }

   bool can_be_cloned() const final {
     return ByteStream<uint16_t>::kCanBeCloned;
   }

   std::unique_ptr<Utf16CharacterStream> Clone() const override {
     CHECK(can_be_cloned());
     return std::unique_ptr<Utf16CharacterStream>(
         new BufferedCharacterStream<ByteStream>(*this));
   }

  protected:
   bool ReadBlock() final {
     size_t position = pos();
     buffer_pos_ = position;
     buffer_start_ = &buffer_[0];
     buffer_cursor_ = buffer_start_;

     Range<uint8_t> range = byte_stream_.GetDataAt(position);
     if (range.length() == 0) {
       buffer_end_ = buffer_start_;
       return false;
     }

     size_t length = Min(kBufferSize, range.length());
     i::CopyCharsUnsigned(buffer_, range.start, length);
     buffer_end_ = &buffer_[length];
     return true;
   }

   bool can_access_heap() const final {
     return ByteStream<uint8_t>::kCanAccessHeap;
   }

  private:
   BufferedCharacterStream(const BufferedCharacterStream<ByteStream>& other)
       : byte_stream_(other.byte_stream_) {}

   static const size_t kBufferSize = 512;
   uc16 buffer_[kBufferSize];
   ByteStream<uint8_t> byte_stream_;
 };

 // Provides a unbuffered utf-16 view on the bytes from the underlying
 // ByteStream.
 template <template <typename T> class ByteStream>
 class UnbufferedCharacterStream : public Utf16CharacterStream {
  public:
   template <class... TArgs>
   UnbufferedCharacterStream(size_t pos, TArgs... args) : byte_stream_(args...) {
     buffer_pos_ = pos;
   }

   bool can_access_heap() const final {
     return ByteStream<uint16_t>::kCanAccessHeap;
   }

   bool can_be_cloned() const final {
     return ByteStream<uint16_t>::kCanBeCloned;
   }

   std::unique_ptr<Utf16CharacterStream> Clone() const override {
     return std::unique_ptr<Utf16CharacterStream>(
         new UnbufferedCharacterStream<ByteStream>(*this));
   }

  protected:
   bool ReadBlock() final {
     size_t position = pos();
     buffer_pos_ = position;
     Range<uint16_t> range = byte_stream_.GetDataAt(position);
     buffer_start_ = range.start;
     buffer_end_ = range.end;
     buffer_cursor_ = buffer_start_;
     if (range.length() == 0) return false;

     DCHECK(!range.unaligned_start());
     DCHECK_LE(buffer_start_, buffer_end_);
     return true;
   }

   UnbufferedCharacterStream(const UnbufferedCharacterStream<ByteStream>& other)
       : byte_stream_(other.byte_stream_) {}

   ByteStream<uint16_t> byte_stream_;
 };

 // Provides a unbuffered utf-16 view on the bytes from the underlying
 // ByteStream.
 class RelocatingCharacterStream
     : public UnbufferedCharacterStream<OnHeapStream> {
  public:
   template <class... TArgs>
   RelocatingCharacterStream(Isolate* isolate, size_t pos, TArgs... args)
       : UnbufferedCharacterStream<OnHeapStream>(pos, args...),
         isolate_(isolate) {
     isolate->heap()->AddGCEpilogueCallback(UpdateBufferPointersCallback,
                                            v8::kGCTypeAll, this);
   }

  private:
   ~RelocatingCharacterStream() final {
     isolate_->heap()->RemoveGCEpilogueCallback(UpdateBufferPointersCallback,
                                                this);
   }

   static void UpdateBufferPointersCallback(v8::Isolate* v8_isolate,
                                            v8::GCType type,
                                            v8::GCCallbackFlags flags,
                                            void* stream) {
     reinterpret_cast<RelocatingCharacterStream*>(stream)
         ->UpdateBufferPointers();
   }

   void UpdateBufferPointers() {
     Range<uint16_t> range = byte_stream_.GetDataAt(0);
     if (range.start != buffer_start_) {
       buffer_cursor_ = (buffer_cursor_ - buffer_start_) + range.start;
       buffer_start_ = range.start;
       buffer_end_ = range.end;
     }
   }

   Isolate* isolate_;
 };

 // ----------------------------------------------------------------------------
 // BufferedUtf16CharacterStreams
 //
 // A buffered character stream based on a random access character
 // source (ReadBlock can be called with pos() pointing to any position,
 // even positions before the current).
 //
 // TODO(verwaest): Remove together with Utf8 external streaming streams.
 class BufferedUtf16CharacterStream : public Utf16CharacterStream {
  public:
   BufferedUtf16CharacterStream();

  protected:
   static const size_t kBufferSize = 512;

   bool ReadBlock() final;

   // FillBuffer should read up to kBufferSize characters at position and store
   // them into buffer_[0..]. It returns the number of characters stored.
   virtual size_t FillBuffer(size_t position) = 0;

   // Fixed sized buffer that this class reads from.
   // The base class' buffer_start_ should always point to buffer_.
   uc16 buffer_[kBufferSize];
 };

 BufferedUtf16CharacterStream::BufferedUtf16CharacterStream()
     : Utf16CharacterStream(buffer_, buffer_, buffer_, 0) {}

 bool BufferedUtf16CharacterStream::ReadBlock() {
   DCHECK_EQ(buffer_start_, buffer_);

   size_t position = pos();
   buffer_pos_ = position;
   buffer_cursor_ = buffer_;
   buffer_end_ = buffer_ + FillBuffer(position);
   DCHECK_EQ(pos(), position);
   DCHECK_LE(buffer_end_, buffer_start_ + kBufferSize);
   return buffer_cursor_ < buffer_end_;
 }

 // ----------------------------------------------------------------------------
 // Utf8ExternalStreamingStream - chunked streaming of Utf-8 data.
 //
 // This implementation is fairly complex, since data arrives in chunks which
 // may 'cut' arbitrarily into utf-8 characters. Also, seeking to a given
 // character position is tricky because the byte position cannot be dericed
 // from the character position.
 //
 // TODO(verwaest): Decode utf8 chunks into utf16 chunks on the blink side
 // instead so we don't need to buffer.

 class Utf8ExternalStreamingStream : public BufferedUtf16CharacterStream {
  public:
   Utf8ExternalStreamingStream(
       ScriptCompiler::ExternalSourceStream* source_stream,
       RuntimeCallStats* stats)
       : current_({0, {0, 0, 0, unibrow::Utf8::State::kAccept}}),
         source_stream_(source_stream),
         stats_(stats) {}
   ~Utf8ExternalStreamingStream() final {
     for (size_t i = 0; i < chunks_.size(); i++) delete[] chunks_[i].data;
   }

   bool can_access_heap() const final { return false; }

   bool can_be_cloned() const final { return false; }

   std::unique_ptr<Utf16CharacterStream> Clone() const override {
     UNREACHABLE();
   }

  protected:
   size_t FillBuffer(size_t position) final;

  private:
   // A position within the data stream. It stores:
   // - The 'physical' position (# of bytes in the stream),
   // - the 'logical' position (# of ucs-2 characters, also within the stream),
   // - a possibly incomplete utf-8 char at the current 'physical' position.
   struct StreamPosition {
     size_t bytes;
     size_t chars;
     uint32_t incomplete_char;
     unibrow::Utf8::State state;
   };

   // Position contains a StreamPosition and the index of the chunk the position
   // points into. (The chunk_no could be derived from pos, but that'd be
   // an expensive search through all chunks.)
   struct Position {
     size_t chunk_no;
     StreamPosition pos;
   };

   // A chunk in the list of chunks, containing:
   // - The chunk data (data pointer and length), and
   // - the position at the first byte of the chunk.
   struct Chunk {
     const uint8_t* data;
     size_t length;
     StreamPosition start;
   };

   // Within the current chunk, skip forward from current_ towards position.
   bool SkipToPosition(size_t position);
   // Within the current chunk, fill the buffer_ (while it has capacity).
   void FillBufferFromCurrentChunk();
   // Fetch a new chunk (assuming current_ is at the end of the current data).
   bool FetchChunk();
   // Search through the chunks and set current_ to point to the given position.
   // (This call is potentially expensive.)
   void SearchPosition(size_t position);

   std::vector<Chunk> chunks_;
   Position current_;
   ScriptCompiler::ExternalSourceStream* source_stream_;
   RuntimeCallStats* stats_;
 };

 bool Utf8ExternalStreamingStream::SkipToPosition(size_t position) {
   DCHECK_LE(current_.pos.chars, position);  // We can only skip forward.

   // Already there? Then return immediately.
   if (current_.pos.chars == position) return true;

   const Chunk& chunk = chunks_[current_.chunk_no];
   DCHECK(current_.pos.bytes >= chunk.start.bytes);

   unibrow::Utf8::State state = chunk.start.state;
   uint32_t incomplete_char = chunk.start.incomplete_char;
   size_t it = current_.pos.bytes - chunk.start.bytes;
   size_t chars = chunk.start.chars;
   while (it < chunk.length && chars < position) {
     unibrow::uchar t = unibrow::Utf8::ValueOfIncremental(
         chunk.data[it], &it, &state, &incomplete_char);
     if (t == kUtf8Bom && current_.pos.chars == 0) {
       // BOM detected at beginning of the stream. Don't copy it.
     } else if (t != unibrow::Utf8::kIncomplete) {
       chars++;
       if (t > unibrow::Utf16::kMaxNonSurrogateCharCode) chars++;
     }
   }

   current_.pos.bytes += it;
   current_.pos.chars = chars;
   current_.pos.incomplete_char = incomplete_char;
   current_.pos.state = state;
   current_.chunk_no += (it == chunk.length);

   return current_.pos.chars == position;
 }

 void Utf8ExternalStreamingStream::FillBufferFromCurrentChunk() {
   DCHECK_LT(current_.chunk_no, chunks_.size());
   DCHECK_EQ(buffer_start_, buffer_cursor_);
   DCHECK_LT(buffer_end_ + 1, buffer_start_ + kBufferSize);

   const Chunk& chunk = chunks_[current_.chunk_no];

   // The buffer_ is writable, but buffer_*_ members are const. So we get a
   // non-const pointer into buffer that points to the same char as buffer_end_.
   uint16_t* cursor = buffer_ + (buffer_end_ - buffer_start_);
   DCHECK_EQ(cursor, buffer_end_);

   unibrow::Utf8::State state = current_.pos.state;
   uint32_t incomplete_char = current_.pos.incomplete_char;

   // If the current chunk is the last (empty) chunk we'll have to process
   // any left-over, partial characters.
   if (chunk.length == 0) {
     unibrow::uchar t = unibrow::Utf8::ValueOfIncrementalFinish(&state);
     if (t != unibrow::Utf8::kBufferEmpty) {
       DCHECK_EQ(t, unibrow::Utf8::kBadChar);
       *cursor = static_cast<uc16>(t);
       buffer_end_++;
       current_.pos.chars++;
       current_.pos.incomplete_char = 0;
       current_.pos.state = state;
     }
     return;
   }

   size_t it = current_.pos.bytes - chunk.start.bytes;
   while (it < chunk.length && cursor + 1 < buffer_start_ + kBufferSize) {
     unibrow::uchar t = unibrow::Utf8::ValueOfIncremental(
         chunk.data[it], &it, &state, &incomplete_char);
     if (V8_LIKELY(t < kUtf8Bom)) {
       *(cursor++) = static_cast<uc16>(t);  // The by most frequent case.
     } else if (t == unibrow::Utf8::kIncomplete) {
       continue;
     } else if (t == kUtf8Bom && current_.pos.bytes + it == 3) {
       // BOM detected at beginning of the stream. Don't copy it.
     } else if (t <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
       *(cursor++) = static_cast<uc16>(t);
     } else {
       *(cursor++) = unibrow::Utf16::LeadSurrogate(t);
       *(cursor++) = unibrow::Utf16::TrailSurrogate(t);
     }
   }

   current_.pos.bytes = chunk.start.bytes + it;
   current_.pos.chars += (cursor - buffer_end_);
   current_.pos.incomplete_char = incomplete_char;
   current_.pos.state = state;
   current_.chunk_no += (it == chunk.length);

   buffer_end_ = cursor;
 }

 bool Utf8ExternalStreamingStream::FetchChunk() {
   RuntimeCallTimerScope scope(stats_,
                               RuntimeCallCounterId::kGetMoreDataCallback);
   DCHECK_EQ(current_.chunk_no, chunks_.size());
   DCHECK(chunks_.empty() || chunks_.back().length != 0);

   const uint8_t* chunk = nullptr;
   size_t length = source_stream_->GetMoreData(&chunk);
   chunks_.push_back({chunk, length, current_.pos});
   return length > 0;
 }

 void Utf8ExternalStreamingStream::SearchPosition(size_t position) {
   // If current_ already points to the right position, we're done.
   //
   // This is expected to be the common case, since we typically call
   // FillBuffer right after the current buffer.
   if (current_.pos.chars == position) return;

   // No chunks. Fetch at least one, so we can assume !chunks_.empty() below.
   if (chunks_.empty()) {
     DCHECK_EQ(current_.chunk_no, 0u);
     DCHECK_EQ(current_.pos.bytes, 0u);
     DCHECK_EQ(current_.pos.chars, 0u);
     FetchChunk();
   }

   // Search for the last chunk whose start position is less or equal to
   // position.
   size_t chunk_no = chunks_.size() - 1;
   while (chunk_no > 0 && chunks_[chunk_no].start.chars > position) {
     chunk_no--;
   }

   // Did we find the terminating (zero-length) chunk? Then we're seeking
   // behind the end of the data, and position does not exist.
   // Set current_ to point to the terminating chunk.
   if (chunks_[chunk_no].length == 0) {
     current_ = {chunk_no, chunks_[chunk_no].start};
     return;
   }

   // Did we find the non-last chunk? Then our position must be within chunk_no.
   if (chunk_no + 1 < chunks_.size()) {
     // Fancy-pants optimization for ASCII chunks within a utf-8 stream.
     // (Many web sites declare utf-8 encoding, but use only (or almost only) the
     //  ASCII subset for their JavaScript sources. We can exploit this, by
     //  checking whether the # bytes in a chunk are equal to the # chars, and if
     //  so avoid the expensive SkipToPosition.)
     bool ascii_only_chunk =
         chunks_[chunk_no].start.incomplete_char == 0 &&
         (chunks_[chunk_no + 1].start.bytes - chunks_[chunk_no].start.bytes) ==
             (chunks_[chunk_no + 1].start.chars - chunks_[chunk_no].start.chars);
     if (ascii_only_chunk) {
       size_t skip = position - chunks_[chunk_no].start.chars;
       current_ = {chunk_no,
                   {chunks_[chunk_no].start.bytes + skip,
                    chunks_[chunk_no].start.chars + skip, 0,
                    unibrow::Utf8::State::kAccept}};
     } else {
       current_ = {chunk_no, chunks_[chunk_no].start};
       SkipToPosition(position);
     }

     // Since position was within the chunk, SkipToPosition should have found
     // something.
     DCHECK_EQ(position, current_.pos.chars);
     return;
   }

   // What's left: We're in the last, non-terminating chunk. Our position
   // may be in the chunk, but it may also be in 'future' chunks, which we'll
   // have to obtain.
   DCHECK_EQ(chunk_no, chunks_.size() - 1);
   current_ = {chunk_no, chunks_[chunk_no].start};
   bool have_more_data = true;
   bool found = SkipToPosition(position);
   while (have_more_data && !found) {
     DCHECK_EQ(current_.chunk_no, chunks_.size());
     have_more_data = FetchChunk();
     found = have_more_data && SkipToPosition(position);
   }

   // We'll return with a postion != the desired position only if we're out
   // of data. In that case, we'll point to the terminating chunk.
   DCHECK_EQ(found, current_.pos.chars == position);
   DCHECK_EQ(have_more_data, chunks_.back().length != 0);
   DCHECK_IMPLIES(!found, !have_more_data);
   DCHECK_IMPLIES(!found, current_.chunk_no == chunks_.size() - 1);
 }

 size_t Utf8ExternalStreamingStream::FillBuffer(size_t position) {
   buffer_cursor_ = buffer_;
   buffer_end_ = buffer_;

   SearchPosition(position);
   bool out_of_data = current_.chunk_no != chunks_.size() &&
                      chunks_[current_.chunk_no].length == 0 &&
                      current_.pos.incomplete_char == 0;

   if (out_of_data) return 0;

   // Fill the buffer, until we have at least one char (or are out of data).
   // (The embedder might give us 1-byte blocks within a utf-8 char, so we
   //  can't guarantee progress with one chunk. Thus we iterate.)
   while (!out_of_data && buffer_cursor_ == buffer_end_) {
     // At end of current data, but there might be more? Then fetch it.
     if (current_.chunk_no == chunks_.size()) {
       out_of_data = !FetchChunk();
     }
     FillBufferFromCurrentChunk();
   }

   DCHECK_EQ(current_.pos.chars - position,
             static_cast<size_t>(buffer_end_ - buffer_cursor_));
   return buffer_end_ - buffer_cursor_;
 }

 // ----------------------------------------------------------------------------
 // ScannerStream: Create stream instances.

 Utf16CharacterStream* ScannerStream::For(Isolate* isolate,
                                          Handle<String> data) {
   return ScannerStream::For(isolate, data, 0, data->length());
 }

 Utf16CharacterStream* ScannerStream::For(Isolate* isolate, Handle<String> data,
                                          int start_pos, int end_pos) {
   DCHECK_GE(start_pos, 0);
   DCHECK_LE(start_pos, end_pos);
   DCHECK_LE(end_pos, data->length());
   size_t start_offset = 0;
   if (data->IsSlicedString()) {
     SlicedString* string = SlicedString::cast(*data);
     start_offset = string->offset();
     String* parent = string->parent();
     if (parent->IsThinString()) parent = ThinString::cast(parent)->actual();
     data = handle(parent, isolate);
   } else {
     data = String::Flatten(isolate, data);
   }
   if (data->IsExternalOneByteString()) {
     return new BufferedCharacterStream<ExternalStringStream>(
         static_cast<size_t>(start_pos), ExternalOneByteString::cast(*data),
         start_offset, static_cast<size_t>(end_pos));
   } else if (data->IsExternalTwoByteString()) {
     return new UnbufferedCharacterStream<ExternalStringStream>(
         static_cast<size_t>(start_pos), ExternalTwoByteString::cast(*data),
         start_offset, static_cast<size_t>(end_pos));
   } else if (data->IsSeqOneByteString()) {
     return new BufferedCharacterStream<OnHeapStream>(
         static_cast<size_t>(start_pos), Handle<SeqOneByteString>::cast(data),
         start_offset, static_cast<size_t>(end_pos));
   } else if (data->IsSeqTwoByteString()) {
     return new RelocatingCharacterStream(
         isolate, static_cast<size_t>(start_pos),
         Handle<SeqTwoByteString>::cast(data), start_offset,
         static_cast<size_t>(end_pos));
   } else {
     UNREACHABLE();
   }
 }

 std::unique_ptr<Utf16CharacterStream> ScannerStream::ForTesting(
     const char* data) {
   return ScannerStream::ForTesting(data, strlen(data));
 }

 std::unique_ptr<Utf16CharacterStream> ScannerStream::ForTesting(
     const char* data, size_t length) {
   return std::unique_ptr<Utf16CharacterStream>(
       new BufferedCharacterStream<TestingStream>(
           static_cast<size_t>(0), reinterpret_cast<const uint8_t*>(data),
           static_cast<size_t>(length)));
 }

 Utf16CharacterStream* ScannerStream::For(
     ScriptCompiler::ExternalSourceStream* source_stream,
     v8::ScriptCompiler::StreamedSource::Encoding encoding,
     RuntimeCallStats* stats) {
   switch (encoding) {
     case v8::ScriptCompiler::StreamedSource::TWO_BYTE:
       return new UnbufferedCharacterStream<ChunkedStream>(
           static_cast<size_t>(0), source_stream, stats);
     case v8::ScriptCompiler::StreamedSource::ONE_BYTE:
       return new BufferedCharacterStream<ChunkedStream>(static_cast<size_t>(0),
                                                         source_stream, stats);
     case v8::ScriptCompiler::StreamedSource::UTF8:
       return new Utf8ExternalStreamingStream(source_stream, stats);
   }
   UNREACHABLE();
 }

 }  // namespace internal
 }  // namespace v8