| /* |
| * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2012 Apple Inc. All |
| * rights reserved. |
| * Copyright (C) 2005 Alexey Proskuryakov. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions |
| * are met: |
| * 1. Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * 2. Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in the |
| * documentation and/or other materials provided with the distribution. |
| * |
| * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY |
| * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
| * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR |
| * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
| * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
| * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
| * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
| * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| #include "third_party/blink/renderer/core/editing/iterators/search_buffer.h" |
| |
| #include "third_party/blink/renderer/core/dom/document.h" |
| #include "third_party/blink/renderer/core/editing/ephemeral_range.h" |
| #include "third_party/blink/renderer/core/editing/iterators/character_iterator.h" |
| #include "third_party/blink/renderer/core/editing/iterators/simplified_backwards_text_iterator.h" |
| #include "third_party/blink/renderer/core/editing/iterators/text_searcher_icu.h" |
| #include "third_party/blink/renderer/platform/text/character.h" |
| #include "third_party/blink/renderer/platform/text/text_boundaries.h" |
| #include "third_party/blink/renderer/platform/text/unicode_utilities.h" |
| #include "third_party/blink/renderer/platform/wtf/text/string_view.h" |
| |
| namespace blink { |
| |
| namespace { |
| |
| const wtf_size_t kMinimumSearchBufferSize = 8192; |
| |
| UChar32 GetCodePointAt(const UChar* str, wtf_size_t index, wtf_size_t length) { |
| UChar32 c; |
| U16_GET(str, 0, index, length, c); |
| return c; |
| } |
| |
| } // namespace |
| |
| inline SearchBuffer::SearchBuffer(const String& target, FindOptions options) |
| : options_(options), |
| prefix_length_(0), |
| number_of_characters_just_appended_(0), |
| at_break_(true), |
| needs_more_context_(options & kWholeWord), |
| target_requires_kana_workaround_(ContainsKanaLetters(target)) { |
| DCHECK(!target.IsEmpty()) << target; |
| target.AppendTo(target_); |
| |
| // FIXME: We'd like to tailor the searcher to fold quote marks for us instead |
| // of doing it in a separate replacement pass here, but ICU doesn't offer a |
| // way to add tailoring on top of the locale-specific tailoring as of this |
| // writing. |
| FoldQuoteMarksAndSoftHyphens(target_.data(), target_.size()); |
| |
| wtf_size_t target_length = target_.size(); |
| buffer_.ReserveInitialCapacity( |
| std::max(target_length * 8, kMinimumSearchBufferSize)); |
| overlap_ = buffer_.capacity() / 4; |
| |
| if ((options_ & kWholeWord) && target_length) { |
| const UChar32 target_first_character = |
| GetCodePointAt(target_.data(), 0, target_length); |
| // Characters in the separator category never really occur at the beginning |
| // of a word, so if the target begins with such a character, we just ignore |
| // the AtWordStart option. |
| if (IsSeparator(target_first_character)) { |
| options_ &= ~kWholeWord; |
| needs_more_context_ = false; |
| } |
| } |
| |
| text_searcher_ = std::make_unique<TextSearcherICU>(); |
| text_searcher_->SetPattern(StringView(target_.data(), target_.size()), |
| options_); |
| |
| // The kana workaround requires a normalized copy of the target string. |
| if (target_requires_kana_workaround_) |
| NormalizeCharactersIntoNFCForm(target_.data(), target_.size(), |
| normalized_target_); |
| } |
| |
| inline SearchBuffer::~SearchBuffer() = default; |
| |
| template <typename CharType> |
| inline void SearchBuffer::Append(const CharType* characters, |
| wtf_size_t length) { |
| DCHECK(length); |
| |
| if (at_break_) { |
| buffer_.Shrink(0); |
| prefix_length_ = 0; |
| at_break_ = false; |
| } else if (buffer_.size() == buffer_.capacity()) { |
| memcpy(buffer_.data(), buffer_.data() + buffer_.size() - overlap_, |
| overlap_ * sizeof(UChar)); |
| prefix_length_ -= std::min(prefix_length_, buffer_.size() - overlap_); |
| buffer_.Shrink(overlap_); |
| } |
| |
| wtf_size_t old_length = buffer_.size(); |
| wtf_size_t usable_length = std::min(buffer_.capacity() - old_length, length); |
| DCHECK(usable_length); |
| buffer_.resize(old_length + usable_length); |
| UChar* destination = buffer_.data() + old_length; |
| StringImpl::CopyChars(destination, characters, usable_length); |
| FoldQuoteMarksAndSoftHyphens(destination, usable_length); |
| number_of_characters_just_appended_ = usable_length; |
| } |
| |
| inline bool SearchBuffer::NeedsMoreContext() const { |
| return needs_more_context_; |
| } |
| |
| inline void SearchBuffer::PrependContext(const UChar* characters, |
| wtf_size_t length) { |
| DCHECK(needs_more_context_); |
| DCHECK_EQ(prefix_length_, buffer_.size()); |
| |
| if (!length) |
| return; |
| |
| at_break_ = false; |
| |
| wtf_size_t word_boundary_context_start = length; |
| if (word_boundary_context_start) { |
| U16_BACK_1(characters, 0, word_boundary_context_start); |
| word_boundary_context_start = |
| StartOfLastWordBoundaryContext(characters, word_boundary_context_start); |
| } |
| |
| wtf_size_t usable_length = std::min(buffer_.capacity() - prefix_length_, |
| length - word_boundary_context_start); |
| buffer_.push_front(characters + length - usable_length, usable_length); |
| prefix_length_ += usable_length; |
| |
| if (word_boundary_context_start || prefix_length_ == buffer_.capacity()) |
| needs_more_context_ = false; |
| } |
| |
| inline bool SearchBuffer::AtBreak() const { |
| return at_break_; |
| } |
| |
| inline void SearchBuffer::ReachedBreak() { |
| at_break_ = true; |
| } |
| |
| inline bool SearchBuffer::IsBadMatch(const UChar* match, |
| wtf_size_t match_length) const { |
| // This function implements the kana workaround. If usearch treats |
| // it as a match, but we do not want to, then it's a "bad match". |
| if (!target_requires_kana_workaround_) |
| return false; |
| |
| // Normalize into a match buffer. We reuse a single buffer rather than |
| // creating a new one each time. |
| NormalizeCharactersIntoNFCForm(match, match_length, normalized_match_); |
| |
| return !CheckOnlyKanaLettersInStrings( |
| normalized_target_.begin(), normalized_target_.size(), |
| normalized_match_.begin(), normalized_match_.size()); |
| } |
| |
| inline bool SearchBuffer::IsWordStartMatch(wtf_size_t start, |
| wtf_size_t length) const { |
| DCHECK(options_ & kWholeWord); |
| |
| if (!start) |
| return true; |
| |
| int size = buffer_.size(); |
| int offset = start; |
| UChar32 first_character = GetCodePointAt(buffer_.data(), offset, size); |
| |
| // Chinese and Japanese lack word boundary marks, and there is no clear |
| // agreement on what constitutes a word, so treat the position before any CJK |
| // character as a word start. |
| if (Character::IsCJKIdeographOrSymbol(first_character)) |
| return true; |
| |
| wtf_size_t word_break_search_start = start + length; |
| while (word_break_search_start > start) { |
| word_break_search_start = FindNextWordBackward( |
| buffer_.data(), buffer_.size(), word_break_search_start); |
| } |
| if (word_break_search_start != start) |
| return false; |
| if (options_ & kWholeWord) |
| return static_cast<int>(start + length) == |
| FindWordEndBoundary(buffer_.data(), buffer_.size(), |
| word_break_search_start); |
| return true; |
| } |
| |
| inline wtf_size_t SearchBuffer::Search(wtf_size_t& start) { |
| wtf_size_t size = buffer_.size(); |
| if (at_break_) { |
| if (!size) |
| return 0; |
| } else { |
| if (size != buffer_.capacity()) |
| return 0; |
| } |
| |
| text_searcher_->SetText(buffer_.data(), size); |
| text_searcher_->SetOffset(prefix_length_); |
| |
| MatchResultICU match; |
| |
| nextMatch: |
| if (!text_searcher_->NextMatchResult(match)) |
| return 0; |
| |
| // Matches that start in the overlap area are only tentative. |
| // The same match may appear later, matching more characters, |
| // possibly including a combining character that's not yet in the buffer. |
| if (!at_break_ && match.start >= size - overlap_) { |
| wtf_size_t overlap = overlap_; |
| if (options_ & kWholeWord) { |
| // Ensure that there is sufficient context before matchStart the next time |
| // around for determining if it is at a word boundary. |
| int word_boundary_context_start = match.start; |
| U16_BACK_1(buffer_.data(), 0, word_boundary_context_start); |
| word_boundary_context_start = StartOfLastWordBoundaryContext( |
| buffer_.data(), word_boundary_context_start); |
| overlap = std::min(size - 1, |
| std::max(overlap, size - word_boundary_context_start)); |
| } |
| memcpy(buffer_.data(), buffer_.data() + size - overlap, |
| overlap * sizeof(UChar)); |
| prefix_length_ -= std::min(prefix_length_, size - overlap); |
| buffer_.Shrink(overlap); |
| return 0; |
| } |
| |
| SECURITY_DCHECK(match.start + match.length <= size); |
| |
| // If this match is "bad", move on to the next match. |
| if (IsBadMatch(buffer_.data() + match.start, match.length) || |
| ((options_ & kWholeWord) && |
| !IsWordStartMatch(match.start, match.length))) { |
| goto nextMatch; |
| } |
| |
| wtf_size_t new_size = size - (match.start + 1); |
| memmove(buffer_.data(), buffer_.data() + match.start + 1, |
| new_size * sizeof(UChar)); |
| prefix_length_ -= std::min<wtf_size_t>(prefix_length_, match.start + 1); |
| buffer_.Shrink(new_size); |
| |
| start = size - match.start; |
| return match.length; |
| } |
| |
| // Check if there's any unpaird surrogate code point. |
| // Non-character code points are not checked. |
| static bool IsValidUTF16(const String& s) { |
| if (s.Is8Bit()) |
| return true; |
| const UChar* ustr = s.Characters16(); |
| wtf_size_t length = s.length(); |
| wtf_size_t position = 0; |
| while (position < length) { |
| UChar32 character; |
| U16_NEXT(ustr, position, length, character); |
| if (U_IS_SURROGATE(character)) |
| return false; |
| } |
| return true; |
| } |
| |
| template <typename Strategy> |
| static wtf_size_t FindPlainTextInternal( |
| CharacterIteratorAlgorithm<Strategy>& it, |
| const String& target, |
| FindOptions options, |
| wtf_size_t& match_start) { |
| match_start = 0; |
| wtf_size_t match_length = 0; |
| |
| if (!IsValidUTF16(target)) |
| return 0; |
| |
| SearchBuffer buffer(target, options); |
| |
| if (buffer.NeedsMoreContext()) { |
| for (SimplifiedBackwardsTextIteratorAlgorithm<Strategy> backwards_iterator( |
| EphemeralRangeTemplate<Strategy>( |
| PositionTemplate<Strategy>::FirstPositionInNode( |
| it.OwnerDocument()), |
| PositionTemplate<Strategy>(it.CurrentContainer(), |
| it.StartOffset()))); |
| !backwards_iterator.AtEnd(); backwards_iterator.Advance()) { |
| BackwardsTextBuffer characters; |
| backwards_iterator.CopyTextTo(&characters); |
| buffer.PrependContext(characters.Data(), characters.Size()); |
| if (!buffer.NeedsMoreContext()) |
| break; |
| } |
| } |
| |
| while (!it.AtEnd()) { |
| ForwardsTextBuffer characters; |
| it.CopyTextTo(&characters); |
| buffer.Append(characters.Data(), characters.Size()); |
| it.Advance(buffer.NumberOfCharactersJustAppended()); |
| tryAgain: |
| wtf_size_t match_start_offset; |
| if (wtf_size_t new_match_length = buffer.Search(match_start_offset)) { |
| // Note that we found a match, and where we found it. |
| wtf_size_t last_character_in_buffer_offset = it.CharacterOffset(); |
| DCHECK_GE(last_character_in_buffer_offset, match_start_offset); |
| match_start = last_character_in_buffer_offset - match_start_offset; |
| match_length = new_match_length; |
| // If searching forward, stop on the first match. |
| // If searching backward, don't stop, so we end up with the last match. |
| if (!(options & kBackwards)) |
| break; |
| goto tryAgain; |
| } |
| if (it.AtBreak() && !buffer.AtBreak()) { |
| buffer.ReachedBreak(); |
| goto tryAgain; |
| } |
| } |
| |
| return match_length; |
| } |
| |
| template <typename Strategy> |
| static EphemeralRangeTemplate<Strategy> FindPlainTextAlgorithm( |
| const EphemeralRangeTemplate<Strategy>& input_range, |
| const String& target, |
| FindOptions options) { |
| // CharacterIterator requires layoutObjects to be up to date. |
| if (!input_range.StartPosition().IsConnected()) |
| return EphemeralRangeTemplate<Strategy>(); |
| DCHECK_EQ(input_range.StartPosition().GetDocument(), |
| input_range.EndPosition().GetDocument()); |
| |
| const TextIteratorBehavior& iterator_flags_for_find_plain_text = |
| TextIteratorBehavior::Builder() |
| .SetEntersTextControls(true) |
| .SetEntersOpenShadowRoots(true) |
| .SetDoesNotBreakAtReplacedElement(true) |
| .SetCollapseTrailingSpace(true) |
| .Build(); |
| |
| // FIXME: Reduce the code duplication with above (but how?). |
| wtf_size_t match_start; |
| wtf_size_t match_length; |
| { |
| const TextIteratorBehavior& behavior = |
| TextIteratorBehavior::Builder(iterator_flags_for_find_plain_text) |
| .SetForWindowFind(options & kFindAPICall) |
| .Build(); |
| CharacterIteratorAlgorithm<Strategy> find_iterator(input_range, behavior); |
| match_length = |
| FindPlainTextInternal(find_iterator, target, options, match_start); |
| if (!match_length) |
| return EphemeralRangeTemplate<Strategy>(options & kBackwards |
| ? input_range.StartPosition() |
| : input_range.EndPosition()); |
| } |
| |
| CharacterIteratorAlgorithm<Strategy> compute_range_iterator( |
| input_range, iterator_flags_for_find_plain_text); |
| return compute_range_iterator.CalculateCharacterSubrange(match_start, |
| match_length); |
| } |
| |
| EphemeralRange FindPlainText(const EphemeralRange& input_range, |
| const String& target, |
| FindOptions options) { |
| return FindPlainTextAlgorithm<EditingStrategy>(input_range, target, options); |
| } |
| |
| EphemeralRangeInFlatTree FindPlainText( |
| const EphemeralRangeInFlatTree& input_range, |
| const String& target, |
| FindOptions options) { |
| return FindPlainTextAlgorithm<EditingInFlatTreeStrategy>(input_range, target, |
| options); |
| } |
| |
| } // namespace blink |