blob: dc6a3145241aa5d5fb183842d2e2e3f0a29761c9 [file] [log] [blame]
/*
* Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2012 Apple Inc. All
* rights reserved.
* Copyright (C) 2005 Alexey Proskuryakov.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "third_party/blink/renderer/core/editing/iterators/search_buffer.h"
#include "third_party/blink/renderer/core/dom/document.h"
#include "third_party/blink/renderer/core/editing/ephemeral_range.h"
#include "third_party/blink/renderer/core/editing/iterators/character_iterator.h"
#include "third_party/blink/renderer/core/editing/iterators/simplified_backwards_text_iterator.h"
#include "third_party/blink/renderer/core/editing/iterators/text_searcher_icu.h"
#include "third_party/blink/renderer/platform/text/character.h"
#include "third_party/blink/renderer/platform/text/text_boundaries.h"
#include "third_party/blink/renderer/platform/text/unicode_utilities.h"
#include "third_party/blink/renderer/platform/wtf/text/string_view.h"
namespace blink {
namespace {
const wtf_size_t kMinimumSearchBufferSize = 8192;
UChar32 GetCodePointAt(const UChar* str, wtf_size_t index, wtf_size_t length) {
UChar32 c;
U16_GET(str, 0, index, length, c);
return c;
}
} // namespace
inline SearchBuffer::SearchBuffer(const String& target, FindOptions options)
: options_(options),
prefix_length_(0),
number_of_characters_just_appended_(0),
at_break_(true),
needs_more_context_(options & kWholeWord),
target_requires_kana_workaround_(ContainsKanaLetters(target)) {
DCHECK(!target.IsEmpty()) << target;
target.AppendTo(target_);
// FIXME: We'd like to tailor the searcher to fold quote marks for us instead
// of doing it in a separate replacement pass here, but ICU doesn't offer a
// way to add tailoring on top of the locale-specific tailoring as of this
// writing.
FoldQuoteMarksAndSoftHyphens(target_.data(), target_.size());
wtf_size_t target_length = target_.size();
buffer_.ReserveInitialCapacity(
std::max(target_length * 8, kMinimumSearchBufferSize));
overlap_ = buffer_.capacity() / 4;
if ((options_ & kWholeWord) && target_length) {
const UChar32 target_first_character =
GetCodePointAt(target_.data(), 0, target_length);
// Characters in the separator category never really occur at the beginning
// of a word, so if the target begins with such a character, we just ignore
// the AtWordStart option.
if (IsSeparator(target_first_character)) {
options_ &= ~kWholeWord;
needs_more_context_ = false;
}
}
text_searcher_ = std::make_unique<TextSearcherICU>();
text_searcher_->SetPattern(StringView(target_.data(), target_.size()),
options_);
// The kana workaround requires a normalized copy of the target string.
if (target_requires_kana_workaround_)
NormalizeCharactersIntoNFCForm(target_.data(), target_.size(),
normalized_target_);
}
inline SearchBuffer::~SearchBuffer() = default;
template <typename CharType>
inline void SearchBuffer::Append(const CharType* characters,
wtf_size_t length) {
DCHECK(length);
if (at_break_) {
buffer_.Shrink(0);
prefix_length_ = 0;
at_break_ = false;
} else if (buffer_.size() == buffer_.capacity()) {
memcpy(buffer_.data(), buffer_.data() + buffer_.size() - overlap_,
overlap_ * sizeof(UChar));
prefix_length_ -= std::min(prefix_length_, buffer_.size() - overlap_);
buffer_.Shrink(overlap_);
}
wtf_size_t old_length = buffer_.size();
wtf_size_t usable_length = std::min(buffer_.capacity() - old_length, length);
DCHECK(usable_length);
buffer_.resize(old_length + usable_length);
UChar* destination = buffer_.data() + old_length;
StringImpl::CopyChars(destination, characters, usable_length);
FoldQuoteMarksAndSoftHyphens(destination, usable_length);
number_of_characters_just_appended_ = usable_length;
}
inline bool SearchBuffer::NeedsMoreContext() const {
return needs_more_context_;
}
inline void SearchBuffer::PrependContext(const UChar* characters,
wtf_size_t length) {
DCHECK(needs_more_context_);
DCHECK_EQ(prefix_length_, buffer_.size());
if (!length)
return;
at_break_ = false;
wtf_size_t word_boundary_context_start = length;
if (word_boundary_context_start) {
U16_BACK_1(characters, 0, word_boundary_context_start);
word_boundary_context_start =
StartOfLastWordBoundaryContext(characters, word_boundary_context_start);
}
wtf_size_t usable_length = std::min(buffer_.capacity() - prefix_length_,
length - word_boundary_context_start);
buffer_.push_front(characters + length - usable_length, usable_length);
prefix_length_ += usable_length;
if (word_boundary_context_start || prefix_length_ == buffer_.capacity())
needs_more_context_ = false;
}
inline bool SearchBuffer::AtBreak() const {
return at_break_;
}
inline void SearchBuffer::ReachedBreak() {
at_break_ = true;
}
inline bool SearchBuffer::IsBadMatch(const UChar* match,
wtf_size_t match_length) const {
// This function implements the kana workaround. If usearch treats
// it as a match, but we do not want to, then it's a "bad match".
if (!target_requires_kana_workaround_)
return false;
// Normalize into a match buffer. We reuse a single buffer rather than
// creating a new one each time.
NormalizeCharactersIntoNFCForm(match, match_length, normalized_match_);
return !CheckOnlyKanaLettersInStrings(
normalized_target_.begin(), normalized_target_.size(),
normalized_match_.begin(), normalized_match_.size());
}
inline bool SearchBuffer::IsWordStartMatch(wtf_size_t start,
wtf_size_t length) const {
DCHECK(options_ & kWholeWord);
if (!start)
return true;
int size = buffer_.size();
int offset = start;
UChar32 first_character = GetCodePointAt(buffer_.data(), offset, size);
// Chinese and Japanese lack word boundary marks, and there is no clear
// agreement on what constitutes a word, so treat the position before any CJK
// character as a word start.
if (Character::IsCJKIdeographOrSymbol(first_character))
return true;
wtf_size_t word_break_search_start = start + length;
while (word_break_search_start > start) {
word_break_search_start = FindNextWordBackward(
buffer_.data(), buffer_.size(), word_break_search_start);
}
if (word_break_search_start != start)
return false;
if (options_ & kWholeWord)
return static_cast<int>(start + length) ==
FindWordEndBoundary(buffer_.data(), buffer_.size(),
word_break_search_start);
return true;
}
inline wtf_size_t SearchBuffer::Search(wtf_size_t& start) {
wtf_size_t size = buffer_.size();
if (at_break_) {
if (!size)
return 0;
} else {
if (size != buffer_.capacity())
return 0;
}
text_searcher_->SetText(buffer_.data(), size);
text_searcher_->SetOffset(prefix_length_);
MatchResultICU match;
nextMatch:
if (!text_searcher_->NextMatchResult(match))
return 0;
// Matches that start in the overlap area are only tentative.
// The same match may appear later, matching more characters,
// possibly including a combining character that's not yet in the buffer.
if (!at_break_ && match.start >= size - overlap_) {
wtf_size_t overlap = overlap_;
if (options_ & kWholeWord) {
// Ensure that there is sufficient context before matchStart the next time
// around for determining if it is at a word boundary.
int word_boundary_context_start = match.start;
U16_BACK_1(buffer_.data(), 0, word_boundary_context_start);
word_boundary_context_start = StartOfLastWordBoundaryContext(
buffer_.data(), word_boundary_context_start);
overlap = std::min(size - 1,
std::max(overlap, size - word_boundary_context_start));
}
memcpy(buffer_.data(), buffer_.data() + size - overlap,
overlap * sizeof(UChar));
prefix_length_ -= std::min(prefix_length_, size - overlap);
buffer_.Shrink(overlap);
return 0;
}
SECURITY_DCHECK(match.start + match.length <= size);
// If this match is "bad", move on to the next match.
if (IsBadMatch(buffer_.data() + match.start, match.length) ||
((options_ & kWholeWord) &&
!IsWordStartMatch(match.start, match.length))) {
goto nextMatch;
}
wtf_size_t new_size = size - (match.start + 1);
memmove(buffer_.data(), buffer_.data() + match.start + 1,
new_size * sizeof(UChar));
prefix_length_ -= std::min<wtf_size_t>(prefix_length_, match.start + 1);
buffer_.Shrink(new_size);
start = size - match.start;
return match.length;
}
// Check if there's any unpaird surrogate code point.
// Non-character code points are not checked.
static bool IsValidUTF16(const String& s) {
if (s.Is8Bit())
return true;
const UChar* ustr = s.Characters16();
wtf_size_t length = s.length();
wtf_size_t position = 0;
while (position < length) {
UChar32 character;
U16_NEXT(ustr, position, length, character);
if (U_IS_SURROGATE(character))
return false;
}
return true;
}
template <typename Strategy>
static wtf_size_t FindPlainTextInternal(
CharacterIteratorAlgorithm<Strategy>& it,
const String& target,
FindOptions options,
wtf_size_t& match_start) {
match_start = 0;
wtf_size_t match_length = 0;
if (!IsValidUTF16(target))
return 0;
SearchBuffer buffer(target, options);
if (buffer.NeedsMoreContext()) {
for (SimplifiedBackwardsTextIteratorAlgorithm<Strategy> backwards_iterator(
EphemeralRangeTemplate<Strategy>(
PositionTemplate<Strategy>::FirstPositionInNode(
it.OwnerDocument()),
PositionTemplate<Strategy>(it.CurrentContainer(),
it.StartOffset())));
!backwards_iterator.AtEnd(); backwards_iterator.Advance()) {
BackwardsTextBuffer characters;
backwards_iterator.CopyTextTo(&characters);
buffer.PrependContext(characters.Data(), characters.Size());
if (!buffer.NeedsMoreContext())
break;
}
}
while (!it.AtEnd()) {
ForwardsTextBuffer characters;
it.CopyTextTo(&characters);
buffer.Append(characters.Data(), characters.Size());
it.Advance(buffer.NumberOfCharactersJustAppended());
tryAgain:
wtf_size_t match_start_offset;
if (wtf_size_t new_match_length = buffer.Search(match_start_offset)) {
// Note that we found a match, and where we found it.
wtf_size_t last_character_in_buffer_offset = it.CharacterOffset();
DCHECK_GE(last_character_in_buffer_offset, match_start_offset);
match_start = last_character_in_buffer_offset - match_start_offset;
match_length = new_match_length;
// If searching forward, stop on the first match.
// If searching backward, don't stop, so we end up with the last match.
if (!(options & kBackwards))
break;
goto tryAgain;
}
if (it.AtBreak() && !buffer.AtBreak()) {
buffer.ReachedBreak();
goto tryAgain;
}
}
return match_length;
}
template <typename Strategy>
static EphemeralRangeTemplate<Strategy> FindPlainTextAlgorithm(
const EphemeralRangeTemplate<Strategy>& input_range,
const String& target,
FindOptions options) {
// CharacterIterator requires layoutObjects to be up to date.
if (!input_range.StartPosition().IsConnected())
return EphemeralRangeTemplate<Strategy>();
DCHECK_EQ(input_range.StartPosition().GetDocument(),
input_range.EndPosition().GetDocument());
const TextIteratorBehavior& iterator_flags_for_find_plain_text =
TextIteratorBehavior::Builder()
.SetEntersTextControls(true)
.SetEntersOpenShadowRoots(true)
.SetDoesNotBreakAtReplacedElement(true)
.SetCollapseTrailingSpace(true)
.Build();
// FIXME: Reduce the code duplication with above (but how?).
wtf_size_t match_start;
wtf_size_t match_length;
{
const TextIteratorBehavior& behavior =
TextIteratorBehavior::Builder(iterator_flags_for_find_plain_text)
.SetForWindowFind(options & kFindAPICall)
.Build();
CharacterIteratorAlgorithm<Strategy> find_iterator(input_range, behavior);
match_length =
FindPlainTextInternal(find_iterator, target, options, match_start);
if (!match_length)
return EphemeralRangeTemplate<Strategy>(options & kBackwards
? input_range.StartPosition()
: input_range.EndPosition());
}
CharacterIteratorAlgorithm<Strategy> compute_range_iterator(
input_range, iterator_flags_for_find_plain_text);
return compute_range_iterator.CalculateCharacterSubrange(match_start,
match_length);
}
EphemeralRange FindPlainText(const EphemeralRange& input_range,
const String& target,
FindOptions options) {
return FindPlainTextAlgorithm<EditingStrategy>(input_range, target, options);
}
EphemeralRangeInFlatTree FindPlainText(
const EphemeralRangeInFlatTree& input_range,
const String& target,
FindOptions options) {
return FindPlainTextAlgorithm<EditingInFlatTreeStrategy>(input_range, target,
options);
}
} // namespace blink