blob: 85a10542cdc7badf7b560eb4caaa16283aefe41c [file] [log] [blame]
/*
* Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2012 Apple Inc. All
* rights reserved.
* Copyright (C) 2005 Alexey Proskuryakov.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "core/editing/iterators/SearchBuffer.h"
#include "core/dom/Document.h"
#include "core/editing/iterators/CharacterIterator.h"
#include "core/editing/iterators/SimplifiedBackwardsTextIterator.h"
#include "platform/text/Character.h"
#include "platform/text/TextBoundaries.h"
#include "platform/text/TextBreakIteratorInternalICU.h"
#include "platform/text/UnicodeUtilities.h"
#include "wtf/text/CharacterNames.h"
#include <unicode/usearch.h>
namespace blink {
static const size_t minimumSearchBufferSize = 8192;
#if DCHECK_IS_ON()
static bool searcherInUse;
#endif
static UStringSearch* createSearcher() {
// Provide a non-empty pattern and non-empty text so usearch_open will not
// fail, but it doesn't matter exactly what it is, since we don't perform any
// searches without setting both the pattern and the text.
UErrorCode status = U_ZERO_ERROR;
String searchCollatorName =
currentSearchLocaleID() + String("@collation=search");
UStringSearch* searcher =
usearch_open(&newlineCharacter, 1, &newlineCharacter, 1,
searchCollatorName.utf8().data(), 0, &status);
DCHECK(status == U_ZERO_ERROR || status == U_USING_FALLBACK_WARNING ||
status == U_USING_DEFAULT_WARNING)
<< status;
return searcher;
}
static UStringSearch* searcher() {
static UStringSearch* searcher = createSearcher();
return searcher;
}
static inline void lockSearcher() {
#if DCHECK_IS_ON()
DCHECK(!searcherInUse);
searcherInUse = true;
#endif
}
static inline void unlockSearcher() {
#if DCHECK_IS_ON()
DCHECK(searcherInUse);
searcherInUse = false;
#endif
}
inline SearchBuffer::SearchBuffer(const String& target, FindOptions options)
: m_options(options),
m_prefixLength(0),
m_numberOfCharactersJustAppended(0),
m_atBreak(true),
m_needsMoreContext(options & AtWordStarts),
m_targetRequiresKanaWorkaround(containsKanaLetters(target)) {
DCHECK(!target.isEmpty()) << target;
target.appendTo(m_target);
// FIXME: We'd like to tailor the searcher to fold quote marks for us instead
// of doing it in a separate replacement pass here, but ICU doesn't offer a
// way to add tailoring on top of the locale-specific tailoring as of this
// writing.
foldQuoteMarksAndSoftHyphens(m_target.data(), m_target.size());
size_t targetLength = m_target.size();
m_buffer.reserveInitialCapacity(
std::max(targetLength * 8, minimumSearchBufferSize));
m_overlap = m_buffer.capacity() / 4;
if ((m_options & AtWordStarts) && targetLength) {
UChar32 targetFirstCharacter;
U16_GET(m_target.data(), 0, 0, targetLength, targetFirstCharacter);
// Characters in the separator category never really occur at the beginning
// of a word, so if the target begins with such a character, we just ignore
// the AtWordStart option.
if (isSeparator(targetFirstCharacter)) {
m_options &= ~AtWordStarts;
m_needsMoreContext = false;
}
}
// Grab the single global searcher.
// If we ever have a reason to do more than once search buffer at once, we'll
// have to move to multiple searchers.
lockSearcher();
UStringSearch* searcher = blink::searcher();
UCollator* collator = usearch_getCollator(searcher);
UCollationStrength strength =
m_options & CaseInsensitive ? UCOL_PRIMARY : UCOL_TERTIARY;
if (ucol_getStrength(collator) != strength) {
ucol_setStrength(collator, strength);
usearch_reset(searcher);
}
UErrorCode status = U_ZERO_ERROR;
usearch_setPattern(searcher, m_target.data(), targetLength, &status);
DCHECK_EQ(status, U_ZERO_ERROR);
// The kana workaround requires a normalized copy of the target string.
if (m_targetRequiresKanaWorkaround)
normalizeCharactersIntoNFCForm(m_target.data(), m_target.size(),
m_normalizedTarget);
}
inline SearchBuffer::~SearchBuffer() {
// Leave the static object pointing to valid strings (pattern=targer,
// text=buffer). Otheriwse, usearch_reset() will results in 'use-after-free'
// error.
UErrorCode status = U_ZERO_ERROR;
usearch_setPattern(blink::searcher(), &newlineCharacter, 1, &status);
usearch_setText(blink::searcher(), &newlineCharacter, 1, &status);
DCHECK_EQ(status, U_ZERO_ERROR);
unlockSearcher();
}
template <typename CharType>
inline void SearchBuffer::append(const CharType* characters, size_t length) {
DCHECK(length);
if (m_atBreak) {
m_buffer.shrink(0);
m_prefixLength = 0;
m_atBreak = false;
} else if (m_buffer.size() == m_buffer.capacity()) {
memcpy(m_buffer.data(), m_buffer.data() + m_buffer.size() - m_overlap,
m_overlap * sizeof(UChar));
m_prefixLength -= std::min(m_prefixLength, m_buffer.size() - m_overlap);
m_buffer.shrink(m_overlap);
}
size_t oldLength = m_buffer.size();
size_t usableLength = std::min(m_buffer.capacity() - oldLength, length);
DCHECK(usableLength);
m_buffer.resize(oldLength + usableLength);
UChar* destination = m_buffer.data() + oldLength;
StringImpl::copyChars(destination, characters, usableLength);
foldQuoteMarksAndSoftHyphens(destination, usableLength);
m_numberOfCharactersJustAppended = usableLength;
}
inline bool SearchBuffer::needsMoreContext() const {
return m_needsMoreContext;
}
inline void SearchBuffer::prependContext(const UChar* characters,
size_t length) {
DCHECK(m_needsMoreContext);
DCHECK_EQ(m_prefixLength, m_buffer.size());
if (!length)
return;
m_atBreak = false;
size_t wordBoundaryContextStart = length;
if (wordBoundaryContextStart) {
U16_BACK_1(characters, 0, wordBoundaryContextStart);
wordBoundaryContextStart =
startOfLastWordBoundaryContext(characters, wordBoundaryContextStart);
}
size_t usableLength = std::min(m_buffer.capacity() - m_prefixLength,
length - wordBoundaryContextStart);
m_buffer.prepend(characters + length - usableLength, usableLength);
m_prefixLength += usableLength;
if (wordBoundaryContextStart || m_prefixLength == m_buffer.capacity())
m_needsMoreContext = false;
}
inline bool SearchBuffer::atBreak() const {
return m_atBreak;
}
inline void SearchBuffer::reachedBreak() {
m_atBreak = true;
}
inline bool SearchBuffer::isBadMatch(const UChar* match,
size_t matchLength) const {
// This function implements the kana workaround. If usearch treats
// it as a match, but we do not want to, then it's a "bad match".
if (!m_targetRequiresKanaWorkaround)
return false;
// Normalize into a match buffer. We reuse a single buffer rather than
// creating a new one each time.
normalizeCharactersIntoNFCForm(match, matchLength, m_normalizedMatch);
return !checkOnlyKanaLettersInStrings(
m_normalizedTarget.begin(), m_normalizedTarget.size(),
m_normalizedMatch.begin(), m_normalizedMatch.size());
}
inline bool SearchBuffer::isWordStartMatch(size_t start, size_t length) const {
DCHECK(m_options & AtWordStarts);
if (!start)
return true;
int size = m_buffer.size();
int offset = start;
UChar32 firstCharacter;
U16_GET(m_buffer.data(), 0, offset, size, firstCharacter);
if (m_options & TreatMedialCapitalAsWordStart) {
UChar32 previousCharacter;
U16_PREV(m_buffer.data(), 0, offset, previousCharacter);
if (isSeparator(firstCharacter)) {
// The start of a separator run is a word start (".org" in "webkit.org").
if (!isSeparator(previousCharacter))
return true;
} else if (isASCIIUpper(firstCharacter)) {
// The start of an uppercase run is a word start ("Kit" in "WebKit").
if (!isASCIIUpper(previousCharacter))
return true;
// The last character of an uppercase run followed by a non-separator,
// non-digit is a word start ("Request" in "XMLHTTPRequest").
offset = start;
U16_FWD_1(m_buffer.data(), offset, size);
UChar32 nextCharacter = 0;
if (offset < size)
U16_GET(m_buffer.data(), 0, offset, size, nextCharacter);
if (!isASCIIUpper(nextCharacter) && !isASCIIDigit(nextCharacter) &&
!isSeparator(nextCharacter))
return true;
} else if (isASCIIDigit(firstCharacter)) {
// The start of a digit run is a word start ("2" in "WebKit2").
if (!isASCIIDigit(previousCharacter))
return true;
} else if (isSeparator(previousCharacter) ||
isASCIIDigit(previousCharacter)) {
// The start of a non-separator, non-uppercase, non-digit run is a word
// start, except after an uppercase. ("org" in "webkit.org", but not "ore"
// in "WebCore").
return true;
}
}
// Chinese and Japanese lack word boundary marks, and there is no clear
// agreement on what constitutes a word, so treat the position before any CJK
// character as a word start.
if (Character::isCJKIdeographOrSymbol(firstCharacter))
return true;
size_t wordBreakSearchStart = start + length;
while (wordBreakSearchStart > start)
wordBreakSearchStart =
findNextWordFromIndex(m_buffer.data(), m_buffer.size(),
wordBreakSearchStart, false /* backwards */);
if (wordBreakSearchStart != start)
return false;
if (m_options & WholeWord)
return static_cast<int>(start + length) ==
findWordEndBoundary(m_buffer.data(), m_buffer.size(),
wordBreakSearchStart);
return true;
}
inline size_t SearchBuffer::search(size_t& start) {
size_t size = m_buffer.size();
if (m_atBreak) {
if (!size)
return 0;
} else {
if (size != m_buffer.capacity())
return 0;
}
UStringSearch* searcher = blink::searcher();
UErrorCode status = U_ZERO_ERROR;
usearch_setText(searcher, m_buffer.data(), size, &status);
DCHECK_EQ(status, U_ZERO_ERROR);
usearch_setOffset(searcher, m_prefixLength, &status);
DCHECK_EQ(status, U_ZERO_ERROR);
int matchStart = usearch_next(searcher, &status);
DCHECK_EQ(status, U_ZERO_ERROR);
nextMatch:
if (!(matchStart >= 0 && static_cast<size_t>(matchStart) < size)) {
DCHECK_EQ(matchStart, USEARCH_DONE);
return 0;
}
// Matches that start in the overlap area are only tentative.
// The same match may appear later, matching more characters,
// possibly including a combining character that's not yet in the buffer.
if (!m_atBreak && static_cast<size_t>(matchStart) >= size - m_overlap) {
size_t overlap = m_overlap;
if (m_options & AtWordStarts) {
// Ensure that there is sufficient context before matchStart the next time
// around for determining if it is at a word boundary.
int wordBoundaryContextStart = matchStart;
U16_BACK_1(m_buffer.data(), 0, wordBoundaryContextStart);
wordBoundaryContextStart = startOfLastWordBoundaryContext(
m_buffer.data(), wordBoundaryContextStart);
overlap = std::min(size - 1,
std::max(overlap, size - wordBoundaryContextStart));
}
memcpy(m_buffer.data(), m_buffer.data() + size - overlap,
overlap * sizeof(UChar));
m_prefixLength -= std::min(m_prefixLength, size - overlap);
m_buffer.shrink(overlap);
return 0;
}
size_t matchedLength = usearch_getMatchedLength(searcher);
ASSERT_WITH_SECURITY_IMPLICATION(matchStart + matchedLength <= size);
// If this match is "bad", move on to the next match.
if (isBadMatch(m_buffer.data() + matchStart, matchedLength) ||
((m_options & AtWordStarts) &&
!isWordStartMatch(matchStart, matchedLength))) {
matchStart = usearch_next(searcher, &status);
DCHECK_EQ(status, U_ZERO_ERROR);
goto nextMatch;
}
size_t newSize = size - (matchStart + 1);
memmove(m_buffer.data(), m_buffer.data() + matchStart + 1,
newSize * sizeof(UChar));
m_prefixLength -= std::min<size_t>(m_prefixLength, matchStart + 1);
m_buffer.shrink(newSize);
start = size - matchStart;
return matchedLength;
}
// Check if there's any unpaird surrogate code point.
// Non-character code points are not checked.
static bool isValidUTF16(const String& s) {
if (s.is8Bit())
return true;
const UChar* ustr = s.characters16();
size_t length = s.length();
size_t position = 0;
while (position < length) {
UChar32 character;
U16_NEXT(ustr, position, length, character);
if (U_IS_SURROGATE(character))
return false;
}
return true;
}
template <typename Strategy>
static size_t findPlainTextInternal(CharacterIteratorAlgorithm<Strategy>& it,
const String& target,
FindOptions options,
size_t& matchStart) {
matchStart = 0;
size_t matchLength = 0;
if (!isValidUTF16(target))
return 0;
SearchBuffer buffer(target, options);
if (buffer.needsMoreContext()) {
for (SimplifiedBackwardsTextIteratorAlgorithm<Strategy> backwardsIterator(
PositionTemplate<Strategy>::firstPositionInNode(
it.ownerDocument()),
PositionTemplate<Strategy>(it.currentContainer(),
it.startOffset()));
!backwardsIterator.atEnd(); backwardsIterator.advance()) {
BackwardsTextBuffer characters;
backwardsIterator.copyTextTo(&characters);
buffer.prependContext(characters.data(), characters.size());
if (!buffer.needsMoreContext())
break;
}
}
while (!it.atEnd()) {
// TODO(xiaochengh): Should allow copying text to SearchBuffer directly
ForwardsTextBuffer characters;
it.copyTextTo(&characters);
buffer.append(characters.data(), characters.size());
it.advance(buffer.numberOfCharactersJustAppended());
tryAgain:
size_t matchStartOffset;
if (size_t newMatchLength = buffer.search(matchStartOffset)) {
// Note that we found a match, and where we found it.
size_t lastCharacterInBufferOffset = it.characterOffset();
DCHECK_GE(lastCharacterInBufferOffset, matchStartOffset);
matchStart = lastCharacterInBufferOffset - matchStartOffset;
matchLength = newMatchLength;
// If searching forward, stop on the first match.
// If searching backward, don't stop, so we end up with the last match.
if (!(options & Backwards))
break;
goto tryAgain;
}
if (it.atBreak() && !buffer.atBreak()) {
buffer.reachedBreak();
goto tryAgain;
}
}
return matchLength;
}
static const TextIteratorBehaviorFlags iteratorFlagsForFindPlainText =
TextIteratorEntersTextControls | TextIteratorEntersOpenShadowRoots |
TextIteratorDoesNotBreakAtReplacedElement |
TextIteratorCollapseTrailingSpace;
template <typename Strategy>
static EphemeralRangeTemplate<Strategy> findPlainTextAlgorithm(
const EphemeralRangeTemplate<Strategy>& inputRange,
const String& target,
FindOptions options) {
// CharacterIterator requires layoutObjects to be up to date.
if (!inputRange.startPosition().isConnected())
return EphemeralRangeTemplate<Strategy>();
DCHECK_EQ(inputRange.startPosition().document(),
inputRange.endPosition().document());
// FIXME: Reduce the code duplication with above (but how?).
size_t matchStart;
size_t matchLength;
{
TextIteratorBehaviorFlags behavior = iteratorFlagsForFindPlainText;
if (options & FindAPICall)
behavior |= TextIteratorForWindowFind;
CharacterIteratorAlgorithm<Strategy> findIterator(inputRange, behavior);
matchLength =
findPlainTextInternal(findIterator, target, options, matchStart);
if (!matchLength)
return EphemeralRangeTemplate<Strategy>(options & Backwards
? inputRange.startPosition()
: inputRange.endPosition());
}
CharacterIteratorAlgorithm<Strategy> computeRangeIterator(
inputRange, iteratorFlagsForFindPlainText);
return computeRangeIterator.calculateCharacterSubrange(matchStart,
matchLength);
}
EphemeralRange findPlainText(const EphemeralRange& inputRange,
const String& target,
FindOptions options) {
return findPlainTextAlgorithm<EditingStrategy>(inputRange, target, options);
}
EphemeralRangeInFlatTree findPlainText(
const EphemeralRangeInFlatTree& inputRange,
const String& target,
FindOptions options) {
return findPlainTextAlgorithm<EditingInFlatTreeStrategy>(inputRange, target,
options);
}
} // namespace blink