| /* |
| * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2012 Apple Inc. All rights reserved. |
| * Copyright (C) 2005 Alexey Proskuryakov. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions |
| * are met: |
| * 1. Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * 2. Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in the |
| * documentation and/or other materials provided with the distribution. |
| * |
| * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY |
| * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
| * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR |
| * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
| * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
| * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
| * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
| * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| #include "core/editing/iterators/SearchBuffer.h" |
| |
| #include "core/dom/Document.h" |
| #include "core/editing/iterators/CharacterIterator.h" |
| #include "core/editing/iterators/SimplifiedBackwardsTextIterator.h" |
| #include "platform/fonts/Character.h" |
| #include "platform/text/TextBoundaries.h" |
| #include "platform/text/TextBreakIteratorInternalICU.h" |
| #include "platform/text/UnicodeUtilities.h" |
| #include "wtf/text/CharacterNames.h" |
| #include <unicode/usearch.h> |
| |
| namespace blink { |
| |
| static const size_t minimumSearchBufferSize = 8192; |
| |
| #if ENABLE(ASSERT) |
| static bool searcherInUse; |
| #endif |
| |
| static UStringSearch* createSearcher() |
| { |
| // Provide a non-empty pattern and non-empty text so usearch_open will not fail, |
| // but it doesn't matter exactly what it is, since we don't perform any searches |
| // without setting both the pattern and the text. |
| UErrorCode status = U_ZERO_ERROR; |
| String searchCollatorName = currentSearchLocaleID() + String("@collation=search"); |
| UStringSearch* searcher = usearch_open(&newlineCharacter, 1, &newlineCharacter, 1, searchCollatorName.utf8().data(), 0, &status); |
| ASSERT(status == U_ZERO_ERROR || status == U_USING_FALLBACK_WARNING || status == U_USING_DEFAULT_WARNING); |
| return searcher; |
| } |
| |
| static UStringSearch* searcher() |
| { |
| static UStringSearch* searcher = createSearcher(); |
| return searcher; |
| } |
| |
| static inline void lockSearcher() |
| { |
| #if ENABLE(ASSERT) |
| ASSERT(!searcherInUse); |
| searcherInUse = true; |
| #endif |
| } |
| |
| static inline void unlockSearcher() |
| { |
| #if ENABLE(ASSERT) |
| ASSERT(searcherInUse); |
| searcherInUse = false; |
| #endif |
| } |
| |
| inline SearchBuffer::SearchBuffer(const String& target, FindOptions options) |
| : m_options(options) |
| , m_prefixLength(0) |
| , m_numberOfCharactersJustAppended(0) |
| , m_atBreak(true) |
| , m_needsMoreContext(options & AtWordStarts) |
| , m_targetRequiresKanaWorkaround(containsKanaLetters(target)) |
| { |
| ASSERT(!target.isEmpty()); |
| target.appendTo(m_target); |
| |
| // FIXME: We'd like to tailor the searcher to fold quote marks for us instead |
| // of doing it in a separate replacement pass here, but ICU doesn't offer a way |
| // to add tailoring on top of the locale-specific tailoring as of this writing. |
| foldQuoteMarksAndSoftHyphens(m_target.data(), m_target.size()); |
| |
| size_t targetLength = m_target.size(); |
| m_buffer.reserveInitialCapacity(std::max(targetLength * 8, minimumSearchBufferSize)); |
| m_overlap = m_buffer.capacity() / 4; |
| |
| if ((m_options & AtWordStarts) && targetLength) { |
| UChar32 targetFirstCharacter; |
| U16_GET(m_target.data(), 0, 0, targetLength, targetFirstCharacter); |
| // Characters in the separator category never really occur at the beginning of a word, |
| // so if the target begins with such a character, we just ignore the AtWordStart option. |
| if (isSeparator(targetFirstCharacter)) { |
| m_options &= ~AtWordStarts; |
| m_needsMoreContext = false; |
| } |
| } |
| |
| // Grab the single global searcher. |
| // If we ever have a reason to do more than once search buffer at once, we'll have |
| // to move to multiple searchers. |
| lockSearcher(); |
| |
| UStringSearch* searcher = blink::searcher(); |
| UCollator* collator = usearch_getCollator(searcher); |
| |
| UCollationStrength strength = m_options & CaseInsensitive ? UCOL_PRIMARY : UCOL_TERTIARY; |
| if (ucol_getStrength(collator) != strength) { |
| ucol_setStrength(collator, strength); |
| usearch_reset(searcher); |
| } |
| |
| UErrorCode status = U_ZERO_ERROR; |
| usearch_setPattern(searcher, m_target.data(), targetLength, &status); |
| ASSERT(status == U_ZERO_ERROR); |
| |
| // The kana workaround requires a normalized copy of the target string. |
| if (m_targetRequiresKanaWorkaround) |
| normalizeCharactersIntoNFCForm(m_target.data(), m_target.size(), m_normalizedTarget); |
| } |
| |
| inline SearchBuffer::~SearchBuffer() |
| { |
| // Leave the static object pointing to valid strings (pattern=targer, |
| // text=buffer). Otheriwse, usearch_reset() will results in 'use-after-free' |
| // error. |
| UErrorCode status = U_ZERO_ERROR; |
| usearch_setPattern(blink::searcher(), &newlineCharacter, 1, &status); |
| usearch_setText(blink::searcher(), &newlineCharacter, 1, &status); |
| ASSERT(status == U_ZERO_ERROR); |
| |
| unlockSearcher(); |
| } |
| |
| template<typename CharType> |
| inline void SearchBuffer::append(const CharType* characters, size_t length) |
| { |
| ASSERT(length); |
| |
| if (m_atBreak) { |
| m_buffer.shrink(0); |
| m_prefixLength = 0; |
| m_atBreak = false; |
| } else if (m_buffer.size() == m_buffer.capacity()) { |
| memcpy(m_buffer.data(), m_buffer.data() + m_buffer.size() - m_overlap, m_overlap * sizeof(UChar)); |
| m_prefixLength -= std::min(m_prefixLength, m_buffer.size() - m_overlap); |
| m_buffer.shrink(m_overlap); |
| } |
| |
| size_t oldLength = m_buffer.size(); |
| size_t usableLength = std::min(m_buffer.capacity() - oldLength, length); |
| ASSERT(usableLength); |
| m_buffer.resize(oldLength + usableLength); |
| UChar* destination = m_buffer.data() + oldLength; |
| StringImpl::copyChars(destination, characters, usableLength); |
| foldQuoteMarksAndSoftHyphens(destination, usableLength); |
| m_numberOfCharactersJustAppended = usableLength; |
| } |
| |
| inline bool SearchBuffer::needsMoreContext() const |
| { |
| return m_needsMoreContext; |
| } |
| |
| inline void SearchBuffer::prependContext(const UChar* characters, size_t length) |
| { |
| ASSERT(m_needsMoreContext); |
| ASSERT(m_prefixLength == m_buffer.size()); |
| |
| if (!length) |
| return; |
| |
| m_atBreak = false; |
| |
| size_t wordBoundaryContextStart = length; |
| if (wordBoundaryContextStart) { |
| U16_BACK_1(characters, 0, wordBoundaryContextStart); |
| wordBoundaryContextStart = startOfLastWordBoundaryContext(characters, wordBoundaryContextStart); |
| } |
| |
| size_t usableLength = std::min(m_buffer.capacity() - m_prefixLength, length - wordBoundaryContextStart); |
| m_buffer.prepend(characters + length - usableLength, usableLength); |
| m_prefixLength += usableLength; |
| |
| if (wordBoundaryContextStart || m_prefixLength == m_buffer.capacity()) |
| m_needsMoreContext = false; |
| } |
| |
| inline bool SearchBuffer::atBreak() const |
| { |
| return m_atBreak; |
| } |
| |
| inline void SearchBuffer::reachedBreak() |
| { |
| m_atBreak = true; |
| } |
| |
| inline bool SearchBuffer::isBadMatch(const UChar* match, size_t matchLength) const |
| { |
| // This function implements the kana workaround. If usearch treats |
| // it as a match, but we do not want to, then it's a "bad match". |
| if (!m_targetRequiresKanaWorkaround) |
| return false; |
| |
| // Normalize into a match buffer. We reuse a single buffer rather than |
| // creating a new one each time. |
| normalizeCharactersIntoNFCForm(match, matchLength, m_normalizedMatch); |
| |
| return !checkOnlyKanaLettersInStrings(m_normalizedTarget.begin(), m_normalizedTarget.size(), m_normalizedMatch.begin(), m_normalizedMatch.size()); |
| } |
| |
| inline bool SearchBuffer::isWordStartMatch(size_t start, size_t length) const |
| { |
| ASSERT(m_options & AtWordStarts); |
| |
| if (!start) |
| return true; |
| |
| int size = m_buffer.size(); |
| int offset = start; |
| UChar32 firstCharacter; |
| U16_GET(m_buffer.data(), 0, offset, size, firstCharacter); |
| |
| if (m_options & TreatMedialCapitalAsWordStart) { |
| UChar32 previousCharacter; |
| U16_PREV(m_buffer.data(), 0, offset, previousCharacter); |
| |
| if (isSeparator(firstCharacter)) { |
| // The start of a separator run is a word start (".org" in "webkit.org"). |
| if (!isSeparator(previousCharacter)) |
| return true; |
| } else if (isASCIIUpper(firstCharacter)) { |
| // The start of an uppercase run is a word start ("Kit" in "WebKit"). |
| if (!isASCIIUpper(previousCharacter)) |
| return true; |
| // The last character of an uppercase run followed by a non-separator, non-digit |
| // is a word start ("Request" in "XMLHTTPRequest"). |
| offset = start; |
| U16_FWD_1(m_buffer.data(), offset, size); |
| UChar32 nextCharacter = 0; |
| if (offset < size) |
| U16_GET(m_buffer.data(), 0, offset, size, nextCharacter); |
| if (!isASCIIUpper(nextCharacter) && !isASCIIDigit(nextCharacter) && !isSeparator(nextCharacter)) |
| return true; |
| } else if (isASCIIDigit(firstCharacter)) { |
| // The start of a digit run is a word start ("2" in "WebKit2"). |
| if (!isASCIIDigit(previousCharacter)) |
| return true; |
| } else if (isSeparator(previousCharacter) || isASCIIDigit(previousCharacter)) { |
| // The start of a non-separator, non-uppercase, non-digit run is a word start, |
| // except after an uppercase. ("org" in "webkit.org", but not "ore" in "WebCore"). |
| return true; |
| } |
| } |
| |
| // Chinese and Japanese lack word boundary marks, and there is no clear agreement on what constitutes |
| // a word, so treat the position before any CJK character as a word start. |
| if (Character::isCJKIdeographOrSymbol(firstCharacter)) |
| return true; |
| |
| size_t wordBreakSearchStart = start + length; |
| while (wordBreakSearchStart > start) |
| wordBreakSearchStart = findNextWordFromIndex(m_buffer.data(), m_buffer.size(), wordBreakSearchStart, false /* backwards */); |
| if (wordBreakSearchStart != start) |
| return false; |
| if (m_options & WholeWord) |
| return static_cast<int>(start + length) == findWordEndBoundary(m_buffer.data(), m_buffer.size(), wordBreakSearchStart); |
| return true; |
| } |
| |
| inline size_t SearchBuffer::search(size_t& start) |
| { |
| size_t size = m_buffer.size(); |
| if (m_atBreak) { |
| if (!size) |
| return 0; |
| } else { |
| if (size != m_buffer.capacity()) |
| return 0; |
| } |
| |
| UStringSearch* searcher = blink::searcher(); |
| |
| UErrorCode status = U_ZERO_ERROR; |
| usearch_setText(searcher, m_buffer.data(), size, &status); |
| ASSERT(status == U_ZERO_ERROR); |
| |
| usearch_setOffset(searcher, m_prefixLength, &status); |
| ASSERT(status == U_ZERO_ERROR); |
| |
| int matchStart = usearch_next(searcher, &status); |
| ASSERT(status == U_ZERO_ERROR); |
| |
| nextMatch: |
| if (!(matchStart >= 0 && static_cast<size_t>(matchStart) < size)) { |
| ASSERT(matchStart == USEARCH_DONE); |
| return 0; |
| } |
| |
| // Matches that start in the overlap area are only tentative. |
| // The same match may appear later, matching more characters, |
| // possibly including a combining character that's not yet in the buffer. |
| if (!m_atBreak && static_cast<size_t>(matchStart) >= size - m_overlap) { |
| size_t overlap = m_overlap; |
| if (m_options & AtWordStarts) { |
| // Ensure that there is sufficient context before matchStart the next time around for |
| // determining if it is at a word boundary. |
| int wordBoundaryContextStart = matchStart; |
| U16_BACK_1(m_buffer.data(), 0, wordBoundaryContextStart); |
| wordBoundaryContextStart = startOfLastWordBoundaryContext(m_buffer.data(), wordBoundaryContextStart); |
| overlap = std::min(size - 1, std::max(overlap, size - wordBoundaryContextStart)); |
| } |
| memcpy(m_buffer.data(), m_buffer.data() + size - overlap, overlap * sizeof(UChar)); |
| m_prefixLength -= std::min(m_prefixLength, size - overlap); |
| m_buffer.shrink(overlap); |
| return 0; |
| } |
| |
| size_t matchedLength = usearch_getMatchedLength(searcher); |
| ASSERT_WITH_SECURITY_IMPLICATION(matchStart + matchedLength <= size); |
| |
| // If this match is "bad", move on to the next match. |
| if (isBadMatch(m_buffer.data() + matchStart, matchedLength) || ((m_options & AtWordStarts) && !isWordStartMatch(matchStart, matchedLength))) { |
| matchStart = usearch_next(searcher, &status); |
| ASSERT(status == U_ZERO_ERROR); |
| goto nextMatch; |
| } |
| |
| size_t newSize = size - (matchStart + 1); |
| memmove(m_buffer.data(), m_buffer.data() + matchStart + 1, newSize * sizeof(UChar)); |
| m_prefixLength -= std::min<size_t>(m_prefixLength, matchStart + 1); |
| m_buffer.shrink(newSize); |
| |
| start = size - matchStart; |
| return matchedLength; |
| } |
| |
| // Check if there's any unpaird surrogate code point. |
| // Non-character code points are not checked. |
| static bool isValidUTF16(const String& s) |
| { |
| if (s.is8Bit()) |
| return true; |
| const UChar* ustr = s.characters16(); |
| size_t length = s.length(); |
| size_t position = 0; |
| while (position < length) { |
| UChar32 character; |
| U16_NEXT(ustr, position, length, character); |
| if (U_IS_SURROGATE(character)) |
| return false; |
| } |
| return true; |
| } |
| |
| template <typename Strategy> |
| static size_t findPlainTextInternal(CharacterIteratorAlgorithm<Strategy>& it, const String& target, FindOptions options, size_t& matchStart) |
| { |
| matchStart = 0; |
| size_t matchLength = 0; |
| |
| if (!isValidUTF16(target)) |
| return 0; |
| |
| SearchBuffer buffer(target, options); |
| |
| if (buffer.needsMoreContext()) { |
| for (SimplifiedBackwardsTextIteratorAlgorithm<Strategy> backwardsIterator(PositionTemplate<Strategy>::firstPositionInNode(it.ownerDocument()), PositionTemplate<Strategy>(it.currentContainer(), it.startOffset())); !backwardsIterator.atEnd(); backwardsIterator.advance()) { |
| BackwardsTextBuffer characters; |
| backwardsIterator.copyTextTo(&characters); |
| buffer.prependContext(characters.data(), characters.size()); |
| if (!buffer.needsMoreContext()) |
| break; |
| } |
| } |
| |
| while (!it.atEnd()) { |
| // TODO(xiaochengh): Should allow copying text to SearchBuffer directly |
| ForwardsTextBuffer characters; |
| it.copyTextTo(&characters); |
| buffer.append(characters.data(), characters.size()); |
| it.advance(buffer.numberOfCharactersJustAppended()); |
| tryAgain: |
| size_t matchStartOffset; |
| if (size_t newMatchLength = buffer.search(matchStartOffset)) { |
| // Note that we found a match, and where we found it. |
| size_t lastCharacterInBufferOffset = it.characterOffset(); |
| ASSERT(lastCharacterInBufferOffset >= matchStartOffset); |
| matchStart = lastCharacterInBufferOffset - matchStartOffset; |
| matchLength = newMatchLength; |
| // If searching forward, stop on the first match. |
| // If searching backward, don't stop, so we end up with the last match. |
| if (!(options & Backwards)) |
| break; |
| goto tryAgain; |
| } |
| if (it.atBreak() && !buffer.atBreak()) { |
| buffer.reachedBreak(); |
| goto tryAgain; |
| } |
| } |
| |
| return matchLength; |
| } |
| |
| static const TextIteratorBehaviorFlags iteratorFlagsForFindPlainText = TextIteratorEntersTextControls | TextIteratorEntersOpenShadowRoots | TextIteratorDoesNotBreakAtReplacedElement | TextIteratorCollapseTrailingSpace; |
| |
| template <typename Strategy> |
| static EphemeralRangeTemplate<Strategy> findPlainTextAlgorithm(const EphemeralRangeTemplate<Strategy>& inputRange, const String& target, FindOptions options) |
| { |
| // CharacterIterator requires layoutObjects to be up-to-date. |
| if (!inputRange.startPosition().inDocument()) |
| return EphemeralRangeTemplate<Strategy>(); |
| ASSERT(inputRange.startPosition().document() == inputRange.endPosition().document()); |
| |
| // FIXME: Reduce the code duplication with above (but how?). |
| size_t matchStart; |
| size_t matchLength; |
| { |
| TextIteratorBehaviorFlags behavior = iteratorFlagsForFindPlainText; |
| if (options & FindAPICall) |
| behavior |= TextIteratorForWindowFind; |
| CharacterIteratorAlgorithm<Strategy> findIterator(inputRange, behavior); |
| matchLength = findPlainTextInternal(findIterator, target, options, matchStart); |
| if (!matchLength) |
| return EphemeralRangeTemplate<Strategy>(options & Backwards ? inputRange.startPosition() : inputRange.endPosition()); |
| } |
| |
| CharacterIteratorAlgorithm<Strategy> computeRangeIterator(inputRange, iteratorFlagsForFindPlainText); |
| return computeRangeIterator.calculateCharacterSubrange(matchStart, matchLength); |
| } |
| |
| EphemeralRange findPlainText(const EphemeralRange& inputRange, const String& target, FindOptions options) |
| { |
| return findPlainTextAlgorithm<EditingStrategy>(inputRange, target, options); |
| } |
| |
| EphemeralRangeInFlatTree findPlainText(const EphemeralRangeInFlatTree& inputRange, const String& target, FindOptions options) |
| { |
| return findPlainTextAlgorithm<EditingInFlatTreeStrategy>(inputRange, target, options); |
| } |
| |
| } // namespace blink |