| /* |
| * Copyright (C) 2006 Lars Knoll <lars@trolltech.com> |
| * Copyright (C) 2007, 2011, 2012 Apple Inc. All rights reserved. |
| * |
| * This library is free software; you can redistribute it and/or |
| * modify it under the terms of the GNU Library General Public |
| * License as published by the Free Software Foundation; either |
| * version 2 of the License, or (at your option) any later version. |
| * |
| * This library is distributed in the hope that it will be useful, |
| * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| * Library General Public License for more details. |
| * |
| * You should have received a copy of the GNU Library General Public License |
| * along with this library; see the file COPYING.LIB. If not, write to |
| * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, |
| * Boston, MA 02110-1301, USA. |
| * |
| */ |
| |
| #include "platform/text/TextBreakIterator.h" |
| |
| #include "platform/text/TextBreakIteratorInternalICU.h" |
| #include "wtf/Assertions.h" |
| #include "wtf/HashMap.h" |
| #include "wtf/PtrUtil.h" |
| #include "wtf/ThreadSpecific.h" |
| #include "wtf/ThreadingPrimitives.h" |
| #include "wtf/text/WTFString.h" |
| #include <memory> |
| #include <unicode/rbbi.h> |
| #include <unicode/ubrk.h> |
| |
| using namespace WTF; |
| |
| namespace blink { |
| |
| class LineBreakIteratorPool final { |
| USING_FAST_MALLOC(LineBreakIteratorPool); |
| WTF_MAKE_NONCOPYABLE(LineBreakIteratorPool); |
| |
| public: |
| static LineBreakIteratorPool& sharedPool() { |
| static WTF::ThreadSpecific<LineBreakIteratorPool>* pool = |
| new WTF::ThreadSpecific<LineBreakIteratorPool>; |
| return **pool; |
| } |
| |
| static std::unique_ptr<LineBreakIteratorPool> create() { |
| return wrapUnique(new LineBreakIteratorPool); |
| } |
| |
| icu::BreakIterator* take(const AtomicString& locale) { |
| icu::BreakIterator* iterator = 0; |
| for (size_t i = 0; i < m_pool.size(); ++i) { |
| if (m_pool[i].first == locale) { |
| iterator = m_pool[i].second; |
| m_pool.remove(i); |
| break; |
| } |
| } |
| |
| if (!iterator) { |
| UErrorCode openStatus = U_ZERO_ERROR; |
| bool localeIsEmpty = locale.isEmpty(); |
| iterator = icu::BreakIterator::createLineInstance( |
| localeIsEmpty ? icu::Locale(currentTextBreakLocaleID()) |
| : icu::Locale(locale.utf8().data()), |
| openStatus); |
| // locale comes from a web page and it can be invalid, leading ICU |
| // to fail, in which case we fall back to the default locale. |
| if (!localeIsEmpty && U_FAILURE(openStatus)) { |
| openStatus = U_ZERO_ERROR; |
| iterator = icu::BreakIterator::createLineInstance( |
| icu::Locale(currentTextBreakLocaleID()), openStatus); |
| } |
| |
| if (U_FAILURE(openStatus)) { |
| DLOG(ERROR) << "icu::BreakIterator construction failed with status " |
| << openStatus; |
| return 0; |
| } |
| } |
| |
| ASSERT(!m_vendedIterators.contains(iterator)); |
| m_vendedIterators.set(iterator, locale); |
| return iterator; |
| } |
| |
| void put(icu::BreakIterator* iterator) { |
| DCHECK(m_vendedIterators.contains(iterator)); |
| |
| if (m_pool.size() == capacity) { |
| delete (m_pool[0].second); |
| m_pool.remove(0); |
| } |
| |
| m_pool.append(Entry(m_vendedIterators.take(iterator), iterator)); |
| } |
| |
| private: |
| LineBreakIteratorPool() {} |
| |
| static const size_t capacity = 4; |
| |
| typedef std::pair<AtomicString, icu::BreakIterator*> Entry; |
| typedef Vector<Entry, capacity> Pool; |
| Pool m_pool; |
| HashMap<icu::BreakIterator*, AtomicString> m_vendedIterators; |
| |
| friend WTF::ThreadSpecific<LineBreakIteratorPool>:: |
| operator LineBreakIteratorPool*(); |
| }; |
| |
| enum TextContext { NoContext, PriorContext, PrimaryContext }; |
| |
| const int textBufferCapacity = 16; |
| |
| typedef struct { |
| DISALLOW_NEW(); |
| UText text; |
| UChar buffer[textBufferCapacity]; |
| } UTextWithBuffer; |
| |
| static inline int64_t textPinIndex(int64_t& index, int64_t limit) { |
| if (index < 0) |
| index = 0; |
| else if (index > limit) |
| index = limit; |
| return index; |
| } |
| |
| static inline int64_t textNativeLength(UText* text) { |
| return text->a + text->b; |
| } |
| |
| // Relocate pointer from source into destination as required. |
| static void textFixPointer(const UText* source, |
| UText* destination, |
| const void*& pointer) { |
| if (pointer >= source->pExtra && |
| pointer < static_cast<char*>(source->pExtra) + source->extraSize) { |
| // Pointer references source extra buffer. |
| pointer = static_cast<char*>(destination->pExtra) + |
| (static_cast<const char*>(pointer) - |
| static_cast<const char*>(source->pExtra)); |
| } else if (pointer >= source && |
| pointer < |
| reinterpret_cast<const char*>(source) + source->sizeOfStruct) { |
| // Pointer references source text structure, but not source extra buffer. |
| pointer = reinterpret_cast<char*>(destination) + |
| (static_cast<const char*>(pointer) - |
| reinterpret_cast<const char*>(source)); |
| } |
| } |
| |
| static UText* textClone(UText* destination, |
| const UText* source, |
| UBool deep, |
| UErrorCode* status) { |
| ASSERT_UNUSED(deep, !deep); |
| if (U_FAILURE(*status)) |
| return 0; |
| int32_t extraSize = source->extraSize; |
| destination = utext_setup(destination, extraSize, status); |
| if (U_FAILURE(*status)) |
| return destination; |
| void* extraNew = destination->pExtra; |
| int32_t flags = destination->flags; |
| int sizeToCopy = std::min(source->sizeOfStruct, destination->sizeOfStruct); |
| memcpy(destination, source, sizeToCopy); |
| destination->pExtra = extraNew; |
| destination->flags = flags; |
| memcpy(destination->pExtra, source->pExtra, extraSize); |
| textFixPointer(source, destination, destination->context); |
| textFixPointer(source, destination, destination->p); |
| textFixPointer(source, destination, destination->q); |
| ASSERT(!destination->r); |
| const void* chunkContents = |
| static_cast<const void*>(destination->chunkContents); |
| textFixPointer(source, destination, chunkContents); |
| destination->chunkContents = static_cast<const UChar*>(chunkContents); |
| return destination; |
| } |
| |
| static int32_t textExtract(UText*, |
| int64_t, |
| int64_t, |
| UChar*, |
| int32_t, |
| UErrorCode* errorCode) { |
| // In the present context, this text provider is used only with ICU functions |
| // that do not perform an extract operation. |
| ASSERT_NOT_REACHED(); |
| *errorCode = U_UNSUPPORTED_ERROR; |
| return 0; |
| } |
| |
| static void textClose(UText* text) { |
| text->context = 0; |
| } |
| |
| static inline TextContext textGetContext(const UText* text, |
| int64_t nativeIndex, |
| UBool forward) { |
| if (!text->b || nativeIndex > text->b) |
| return PrimaryContext; |
| if (nativeIndex == text->b) |
| return forward ? PrimaryContext : PriorContext; |
| return PriorContext; |
| } |
| |
| static inline TextContext textLatin1GetCurrentContext(const UText* text) { |
| if (!text->chunkContents) |
| return NoContext; |
| return text->chunkContents == text->pExtra ? PrimaryContext : PriorContext; |
| } |
| |
| static void textLatin1MoveInPrimaryContext(UText* text, |
| int64_t nativeIndex, |
| int64_t nativeLength, |
| UBool forward) { |
| ASSERT(text->chunkContents == text->pExtra); |
| if (forward) { |
| ASSERT(nativeIndex >= text->b && nativeIndex < nativeLength); |
| text->chunkNativeStart = nativeIndex; |
| text->chunkNativeLimit = nativeIndex + text->extraSize / sizeof(UChar); |
| if (text->chunkNativeLimit > nativeLength) |
| text->chunkNativeLimit = nativeLength; |
| } else { |
| ASSERT(nativeIndex > text->b && nativeIndex <= nativeLength); |
| text->chunkNativeLimit = nativeIndex; |
| text->chunkNativeStart = nativeIndex - text->extraSize / sizeof(UChar); |
| if (text->chunkNativeStart < text->b) |
| text->chunkNativeStart = text->b; |
| } |
| int64_t length = text->chunkNativeLimit - text->chunkNativeStart; |
| // Ensure chunk length is well defined if computed length exceeds int32_t |
| // range. |
| ASSERT(length <= std::numeric_limits<int32_t>::max()); |
| text->chunkLength = length <= std::numeric_limits<int32_t>::max() |
| ? static_cast<int32_t>(length) |
| : 0; |
| text->nativeIndexingLimit = text->chunkLength; |
| text->chunkOffset = forward ? 0 : text->chunkLength; |
| StringImpl::copyChars( |
| const_cast<UChar*>(text->chunkContents), |
| static_cast<const LChar*>(text->p) + (text->chunkNativeStart - text->b), |
| static_cast<unsigned>(text->chunkLength)); |
| } |
| |
| static void textLatin1SwitchToPrimaryContext(UText* text, |
| int64_t nativeIndex, |
| int64_t nativeLength, |
| UBool forward) { |
| ASSERT(!text->chunkContents || text->chunkContents == text->q); |
| text->chunkContents = static_cast<const UChar*>(text->pExtra); |
| textLatin1MoveInPrimaryContext(text, nativeIndex, nativeLength, forward); |
| } |
| |
| static void textLatin1MoveInPriorContext(UText* text, |
| int64_t nativeIndex, |
| int64_t nativeLength, |
| UBool forward) { |
| ASSERT(text->chunkContents == text->q); |
| ASSERT(forward ? nativeIndex < text->b : nativeIndex <= text->b); |
| ASSERT_UNUSED(nativeLength, forward ? nativeIndex < nativeLength |
| : nativeIndex <= nativeLength); |
| ASSERT_UNUSED(forward, forward ? nativeIndex < nativeLength |
| : nativeIndex <= nativeLength); |
| text->chunkNativeStart = 0; |
| text->chunkNativeLimit = text->b; |
| text->chunkLength = text->b; |
| text->nativeIndexingLimit = text->chunkLength; |
| int64_t offset = nativeIndex - text->chunkNativeStart; |
| // Ensure chunk offset is well defined if computed offset exceeds int32_t |
| // range or chunk length. |
| ASSERT(offset <= std::numeric_limits<int32_t>::max()); |
| text->chunkOffset = std::min(offset <= std::numeric_limits<int32_t>::max() |
| ? static_cast<int32_t>(offset) |
| : 0, |
| text->chunkLength); |
| } |
| |
| static void textLatin1SwitchToPriorContext(UText* text, |
| int64_t nativeIndex, |
| int64_t nativeLength, |
| UBool forward) { |
| ASSERT(!text->chunkContents || text->chunkContents == text->pExtra); |
| text->chunkContents = static_cast<const UChar*>(text->q); |
| textLatin1MoveInPriorContext(text, nativeIndex, nativeLength, forward); |
| } |
| |
| static inline bool textInChunkOrOutOfRange(UText* text, |
| int64_t nativeIndex, |
| int64_t nativeLength, |
| UBool forward, |
| UBool& isAccessible) { |
| if (forward) { |
| if (nativeIndex >= text->chunkNativeStart && |
| nativeIndex < text->chunkNativeLimit) { |
| int64_t offset = nativeIndex - text->chunkNativeStart; |
| // Ensure chunk offset is well formed if computed offset exceeds int32_t |
| // range. |
| ASSERT(offset <= std::numeric_limits<int32_t>::max()); |
| text->chunkOffset = offset <= std::numeric_limits<int32_t>::max() |
| ? static_cast<int32_t>(offset) |
| : 0; |
| isAccessible = TRUE; |
| return true; |
| } |
| if (nativeIndex >= nativeLength && text->chunkNativeLimit == nativeLength) { |
| text->chunkOffset = text->chunkLength; |
| isAccessible = FALSE; |
| return true; |
| } |
| } else { |
| if (nativeIndex > text->chunkNativeStart && |
| nativeIndex <= text->chunkNativeLimit) { |
| int64_t offset = nativeIndex - text->chunkNativeStart; |
| // Ensure chunk offset is well formed if computed offset exceeds int32_t |
| // range. |
| ASSERT(offset <= std::numeric_limits<int32_t>::max()); |
| text->chunkOffset = offset <= std::numeric_limits<int32_t>::max() |
| ? static_cast<int32_t>(offset) |
| : 0; |
| isAccessible = TRUE; |
| return true; |
| } |
| if (nativeIndex <= 0 && !text->chunkNativeStart) { |
| text->chunkOffset = 0; |
| isAccessible = FALSE; |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| static UBool textLatin1Access(UText* text, int64_t nativeIndex, UBool forward) { |
| if (!text->context) |
| return FALSE; |
| int64_t nativeLength = textNativeLength(text); |
| UBool isAccessible; |
| if (textInChunkOrOutOfRange(text, nativeIndex, nativeLength, forward, |
| isAccessible)) |
| return isAccessible; |
| nativeIndex = textPinIndex(nativeIndex, nativeLength - 1); |
| TextContext currentContext = textLatin1GetCurrentContext(text); |
| TextContext newContext = textGetContext(text, nativeIndex, forward); |
| ASSERT(newContext != NoContext); |
| if (newContext == currentContext) { |
| if (currentContext == PrimaryContext) { |
| textLatin1MoveInPrimaryContext(text, nativeIndex, nativeLength, forward); |
| } else { |
| textLatin1MoveInPriorContext(text, nativeIndex, nativeLength, forward); |
| } |
| } else if (newContext == PrimaryContext) { |
| textLatin1SwitchToPrimaryContext(text, nativeIndex, nativeLength, forward); |
| } else { |
| ASSERT(newContext == PriorContext); |
| textLatin1SwitchToPriorContext(text, nativeIndex, nativeLength, forward); |
| } |
| return TRUE; |
| } |
| |
| static const struct UTextFuncs textLatin1Funcs = { |
| sizeof(UTextFuncs), 0, 0, 0, textClone, textNativeLength, |
| textLatin1Access, textExtract, 0, 0, 0, 0, |
| textClose, 0, 0, 0, |
| }; |
| |
| static void textInit(UText* text, |
| const UTextFuncs* funcs, |
| const void* string, |
| unsigned length, |
| const UChar* priorContext, |
| int priorContextLength) { |
| text->pFuncs = funcs; |
| text->providerProperties = 1 << UTEXT_PROVIDER_STABLE_CHUNKS; |
| text->context = string; |
| text->p = string; |
| text->a = length; |
| text->q = priorContext; |
| text->b = priorContextLength; |
| } |
| |
| static UText* textOpenLatin1(UTextWithBuffer* utWithBuffer, |
| const LChar* string, |
| unsigned length, |
| const UChar* priorContext, |
| int priorContextLength, |
| UErrorCode* status) { |
| if (U_FAILURE(*status)) |
| return 0; |
| |
| if (!string || |
| length > static_cast<unsigned>(std::numeric_limits<int32_t>::max())) { |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| return 0; |
| } |
| UText* text = |
| utext_setup(&utWithBuffer->text, sizeof(utWithBuffer->buffer), status); |
| if (U_FAILURE(*status)) { |
| ASSERT(!text); |
| return 0; |
| } |
| textInit(text, &textLatin1Funcs, string, length, priorContext, |
| priorContextLength); |
| return text; |
| } |
| |
| static inline TextContext textUTF16GetCurrentContext(const UText* text) { |
| if (!text->chunkContents) |
| return NoContext; |
| return text->chunkContents == text->p ? PrimaryContext : PriorContext; |
| } |
| |
| static void textUTF16MoveInPrimaryContext(UText* text, |
| int64_t nativeIndex, |
| int64_t nativeLength, |
| UBool forward) { |
| ASSERT(text->chunkContents == text->p); |
| ASSERT_UNUSED(forward, |
| forward ? nativeIndex >= text->b : nativeIndex > text->b); |
| ASSERT_UNUSED(forward, forward ? nativeIndex < nativeLength |
| : nativeIndex <= nativeLength); |
| text->chunkNativeStart = text->b; |
| text->chunkNativeLimit = nativeLength; |
| int64_t length = text->chunkNativeLimit - text->chunkNativeStart; |
| // Ensure chunk length is well defined if computed length exceeds int32_t |
| // range. |
| ASSERT(length <= std::numeric_limits<int32_t>::max()); |
| text->chunkLength = length <= std::numeric_limits<int32_t>::max() |
| ? static_cast<int32_t>(length) |
| : 0; |
| text->nativeIndexingLimit = text->chunkLength; |
| int64_t offset = nativeIndex - text->chunkNativeStart; |
| // Ensure chunk offset is well defined if computed offset exceeds int32_t |
| // range or chunk length. |
| ASSERT(offset <= std::numeric_limits<int32_t>::max()); |
| text->chunkOffset = std::min(offset <= std::numeric_limits<int32_t>::max() |
| ? static_cast<int32_t>(offset) |
| : 0, |
| text->chunkLength); |
| } |
| |
| static void textUTF16SwitchToPrimaryContext(UText* text, |
| int64_t nativeIndex, |
| int64_t nativeLength, |
| UBool forward) { |
| ASSERT(!text->chunkContents || text->chunkContents == text->q); |
| text->chunkContents = static_cast<const UChar*>(text->p); |
| textUTF16MoveInPrimaryContext(text, nativeIndex, nativeLength, forward); |
| } |
| |
| static void textUTF16MoveInPriorContext(UText* text, |
| int64_t nativeIndex, |
| int64_t nativeLength, |
| UBool forward) { |
| ASSERT(text->chunkContents == text->q); |
| ASSERT(forward ? nativeIndex < text->b : nativeIndex <= text->b); |
| ASSERT_UNUSED(nativeLength, forward ? nativeIndex < nativeLength |
| : nativeIndex <= nativeLength); |
| ASSERT_UNUSED(forward, forward ? nativeIndex < nativeLength |
| : nativeIndex <= nativeLength); |
| text->chunkNativeStart = 0; |
| text->chunkNativeLimit = text->b; |
| text->chunkLength = text->b; |
| text->nativeIndexingLimit = text->chunkLength; |
| int64_t offset = nativeIndex - text->chunkNativeStart; |
| // Ensure chunk offset is well defined if computed offset exceeds int32_t |
| // range or chunk length. |
| ASSERT(offset <= std::numeric_limits<int32_t>::max()); |
| text->chunkOffset = std::min(offset <= std::numeric_limits<int32_t>::max() |
| ? static_cast<int32_t>(offset) |
| : 0, |
| text->chunkLength); |
| } |
| |
| static void textUTF16SwitchToPriorContext(UText* text, |
| int64_t nativeIndex, |
| int64_t nativeLength, |
| UBool forward) { |
| ASSERT(!text->chunkContents || text->chunkContents == text->p); |
| text->chunkContents = static_cast<const UChar*>(text->q); |
| textUTF16MoveInPriorContext(text, nativeIndex, nativeLength, forward); |
| } |
| |
| static UBool textUTF16Access(UText* text, int64_t nativeIndex, UBool forward) { |
| if (!text->context) |
| return FALSE; |
| int64_t nativeLength = textNativeLength(text); |
| UBool isAccessible; |
| if (textInChunkOrOutOfRange(text, nativeIndex, nativeLength, forward, |
| isAccessible)) |
| return isAccessible; |
| nativeIndex = textPinIndex(nativeIndex, nativeLength - 1); |
| TextContext currentContext = textUTF16GetCurrentContext(text); |
| TextContext newContext = textGetContext(text, nativeIndex, forward); |
| ASSERT(newContext != NoContext); |
| if (newContext == currentContext) { |
| if (currentContext == PrimaryContext) { |
| textUTF16MoveInPrimaryContext(text, nativeIndex, nativeLength, forward); |
| } else { |
| textUTF16MoveInPriorContext(text, nativeIndex, nativeLength, forward); |
| } |
| } else if (newContext == PrimaryContext) { |
| textUTF16SwitchToPrimaryContext(text, nativeIndex, nativeLength, forward); |
| } else { |
| ASSERT(newContext == PriorContext); |
| textUTF16SwitchToPriorContext(text, nativeIndex, nativeLength, forward); |
| } |
| return TRUE; |
| } |
| |
| static const struct UTextFuncs textUTF16Funcs = { |
| sizeof(UTextFuncs), 0, 0, 0, textClone, textNativeLength, |
| textUTF16Access, textExtract, 0, 0, 0, 0, |
| textClose, 0, 0, 0, |
| }; |
| |
| static UText* textOpenUTF16(UText* text, |
| const UChar* string, |
| unsigned length, |
| const UChar* priorContext, |
| int priorContextLength, |
| UErrorCode* status) { |
| if (U_FAILURE(*status)) |
| return 0; |
| |
| if (!string || |
| length > static_cast<unsigned>(std::numeric_limits<int32_t>::max())) { |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| return 0; |
| } |
| |
| text = utext_setup(text, 0, status); |
| if (U_FAILURE(*status)) { |
| ASSERT(!text); |
| return 0; |
| } |
| textInit(text, &textUTF16Funcs, string, length, priorContext, |
| priorContextLength); |
| return text; |
| } |
| |
| static UText emptyText = UTEXT_INITIALIZER; |
| |
| static TextBreakIterator* wordBreakIterator(const LChar* string, int length) { |
| UErrorCode errorCode = U_ZERO_ERROR; |
| static TextBreakIterator* breakIter = 0; |
| if (!breakIter) { |
| breakIter = icu::BreakIterator::createWordInstance( |
| icu::Locale(currentTextBreakLocaleID()), errorCode); |
| DCHECK(U_SUCCESS(errorCode)) |
| << "ICU could not open a break iterator: " << u_errorName(errorCode) |
| << " (" << errorCode << ")"; |
| if (!breakIter) |
| return 0; |
| } |
| |
| UTextWithBuffer textLocal; |
| textLocal.text = emptyText; |
| textLocal.text.extraSize = sizeof(textLocal.buffer); |
| textLocal.text.pExtra = textLocal.buffer; |
| |
| UErrorCode openStatus = U_ZERO_ERROR; |
| UText* text = textOpenLatin1(&textLocal, string, length, 0, 0, &openStatus); |
| if (U_FAILURE(openStatus)) { |
| DLOG(ERROR) << "textOpenLatin1 failed with status " << openStatus; |
| return 0; |
| } |
| |
| UErrorCode setTextStatus = U_ZERO_ERROR; |
| breakIter->setText(text, setTextStatus); |
| if (U_FAILURE(setTextStatus)) |
| DLOG(ERROR) << "BreakIterator::seText failed with status " << setTextStatus; |
| |
| utext_close(text); |
| |
| return breakIter; |
| } |
| |
| static void setText16(TextBreakIterator* iter, |
| const UChar* string, |
| int length) { |
| UErrorCode errorCode = U_ZERO_ERROR; |
| UText uText = UTEXT_INITIALIZER; |
| utext_openUChars(&uText, string, length, &errorCode); |
| if (U_FAILURE(errorCode)) |
| return; |
| iter->setText(&uText, errorCode); |
| } |
| |
| TextBreakIterator* wordBreakIterator(const UChar* string, int length) { |
| UErrorCode errorCode = U_ZERO_ERROR; |
| static TextBreakIterator* breakIter = 0; |
| if (!breakIter) { |
| breakIter = icu::BreakIterator::createWordInstance( |
| icu::Locale(currentTextBreakLocaleID()), errorCode); |
| DCHECK(U_SUCCESS(errorCode)) |
| << "ICU could not open a break iterator: " << u_errorName(errorCode) |
| << " (" << errorCode << ")"; |
| if (!breakIter) |
| return 0; |
| } |
| setText16(breakIter, string, length); |
| return breakIter; |
| } |
| |
| TextBreakIterator* wordBreakIterator(const String& string, |
| int start, |
| int length) { |
| if (string.isEmpty()) |
| return 0; |
| if (string.is8Bit()) |
| return wordBreakIterator(string.characters8() + start, length); |
| return wordBreakIterator(string.characters16() + start, length); |
| } |
| |
| TextBreakIterator* acquireLineBreakIterator(const LChar* string, |
| int length, |
| const AtomicString& locale, |
| const UChar* priorContext, |
| unsigned priorContextLength) { |
| TextBreakIterator* iterator = |
| LineBreakIteratorPool::sharedPool().take(locale); |
| if (!iterator) |
| return 0; |
| |
| UTextWithBuffer textLocal; |
| textLocal.text = emptyText; |
| textLocal.text.extraSize = sizeof(textLocal.buffer); |
| textLocal.text.pExtra = textLocal.buffer; |
| |
| UErrorCode openStatus = U_ZERO_ERROR; |
| UText* text = textOpenLatin1(&textLocal, string, length, priorContext, |
| priorContextLength, &openStatus); |
| if (U_FAILURE(openStatus)) { |
| DLOG(ERROR) << "textOpenLatin1 failed with status " << openStatus; |
| return 0; |
| } |
| |
| UErrorCode setTextStatus = U_ZERO_ERROR; |
| iterator->setText(text, setTextStatus); |
| if (U_FAILURE(setTextStatus)) { |
| DLOG(ERROR) << "ubrk_setUText failed with status " << setTextStatus; |
| return 0; |
| } |
| |
| utext_close(text); |
| |
| return iterator; |
| } |
| |
| TextBreakIterator* acquireLineBreakIterator(const UChar* string, |
| int length, |
| const AtomicString& locale, |
| const UChar* priorContext, |
| unsigned priorContextLength) { |
| TextBreakIterator* iterator = |
| LineBreakIteratorPool::sharedPool().take(locale); |
| if (!iterator) |
| return 0; |
| |
| UText textLocal = UTEXT_INITIALIZER; |
| |
| UErrorCode openStatus = U_ZERO_ERROR; |
| UText* text = textOpenUTF16(&textLocal, string, length, priorContext, |
| priorContextLength, &openStatus); |
| if (U_FAILURE(openStatus)) { |
| DLOG(ERROR) << "textOpenUTF16 failed with status " << openStatus; |
| return 0; |
| } |
| |
| UErrorCode setTextStatus = U_ZERO_ERROR; |
| iterator->setText(text, setTextStatus); |
| if (U_FAILURE(setTextStatus)) { |
| DLOG(ERROR) << "ubrk_setUText failed with status " << setTextStatus; |
| return 0; |
| } |
| |
| utext_close(text); |
| |
| return iterator; |
| } |
| |
| void releaseLineBreakIterator(TextBreakIterator* iterator) { |
| DCHECK(iterator); |
| LineBreakIteratorPool::sharedPool().put(iterator); |
| } |
| |
| static TextBreakIterator* nonSharedCharacterBreakIterator; |
| |
| static inline bool compareAndSwapNonSharedCharacterBreakIterator( |
| TextBreakIterator* expected, |
| TextBreakIterator* newValue) { |
| DEFINE_STATIC_LOCAL(Mutex, nonSharedCharacterBreakIteratorMutex, ()); |
| MutexLocker locker(nonSharedCharacterBreakIteratorMutex); |
| if (nonSharedCharacterBreakIterator != expected) |
| return false; |
| nonSharedCharacterBreakIterator = newValue; |
| return true; |
| } |
| |
| NonSharedCharacterBreakIterator::NonSharedCharacterBreakIterator( |
| const String& string) |
| : m_is8Bit(true), m_charaters8(0), m_offset(0), m_length(0), m_iterator(0) { |
| if (string.isEmpty()) |
| return; |
| |
| m_is8Bit = string.is8Bit(); |
| |
| if (m_is8Bit) { |
| m_charaters8 = string.characters8(); |
| m_offset = 0; |
| m_length = string.length(); |
| return; |
| } |
| |
| createIteratorForBuffer(string.characters16(), string.length()); |
| } |
| |
| NonSharedCharacterBreakIterator::NonSharedCharacterBreakIterator( |
| const UChar* buffer, |
| unsigned length) |
| : m_is8Bit(false), |
| m_charaters8(0), |
| m_offset(0), |
| m_length(0), |
| m_iterator(0) { |
| createIteratorForBuffer(buffer, length); |
| } |
| |
| void NonSharedCharacterBreakIterator::createIteratorForBuffer( |
| const UChar* buffer, |
| unsigned length) { |
| m_iterator = nonSharedCharacterBreakIterator; |
| bool createdIterator = |
| m_iterator && |
| compareAndSwapNonSharedCharacterBreakIterator(m_iterator, 0); |
| if (!createdIterator) { |
| UErrorCode errorCode = U_ZERO_ERROR; |
| m_iterator = icu::BreakIterator::createCharacterInstance( |
| icu::Locale(currentTextBreakLocaleID()), errorCode); |
| DCHECK(U_SUCCESS(errorCode)) |
| << "ICU could not open a break iterator: " << u_errorName(errorCode) |
| << " (" << errorCode << ")"; |
| } |
| |
| setText16(m_iterator, buffer, length); |
| } |
| |
| NonSharedCharacterBreakIterator::~NonSharedCharacterBreakIterator() { |
| if (m_is8Bit) |
| return; |
| if (!compareAndSwapNonSharedCharacterBreakIterator(0, m_iterator)) |
| delete m_iterator; |
| } |
| |
| int NonSharedCharacterBreakIterator::next() { |
| if (!m_is8Bit) |
| return m_iterator->next(); |
| |
| if (m_offset >= m_length) |
| return TextBreakDone; |
| |
| m_offset += clusterLengthStartingAt(m_offset); |
| return m_offset; |
| } |
| |
| int NonSharedCharacterBreakIterator::current() { |
| if (!m_is8Bit) |
| return m_iterator->current(); |
| return m_offset; |
| } |
| |
| bool NonSharedCharacterBreakIterator::isBreak(int offset) const { |
| if (!m_is8Bit) |
| return m_iterator->isBoundary(offset); |
| return !isLFAfterCR(offset); |
| } |
| |
| int NonSharedCharacterBreakIterator::preceding(int offset) const { |
| if (!m_is8Bit) |
| return m_iterator->preceding(offset); |
| if (offset <= 0) |
| return TextBreakDone; |
| if (isLFAfterCR(offset)) |
| return offset - 2; |
| return offset - 1; |
| } |
| |
| int NonSharedCharacterBreakIterator::following(int offset) const { |
| if (!m_is8Bit) |
| return m_iterator->following(offset); |
| if (static_cast<unsigned>(offset) >= m_length) |
| return TextBreakDone; |
| return offset + clusterLengthStartingAt(offset); |
| } |
| |
| TextBreakIterator* sentenceBreakIterator(const UChar* string, int length) { |
| UErrorCode openStatus = U_ZERO_ERROR; |
| static TextBreakIterator* iterator = 0; |
| if (!iterator) { |
| iterator = icu::BreakIterator::createSentenceInstance( |
| icu::Locale(currentTextBreakLocaleID()), openStatus); |
| DCHECK(U_SUCCESS(openStatus)) |
| << "ICU could not open a break iterator: " << u_errorName(openStatus) |
| << " (" << openStatus << ")"; |
| if (!iterator) |
| return 0; |
| } |
| |
| setText16(iterator, string, length); |
| return iterator; |
| } |
| |
| bool isWordTextBreak(TextBreakIterator* iterator) { |
| icu::RuleBasedBreakIterator* ruleBasedBreakIterator = |
| static_cast<icu::RuleBasedBreakIterator*>(iterator); |
| int ruleStatus = ruleBasedBreakIterator->getRuleStatus(); |
| return ruleStatus != UBRK_WORD_NONE; |
| } |
| |
| static TextBreakIterator* setUpIteratorWithRules(const char* breakRules, |
| const UChar* string, |
| int length) { |
| if (!string) |
| return 0; |
| |
| static TextBreakIterator* iterator = 0; |
| if (!iterator) { |
| UParseError parseStatus; |
| UErrorCode openStatus = U_ZERO_ERROR; |
| Vector<UChar> rules; |
| String(breakRules).appendTo(rules); |
| |
| iterator = new icu::RuleBasedBreakIterator( |
| icu::UnicodeString(rules.data(), rules.size()), parseStatus, |
| openStatus); |
| DCHECK(U_SUCCESS(openStatus)) |
| << "ICU could not open a break iterator: " << u_errorName(openStatus) |
| << " (" << openStatus << ")"; |
| if (!iterator) |
| return 0; |
| } |
| |
| setText16(iterator, string, length); |
| return iterator; |
| } |
| |
| TextBreakIterator* cursorMovementIterator(const UChar* string, int length) { |
| // This rule set is based on character-break iterator rules of ICU 4.0 |
| // <http://source.icu-project.org/repos/icu/icu/tags/release-4-0/source/data/brkitr/char.txt>. |
| // The major differences from the original ones are listed below: |
| // * Replaced '[\p{Grapheme_Cluster_Break = SpacingMark}]' with |
| // '[\p{General_Category = Spacing Mark} - $Extend]' for ICU 3.8 or earlier; |
| // * Removed rules that prevent a cursor from moving after prepend characters |
| // (Bug 24342); |
| // * Added rules that prevent a cursor from moving after virama signs of Indic |
| // languages except Tamil (Bug 15790), and; |
| // * Added rules that prevent a cursor from moving before Japanese half-width |
| // katakara voiced marks. |
| // * Added rules for regional indicator symbols. |
| static const char* const kRules = |
| "$CR = [\\p{Grapheme_Cluster_Break = CR}];" |
| "$LF = [\\p{Grapheme_Cluster_Break = LF}];" |
| "$Control = [\\p{Grapheme_Cluster_Break = Control}];" |
| "$VoiceMarks = [\\uFF9E\\uFF9F];" // Japanese half-width katakana voiced |
| // marks |
| "$Extend = [\\p{Grapheme_Cluster_Break = Extend} $VoiceMarks - [\\u0E30 " |
| "\\u0E32 \\u0E45 \\u0EB0 \\u0EB2]];" |
| "$SpacingMark = [[\\p{General_Category = Spacing Mark}] - $Extend];" |
| "$L = [\\p{Grapheme_Cluster_Break = L}];" |
| "$V = [\\p{Grapheme_Cluster_Break = V}];" |
| "$T = [\\p{Grapheme_Cluster_Break = T}];" |
| "$LV = [\\p{Grapheme_Cluster_Break = LV}];" |
| "$LVT = [\\p{Grapheme_Cluster_Break = LVT}];" |
| "$Hin0 = [\\u0905-\\u0939];" // Devanagari Letter A,...,Ha |
| "$HinV = \\u094D;" // Devanagari Sign Virama |
| "$Hin1 = [\\u0915-\\u0939];" // Devanagari Letter Ka,...,Ha |
| "$Ben0 = [\\u0985-\\u09B9];" // Bengali Letter A,...,Ha |
| "$BenV = \\u09CD;" // Bengali Sign Virama |
| "$Ben1 = [\\u0995-\\u09B9];" // Bengali Letter Ka,...,Ha |
| "$Pan0 = [\\u0A05-\\u0A39];" // Gurmukhi Letter A,...,Ha |
| "$PanV = \\u0A4D;" // Gurmukhi Sign Virama |
| "$Pan1 = [\\u0A15-\\u0A39];" // Gurmukhi Letter Ka,...,Ha |
| "$Guj0 = [\\u0A85-\\u0AB9];" // Gujarati Letter A,...,Ha |
| "$GujV = \\u0ACD;" // Gujarati Sign Virama |
| "$Guj1 = [\\u0A95-\\u0AB9];" // Gujarati Letter Ka,...,Ha |
| "$Ori0 = [\\u0B05-\\u0B39];" // Oriya Letter A,...,Ha |
| "$OriV = \\u0B4D;" // Oriya Sign Virama |
| "$Ori1 = [\\u0B15-\\u0B39];" // Oriya Letter Ka,...,Ha |
| "$Tel0 = [\\u0C05-\\u0C39];" // Telugu Letter A,...,Ha |
| "$TelV = \\u0C4D;" // Telugu Sign Virama |
| "$Tel1 = [\\u0C14-\\u0C39];" // Telugu Letter Ka,...,Ha |
| "$Kan0 = [\\u0C85-\\u0CB9];" // Kannada Letter A,...,Ha |
| "$KanV = \\u0CCD;" // Kannada Sign Virama |
| "$Kan1 = [\\u0C95-\\u0CB9];" // Kannada Letter A,...,Ha |
| "$Mal0 = [\\u0D05-\\u0D39];" // Malayalam Letter A,...,Ha |
| "$MalV = \\u0D4D;" // Malayalam Sign Virama |
| "$Mal1 = [\\u0D15-\\u0D39];" // Malayalam Letter A,...,Ha |
| "$RI = [\\U0001F1E6-\\U0001F1FF];" // Emoji regional indicators |
| "!!chain;" |
| "!!forward;" |
| "$CR $LF;" |
| "$L ($L | $V | $LV | $LVT);" |
| "($LV | $V) ($V | $T);" |
| "($LVT | $T) $T;" |
| "[^$Control $CR $LF] $Extend;" |
| "[^$Control $CR $LF] $SpacingMark;" |
| "$RI $RI / $RI;" |
| "$RI $RI;" |
| "$Hin0 $HinV $Hin1;" // Devanagari Virama (forward) |
| "$Ben0 $BenV $Ben1;" // Bengali Virama (forward) |
| "$Pan0 $PanV $Pan1;" // Gurmukhi Virama (forward) |
| "$Guj0 $GujV $Guj1;" // Gujarati Virama (forward) |
| "$Ori0 $OriV $Ori1;" // Oriya Virama (forward) |
| "$Tel0 $TelV $Tel1;" // Telugu Virama (forward) |
| "$Kan0 $KanV $Kan1;" // Kannada Virama (forward) |
| "$Mal0 $MalV $Mal1;" // Malayalam Virama (forward) |
| "!!reverse;" |
| "$LF $CR;" |
| "($L | $V | $LV | $LVT) $L;" |
| "($V | $T) ($LV | $V);" |
| "$T ($LVT | $T);" |
| "$Extend [^$Control $CR $LF];" |
| "$SpacingMark [^$Control $CR $LF];" |
| "$RI $RI / $RI $RI;" |
| "$RI $RI;" |
| "$Hin1 $HinV $Hin0;" // Devanagari Virama (backward) |
| "$Ben1 $BenV $Ben0;" // Bengali Virama (backward) |
| "$Pan1 $PanV $Pan0;" // Gurmukhi Virama (backward) |
| "$Guj1 $GujV $Guj0;" // Gujarati Virama (backward) |
| "$Ori1 $OriV $Ori0;" // Gujarati Virama (backward) |
| "$Tel1 $TelV $Tel0;" // Telugu Virama (backward) |
| "$Kan1 $KanV $Kan0;" // Kannada Virama (backward) |
| "$Mal1 $MalV $Mal0;" // Malayalam Virama (backward) |
| "!!safe_reverse;" |
| "!!safe_forward;"; |
| |
| return setUpIteratorWithRules(kRules, string, length); |
| } |
| |
| } // namespace blink |