| /* |
| * Copyright (C) 2006 Lars Knoll <lars@trolltech.com> |
| * Copyright (C) 2007, 2011, 2012 Apple Inc. All rights reserved. |
| * |
| * This library is free software; you can redistribute it and/or |
| * modify it under the terms of the GNU Library General Public |
| * License as published by the Free Software Foundation; either |
| * version 2 of the License, or (at your option) any later version. |
| * |
| * This library is distributed in the hope that it will be useful, |
| * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| * Library General Public License for more details. |
| * |
| * You should have received a copy of the GNU Library General Public License |
| * along with this library; see the file COPYING.LIB. If not, write to |
| * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, |
| * Boston, MA 02110-1301, USA. |
| * |
| */ |
| |
| #include "third_party/blink/renderer/platform/text/text_break_iterator.h" |
| |
| #include <unicode/rbbi.h> |
| #include <unicode/ubrk.h> |
| #include <algorithm> |
| #include <limits> |
| #include <memory> |
| #include <utility> |
| |
| #include "base/macros.h" |
| #include "base/memory/ptr_util.h" |
| #include "third_party/blink/renderer/platform/text/icu_error.h" |
| #include "third_party/blink/renderer/platform/text/text_break_iterator_internal_icu.h" |
| #include "third_party/blink/renderer/platform/wtf/assertions.h" |
| #include "third_party/blink/renderer/platform/wtf/hash_map.h" |
| #include "third_party/blink/renderer/platform/wtf/text/atomic_string_hash.h" |
| #include "third_party/blink/renderer/platform/wtf/text/wtf_string.h" |
| #include "third_party/blink/renderer/platform/wtf/thread_specific.h" |
| #include "third_party/blink/renderer/platform/wtf/threading_primitives.h" |
| |
| namespace blink { |
| |
| class LineBreakIteratorPool final { |
| USING_FAST_MALLOC(LineBreakIteratorPool); |
| |
| public: |
| static LineBreakIteratorPool& SharedPool() { |
| static WTF::ThreadSpecific<LineBreakIteratorPool>* pool = |
| new WTF::ThreadSpecific<LineBreakIteratorPool>; |
| return **pool; |
| } |
| |
| static std::unique_ptr<LineBreakIteratorPool> Create() { |
| return base::WrapUnique(new LineBreakIteratorPool); |
| } |
| |
| icu::BreakIterator* Take(const AtomicString& locale) { |
| icu::BreakIterator* iterator = nullptr; |
| for (size_t i = 0; i < pool_.size(); ++i) { |
| if (pool_[i].first == locale) { |
| iterator = pool_[i].second; |
| pool_.EraseAt(i); |
| break; |
| } |
| } |
| |
| if (!iterator) { |
| UErrorCode open_status = U_ZERO_ERROR; |
| bool locale_is_empty = locale.IsEmpty(); |
| iterator = icu::BreakIterator::createLineInstance( |
| locale_is_empty ? icu::Locale(CurrentTextBreakLocaleID()) |
| : icu::Locale(locale.Utf8().data()), |
| open_status); |
| // locale comes from a web page and it can be invalid, leading ICU |
| // to fail, in which case we fall back to the default locale. |
| if (!locale_is_empty && U_FAILURE(open_status)) { |
| open_status = U_ZERO_ERROR; |
| iterator = icu::BreakIterator::createLineInstance( |
| icu::Locale(CurrentTextBreakLocaleID()), open_status); |
| } |
| |
| if (U_FAILURE(open_status)) { |
| DLOG(ERROR) << "icu::BreakIterator construction failed with status " |
| << open_status; |
| return nullptr; |
| } |
| } |
| |
| DCHECK(!vended_iterators_.Contains(iterator)); |
| vended_iterators_.Set(iterator, locale); |
| return iterator; |
| } |
| |
| void Put(icu::BreakIterator* iterator) { |
| DCHECK(vended_iterators_.Contains(iterator)); |
| |
| if (pool_.size() == kCapacity) { |
| delete (pool_[0].second); |
| pool_.EraseAt(0); |
| } |
| |
| pool_.push_back(Entry(vended_iterators_.Take(iterator), iterator)); |
| } |
| |
| private: |
| LineBreakIteratorPool() = default; |
| |
| static const size_t kCapacity = 4; |
| |
| typedef std::pair<AtomicString, icu::BreakIterator*> Entry; |
| typedef Vector<Entry, kCapacity> Pool; |
| Pool pool_; |
| HashMap<icu::BreakIterator*, AtomicString> vended_iterators_; |
| |
| friend WTF::ThreadSpecific<LineBreakIteratorPool>:: |
| operator LineBreakIteratorPool*(); |
| |
| DISALLOW_COPY_AND_ASSIGN(LineBreakIteratorPool); |
| }; |
| |
| enum TextContext { kNoContext, kPriorContext, kPrimaryContext }; |
| |
| const int kTextBufferCapacity = 16; |
| |
| typedef struct { |
| DISALLOW_NEW(); |
| UText text; |
| UChar buffer[kTextBufferCapacity]; |
| } UTextWithBuffer; |
| |
| static inline int64_t TextPinIndex(int64_t& index, int64_t limit) { |
| if (index < 0) |
| index = 0; |
| else if (index > limit) |
| index = limit; |
| return index; |
| } |
| |
| static inline int64_t TextNativeLength(UText* text) { |
| return text->a + text->b; |
| } |
| |
| // Relocate pointer from source into destination as required. |
| static void TextFixPointer(const UText* source, |
| UText* destination, |
| const void*& pointer) { |
| if (pointer >= source->pExtra && |
| pointer < static_cast<char*>(source->pExtra) + source->extraSize) { |
| // Pointer references source extra buffer. |
| pointer = static_cast<char*>(destination->pExtra) + |
| (static_cast<const char*>(pointer) - |
| static_cast<const char*>(source->pExtra)); |
| } else if (pointer >= source && |
| pointer < |
| reinterpret_cast<const char*>(source) + source->sizeOfStruct) { |
| // Pointer references source text structure, but not source extra buffer. |
| pointer = reinterpret_cast<char*>(destination) + |
| (static_cast<const char*>(pointer) - |
| reinterpret_cast<const char*>(source)); |
| } |
| } |
| |
| static UText* TextClone(UText* destination, |
| const UText* source, |
| UBool deep, |
| UErrorCode* status) { |
| DCHECK(!deep); |
| if (U_FAILURE(*status)) |
| return nullptr; |
| int32_t extra_size = source->extraSize; |
| destination = utext_setup(destination, extra_size, status); |
| if (U_FAILURE(*status)) |
| return destination; |
| void* extra_new = destination->pExtra; |
| int32_t flags = destination->flags; |
| int size_to_copy = std::min(source->sizeOfStruct, destination->sizeOfStruct); |
| memcpy(destination, source, size_to_copy); |
| destination->pExtra = extra_new; |
| destination->flags = flags; |
| memcpy(destination->pExtra, source->pExtra, extra_size); |
| TextFixPointer(source, destination, destination->context); |
| TextFixPointer(source, destination, destination->p); |
| TextFixPointer(source, destination, destination->q); |
| DCHECK(!destination->r); |
| const void* chunk_contents = |
| static_cast<const void*>(destination->chunkContents); |
| TextFixPointer(source, destination, chunk_contents); |
| destination->chunkContents = static_cast<const UChar*>(chunk_contents); |
| return destination; |
| } |
| |
| static int32_t TextExtract(UText*, |
| int64_t, |
| int64_t, |
| UChar*, |
| int32_t, |
| UErrorCode* error_code) { |
| // In the present context, this text provider is used only with ICU functions |
| // that do not perform an extract operation. |
| NOTREACHED(); |
| *error_code = U_UNSUPPORTED_ERROR; |
| return 0; |
| } |
| |
| static void TextClose(UText* text) { |
| text->context = nullptr; |
| } |
| |
| static inline TextContext TextGetContext(const UText* text, |
| int64_t native_index, |
| UBool forward) { |
| if (!text->b || native_index > text->b) |
| return kPrimaryContext; |
| if (native_index == text->b) |
| return forward ? kPrimaryContext : kPriorContext; |
| return kPriorContext; |
| } |
| |
| static inline TextContext TextLatin1GetCurrentContext(const UText* text) { |
| if (!text->chunkContents) |
| return kNoContext; |
| return text->chunkContents == text->pExtra ? kPrimaryContext : kPriorContext; |
| } |
| |
| static void TextLatin1MoveInPrimaryContext(UText* text, |
| int64_t native_index, |
| int64_t native_length, |
| UBool forward) { |
| DCHECK_EQ(text->chunkContents, text->pExtra); |
| if (forward) { |
| DCHECK_GE(native_index, text->b); |
| DCHECK_LT(native_index, native_length); |
| text->chunkNativeStart = native_index; |
| text->chunkNativeLimit = native_index + text->extraSize / sizeof(UChar); |
| if (text->chunkNativeLimit > native_length) |
| text->chunkNativeLimit = native_length; |
| } else { |
| DCHECK_GT(native_index, text->b); |
| DCHECK_LE(native_index, native_length); |
| text->chunkNativeLimit = native_index; |
| text->chunkNativeStart = native_index - text->extraSize / sizeof(UChar); |
| if (text->chunkNativeStart < text->b) |
| text->chunkNativeStart = text->b; |
| } |
| int64_t length = text->chunkNativeLimit - text->chunkNativeStart; |
| // Ensure chunk length is well defined if computed length exceeds int32_t |
| // range. |
| DCHECK_LE(length, std::numeric_limits<int32_t>::max()); |
| text->chunkLength = length <= std::numeric_limits<int32_t>::max() |
| ? static_cast<int32_t>(length) |
| : 0; |
| text->nativeIndexingLimit = text->chunkLength; |
| text->chunkOffset = forward ? 0 : text->chunkLength; |
| StringImpl::CopyChars( |
| const_cast<UChar*>(text->chunkContents), |
| static_cast<const LChar*>(text->p) + (text->chunkNativeStart - text->b), |
| static_cast<unsigned>(text->chunkLength)); |
| } |
| |
| static void TextLatin1SwitchToPrimaryContext(UText* text, |
| int64_t native_index, |
| int64_t native_length, |
| UBool forward) { |
| DCHECK(!text->chunkContents || text->chunkContents == text->q); |
| text->chunkContents = static_cast<const UChar*>(text->pExtra); |
| TextLatin1MoveInPrimaryContext(text, native_index, native_length, forward); |
| } |
| |
| static void TextLatin1MoveInPriorContext(UText* text, |
| int64_t native_index, |
| int64_t native_length, |
| UBool forward) { |
| DCHECK_EQ(text->chunkContents, text->q); |
| DCHECK(forward ? native_index < text->b : native_index <= text->b); |
| DCHECK(forward ? native_index < native_length |
| : native_index <= native_length); |
| DCHECK(forward ? native_index < native_length |
| : native_index <= native_length); |
| text->chunkNativeStart = 0; |
| text->chunkNativeLimit = text->b; |
| text->chunkLength = text->b; |
| text->nativeIndexingLimit = text->chunkLength; |
| int64_t offset = native_index - text->chunkNativeStart; |
| // Ensure chunk offset is well defined if computed offset exceeds int32_t |
| // range or chunk length. |
| DCHECK_LE(offset, std::numeric_limits<int32_t>::max()); |
| text->chunkOffset = std::min(offset <= std::numeric_limits<int32_t>::max() |
| ? static_cast<int32_t>(offset) |
| : 0, |
| text->chunkLength); |
| } |
| |
| static void TextLatin1SwitchToPriorContext(UText* text, |
| int64_t native_index, |
| int64_t native_length, |
| UBool forward) { |
| DCHECK(!text->chunkContents || text->chunkContents == text->pExtra); |
| text->chunkContents = static_cast<const UChar*>(text->q); |
| TextLatin1MoveInPriorContext(text, native_index, native_length, forward); |
| } |
| |
| static inline bool TextInChunkOrOutOfRange(UText* text, |
| int64_t native_index, |
| int64_t native_length, |
| UBool forward, |
| UBool& is_accessible) { |
| if (forward) { |
| if (native_index >= text->chunkNativeStart && |
| native_index < text->chunkNativeLimit) { |
| int64_t offset = native_index - text->chunkNativeStart; |
| // Ensure chunk offset is well formed if computed offset exceeds int32_t |
| // range. |
| DCHECK_LE(offset, std::numeric_limits<int32_t>::max()); |
| text->chunkOffset = offset <= std::numeric_limits<int32_t>::max() |
| ? static_cast<int32_t>(offset) |
| : 0; |
| is_accessible = TRUE; |
| return true; |
| } |
| if (native_index >= native_length && |
| text->chunkNativeLimit == native_length) { |
| text->chunkOffset = text->chunkLength; |
| is_accessible = FALSE; |
| return true; |
| } |
| } else { |
| if (native_index > text->chunkNativeStart && |
| native_index <= text->chunkNativeLimit) { |
| int64_t offset = native_index - text->chunkNativeStart; |
| // Ensure chunk offset is well formed if computed offset exceeds int32_t |
| // range. |
| DCHECK_LE(offset, std::numeric_limits<int32_t>::max()); |
| text->chunkOffset = offset <= std::numeric_limits<int32_t>::max() |
| ? static_cast<int32_t>(offset) |
| : 0; |
| is_accessible = TRUE; |
| return true; |
| } |
| if (native_index <= 0 && !text->chunkNativeStart) { |
| text->chunkOffset = 0; |
| is_accessible = FALSE; |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| static UBool TextLatin1Access(UText* text, |
| int64_t native_index, |
| UBool forward) { |
| if (!text->context) |
| return FALSE; |
| int64_t native_length = TextNativeLength(text); |
| UBool is_accessible; |
| if (TextInChunkOrOutOfRange(text, native_index, native_length, forward, |
| is_accessible)) |
| return is_accessible; |
| native_index = TextPinIndex(native_index, native_length - 1); |
| TextContext current_context = TextLatin1GetCurrentContext(text); |
| TextContext new_context = TextGetContext(text, native_index, forward); |
| DCHECK_NE(new_context, kNoContext); |
| if (new_context == current_context) { |
| if (current_context == kPrimaryContext) { |
| TextLatin1MoveInPrimaryContext(text, native_index, native_length, |
| forward); |
| } else { |
| TextLatin1MoveInPriorContext(text, native_index, native_length, forward); |
| } |
| } else if (new_context == kPrimaryContext) { |
| TextLatin1SwitchToPrimaryContext(text, native_index, native_length, |
| forward); |
| } else { |
| DCHECK_EQ(new_context, kPriorContext); |
| TextLatin1SwitchToPriorContext(text, native_index, native_length, forward); |
| } |
| return TRUE; |
| } |
| |
| static const struct UTextFuncs kTextLatin1Funcs = { |
| sizeof(UTextFuncs), |
| 0, |
| 0, |
| 0, |
| TextClone, |
| TextNativeLength, |
| TextLatin1Access, |
| TextExtract, |
| nullptr, |
| nullptr, |
| nullptr, |
| nullptr, |
| TextClose, |
| nullptr, |
| nullptr, |
| nullptr, |
| }; |
| |
| static void TextInit(UText* text, |
| const UTextFuncs* funcs, |
| const void* string, |
| unsigned length, |
| const UChar* prior_context, |
| int prior_context_length) { |
| text->pFuncs = funcs; |
| text->providerProperties = 1 << UTEXT_PROVIDER_STABLE_CHUNKS; |
| text->context = string; |
| text->p = string; |
| text->a = length; |
| text->q = prior_context; |
| text->b = prior_context_length; |
| } |
| |
| static UText* TextOpenLatin1(UTextWithBuffer* ut_with_buffer, |
| const LChar* string, |
| unsigned length, |
| const UChar* prior_context, |
| int prior_context_length, |
| UErrorCode* status) { |
| if (U_FAILURE(*status)) |
| return nullptr; |
| |
| if (!string || |
| length > static_cast<unsigned>(std::numeric_limits<int32_t>::max())) { |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| return nullptr; |
| } |
| UText* text = utext_setup(&ut_with_buffer->text, |
| sizeof(ut_with_buffer->buffer), status); |
| if (U_FAILURE(*status)) { |
| DCHECK(!text); |
| return nullptr; |
| } |
| TextInit(text, &kTextLatin1Funcs, string, length, prior_context, |
| prior_context_length); |
| return text; |
| } |
| |
| static inline TextContext TextUTF16GetCurrentContext(const UText* text) { |
| if (!text->chunkContents) |
| return kNoContext; |
| return text->chunkContents == text->p ? kPrimaryContext : kPriorContext; |
| } |
| |
| static void TextUTF16MoveInPrimaryContext(UText* text, |
| int64_t native_index, |
| int64_t native_length, |
| UBool forward) { |
| DCHECK_EQ(text->chunkContents, text->p); |
| DCHECK(forward ? native_index >= text->b : native_index > text->b); |
| DCHECK(forward ? native_index < native_length |
| : native_index <= native_length); |
| text->chunkNativeStart = text->b; |
| text->chunkNativeLimit = native_length; |
| int64_t length = text->chunkNativeLimit - text->chunkNativeStart; |
| // Ensure chunk length is well defined if computed length exceeds int32_t |
| // range. |
| DCHECK_LE(length, std::numeric_limits<int32_t>::max()); |
| text->chunkLength = length <= std::numeric_limits<int32_t>::max() |
| ? static_cast<int32_t>(length) |
| : 0; |
| text->nativeIndexingLimit = text->chunkLength; |
| int64_t offset = native_index - text->chunkNativeStart; |
| // Ensure chunk offset is well defined if computed offset exceeds int32_t |
| // range or chunk length. |
| DCHECK_LE(offset, std::numeric_limits<int32_t>::max()); |
| text->chunkOffset = std::min(offset <= std::numeric_limits<int32_t>::max() |
| ? static_cast<int32_t>(offset) |
| : 0, |
| text->chunkLength); |
| } |
| |
| static void TextUTF16SwitchToPrimaryContext(UText* text, |
| int64_t native_index, |
| int64_t native_length, |
| UBool forward) { |
| DCHECK(!text->chunkContents || text->chunkContents == text->q); |
| text->chunkContents = static_cast<const UChar*>(text->p); |
| TextUTF16MoveInPrimaryContext(text, native_index, native_length, forward); |
| } |
| |
| static void TextUTF16MoveInPriorContext(UText* text, |
| int64_t native_index, |
| int64_t native_length, |
| UBool forward) { |
| DCHECK_EQ(text->chunkContents, text->q); |
| DCHECK(forward ? native_index < text->b : native_index <= text->b); |
| DCHECK(forward ? native_index < native_length |
| : native_index <= native_length); |
| DCHECK(forward ? native_index < native_length |
| : native_index <= native_length); |
| text->chunkNativeStart = 0; |
| text->chunkNativeLimit = text->b; |
| text->chunkLength = text->b; |
| text->nativeIndexingLimit = text->chunkLength; |
| int64_t offset = native_index - text->chunkNativeStart; |
| // Ensure chunk offset is well defined if computed offset exceeds int32_t |
| // range or chunk length. |
| DCHECK_LE(offset, std::numeric_limits<int32_t>::max()); |
| text->chunkOffset = std::min(offset <= std::numeric_limits<int32_t>::max() |
| ? static_cast<int32_t>(offset) |
| : 0, |
| text->chunkLength); |
| } |
| |
| static void TextUTF16SwitchToPriorContext(UText* text, |
| int64_t native_index, |
| int64_t native_length, |
| UBool forward) { |
| DCHECK(!text->chunkContents || text->chunkContents == text->p); |
| text->chunkContents = static_cast<const UChar*>(text->q); |
| TextUTF16MoveInPriorContext(text, native_index, native_length, forward); |
| } |
| |
| static UBool TextUTF16Access(UText* text, int64_t native_index, UBool forward) { |
| if (!text->context) |
| return FALSE; |
| int64_t native_length = TextNativeLength(text); |
| UBool is_accessible; |
| if (TextInChunkOrOutOfRange(text, native_index, native_length, forward, |
| is_accessible)) |
| return is_accessible; |
| native_index = TextPinIndex(native_index, native_length - 1); |
| TextContext current_context = TextUTF16GetCurrentContext(text); |
| TextContext new_context = TextGetContext(text, native_index, forward); |
| DCHECK_NE(new_context, kNoContext); |
| if (new_context == current_context) { |
| if (current_context == kPrimaryContext) { |
| TextUTF16MoveInPrimaryContext(text, native_index, native_length, forward); |
| } else { |
| TextUTF16MoveInPriorContext(text, native_index, native_length, forward); |
| } |
| } else if (new_context == kPrimaryContext) { |
| TextUTF16SwitchToPrimaryContext(text, native_index, native_length, forward); |
| } else { |
| DCHECK_EQ(new_context, kPriorContext); |
| TextUTF16SwitchToPriorContext(text, native_index, native_length, forward); |
| } |
| return TRUE; |
| } |
| |
| static const struct UTextFuncs kTextUTF16Funcs = { |
| sizeof(UTextFuncs), |
| 0, |
| 0, |
| 0, |
| TextClone, |
| TextNativeLength, |
| TextUTF16Access, |
| TextExtract, |
| nullptr, |
| nullptr, |
| nullptr, |
| nullptr, |
| TextClose, |
| nullptr, |
| nullptr, |
| nullptr, |
| }; |
| |
| static UText* TextOpenUTF16(UText* text, |
| const UChar* string, |
| unsigned length, |
| const UChar* prior_context, |
| int prior_context_length, |
| UErrorCode* status) { |
| if (U_FAILURE(*status)) |
| return nullptr; |
| |
| if (!string || |
| length > static_cast<unsigned>(std::numeric_limits<int32_t>::max())) { |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| return nullptr; |
| } |
| |
| text = utext_setup(text, 0, status); |
| if (U_FAILURE(*status)) { |
| DCHECK(!text); |
| return nullptr; |
| } |
| TextInit(text, &kTextUTF16Funcs, string, length, prior_context, |
| prior_context_length); |
| return text; |
| } |
| |
| static UText g_empty_text = UTEXT_INITIALIZER; |
| |
| static TextBreakIterator* WordBreakIterator(const LChar* string, int length) { |
| UErrorCode error_code = U_ZERO_ERROR; |
| static TextBreakIterator* break_iter = nullptr; |
| if (!break_iter) { |
| break_iter = icu::BreakIterator::createWordInstance( |
| icu::Locale(CurrentTextBreakLocaleID()), error_code); |
| DCHECK(U_SUCCESS(error_code)) |
| << "ICU could not open a break iterator: " << u_errorName(error_code) |
| << " (" << error_code << ")"; |
| if (!break_iter) |
| return nullptr; |
| } |
| |
| UTextWithBuffer text_local; |
| text_local.text = g_empty_text; |
| text_local.text.extraSize = sizeof(text_local.buffer); |
| text_local.text.pExtra = text_local.buffer; |
| |
| UErrorCode open_status = U_ZERO_ERROR; |
| UText* text = |
| TextOpenLatin1(&text_local, string, length, nullptr, 0, &open_status); |
| if (U_FAILURE(open_status)) { |
| DLOG(ERROR) << "textOpenLatin1 failed with status " << open_status; |
| return nullptr; |
| } |
| |
| UErrorCode set_text_status = U_ZERO_ERROR; |
| break_iter->setText(text, set_text_status); |
| if (U_FAILURE(set_text_status)) |
| DLOG(ERROR) << "BreakIterator::seText failed with status " |
| << set_text_status; |
| |
| utext_close(text); |
| |
| return break_iter; |
| } |
| |
| static void SetText16(TextBreakIterator* iter, |
| const UChar* string, |
| int length) { |
| UErrorCode error_code = U_ZERO_ERROR; |
| UText u_text = UTEXT_INITIALIZER; |
| utext_openUChars(&u_text, string, length, &error_code); |
| if (U_FAILURE(error_code)) |
| return; |
| iter->setText(&u_text, error_code); |
| } |
| |
| TextBreakIterator* WordBreakIterator(const UChar* string, int length) { |
| UErrorCode error_code = U_ZERO_ERROR; |
| static TextBreakIterator* break_iter = nullptr; |
| if (!break_iter) { |
| break_iter = icu::BreakIterator::createWordInstance( |
| icu::Locale(CurrentTextBreakLocaleID()), error_code); |
| DCHECK(U_SUCCESS(error_code)) |
| << "ICU could not open a break iterator: " << u_errorName(error_code) |
| << " (" << error_code << ")"; |
| if (!break_iter) |
| return nullptr; |
| } |
| SetText16(break_iter, string, length); |
| return break_iter; |
| } |
| |
| TextBreakIterator* WordBreakIterator(const String& string, |
| int start, |
| int length) { |
| if (string.IsEmpty()) |
| return nullptr; |
| if (string.Is8Bit()) |
| return WordBreakIterator(string.Characters8() + start, length); |
| return WordBreakIterator(string.Characters16() + start, length); |
| } |
| |
| TextBreakIterator* AcquireLineBreakIterator(const LChar* string, |
| int length, |
| const AtomicString& locale, |
| const UChar* prior_context, |
| unsigned prior_context_length) { |
| TextBreakIterator* iterator = |
| LineBreakIteratorPool::SharedPool().Take(locale); |
| if (!iterator) |
| return nullptr; |
| |
| UTextWithBuffer text_local; |
| text_local.text = g_empty_text; |
| text_local.text.extraSize = sizeof(text_local.buffer); |
| text_local.text.pExtra = text_local.buffer; |
| |
| UErrorCode open_status = U_ZERO_ERROR; |
| UText* text = TextOpenLatin1(&text_local, string, length, prior_context, |
| prior_context_length, &open_status); |
| if (U_FAILURE(open_status)) { |
| DLOG(ERROR) << "textOpenLatin1 failed with status " << open_status; |
| return nullptr; |
| } |
| |
| UErrorCode set_text_status = U_ZERO_ERROR; |
| iterator->setText(text, set_text_status); |
| if (U_FAILURE(set_text_status)) { |
| DLOG(ERROR) << "ubrk_setUText failed with status " << set_text_status; |
| return nullptr; |
| } |
| |
| utext_close(text); |
| |
| return iterator; |
| } |
| |
| TextBreakIterator* AcquireLineBreakIterator(const UChar* string, |
| int length, |
| const AtomicString& locale, |
| const UChar* prior_context, |
| unsigned prior_context_length) { |
| TextBreakIterator* iterator = |
| LineBreakIteratorPool::SharedPool().Take(locale); |
| if (!iterator) |
| return nullptr; |
| |
| UText text_local = UTEXT_INITIALIZER; |
| |
| UErrorCode open_status = U_ZERO_ERROR; |
| UText* text = TextOpenUTF16(&text_local, string, length, prior_context, |
| prior_context_length, &open_status); |
| if (U_FAILURE(open_status)) { |
| DLOG(ERROR) << "textOpenUTF16 failed with status " << open_status; |
| return nullptr; |
| } |
| |
| UErrorCode set_text_status = U_ZERO_ERROR; |
| iterator->setText(text, set_text_status); |
| if (U_FAILURE(set_text_status)) { |
| DLOG(ERROR) << "ubrk_setUText failed with status " << set_text_status; |
| return nullptr; |
| } |
| |
| utext_close(text); |
| |
| return iterator; |
| } |
| |
| void ReleaseLineBreakIterator(TextBreakIterator* iterator) { |
| DCHECK(iterator); |
| LineBreakIteratorPool::SharedPool().Put(iterator); |
| } |
| |
| static TextBreakIterator* GetNonSharedCharacterBreakIterator() { |
| DEFINE_THREAD_SAFE_STATIC_LOCAL( |
| ThreadSpecific<std::unique_ptr<TextBreakIterator>>, thread_specific, ()); |
| |
| std::unique_ptr<TextBreakIterator>& iterator = *thread_specific; |
| |
| if (!iterator) { |
| ICUError error_code; |
| iterator = base::WrapUnique(icu::BreakIterator::createCharacterInstance( |
| icu::Locale(CurrentTextBreakLocaleID()), error_code)); |
| CHECK(U_SUCCESS(error_code) && iterator) |
| << "ICU could not open a break iterator: " << u_errorName(error_code) |
| << " (" << error_code << ")"; |
| } |
| |
| DCHECK(iterator); |
| return iterator.get(); |
| } |
| |
| NonSharedCharacterBreakIterator::NonSharedCharacterBreakIterator( |
| const StringView& string) |
| : is8_bit_(true), |
| charaters8_(nullptr), |
| offset_(0), |
| length_(0), |
| iterator_(nullptr) { |
| if (string.IsEmpty()) |
| return; |
| |
| is8_bit_ = string.Is8Bit(); |
| |
| if (is8_bit_) { |
| charaters8_ = string.Characters8(); |
| offset_ = 0; |
| length_ = string.length(); |
| return; |
| } |
| |
| CreateIteratorForBuffer(string.Characters16(), string.length()); |
| } |
| |
| NonSharedCharacterBreakIterator::NonSharedCharacterBreakIterator( |
| const UChar* buffer, |
| unsigned length) |
| : is8_bit_(false), |
| charaters8_(nullptr), |
| offset_(0), |
| length_(0), |
| iterator_(nullptr) { |
| CreateIteratorForBuffer(buffer, length); |
| } |
| |
| void NonSharedCharacterBreakIterator::CreateIteratorForBuffer( |
| const UChar* buffer, |
| unsigned length) { |
| iterator_ = GetNonSharedCharacterBreakIterator(); |
| SetText16(iterator_, buffer, length); |
| } |
| |
| NonSharedCharacterBreakIterator::~NonSharedCharacterBreakIterator() { |
| if (is8_bit_) |
| return; |
| } |
| |
| int NonSharedCharacterBreakIterator::Next() { |
| if (!is8_bit_) |
| return iterator_->next(); |
| |
| if (offset_ >= length_) |
| return kTextBreakDone; |
| |
| offset_ += ClusterLengthStartingAt(offset_); |
| return offset_; |
| } |
| |
| int NonSharedCharacterBreakIterator::Current() { |
| if (!is8_bit_) |
| return iterator_->current(); |
| return offset_; |
| } |
| |
| bool NonSharedCharacterBreakIterator::IsBreak(int offset) const { |
| if (!is8_bit_) |
| return iterator_->isBoundary(offset); |
| return !IsLFAfterCR(offset); |
| } |
| |
| int NonSharedCharacterBreakIterator::Preceding(int offset) const { |
| if (!is8_bit_) |
| return iterator_->preceding(offset); |
| if (offset <= 0) |
| return kTextBreakDone; |
| if (IsLFAfterCR(offset)) |
| return offset - 2; |
| return offset - 1; |
| } |
| |
| int NonSharedCharacterBreakIterator::Following(int offset) const { |
| if (!is8_bit_) |
| return iterator_->following(offset); |
| if (static_cast<unsigned>(offset) >= length_) |
| return kTextBreakDone; |
| return offset + ClusterLengthStartingAt(offset); |
| } |
| |
| TextBreakIterator* SentenceBreakIterator(const UChar* string, int length) { |
| UErrorCode open_status = U_ZERO_ERROR; |
| static TextBreakIterator* iterator = nullptr; |
| if (!iterator) { |
| iterator = icu::BreakIterator::createSentenceInstance( |
| icu::Locale(CurrentTextBreakLocaleID()), open_status); |
| DCHECK(U_SUCCESS(open_status)) |
| << "ICU could not open a break iterator: " << u_errorName(open_status) |
| << " (" << open_status << ")"; |
| if (!iterator) |
| return nullptr; |
| } |
| |
| SetText16(iterator, string, length); |
| return iterator; |
| } |
| |
| bool IsWordTextBreak(TextBreakIterator* iterator) { |
| icu::RuleBasedBreakIterator* rule_based_break_iterator = |
| static_cast<icu::RuleBasedBreakIterator*>(iterator); |
| int rule_status = rule_based_break_iterator->getRuleStatus(); |
| return rule_status != UBRK_WORD_NONE; |
| } |
| |
| TextBreakIterator* CursorMovementIterator(const UChar* string, int length) { |
| // This rule set is based on character-break iterator rules of ICU 4.0 |
| // <http://source.icu-project.org/repos/icu/icu/tags/release-4-0/source/data/brkitr/char.txt>. |
| // The major differences from the original ones are listed below: |
| // * Replaced '[\p{Grapheme_Cluster_Break = SpacingMark}]' with |
| // '[\p{General_Category = Spacing Mark} - $Extend]' for ICU 3.8 or earlier; |
| // * Removed rules that prevent a cursor from moving after prepend characters |
| // (Bug 24342); |
| // * Added rules that prevent a cursor from moving after virama signs of Indic |
| // languages except Tamil (Bug 15790), and; |
| // * Added rules that prevent a cursor from moving before Japanese half-width |
| // katakara voiced marks. |
| // * Added rules for regional indicator symbols. |
| static const char* const kRules = |
| "$CR = [\\p{Grapheme_Cluster_Break = CR}];" |
| "$LF = [\\p{Grapheme_Cluster_Break = LF}];" |
| "$Control = [\\p{Grapheme_Cluster_Break = Control}];" |
| "$VoiceMarks = [\\uFF9E\\uFF9F];" // Japanese half-width katakana voiced |
| // marks |
| "$Extend = [\\p{Grapheme_Cluster_Break = Extend} $VoiceMarks - [\\u0E30 " |
| "\\u0E32 \\u0E45 \\u0EB0 \\u0EB2]];" |
| "$SpacingMark = [[\\p{General_Category = Spacing Mark}] - $Extend];" |
| "$L = [\\p{Grapheme_Cluster_Break = L}];" |
| "$V = [\\p{Grapheme_Cluster_Break = V}];" |
| "$T = [\\p{Grapheme_Cluster_Break = T}];" |
| "$LV = [\\p{Grapheme_Cluster_Break = LV}];" |
| "$LVT = [\\p{Grapheme_Cluster_Break = LVT}];" |
| "$Hin0 = [\\u0905-\\u0939];" // Devanagari Letter A,...,Ha |
| "$HinV = \\u094D;" // Devanagari Sign Virama |
| "$Hin1 = [\\u0915-\\u0939];" // Devanagari Letter Ka,...,Ha |
| "$Ben0 = [\\u0985-\\u09B9];" // Bengali Letter A,...,Ha |
| "$BenV = \\u09CD;" // Bengali Sign Virama |
| "$Ben1 = [\\u0995-\\u09B9];" // Bengali Letter Ka,...,Ha |
| "$Pan0 = [\\u0A05-\\u0A39];" // Gurmukhi Letter A,...,Ha |
| "$PanV = \\u0A4D;" // Gurmukhi Sign Virama |
| "$Pan1 = [\\u0A15-\\u0A39];" // Gurmukhi Letter Ka,...,Ha |
| "$Guj0 = [\\u0A85-\\u0AB9];" // Gujarati Letter A,...,Ha |
| "$GujV = \\u0ACD;" // Gujarati Sign Virama |
| "$Guj1 = [\\u0A95-\\u0AB9];" // Gujarati Letter Ka,...,Ha |
| "$Ori0 = [\\u0B05-\\u0B39];" // Oriya Letter A,...,Ha |
| "$OriV = \\u0B4D;" // Oriya Sign Virama |
| "$Ori1 = [\\u0B15-\\u0B39];" // Oriya Letter Ka,...,Ha |
| "$Tel0 = [\\u0C05-\\u0C39];" // Telugu Letter A,...,Ha |
| "$TelV = \\u0C4D;" // Telugu Sign Virama |
| "$Tel1 = [\\u0C14-\\u0C39];" // Telugu Letter Ka,...,Ha |
| "$Kan0 = [\\u0C85-\\u0CB9];" // Kannada Letter A,...,Ha |
| "$KanV = \\u0CCD;" // Kannada Sign Virama |
| "$Kan1 = [\\u0C95-\\u0CB9];" // Kannada Letter A,...,Ha |
| "$Mal0 = [\\u0D05-\\u0D39];" // Malayalam Letter A,...,Ha |
| "$MalV = \\u0D4D;" // Malayalam Sign Virama |
| "$Mal1 = [\\u0D15-\\u0D39];" // Malayalam Letter A,...,Ha |
| "$RI = [\\U0001F1E6-\\U0001F1FF];" // Emoji regional indicators |
| "!!chain;" |
| "!!forward;" |
| "$CR $LF;" |
| "$L ($L | $V | $LV | $LVT);" |
| "($LV | $V) ($V | $T);" |
| "($LVT | $T) $T;" |
| "[^$Control $CR $LF] $Extend;" |
| "[^$Control $CR $LF] $SpacingMark;" |
| "$RI $RI / $RI;" |
| "$RI $RI;" |
| "$Hin0 $HinV $Hin1;" // Devanagari Virama (forward) |
| "$Ben0 $BenV $Ben1;" // Bengali Virama (forward) |
| "$Pan0 $PanV $Pan1;" // Gurmukhi Virama (forward) |
| "$Guj0 $GujV $Guj1;" // Gujarati Virama (forward) |
| "$Ori0 $OriV $Ori1;" // Oriya Virama (forward) |
| "$Tel0 $TelV $Tel1;" // Telugu Virama (forward) |
| "$Kan0 $KanV $Kan1;" // Kannada Virama (forward) |
| "$Mal0 $MalV $Mal1;" // Malayalam Virama (forward) |
| "!!reverse;" |
| "$LF $CR;" |
| "($L | $V | $LV | $LVT) $L;" |
| "($V | $T) ($LV | $V);" |
| "$T ($LVT | $T);" |
| "$Extend [^$Control $CR $LF];" |
| "$SpacingMark [^$Control $CR $LF];" |
| "$RI $RI / $RI $RI;" |
| "$RI $RI;" |
| "$Hin1 $HinV $Hin0;" // Devanagari Virama (backward) |
| "$Ben1 $BenV $Ben0;" // Bengali Virama (backward) |
| "$Pan1 $PanV $Pan0;" // Gurmukhi Virama (backward) |
| "$Guj1 $GujV $Guj0;" // Gujarati Virama (backward) |
| "$Ori1 $OriV $Ori0;" // Gujarati Virama (backward) |
| "$Tel1 $TelV $Tel0;" // Telugu Virama (backward) |
| "$Kan1 $KanV $Kan0;" // Kannada Virama (backward) |
| "$Mal1 $MalV $Mal0;" // Malayalam Virama (backward) |
| "!!safe_reverse;" |
| "!!safe_forward;"; |
| |
| if (!string) |
| return nullptr; |
| |
| DEFINE_THREAD_SAFE_STATIC_LOCAL( |
| ThreadSpecific<std::unique_ptr<icu::RuleBasedBreakIterator>>, |
| thread_specific, ()); |
| |
| std::unique_ptr<icu::RuleBasedBreakIterator>& iterator = *thread_specific; |
| |
| if (!iterator) { |
| UParseError parse_status; |
| UErrorCode open_status = U_ZERO_ERROR; |
| // break_rules is ASCII. Pick the most efficient UnicodeString ctor. |
| iterator = std::make_unique<icu::RuleBasedBreakIterator>( |
| icu::UnicodeString(kRules, -1, US_INV), parse_status, open_status); |
| DCHECK(U_SUCCESS(open_status)) |
| << "ICU could not open a break iterator: " << u_errorName(open_status) |
| << " (" << open_status << ")"; |
| } |
| |
| SetText16(iterator.get(), string, length); |
| return iterator.get(); |
| } |
| |
| } // namespace blink |