blob: e2ce60c095c1da6e6307b12ee512157603d1ba98 [file] [log] [blame]
/*
* Copyright (C) 2006 Lars Knoll <lars@trolltech.com>
* Copyright (C) 2007, 2011, 2012 Apple Inc. All rights reserved.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public License
* along with this library; see the file COPYING.LIB. If not, write to
* the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
* Boston, MA 02110-1301, USA.
*
*/
#ifndef THIRD_PARTY_BLINK_RENDERER_PLATFORM_TEXT_TEXT_BREAK_ITERATOR_H_
#define THIRD_PARTY_BLINK_RENDERER_PLATFORM_TEXT_TEXT_BREAK_ITERATOR_H_
#include "base/macros.h"
#include "third_party/blink/renderer/platform/platform_export.h"
#include "third_party/blink/renderer/platform/wtf/text/atomic_string.h"
#include "third_party/blink/renderer/platform/wtf/text/character_names.h"
#include "third_party/blink/renderer/platform/wtf/text/unicode.h"
#include <unicode/brkiter.h>
namespace blink {
typedef icu::BreakIterator TextBreakIterator;
// Note: The returned iterator is good only until you get another iterator, with
// the exception of acquireLineBreakIterator.
// This is similar to character break iterator in most cases, but is subject to
// platform UI conventions. One notable example where this can be different
// from character break iterator is Thai prepend characters, see bug 24342.
// Use this for insertion point and selection manipulations.
PLATFORM_EXPORT TextBreakIterator* CursorMovementIterator(const UChar*,
int length);
PLATFORM_EXPORT TextBreakIterator* WordBreakIterator(const String&,
int start,
int length);
PLATFORM_EXPORT TextBreakIterator* WordBreakIterator(const UChar*, int length);
PLATFORM_EXPORT TextBreakIterator* AcquireLineBreakIterator(
const LChar*,
int length,
const AtomicString& locale,
const UChar* prior_context,
unsigned prior_context_length);
PLATFORM_EXPORT TextBreakIterator* AcquireLineBreakIterator(
const UChar*,
int length,
const AtomicString& locale,
const UChar* prior_context,
unsigned prior_context_length);
PLATFORM_EXPORT void ReleaseLineBreakIterator(TextBreakIterator*);
PLATFORM_EXPORT TextBreakIterator* SentenceBreakIterator(const UChar*,
int length);
// Before calling this, check if the iterator is not at the end. Otherwise,
// it may not work as expected.
// See https://ssl.icu-project.org/trac/ticket/13447 .
PLATFORM_EXPORT bool IsWordTextBreak(TextBreakIterator*);
const int kTextBreakDone = -1;
enum class LineBreakType {
kNormal,
// word-break:break-all allows breaks between letters/numbers, but prohibits
// break before/after certain punctuation.
kBreakAll,
// Allows breaks at every grapheme cluster boundary.
// Terminal style line breaks described in UAX#14: Examples of Customization
// http://unicode.org/reports/tr14/#Examples
// CSS is discussing to add this feature crbug.com/720205
// Used internally for word-break:break-word.
kBreakCharacter,
// word-break:keep-all doesn't allow breaks between all kind of
// letters/numbers except some south east asians'.
kKeepAll,
};
// Determines break opportunities around collapsible space characters (space,
// newline, and tabulation characters.)
enum class BreakSpaceType {
// Break before every collapsible space character.
// This is a specialized optimization for CSS, where leading/trailing spaces
// in each line are removed, and thus breaking before spaces can save
// computing hanging spaces.
// Callers are expected to handle spaces by themselves. Because a run of
// spaces can include different types of spaces, break opportunity is given
// for every space character.
// Pre-LayoutNG line breaker uses this type.
kBeforeEverySpace,
// Break before a run of white space characters.
// This is for CSS line breaking as in |kBeforeEverySpace|, but when
// whitespace collapsing is already applied to the target string. In this
// case, a run of white spaces are preserved spaces. There should not be break
// opportunities between white spaces.
// LayoutNG line breaker uses this type.
kBeforeSpaceRun,
};
PLATFORM_EXPORT std::ostream& operator<<(std::ostream&, LineBreakType);
PLATFORM_EXPORT std::ostream& operator<<(std::ostream&, BreakSpaceType);
class PLATFORM_EXPORT LazyLineBreakIterator final {
STACK_ALLOCATED();
public:
LazyLineBreakIterator()
: iterator_(nullptr),
break_type_(LineBreakType::kNormal) {
ResetPriorContext();
}
LazyLineBreakIterator(String string,
const AtomicString& locale = AtomicString(),
LineBreakType break_type = LineBreakType::kNormal)
: string_(string),
locale_(locale),
iterator_(nullptr),
break_type_(break_type) {
ResetPriorContext();
}
~LazyLineBreakIterator() {
if (iterator_)
ReleaseLineBreakIterator(iterator_);
}
const String& GetString() const { return string_; }
UChar LastCharacter() const {
static_assert(arraysize(prior_context_) == 2,
"TextBreakIterator has unexpected prior context length");
return prior_context_[1];
}
UChar SecondToLastCharacter() const {
static_assert(arraysize(prior_context_) == 2,
"TextBreakIterator has unexpected prior context length");
return prior_context_[0];
}
void SetPriorContext(UChar last, UChar second_to_last) {
static_assert(arraysize(prior_context_) == 2,
"TextBreakIterator has unexpected prior context length");
prior_context_[0] = second_to_last;
prior_context_[1] = last;
}
void UpdatePriorContext(UChar last) {
static_assert(arraysize(prior_context_) == 2,
"TextBreakIterator has unexpected prior context length");
prior_context_[0] = prior_context_[1];
prior_context_[1] = last;
}
void ResetPriorContext() {
static_assert(arraysize(prior_context_) == 2,
"TextBreakIterator has unexpected prior context length");
prior_context_[0] = 0;
prior_context_[1] = 0;
}
struct PriorContext {
const UChar* text = nullptr;
unsigned length = 0;
};
PriorContext GetPriorContext() const {
static_assert(arraysize(prior_context_) == 2,
"TextBreakIterator has unexpected prior context length");
if (prior_context_[1]) {
if (prior_context_[0])
return PriorContext{&prior_context_[0], 2};
return PriorContext{&prior_context_[1], 1};
}
return PriorContext{nullptr, 0};
}
unsigned PriorContextLength() const { return GetPriorContext().length; }
void ResetStringAndReleaseIterator(String string,
const AtomicString& locale) {
string_ = string;
start_offset_ = 0;
locale_ = locale;
ReleaseIterator();
}
// Set the start offset. Text before this offset is disregarded. Properly
// setting the start offset improves the performance significantly, because
// ICU break iterator computes all the text from the beginning.
void SetStartOffset(unsigned offset) {
CHECK_LE(offset, string_.length());
start_offset_ = offset;
ReleaseIterator();
}
void SetLocale(const AtomicString& locale) {
if (locale == locale_)
return;
locale_ = locale;
ReleaseIterator();
}
LineBreakType BreakType() const { return break_type_; }
void SetBreakType(LineBreakType break_type) { break_type_ = break_type; }
BreakSpaceType BreakSpace() const { return break_space_; }
void SetBreakSpace(BreakSpaceType break_space) { break_space_ = break_space; }
inline bool IsBreakable(int pos,
int& next_breakable,
LineBreakType line_break_type) const {
if (pos > next_breakable) {
next_breakable = NextBreakablePosition(pos, line_break_type);
}
return pos == next_breakable;
}
inline bool IsBreakable(int pos, int& next_breakable) const {
return IsBreakable(pos, next_breakable, break_type_);
}
inline bool IsBreakable(int pos) const {
// No need to scan the entire string for the next breakable position when
// all we need to determine is whether the current position is breakable.
// Limit length to pos + 1.
// TODO(layout-dev): We should probably try to break out an actual
// IsBreakable method from NextBreakablePosition and get rid of this hack.
int next_breakable = NextBreakablePosition(pos, break_type_);
return pos == next_breakable;
}
// Returns the break opportunity at or after |offset|.
unsigned NextBreakOpportunity(unsigned offset) const;
unsigned NextBreakOpportunity(unsigned offset, unsigned len) const;
// Returns the break opportunity at or before |offset|.
unsigned PreviousBreakOpportunity(unsigned offset, unsigned min = 0) const;
static bool IsBreakableSpace(UChar ch) {
return ch == kSpaceCharacter || ch == kTabulationCharacter ||
ch == kNewlineCharacter;
}
private:
void ReleaseIterator() const {
if (iterator_)
ReleaseLineBreakIterator(iterator_);
iterator_ = nullptr;
cached_prior_context_.text = nullptr;
cached_prior_context_.length = 0;
}
// Obtain text break iterator, possibly previously cached, where this iterator
// is (or has been) initialized to use the previously stored string as the
// primary breaking context and using previously stored prior context if
// non-empty.
TextBreakIterator* GetIterator(const PriorContext& prior_context) const {
DCHECK(prior_context.length <= kPriorContextCapacity);
if (iterator_) {
if (prior_context.length == cached_prior_context_.length) {
DCHECK_EQ(prior_context.text, cached_prior_context_.text);
return iterator_;
}
ReleaseIterator();
}
// Create the iterator, or get one from the cache, for the text after
// |start_offset_|. Because ICU TextBreakIterator computes all characters
// from the beginning of the given text, using |start_offset_| improves the
// performance significantly.
//
// For this reason, the offset for the TextBreakIterator must be adjusted by
// |start_offset_|.
cached_prior_context_ = prior_context;
CHECK_LE(start_offset_, string_.length());
if (string_.Is8Bit()) {
iterator_ =
AcquireLineBreakIterator(string_.Characters8() + start_offset_,
string_.length() - start_offset_, locale_,
prior_context.text, prior_context.length);
} else {
iterator_ =
AcquireLineBreakIterator(string_.Characters16() + start_offset_,
string_.length() - start_offset_, locale_,
prior_context.text, prior_context.length);
}
return iterator_;
}
template <typename CharacterType, LineBreakType, BreakSpaceType>
int NextBreakablePosition(int pos, const CharacterType* str) const;
template <typename CharacterType, LineBreakType>
int NextBreakablePosition(int pos, const CharacterType* str) const;
template <LineBreakType>
int NextBreakablePosition(int pos) const;
int NextBreakablePositionBreakCharacter(int pos) const;
int NextBreakablePosition(int pos, LineBreakType) const;
static const unsigned kPriorContextCapacity = 2;
String string_;
AtomicString locale_;
mutable TextBreakIterator* iterator_;
UChar prior_context_[kPriorContextCapacity];
mutable PriorContext cached_prior_context_;
unsigned start_offset_ = 0;
LineBreakType break_type_;
BreakSpaceType break_space_ = BreakSpaceType::kBeforeEverySpace;
};
// Iterates over "extended grapheme clusters", as defined in UAX #29.
// Note that platform implementations may be less sophisticated - e.g. ICU prior
// to version 4.0 only supports "legacy grapheme clusters". Use this for
// general text processing, e.g. string truncation.
class PLATFORM_EXPORT NonSharedCharacterBreakIterator final {
STACK_ALLOCATED();
public:
explicit NonSharedCharacterBreakIterator(const StringView&);
NonSharedCharacterBreakIterator(const UChar*, unsigned length);
~NonSharedCharacterBreakIterator();
int Next();
int Current();
bool IsBreak(int offset) const;
int Preceding(int offset) const;
int Following(int offset) const;
bool operator!() const { return !is8_bit_ && !iterator_; }
private:
void CreateIteratorForBuffer(const UChar*, unsigned length);
unsigned ClusterLengthStartingAt(unsigned offset) const {
DCHECK(is8_bit_);
// The only Latin-1 Extended Grapheme Cluster is CR LF
return IsCRBeforeLF(offset) ? 2 : 1;
}
bool IsCRBeforeLF(unsigned offset) const {
DCHECK(is8_bit_);
return charaters8_[offset] == '\r' && offset + 1 < length_ &&
charaters8_[offset + 1] == '\n';
}
bool IsLFAfterCR(unsigned offset) const {
DCHECK(is8_bit_);
return charaters8_[offset] == '\n' && offset >= 1 &&
charaters8_[offset - 1] == '\r';
}
bool is8_bit_;
// For 8 bit strings, we implement the iterator ourselves.
const LChar* charaters8_;
unsigned offset_;
unsigned length_;
// For 16 bit strings, we use a TextBreakIterator.
TextBreakIterator* iterator_;
DISALLOW_COPY_AND_ASSIGN(NonSharedCharacterBreakIterator);
};
// Counts the number of grapheme clusters. A surrogate pair or a sequence
// of a non-combining character and following combining characters is
// counted as 1 grapheme cluster.
PLATFORM_EXPORT unsigned NumGraphemeClusters(const String&);
// Returns the number of code units that the next grapheme cluster is made of.
PLATFORM_EXPORT unsigned LengthOfGraphemeCluster(const String&, unsigned = 0);
// Returns a list of graphemes cluster at each character using character break
// rules.
PLATFORM_EXPORT void GraphemesClusterList(const StringView& text,
Vector<unsigned>* graphemes);
} // namespace blink
#endif