blob: 1e1718173f7b1ffd7195966f423aa4222f147141 [file] [log] [blame]
/*
* Copyright (C) 2006 Lars Knoll <lars@trolltech.com>
* Copyright (C) 2007, 2011, 2012 Apple Inc. All rights reserved.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public License
* along with this library; see the file COPYING.LIB. If not, write to
* the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
* Boston, MA 02110-1301, USA.
*
*/
#ifndef TextBreakIterator_h
#define TextBreakIterator_h
#include "platform/PlatformExport.h"
#include "wtf/text/AtomicString.h"
#include "wtf/text/Unicode.h"
#include <unicode/brkiter.h>
namespace blink {
typedef icu::BreakIterator TextBreakIterator;
// Note: The returned iterator is good only until you get another iterator, with
// the exception of acquireLineBreakIterator.
// This is similar to character break iterator in most cases, but is subject to
// platform UI conventions. One notable example where this can be different
// from character break iterator is Thai prepend characters, see bug 24342.
// Use this for insertion point and selection manipulations.
PLATFORM_EXPORT TextBreakIterator* cursorMovementIterator(const UChar*,
int length);
PLATFORM_EXPORT TextBreakIterator* wordBreakIterator(const String&,
int start,
int length);
PLATFORM_EXPORT TextBreakIterator* wordBreakIterator(const UChar*, int length);
PLATFORM_EXPORT TextBreakIterator* acquireLineBreakIterator(
const LChar*,
int length,
const AtomicString& locale,
const UChar* priorContext,
unsigned priorContextLength);
PLATFORM_EXPORT TextBreakIterator* acquireLineBreakIterator(
const UChar*,
int length,
const AtomicString& locale,
const UChar* priorContext,
unsigned priorContextLength);
PLATFORM_EXPORT void releaseLineBreakIterator(TextBreakIterator*);
PLATFORM_EXPORT TextBreakIterator* sentenceBreakIterator(const UChar*,
int length);
PLATFORM_EXPORT bool isWordTextBreak(TextBreakIterator*);
const int TextBreakDone = -1;
enum class LineBreakType {
Normal,
BreakAll, // word-break:break-all allows breaks between letters/numbers
KeepAll, // word-break:keep-all doesn't allow breaks between all kind of
// letters/numbers except some south east asians'.
};
class PLATFORM_EXPORT LazyLineBreakIterator final {
STACK_ALLOCATED();
public:
LazyLineBreakIterator()
: m_iterator(0), m_cachedPriorContext(0), m_cachedPriorContextLength(0) {
resetPriorContext();
}
LazyLineBreakIterator(String string,
const AtomicString& locale = AtomicString())
: m_string(string),
m_locale(locale),
m_iterator(0),
m_cachedPriorContext(0),
m_cachedPriorContextLength(0) {
resetPriorContext();
}
~LazyLineBreakIterator() {
if (m_iterator)
releaseLineBreakIterator(m_iterator);
}
String getString() const { return m_string; }
UChar lastCharacter() const {
static_assert(WTF_ARRAY_LENGTH(m_priorContext) == 2,
"TextBreakIterator has unexpected prior context length");
return m_priorContext[1];
}
UChar secondToLastCharacter() const {
static_assert(WTF_ARRAY_LENGTH(m_priorContext) == 2,
"TextBreakIterator has unexpected prior context length");
return m_priorContext[0];
}
void setPriorContext(UChar last, UChar secondToLast) {
static_assert(WTF_ARRAY_LENGTH(m_priorContext) == 2,
"TextBreakIterator has unexpected prior context length");
m_priorContext[0] = secondToLast;
m_priorContext[1] = last;
}
void updatePriorContext(UChar last) {
static_assert(WTF_ARRAY_LENGTH(m_priorContext) == 2,
"TextBreakIterator has unexpected prior context length");
m_priorContext[0] = m_priorContext[1];
m_priorContext[1] = last;
}
void resetPriorContext() {
static_assert(WTF_ARRAY_LENGTH(m_priorContext) == 2,
"TextBreakIterator has unexpected prior context length");
m_priorContext[0] = 0;
m_priorContext[1] = 0;
}
unsigned priorContextLength() const {
unsigned priorContextLength = 0;
static_assert(WTF_ARRAY_LENGTH(m_priorContext) == 2,
"TextBreakIterator has unexpected prior context length");
if (m_priorContext[1]) {
++priorContextLength;
if (m_priorContext[0])
++priorContextLength;
}
return priorContextLength;
}
// Obtain text break iterator, possibly previously cached, where this iterator
// is (or has been) initialized to use the previously stored string as the
// primary breaking context and using previously stored prior context if
// non-empty.
TextBreakIterator* get(unsigned priorContextLength) {
ASSERT(priorContextLength <= priorContextCapacity);
const UChar* priorContext =
priorContextLength
? &m_priorContext[priorContextCapacity - priorContextLength]
: 0;
if (!m_iterator) {
if (m_string.is8Bit())
m_iterator = acquireLineBreakIterator(m_string.characters8(),
m_string.length(), m_locale,
priorContext, priorContextLength);
else
m_iterator = acquireLineBreakIterator(m_string.characters16(),
m_string.length(), m_locale,
priorContext, priorContextLength);
m_cachedPriorContext = priorContext;
m_cachedPriorContextLength = priorContextLength;
} else if (priorContext != m_cachedPriorContext ||
priorContextLength != m_cachedPriorContextLength) {
this->resetStringAndReleaseIterator(m_string, m_locale);
return this->get(priorContextLength);
}
return m_iterator;
}
void resetStringAndReleaseIterator(String string,
const AtomicString& locale) {
if (m_iterator)
releaseLineBreakIterator(m_iterator);
m_string = string;
m_locale = locale;
m_iterator = 0;
m_cachedPriorContext = 0;
m_cachedPriorContextLength = 0;
}
inline bool isBreakable(int pos,
int& nextBreakable,
LineBreakType lineBreakType = LineBreakType::Normal) {
if (pos > nextBreakable) {
switch (lineBreakType) {
case LineBreakType::BreakAll:
nextBreakable = nextBreakablePositionBreakAll(pos);
break;
case LineBreakType::KeepAll:
nextBreakable = nextBreakablePositionKeepAll(pos);
break;
default:
nextBreakable = nextBreakablePositionIgnoringNBSP(pos);
}
}
return pos == nextBreakable;
}
private:
int nextBreakablePositionIgnoringNBSP(int pos);
int nextBreakablePositionBreakAll(int pos);
int nextBreakablePositionKeepAll(int pos);
static const unsigned priorContextCapacity = 2;
String m_string;
AtomicString m_locale;
TextBreakIterator* m_iterator;
UChar m_priorContext[priorContextCapacity];
const UChar* m_cachedPriorContext;
unsigned m_cachedPriorContextLength;
};
// Iterates over "extended grapheme clusters", as defined in UAX #29.
// Note that platform implementations may be less sophisticated - e.g. ICU prior
// to version 4.0 only supports "legacy grapheme clusters". Use this for
// general text processing, e.g. string truncation.
class PLATFORM_EXPORT NonSharedCharacterBreakIterator final {
STACK_ALLOCATED();
WTF_MAKE_NONCOPYABLE(NonSharedCharacterBreakIterator);
public:
explicit NonSharedCharacterBreakIterator(const String&);
NonSharedCharacterBreakIterator(const UChar*, unsigned length);
~NonSharedCharacterBreakIterator();
int next();
int current();
bool isBreak(int offset) const;
int preceding(int offset) const;
int following(int offset) const;
bool operator!() const { return !m_is8Bit && !m_iterator; }
private:
void createIteratorForBuffer(const UChar*, unsigned length);
unsigned clusterLengthStartingAt(unsigned offset) const {
ASSERT(m_is8Bit);
// The only Latin-1 Extended Grapheme Cluster is CR LF
return isCRBeforeLF(offset) ? 2 : 1;
}
bool isCRBeforeLF(unsigned offset) const {
ASSERT(m_is8Bit);
return m_charaters8[offset] == '\r' && offset + 1 < m_length &&
m_charaters8[offset + 1] == '\n';
}
bool isLFAfterCR(unsigned offset) const {
ASSERT(m_is8Bit);
return m_charaters8[offset] == '\n' && offset >= 1 &&
m_charaters8[offset - 1] == '\r';
}
bool m_is8Bit;
// For 8 bit strings, we implement the iterator ourselves.
const LChar* m_charaters8;
unsigned m_offset;
unsigned m_length;
// For 16 bit strings, we use a TextBreakIterator.
TextBreakIterator* m_iterator;
};
// Counts the number of grapheme clusters. A surrogate pair or a sequence
// of a non-combining character and following combining characters is
// counted as 1 grapheme cluster.
PLATFORM_EXPORT unsigned numGraphemeClusters(const String&);
} // namespace blink
#endif