third_party/WebKit/Source/platform/text/TextBreakIterator.h - chromium/src - Git at Google

 /*
  * Copyright (C) 2006 Lars Knoll <lars@trolltech.com>
  * Copyright (C) 2007, 2011, 2012 Apple Inc. All rights reserved.
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Library General Public
  * License as published by the Free Software Foundation; either
  * version 2 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Library General Public License for more details.
  *
  * You should have received a copy of the GNU Library General Public License
  * along with this library; see the file COPYING.LIB.  If not, write to
  * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  * Boston, MA 02110-1301, USA.
  *
  */

 #ifndef TextBreakIterator_h
 #define TextBreakIterator_h

 #include "platform/PlatformExport.h"
 #include "wtf/text/AtomicString.h"
 #include "wtf/text/Unicode.h"

 #include <unicode/brkiter.h>

 namespace blink {

 typedef icu::BreakIterator TextBreakIterator;

 // Note: The returned iterator is good only until you get another iterator, with
 // the exception of acquireLineBreakIterator.

 // This is similar to character break iterator in most cases, but is subject to
 // platform UI conventions. One notable example where this can be different
 // from character break iterator is Thai prepend characters, see bug 24342.
 // Use this for insertion point and selection manipulations.
 PLATFORM_EXPORT TextBreakIterator* cursorMovementIterator(const UChar*,
                                                           int length);

 PLATFORM_EXPORT TextBreakIterator* wordBreakIterator(const String&,
                                                      int start,
                                                      int length);
 PLATFORM_EXPORT TextBreakIterator* wordBreakIterator(const UChar*, int length);
 PLATFORM_EXPORT TextBreakIterator* acquireLineBreakIterator(
     const LChar*,
     int length,
     const AtomicString& locale,
     const UChar* priorContext,
     unsigned priorContextLength);
 PLATFORM_EXPORT TextBreakIterator* acquireLineBreakIterator(
     const UChar*,
     int length,
     const AtomicString& locale,
     const UChar* priorContext,
     unsigned priorContextLength);
 PLATFORM_EXPORT void releaseLineBreakIterator(TextBreakIterator*);
 PLATFORM_EXPORT TextBreakIterator* sentenceBreakIterator(const UChar*,
                                                          int length);

 PLATFORM_EXPORT bool isWordTextBreak(TextBreakIterator*);

 const int TextBreakDone = -1;

 enum class LineBreakType {
   Normal,
   BreakAll,  // word-break:break-all allows breaks between letters/numbers
   KeepAll,   // word-break:keep-all doesn't allow breaks between all kind of
              // letters/numbers except some south east asians'.
 };

 class PLATFORM_EXPORT LazyLineBreakIterator final {
   STACK_ALLOCATED();

  public:
   LazyLineBreakIterator()
       : m_iterator(0), m_cachedPriorContext(0), m_cachedPriorContextLength(0) {
     resetPriorContext();
   }

   LazyLineBreakIterator(String string,
                         const AtomicString& locale = AtomicString())
       : m_string(string),
         m_locale(locale),
         m_iterator(0),
         m_cachedPriorContext(0),
         m_cachedPriorContextLength(0) {
     resetPriorContext();
   }

   ~LazyLineBreakIterator() {
     if (m_iterator)
       releaseLineBreakIterator(m_iterator);
   }

   String getString() const { return m_string; }

   UChar lastCharacter() const {
     static_assert(WTF_ARRAY_LENGTH(m_priorContext) == 2,
                   "TextBreakIterator has unexpected prior context length");
     return m_priorContext[1];
   }

   UChar secondToLastCharacter() const {
     static_assert(WTF_ARRAY_LENGTH(m_priorContext) == 2,
                   "TextBreakIterator has unexpected prior context length");
     return m_priorContext[0];
   }

   void setPriorContext(UChar last, UChar secondToLast) {
     static_assert(WTF_ARRAY_LENGTH(m_priorContext) == 2,
                   "TextBreakIterator has unexpected prior context length");
     m_priorContext[0] = secondToLast;
     m_priorContext[1] = last;
   }

   void updatePriorContext(UChar last) {
     static_assert(WTF_ARRAY_LENGTH(m_priorContext) == 2,
                   "TextBreakIterator has unexpected prior context length");
     m_priorContext[0] = m_priorContext[1];
     m_priorContext[1] = last;
   }

   void resetPriorContext() {
     static_assert(WTF_ARRAY_LENGTH(m_priorContext) == 2,
                   "TextBreakIterator has unexpected prior context length");
     m_priorContext[0] = 0;
     m_priorContext[1] = 0;
   }

   unsigned priorContextLength() const {
     unsigned priorContextLength = 0;
     static_assert(WTF_ARRAY_LENGTH(m_priorContext) == 2,
                   "TextBreakIterator has unexpected prior context length");
     if (m_priorContext[1]) {
       ++priorContextLength;
       if (m_priorContext[0])
         ++priorContextLength;
     }
     return priorContextLength;
   }

   // Obtain text break iterator, possibly previously cached, where this iterator
   // is (or has been) initialized to use the previously stored string as the
   // primary breaking context and using previously stored prior context if
   // non-empty.
   TextBreakIterator* get(unsigned priorContextLength) {
     ASSERT(priorContextLength <= priorContextCapacity);
     const UChar* priorContext =
         priorContextLength
             ? &m_priorContext[priorContextCapacity - priorContextLength]
             : 0;
     if (!m_iterator) {
       if (m_string.is8Bit())
         m_iterator = acquireLineBreakIterator(m_string.characters8(),
                                               m_string.length(), m_locale,
                                               priorContext, priorContextLength);
       else
         m_iterator = acquireLineBreakIterator(m_string.characters16(),
                                               m_string.length(), m_locale,
                                               priorContext, priorContextLength);
       m_cachedPriorContext = priorContext;
       m_cachedPriorContextLength = priorContextLength;
     } else if (priorContext != m_cachedPriorContext ||
                priorContextLength != m_cachedPriorContextLength) {
       this->resetStringAndReleaseIterator(m_string, m_locale);
       return this->get(priorContextLength);
     }
     return m_iterator;
   }

   void resetStringAndReleaseIterator(String string,
                                      const AtomicString& locale) {
     if (m_iterator)
       releaseLineBreakIterator(m_iterator);

     m_string = string;
     m_locale = locale;
     m_iterator = 0;
     m_cachedPriorContext = 0;
     m_cachedPriorContextLength = 0;
   }

   inline bool isBreakable(int pos,
                           int& nextBreakable,
                           LineBreakType lineBreakType = LineBreakType::Normal) {
     if (pos > nextBreakable) {
       switch (lineBreakType) {
         case LineBreakType::BreakAll:
           nextBreakable = nextBreakablePositionBreakAll(pos);
           break;
         case LineBreakType::KeepAll:
           nextBreakable = nextBreakablePositionKeepAll(pos);
           break;
         default:
           nextBreakable = nextBreakablePositionIgnoringNBSP(pos);
       }
     }
     return pos == nextBreakable;
   }

  private:
   int nextBreakablePositionIgnoringNBSP(int pos);
   int nextBreakablePositionBreakAll(int pos);
   int nextBreakablePositionKeepAll(int pos);

   static const unsigned priorContextCapacity = 2;
   String m_string;
   AtomicString m_locale;
   TextBreakIterator* m_iterator;
   UChar m_priorContext[priorContextCapacity];
   const UChar* m_cachedPriorContext;
   unsigned m_cachedPriorContextLength;
 };

 // Iterates over "extended grapheme clusters", as defined in UAX #29.
 // Note that platform implementations may be less sophisticated - e.g. ICU prior
 // to version 4.0 only supports "legacy grapheme clusters".  Use this for
 // general text processing, e.g. string truncation.

 class PLATFORM_EXPORT NonSharedCharacterBreakIterator final {
   STACK_ALLOCATED();
   WTF_MAKE_NONCOPYABLE(NonSharedCharacterBreakIterator);

  public:
   explicit NonSharedCharacterBreakIterator(const String&);
   NonSharedCharacterBreakIterator(const UChar*, unsigned length);
   ~NonSharedCharacterBreakIterator();

   int next();
   int current();

   bool isBreak(int offset) const;
   int preceding(int offset) const;
   int following(int offset) const;

   bool operator!() const { return !m_is8Bit && !m_iterator; }

  private:
   void createIteratorForBuffer(const UChar*, unsigned length);

   unsigned clusterLengthStartingAt(unsigned offset) const {
     ASSERT(m_is8Bit);
     // The only Latin-1 Extended Grapheme Cluster is CR LF
     return isCRBeforeLF(offset) ? 2 : 1;
   }

   bool isCRBeforeLF(unsigned offset) const {
     ASSERT(m_is8Bit);
     return m_charaters8[offset] == '\r' && offset + 1 < m_length &&
            m_charaters8[offset + 1] == '\n';
   }

   bool isLFAfterCR(unsigned offset) const {
     ASSERT(m_is8Bit);
     return m_charaters8[offset] == '\n' && offset >= 1 &&
            m_charaters8[offset - 1] == '\r';
   }

   bool m_is8Bit;

   // For 8 bit strings, we implement the iterator ourselves.
   const LChar* m_charaters8;
   unsigned m_offset;
   unsigned m_length;

   // For 16 bit strings, we use a TextBreakIterator.
   TextBreakIterator* m_iterator;
 };

 // Counts the number of grapheme clusters. A surrogate pair or a sequence
 // of a non-combining character and following combining characters is
 // counted as 1 grapheme cluster.
 PLATFORM_EXPORT unsigned numGraphemeClusters(const String&);

 }  // namespace blink

 #endif
	/*
	* Copyright (C) 2006 Lars Knoll <lars@trolltech.com>
	* Copyright (C) 2007, 2011, 2012 Apple Inc. All rights reserved.
	*
	* This library is free software; you can redistribute it and/or
	* modify it under the terms of the GNU Library General Public
	* License as published by the Free Software Foundation; either
	* version 2 of the License, or (at your option) any later version.
	*
	* This library is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	* Library General Public License for more details.
	*
	* You should have received a copy of the GNU Library General Public License
	* along with this library; see the file COPYING.LIB. If not, write to
	* the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
	* Boston, MA 02110-1301, USA.
	*
	*/

	#ifndef TextBreakIterator_h
	#define TextBreakIterator_h

	#include "platform/PlatformExport.h"
	#include "wtf/text/AtomicString.h"
	#include "wtf/text/Unicode.h"

	#include <unicode/brkiter.h>

	namespace blink {

	typedef icu::BreakIterator TextBreakIterator;

	// Note: The returned iterator is good only until you get another iterator, with
	// the exception of acquireLineBreakIterator.

	// This is similar to character break iterator in most cases, but is subject to
	// platform UI conventions. One notable example where this can be different
	// from character break iterator is Thai prepend characters, see bug 24342.
	// Use this for insertion point and selection manipulations.
	PLATFORM_EXPORT TextBreakIterator* cursorMovementIterator(const UChar*,
	int length);

	PLATFORM_EXPORT TextBreakIterator* wordBreakIterator(const String&,
	int start,
	int length);
	PLATFORM_EXPORT TextBreakIterator* wordBreakIterator(const UChar*, int length);
	PLATFORM_EXPORT TextBreakIterator* acquireLineBreakIterator(
	const LChar*,
	int length,
	const AtomicString& locale,
	const UChar* priorContext,
	unsigned priorContextLength);
	PLATFORM_EXPORT TextBreakIterator* acquireLineBreakIterator(
	const UChar*,
	int length,
	const AtomicString& locale,
	const UChar* priorContext,
	unsigned priorContextLength);
	PLATFORM_EXPORT void releaseLineBreakIterator(TextBreakIterator*);
	PLATFORM_EXPORT TextBreakIterator* sentenceBreakIterator(const UChar*,
	int length);

	PLATFORM_EXPORT bool isWordTextBreak(TextBreakIterator*);

	const int TextBreakDone = -1;

	enum class LineBreakType {
	Normal,
	BreakAll, // word-break:break-all allows breaks between letters/numbers
	KeepAll, // word-break:keep-all doesn't allow breaks between all kind of
	// letters/numbers except some south east asians'.
	};

	class PLATFORM_EXPORT LazyLineBreakIterator final {
	STACK_ALLOCATED();

	public:
	LazyLineBreakIterator()
	: m_iterator(0), m_cachedPriorContext(0), m_cachedPriorContextLength(0) {
	resetPriorContext();
	}

	LazyLineBreakIterator(String string,
	const AtomicString& locale = AtomicString())
	: m_string(string),
	m_locale(locale),
	m_iterator(0),
	m_cachedPriorContext(0),
	m_cachedPriorContextLength(0) {
	resetPriorContext();
	}

	~LazyLineBreakIterator() {
	if (m_iterator)
	releaseLineBreakIterator(m_iterator);
	}

	String getString() const { return m_string; }

	UChar lastCharacter() const {
	static_assert(WTF_ARRAY_LENGTH(m_priorContext) == 2,
	"TextBreakIterator has unexpected prior context length");
	return m_priorContext[1];
	}

	UChar secondToLastCharacter() const {
	static_assert(WTF_ARRAY_LENGTH(m_priorContext) == 2,
	"TextBreakIterator has unexpected prior context length");
	return m_priorContext[0];
	}

	void setPriorContext(UChar last, UChar secondToLast) {
	static_assert(WTF_ARRAY_LENGTH(m_priorContext) == 2,
	"TextBreakIterator has unexpected prior context length");
	m_priorContext[0] = secondToLast;
	m_priorContext[1] = last;
	}

	void updatePriorContext(UChar last) {
	static_assert(WTF_ARRAY_LENGTH(m_priorContext) == 2,
	"TextBreakIterator has unexpected prior context length");
	m_priorContext[0] = m_priorContext[1];
	m_priorContext[1] = last;
	}

	void resetPriorContext() {
	static_assert(WTF_ARRAY_LENGTH(m_priorContext) == 2,
	"TextBreakIterator has unexpected prior context length");
	m_priorContext[0] = 0;
	m_priorContext[1] = 0;
	}

	unsigned priorContextLength() const {
	unsigned priorContextLength = 0;
	static_assert(WTF_ARRAY_LENGTH(m_priorContext) == 2,
	"TextBreakIterator has unexpected prior context length");
	if (m_priorContext[1]) {
	++priorContextLength;
	if (m_priorContext[0])
	++priorContextLength;
	}
	return priorContextLength;
	}

	// Obtain text break iterator, possibly previously cached, where this iterator
	// is (or has been) initialized to use the previously stored string as the
	// primary breaking context and using previously stored prior context if
	// non-empty.
	TextBreakIterator* get(unsigned priorContextLength) {
	ASSERT(priorContextLength <= priorContextCapacity);
	const UChar* priorContext =
	priorContextLength
	? &m_priorContext[priorContextCapacity - priorContextLength]
	: 0;
	if (!m_iterator) {
	if (m_string.is8Bit())
	m_iterator = acquireLineBreakIterator(m_string.characters8(),
	m_string.length(), m_locale,
	priorContext, priorContextLength);
	else
	m_iterator = acquireLineBreakIterator(m_string.characters16(),
	m_string.length(), m_locale,
	priorContext, priorContextLength);
	m_cachedPriorContext = priorContext;
	m_cachedPriorContextLength = priorContextLength;
	} else if (priorContext != m_cachedPriorContext \|\|
	priorContextLength != m_cachedPriorContextLength) {
	this->resetStringAndReleaseIterator(m_string, m_locale);
	return this->get(priorContextLength);
	}
	return m_iterator;
	}

	void resetStringAndReleaseIterator(String string,
	const AtomicString& locale) {
	if (m_iterator)
	releaseLineBreakIterator(m_iterator);

	m_string = string;
	m_locale = locale;
	m_iterator = 0;
	m_cachedPriorContext = 0;
	m_cachedPriorContextLength = 0;
	}

	inline bool isBreakable(int pos,
	int& nextBreakable,
	LineBreakType lineBreakType = LineBreakType::Normal) {
	if (pos > nextBreakable) {
	switch (lineBreakType) {
	case LineBreakType::BreakAll:
	nextBreakable = nextBreakablePositionBreakAll(pos);
	break;
	case LineBreakType::KeepAll:
	nextBreakable = nextBreakablePositionKeepAll(pos);
	break;
	default:
	nextBreakable = nextBreakablePositionIgnoringNBSP(pos);
	}
	}
	return pos == nextBreakable;
	}

	private:
	int nextBreakablePositionIgnoringNBSP(int pos);
	int nextBreakablePositionBreakAll(int pos);
	int nextBreakablePositionKeepAll(int pos);

	static const unsigned priorContextCapacity = 2;
	String m_string;
	AtomicString m_locale;
	TextBreakIterator* m_iterator;
	UChar m_priorContext[priorContextCapacity];
	const UChar* m_cachedPriorContext;
	unsigned m_cachedPriorContextLength;
	};

	// Iterates over "extended grapheme clusters", as defined in UAX #29.
	// Note that platform implementations may be less sophisticated - e.g. ICU prior
	// to version 4.0 only supports "legacy grapheme clusters". Use this for
	// general text processing, e.g. string truncation.

	class PLATFORM_EXPORT NonSharedCharacterBreakIterator final {
	STACK_ALLOCATED();
	WTF_MAKE_NONCOPYABLE(NonSharedCharacterBreakIterator);

	public:
	explicit NonSharedCharacterBreakIterator(const String&);
	NonSharedCharacterBreakIterator(const UChar*, unsigned length);
	~NonSharedCharacterBreakIterator();

	int next();
	int current();

	bool isBreak(int offset) const;
	int preceding(int offset) const;
	int following(int offset) const;

	bool operator!() const { return !m_is8Bit && !m_iterator; }

	private:
	void createIteratorForBuffer(const UChar*, unsigned length);

	unsigned clusterLengthStartingAt(unsigned offset) const {
	ASSERT(m_is8Bit);
	// The only Latin-1 Extended Grapheme Cluster is CR LF
	return isCRBeforeLF(offset) ? 2 : 1;
	}

	bool isCRBeforeLF(unsigned offset) const {
	ASSERT(m_is8Bit);
	return m_charaters8[offset] == '\r' && offset + 1 < m_length &&
	m_charaters8[offset + 1] == '\n';
	}

	bool isLFAfterCR(unsigned offset) const {
	ASSERT(m_is8Bit);
	return m_charaters8[offset] == '\n' && offset >= 1 &&
	m_charaters8[offset - 1] == '\r';
	}

	bool m_is8Bit;

	// For 8 bit strings, we implement the iterator ourselves.
	const LChar* m_charaters8;
	unsigned m_offset;
	unsigned m_length;

	// For 16 bit strings, we use a TextBreakIterator.
	TextBreakIterator* m_iterator;
	};

	// Counts the number of grapheme clusters. A surrogate pair or a sequence
	// of a non-combining character and following combining characters is
	// counted as 1 grapheme cluster.
	PLATFORM_EXPORT unsigned numGraphemeClusters(const String&);

	} // namespace blink

	#endif