/*
 * Copyright (C) 2006 Lars Knoll <lars@trolltech.com>
 * Copyright (C) 2007, 2011, 2012 Apple Inc. All rights reserved.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Library General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Library General Public License for more details.
 *
 * You should have received a copy of the GNU Library General Public License
 * along with this library; see the file COPYING.LIB.  If not, write to
 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
 * Boston, MA 02110-1301, USA.
 *
 */

#ifndef THIRD_PARTY_BLINK_RENDERER_PLATFORM_TEXT_TEXT_BREAK_ITERATOR_H_
#define THIRD_PARTY_BLINK_RENDERER_PLATFORM_TEXT_TEXT_BREAK_ITERATOR_H_

#include "base/macros.h"
#include "third_party/blink/renderer/platform/platform_export.h"
#include "third_party/blink/renderer/platform/wtf/text/atomic_string.h"
#include "third_party/blink/renderer/platform/wtf/text/character_names.h"
#include "third_party/blink/renderer/platform/wtf/text/unicode.h"

#include <unicode/brkiter.h>

namespace blink {

typedef icu::BreakIterator TextBreakIterator;

// Note: The returned iterator is good only until you get another iterator, with
// the exception of acquireLineBreakIterator.

// This is similar to character break iterator in most cases, but is subject to
// platform UI conventions. One notable example where this can be different
// from character break iterator is Thai prepend characters, see bug 24342.
// Use this for insertion point and selection manipulations.
PLATFORM_EXPORT TextBreakIterator* CursorMovementIterator(const UChar*,
                                                          int length);

PLATFORM_EXPORT TextBreakIterator* WordBreakIterator(const String&,
                                                     int start,
                                                     int length);
PLATFORM_EXPORT TextBreakIterator* WordBreakIterator(const UChar*, int length);
PLATFORM_EXPORT TextBreakIterator* AcquireLineBreakIterator(
    const LChar*,
    int length,
    const AtomicString& locale,
    const UChar* prior_context,
    unsigned prior_context_length);
PLATFORM_EXPORT TextBreakIterator* AcquireLineBreakIterator(
    const UChar*,
    int length,
    const AtomicString& locale,
    const UChar* prior_context,
    unsigned prior_context_length);
PLATFORM_EXPORT void ReleaseLineBreakIterator(TextBreakIterator*);
PLATFORM_EXPORT TextBreakIterator* SentenceBreakIterator(const UChar*,
                                                         int length);

// Before calling this, check if the iterator is not at the end. Otherwise,
// it may not work as expected.
// See https://ssl.icu-project.org/trac/ticket/13447 .
PLATFORM_EXPORT bool IsWordTextBreak(TextBreakIterator*);

const int kTextBreakDone = -1;

enum class LineBreakType {
  kNormal,

  // word-break:break-all allows breaks between letters/numbers, but prohibits
  // break before/after certain punctuation.
  kBreakAll,

  // Allows breaks at every grapheme cluster boundary.
  // Terminal style line breaks described in UAX#14: Examples of Customization
  // http://unicode.org/reports/tr14/#Examples
  // CSS is discussing to add this feature crbug.com/720205
  // Used internally for word-break:break-word.
  kBreakCharacter,

  // word-break:keep-all doesn't allow breaks between all kind of
  // letters/numbers except some south east asians'.
  kKeepAll,
};

// Determines break opportunities around collapsible space characters (space,
// newline, and tabulation characters.)
enum class BreakSpaceType {
  // Break before every collapsible space character.
  // This is a specialized optimization for CSS, where leading/trailing spaces
  // in each line are removed, and thus breaking before spaces can save
  // computing hanging spaces.
  // Callers are expected to handle spaces by themselves. Because a run of
  // spaces can include different types of spaces, break opportunity is given
  // for every space character.
  // Pre-LayoutNG line breaker uses this type.
  kBeforeEverySpace,

  // Break before a run of white space characters.
  // This is for CSS line breaking as in |kBeforeEverySpace|, but when
  // whitespace collapsing is already applied to the target string. In this
  // case, a run of white spaces are preserved spaces. There should not be break
  // opportunities between white spaces.
  // LayoutNG line breaker uses this type.
  kBeforeSpaceRun,
};

PLATFORM_EXPORT std::ostream& operator<<(std::ostream&, LineBreakType);
PLATFORM_EXPORT std::ostream& operator<<(std::ostream&, BreakSpaceType);

class PLATFORM_EXPORT LazyLineBreakIterator final {
  STACK_ALLOCATED();

 public:
  LazyLineBreakIterator()
      : iterator_(nullptr),
        break_type_(LineBreakType::kNormal) {
    ResetPriorContext();
  }

  LazyLineBreakIterator(String string,
                        const AtomicString& locale = AtomicString(),
                        LineBreakType break_type = LineBreakType::kNormal)
      : string_(string),
        locale_(locale),
        iterator_(nullptr),
        break_type_(break_type) {
    ResetPriorContext();
  }

  ~LazyLineBreakIterator() {
    if (iterator_)
      ReleaseLineBreakIterator(iterator_);
  }

  const String& GetString() const { return string_; }

  UChar LastCharacter() const {
    static_assert(arraysize(prior_context_) == 2,
                  "TextBreakIterator has unexpected prior context length");
    return prior_context_[1];
  }

  UChar SecondToLastCharacter() const {
    static_assert(arraysize(prior_context_) == 2,
                  "TextBreakIterator has unexpected prior context length");
    return prior_context_[0];
  }

  void SetPriorContext(UChar last, UChar second_to_last) {
    static_assert(arraysize(prior_context_) == 2,
                  "TextBreakIterator has unexpected prior context length");
    prior_context_[0] = second_to_last;
    prior_context_[1] = last;
  }

  void UpdatePriorContext(UChar last) {
    static_assert(arraysize(prior_context_) == 2,
                  "TextBreakIterator has unexpected prior context length");
    prior_context_[0] = prior_context_[1];
    prior_context_[1] = last;
  }

  void ResetPriorContext() {
    static_assert(arraysize(prior_context_) == 2,
                  "TextBreakIterator has unexpected prior context length");
    prior_context_[0] = 0;
    prior_context_[1] = 0;
  }

  struct PriorContext {
    const UChar* text = nullptr;
    unsigned length = 0;
  };

  PriorContext GetPriorContext() const {
    static_assert(arraysize(prior_context_) == 2,
                  "TextBreakIterator has unexpected prior context length");
    if (prior_context_[1]) {
      if (prior_context_[0])
        return PriorContext{&prior_context_[0], 2};
      return PriorContext{&prior_context_[1], 1};
    }
    return PriorContext{nullptr, 0};
  }

  unsigned PriorContextLength() const { return GetPriorContext().length; }

  void ResetStringAndReleaseIterator(String string,
                                     const AtomicString& locale) {
    string_ = string;
    start_offset_ = 0;
    locale_ = locale;

    ReleaseIterator();
  }

  // Set the start offset. Text before this offset is disregarded. Properly
  // setting the start offset improves the performance significantly, because
  // ICU break iterator computes all the text from the beginning.
  void SetStartOffset(unsigned offset) {
    CHECK_LE(offset, string_.length());
    start_offset_ = offset;
    ReleaseIterator();
  }

  void SetLocale(const AtomicString& locale) {
    if (locale == locale_)
      return;
    locale_ = locale;
    ReleaseIterator();
  }

  LineBreakType BreakType() const { return break_type_; }
  void SetBreakType(LineBreakType break_type) { break_type_ = break_type; }
  BreakSpaceType BreakSpace() const { return break_space_; }
  void SetBreakSpace(BreakSpaceType break_space) { break_space_ = break_space; }

  inline bool IsBreakable(int pos,
                          int& next_breakable,
                          LineBreakType line_break_type) const {
    if (pos > next_breakable) {
      next_breakable = NextBreakablePosition(pos, line_break_type);
    }
    return pos == next_breakable;
  }

  inline bool IsBreakable(int pos, int& next_breakable) const {
    return IsBreakable(pos, next_breakable, break_type_);
  }

  inline bool IsBreakable(int pos) const {
    // No need to scan the entire string for the next breakable position when
    // all we need to determine is whether the current position is breakable.
    // Limit length to pos + 1.
    // TODO(layout-dev): We should probably try to break out an actual
    // IsBreakable method from NextBreakablePosition and get rid of this hack.
    int len = std::min(pos + 1, static_cast<int>(string_.length()));
    int next_breakable = NextBreakablePosition(pos, break_type_, len);
    return pos == next_breakable;
  }

  // Returns the break opportunity at or after |offset|.
  unsigned NextBreakOpportunity(unsigned offset) const;
  unsigned NextBreakOpportunity(unsigned offset, unsigned len) const;

  // Returns the break opportunity at or before |offset|.
  unsigned PreviousBreakOpportunity(unsigned offset, unsigned min = 0) const;

  static bool IsBreakableSpace(UChar ch) {
    return ch == kSpaceCharacter || ch == kTabulationCharacter ||
           ch == kNewlineCharacter;
  }

 private:
  void ReleaseIterator() const {
    if (iterator_)
      ReleaseLineBreakIterator(iterator_);
    iterator_ = nullptr;
    cached_prior_context_.text = nullptr;
    cached_prior_context_.length = 0;
  }

  // Obtain text break iterator, possibly previously cached, where this iterator
  // is (or has been) initialized to use the previously stored string as the
  // primary breaking context and using previously stored prior context if
  // non-empty.
  TextBreakIterator* GetIterator(const PriorContext& prior_context) const {
    DCHECK(prior_context.length <= kPriorContextCapacity);
    if (iterator_) {
      if (prior_context.length == cached_prior_context_.length) {
        DCHECK_EQ(prior_context.text, cached_prior_context_.text);
        return iterator_;
      }
      ReleaseIterator();
    }

    // Create the iterator, or get one from the cache, for the text after
    // |start_offset_|. Because ICU TextBreakIterator computes all characters
    // from the beginning of the given text, using |start_offset_| improves the
    // performance significantly.
    //
    // For this reason, the offset for the TextBreakIterator must be adjusted by
    // |start_offset_|.
    cached_prior_context_ = prior_context;
    CHECK_LE(start_offset_, string_.length());
    if (string_.Is8Bit()) {
      iterator_ =
          AcquireLineBreakIterator(string_.Characters8() + start_offset_,
                                   string_.length() - start_offset_, locale_,
                                   prior_context.text, prior_context.length);
    } else {
      iterator_ =
          AcquireLineBreakIterator(string_.Characters16() + start_offset_,
                                   string_.length() - start_offset_, locale_,
                                   prior_context.text, prior_context.length);
    }
    return iterator_;
  }

  template <typename CharacterType, LineBreakType, BreakSpaceType>
  int NextBreakablePosition(int pos, const CharacterType* str, int len) const;
  template <typename CharacterType, LineBreakType>
  int NextBreakablePosition(int pos, const CharacterType* str, int len) const;
  template <LineBreakType>
  int NextBreakablePosition(int pos, int len) const;
  int NextBreakablePositionBreakCharacter(int pos) const;
  int NextBreakablePosition(int pos, LineBreakType, int len) const;
  int NextBreakablePosition(int pos, LineBreakType) const;

  static const unsigned kPriorContextCapacity = 2;
  String string_;
  AtomicString locale_;
  mutable TextBreakIterator* iterator_;
  UChar prior_context_[kPriorContextCapacity];
  mutable PriorContext cached_prior_context_;
  unsigned start_offset_ = 0;
  LineBreakType break_type_;
  BreakSpaceType break_space_ = BreakSpaceType::kBeforeEverySpace;
};

// Iterates over "extended grapheme clusters", as defined in UAX #29.
// Note that platform implementations may be less sophisticated - e.g. ICU prior
// to version 4.0 only supports "legacy grapheme clusters".  Use this for
// general text processing, e.g. string truncation.

class PLATFORM_EXPORT NonSharedCharacterBreakIterator final {
  STACK_ALLOCATED();

 public:
  explicit NonSharedCharacterBreakIterator(const StringView&);
  NonSharedCharacterBreakIterator(const UChar*, unsigned length);
  ~NonSharedCharacterBreakIterator();

  int Next();
  int Current();

  bool IsBreak(int offset) const;
  int Preceding(int offset) const;
  int Following(int offset) const;

  bool operator!() const { return !is8_bit_ && !iterator_; }

 private:
  void CreateIteratorForBuffer(const UChar*, unsigned length);

  unsigned ClusterLengthStartingAt(unsigned offset) const {
    DCHECK(is8_bit_);
    // The only Latin-1 Extended Grapheme Cluster is CR LF
    return IsCRBeforeLF(offset) ? 2 : 1;
  }

  bool IsCRBeforeLF(unsigned offset) const {
    DCHECK(is8_bit_);
    return charaters8_[offset] == '\r' && offset + 1 < length_ &&
           charaters8_[offset + 1] == '\n';
  }

  bool IsLFAfterCR(unsigned offset) const {
    DCHECK(is8_bit_);
    return charaters8_[offset] == '\n' && offset >= 1 &&
           charaters8_[offset - 1] == '\r';
  }

  bool is8_bit_;

  // For 8 bit strings, we implement the iterator ourselves.
  const LChar* charaters8_;
  unsigned offset_;
  unsigned length_;

  // For 16 bit strings, we use a TextBreakIterator.
  TextBreakIterator* iterator_;

  DISALLOW_COPY_AND_ASSIGN(NonSharedCharacterBreakIterator);
};

// Counts the number of grapheme clusters. A surrogate pair or a sequence
// of a non-combining character and following combining characters is
// counted as 1 grapheme cluster.
PLATFORM_EXPORT unsigned NumGraphemeClusters(const String&);

// Returns the number of code units that the next grapheme cluster is made of.
PLATFORM_EXPORT unsigned LengthOfGraphemeCluster(const String&, unsigned = 0);

// Returns a list of graphemes cluster at each character using character break
// rules.
PLATFORM_EXPORT void GraphemesClusterList(const StringView& text,
                                          Vector<unsigned>* graphemes);

}  // namespace blink

#endif
