blob: 9f3c4eb386fc93fa897893818573c9e0ceabcf5e [file] [log] [blame]
// Copyright 2016 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "third_party/blink/renderer/core/editing/state_machines/state_machine_util.h"
#include "base/stl_util.h"
#include "third_party/blink/renderer/platform/text/character.h"
#include "third_party/blink/renderer/platform/wtf/assertions.h"
#include "third_party/blink/renderer/platform/wtf/text/character_names.h"
#include "third_party/blink/renderer/platform/wtf/text/unicode.h"
namespace blink {
namespace {
// The list of code points which has Indic_Syllabic_Category=Virama property.
// Must be sorted.
// See http://www.unicode.org/Public/9.0.0/ucd/IndicSyllabicCategory-9.0.0d2.txt
const uint32_t kIndicSyllabicCategoryViramaList[] = {
// Do not include 0+0BCD TAMIL SIGN VIRAMA as Tamil works differently from
// other Indic languages. See crbug.com/693687.
0x094D, 0x09CD, 0x0A4D, 0x0ACD, 0x0B4D, 0x0C4D, 0x0CCD, 0x0D4D,
0x0DCA, 0x1B44, 0xA8C4, 0xA9C0, 0x11046, 0x110B9, 0x111C0, 0x11235,
0x1134D, 0x11442, 0x114C2, 0x115BF, 0x1163F, 0x116B6, 0x11C3F,
};
// Returns true if the code point has Indic_Syllabic_Category=Virama property.
// See http://www.unicode.org/Public/9.0.0/ucd/IndicSyllabicCategory-9.0.0d2.txt
bool IsIndicSyllabicCategoryVirama(uint32_t code_point) {
const int length = base::size(kIndicSyllabicCategoryViramaList);
return std::binary_search(kIndicSyllabicCategoryViramaList,
kIndicSyllabicCategoryViramaList + length,
code_point);
}
} // namespace
bool IsGraphemeBreak(UChar32 prev_code_point, UChar32 next_code_point) {
// The following breaking rules come from Unicode Standard Annex #29 on
// Unicode Text Segmaentation. See http://www.unicode.org/reports/tr29/
int prev_prop =
u_getIntPropertyValue(prev_code_point, UCHAR_GRAPHEME_CLUSTER_BREAK);
int next_prop =
u_getIntPropertyValue(next_code_point, UCHAR_GRAPHEME_CLUSTER_BREAK);
// Rule1 GB1 sot ÷
// Rule2 GB2 ÷ eot
// Should be handled by caller.
// Rule GB3, CR x LF
if (prev_prop == U_GCB_CR && next_prop == U_GCB_LF)
return false;
// Rule GB4, (Control | CR | LF) ÷
if (prev_prop == U_GCB_CONTROL || prev_prop == U_GCB_CR ||
prev_prop == U_GCB_LF)
return true;
// Rule GB5, ÷ (Control | CR | LF)
if (next_prop == U_GCB_CONTROL || next_prop == U_GCB_CR ||
next_prop == U_GCB_LF)
return true;
// Rule GB6, L x (L | V | LV | LVT)
if (prev_prop == U_GCB_L && (next_prop == U_GCB_L || next_prop == U_GCB_V ||
next_prop == U_GCB_LV || next_prop == U_GCB_LVT))
return false;
// Rule GB7, (LV | V) x (V | T)
if ((prev_prop == U_GCB_LV || prev_prop == U_GCB_V) &&
(next_prop == U_GCB_V || next_prop == U_GCB_T))
return false;
// Rule GB8, (LVT | T) x T
if ((prev_prop == U_GCB_LVT || prev_prop == U_GCB_T) && next_prop == U_GCB_T)
return false;
// Rule GB8a
//
// sot (RI RI)* RI x RI
// [^RI] (RI RI)* RI x RI
// RI ÷ RI
if (Character::IsRegionalIndicator(prev_code_point) &&
Character::IsRegionalIndicator(next_code_point))
NOTREACHED() << "Do not use this function for regional indicators.";
// Rule GB9, x (Extend | ZWJ)
// Rule GB9a, x SpacingMark
if (next_prop == U_GCB_EXTEND ||
next_code_point == kZeroWidthJoinerCharacter ||
next_prop == U_GCB_SPACING_MARK)
return false;
// Rule GB9b, Prepend x
if (prev_prop == U_GCB_PREPEND)
return false;
// Cluster Indic syllables together.
if (IsIndicSyllabicCategoryVirama(prev_code_point) &&
u_getIntPropertyValue(next_code_point, UCHAR_GENERAL_CATEGORY) ==
U_OTHER_LETTER)
return false;
// GB11, ZWJ x Emoji
if (prev_code_point == kZeroWidthJoinerCharacter &&
(Character::IsEmoji(next_code_point)))
return false;
// GB12 for RI(Regional Indicator) is handled elsewhere because it requires
// counting the number of consecutive RIs.
// Rule GB999 any ÷ any
return true;
}
} // namespace blink