| /* |
| * Copyright (C) 2014 Google Inc. All rights reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions are |
| * met: |
| * |
| * * Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * * Redistributions in binary form must reproduce the above |
| * copyright notice, this list of conditions and the following disclaimer |
| * in the documentation and/or other materials provided with the |
| * distribution. |
| * * Neither the name of Google Inc. nor the names of its |
| * contributors may be used to endorse or promote products derived from |
| * this software without specific prior written permission. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| #include "platform/text/Character.h" |
| |
| #include "wtf/StdLibExtras.h" |
| #include "wtf/text/StringBuilder.h" |
| #include <algorithm> |
| #include <unicode/uobject.h> |
| #include <unicode/uscript.h> |
| |
| #if defined(USING_SYSTEM_ICU) |
| #include "platform/text/CharacterPropertyDataGenerator.h" |
| #include <unicode/uniset.h> |
| #else |
| #define MUTEX_H // Prevent compile failure of utrie2.h on Windows |
| #include <utrie2.h> |
| #endif |
| |
| using namespace WTF; |
| using namespace Unicode; |
| |
| namespace blink { |
| |
| #if defined(USING_SYSTEM_ICU) |
| static icu::UnicodeSet* createUnicodeSet(const UChar32* characters, |
| size_t charactersCount, |
| const UChar32* ranges, |
| size_t rangesCount) { |
| icu::UnicodeSet* unicodeSet = new icu::UnicodeSet(); |
| for (size_t i = 0; i < charactersCount; i++) |
| unicodeSet->add(characters[i]); |
| for (size_t i = 0; i < rangesCount; i += 2) |
| unicodeSet->add(ranges[i], ranges[i + 1]); |
| unicodeSet->freeze(); |
| return unicodeSet; |
| } |
| |
| #define CREATE_UNICODE_SET(name) \ |
| createUnicodeSet(name##Array, WTF_ARRAY_LENGTH(name##Array), name##Ranges, \ |
| WTF_ARRAY_LENGTH(name##Ranges)) |
| |
| #define RETURN_HAS_PROPERTY(c, name) \ |
| static icu::UnicodeSet* unicodeSet = nullptr; \ |
| if (!unicodeSet) \ |
| unicodeSet = CREATE_UNICODE_SET(name); \ |
| return unicodeSet->contains(c); |
| #else |
| // Freezed trie tree, see CharacterDataGenerator.cpp. |
| extern int32_t serializedCharacterDataSize; |
| extern uint8_t serializedCharacterData[]; |
| |
| static UTrie2* createTrie() { |
| // Create a Trie from the value array. |
| UErrorCode error = U_ZERO_ERROR; |
| UTrie2* trie = utrie2_openFromSerialized( |
| UTrie2ValueBits::UTRIE2_16_VALUE_BITS, serializedCharacterData, |
| serializedCharacterDataSize, nullptr, &error); |
| ASSERT(error == U_ZERO_ERROR); |
| return trie; |
| } |
| |
| static bool hasProperty(UChar32 c, CharacterProperty property) { |
| static UTrie2* trie = nullptr; |
| if (!trie) |
| trie = createTrie(); |
| return UTRIE2_GET16(trie, c) & static_cast<CharacterPropertyType>(property); |
| } |
| |
| #define RETURN_HAS_PROPERTY(c, name) \ |
| return hasProperty(c, CharacterProperty::name); |
| #endif |
| |
| // Takes a flattened list of closed intervals |
| template <class T, size_t size> |
| bool valueInIntervalList(const T (&intervalList)[size], const T& value) { |
| const T* bound = |
| std::upper_bound(&intervalList[0], &intervalList[size], value); |
| if ((bound - intervalList) % 2 == 1) |
| return true; |
| return bound > intervalList && *(bound - 1) == value; |
| } |
| |
| CodePath Character::characterRangeCodePath(const UChar* characters, |
| unsigned len) { |
| static const UChar complexCodePathRanges[] = { |
| // U+02E5 through U+02E9 (Modifier Letters : Tone letters) |
| 0x2E5, 0x2E9, |
| // U+0300 through U+036F Combining diacritical marks |
| 0x300, 0x36F, |
| // U+0591 through U+05CF excluding U+05BE Hebrew combining marks, ... |
| 0x0591, 0x05BD, |
| // ... Hebrew punctuation Paseq, Sof Pasuq and Nun Hafukha |
| 0x05BF, 0x05CF, |
| // U+0600 through U+109F Arabic, Syriac, Thaana, NKo, Samaritan, Mandaic, |
| // Devanagari, Bengali, Gurmukhi, Gujarati, Oriya, Tamil, Telugu, Kannada, |
| // Malayalam, Sinhala, Thai, Lao, Tibetan, Myanmar |
| 0x0600, 0x109F, |
| // U+1100 through U+11FF Hangul Jamo (only Ancient Korean should be left |
| // here if you precompose; Modern Korean will be precomposed as a result |
| // of step A) |
| 0x1100, 0x11FF, |
| // U+135D through U+135F Ethiopic combining marks |
| 0x135D, 0x135F, |
| // U+1780 through U+18AF Tagalog, Hanunoo, Buhid, Taghanwa, Khmer, |
| // Mongolian |
| 0x1700, 0x18AF, |
| // U+1900 through U+194F Limbu (Unicode 4.0) |
| 0x1900, 0x194F, |
| // U+1980 through U+19DF New Tai Lue |
| 0x1980, 0x19DF, |
| // U+1A00 through U+1CFF Buginese, Tai Tham, Balinese, Batak, Lepcha, |
| // Vedic |
| 0x1A00, 0x1CFF, |
| // U+1DC0 through U+1DFF Comining diacritical mark supplement |
| 0x1DC0, 0x1DFF, |
| // U+20D0 through U+20FF Combining marks for symbols |
| 0x20D0, 0x20FF, |
| // U+2CEF through U+2CF1 Combining marks for Coptic |
| 0x2CEF, 0x2CF1, |
| // U+302A through U+302F Ideographic and Hangul Tone marks |
| 0x302A, 0x302F, |
| // Combining Katakana-Hiragana Voiced/Semi-voiced Sound Mark |
| 0x3099, 0x309A, |
| // U+A67C through U+A67D Combining marks for old Cyrillic |
| 0xA67C, 0xA67D, |
| // U+A6F0 through U+A6F1 Combining mark for Bamum |
| 0xA6F0, 0xA6F1, |
| // U+A800 through U+ABFF Nagri, Phags-pa, Saurashtra, Devanagari Extended, |
| // Hangul Jamo Ext. A, Javanese, Myanmar Extended A, Tai Viet, |
| // Meetei Mayek |
| 0xA800, 0xABFF, |
| // U+D7B0 through U+D7FF Hangul Jamo Ext. B |
| 0xD7B0, 0xD7FF, |
| // U+E000..U+F8FF BMP Private Use Area |
| 0xE000, 0xF8FF, |
| // U+FE00 through U+FE0F Unicode variation selectors |
| 0xFE00, 0xFE0F, |
| // U+FE20 through U+FE2F Combining half marks |
| 0xFE20, 0xFE2F}; |
| |
| CodePath result = SimplePath; |
| for (unsigned i = 0; i < len; i++) { |
| const UChar c = characters[i]; |
| |
| // Shortcut for common case |
| if (c < 0x2E5) |
| continue; |
| |
| // Surrogate pairs |
| if (c > 0xD7FF && c <= 0xDBFF) { |
| if (i == len - 1) |
| continue; |
| |
| UChar next = characters[++i]; |
| if (!U16_IS_TRAIL(next)) |
| continue; |
| |
| UChar32 supplementaryCharacter = U16_GET_SUPPLEMENTARY(c, next); |
| |
| if (supplementaryCharacter < |
| 0x1F1E6) // U+1F1E6 through U+1F1FF Regional Indicator Symbols |
| continue; |
| if (supplementaryCharacter <= 0x1F1FF) |
| return ComplexPath; |
| |
| // Emoji Fitzpatrick modifiers trigger upgrade to complex path for shaping |
| // them. |
| if (supplementaryCharacter < 0x1F3FB) |
| continue; |
| if (supplementaryCharacter <= 0x1F3FF) |
| return ComplexPath; |
| |
| if (supplementaryCharacter == eyeCharacter) |
| return ComplexPath; |
| |
| // Man and Woman Emojies, in order to support emoji joiner combinations |
| // for family and couple pictographs. |
| // Compare http://unicode.org/reports/tr51/#Emoji_ZWJ_Sequences |
| if (supplementaryCharacter < 0x1F468) |
| continue; |
| if (supplementaryCharacter <= 0x1F469) |
| return ComplexPath; |
| |
| if (supplementaryCharacter == leftSpeechBubbleCharacter) |
| return ComplexPath; |
| |
| if (supplementaryCharacter < |
| 0xE0100) // U+E0100 through U+E01EF Unicode variation selectors. |
| continue; |
| if (supplementaryCharacter <= 0xE01EF) |
| return ComplexPath; |
| |
| // Supplemental Private Use Area-A |
| if (supplementaryCharacter < 0xF0000) |
| continue; |
| if (supplementaryCharacter <= 0xFFFFD) |
| return ComplexPath; |
| |
| // Supplemental Private Use Area-B |
| if (supplementaryCharacter < 0x100000) |
| continue; |
| if (supplementaryCharacter <= 0x10FFFD) |
| return ComplexPath; |
| |
| // FIXME: Check for Brahmi (U+11000 block), Kaithi (U+11080 block) and |
| // other complex scripts in plane 1 or higher. |
| |
| continue; |
| } |
| |
| // Search for other Complex cases |
| if (valueInIntervalList(complexCodePathRanges, c)) |
| return ComplexPath; |
| } |
| |
| return result; |
| } |
| |
| bool Character::isUprightInMixedVertical(UChar32 character) { |
| RETURN_HAS_PROPERTY(character, isUprightInMixedVertical) |
| } |
| |
| bool Character::isCJKIdeographOrSymbol(UChar32 c) { |
| // Likely common case |
| if (c < 0x2C7) |
| return false; |
| |
| RETURN_HAS_PROPERTY(c, isCJKIdeographOrSymbol) |
| } |
| |
| bool Character::isPotentialCustomElementNameChar(UChar32 character) { |
| RETURN_HAS_PROPERTY(character, isPotentialCustomElementNameChar); |
| } |
| |
| unsigned Character::expansionOpportunityCount(const LChar* characters, |
| size_t length, |
| TextDirection direction, |
| bool& isAfterExpansion, |
| const TextJustify textJustify) { |
| unsigned count = 0; |
| if (textJustify == TextJustifyDistribute) { |
| isAfterExpansion = true; |
| return length; |
| } |
| |
| if (direction == LTR) { |
| for (size_t i = 0; i < length; ++i) { |
| if (treatAsSpace(characters[i])) { |
| count++; |
| isAfterExpansion = true; |
| } else { |
| isAfterExpansion = false; |
| } |
| } |
| } else { |
| for (size_t i = length; i > 0; --i) { |
| if (treatAsSpace(characters[i - 1])) { |
| count++; |
| isAfterExpansion = true; |
| } else { |
| isAfterExpansion = false; |
| } |
| } |
| } |
| |
| return count; |
| } |
| |
| unsigned Character::expansionOpportunityCount(const UChar* characters, |
| size_t length, |
| TextDirection direction, |
| bool& isAfterExpansion, |
| const TextJustify textJustify) { |
| unsigned count = 0; |
| if (direction == LTR) { |
| for (size_t i = 0; i < length; ++i) { |
| UChar32 character = characters[i]; |
| if (treatAsSpace(character)) { |
| count++; |
| isAfterExpansion = true; |
| continue; |
| } |
| if (U16_IS_LEAD(character) && i + 1 < length && |
| U16_IS_TRAIL(characters[i + 1])) { |
| character = U16_GET_SUPPLEMENTARY(character, characters[i + 1]); |
| i++; |
| } |
| if (textJustify == TextJustify::TextJustifyAuto && |
| isCJKIdeographOrSymbol(character)) { |
| if (!isAfterExpansion) |
| count++; |
| count++; |
| isAfterExpansion = true; |
| continue; |
| } |
| isAfterExpansion = false; |
| } |
| } else { |
| for (size_t i = length; i > 0; --i) { |
| UChar32 character = characters[i - 1]; |
| if (treatAsSpace(character)) { |
| count++; |
| isAfterExpansion = true; |
| continue; |
| } |
| if (U16_IS_TRAIL(character) && i > 1 && U16_IS_LEAD(characters[i - 2])) { |
| character = U16_GET_SUPPLEMENTARY(characters[i - 2], character); |
| i--; |
| } |
| if (textJustify == TextJustify::TextJustifyAuto && |
| isCJKIdeographOrSymbol(character)) { |
| if (!isAfterExpansion) |
| count++; |
| count++; |
| isAfterExpansion = true; |
| continue; |
| } |
| isAfterExpansion = false; |
| } |
| } |
| return count; |
| } |
| |
| bool Character::canReceiveTextEmphasis(UChar32 c) { |
| CharCategory category = Unicode::category(c); |
| if (category & (Separator_Space | Separator_Line | Separator_Paragraph | |
| Other_NotAssigned | Other_Control | Other_Format)) |
| return false; |
| |
| // Additional word-separator characters listed in CSS Text Level 3 Editor's |
| // Draft 3 November 2010. |
| if (c == ethiopicWordspaceCharacter || |
| c == aegeanWordSeparatorLineCharacter || |
| c == aegeanWordSeparatorDotCharacter || |
| c == ugariticWordDividerCharacter || |
| c == tibetanMarkIntersyllabicTshegCharacter || |
| c == tibetanMarkDelimiterTshegBstarCharacter) |
| return false; |
| |
| return true; |
| } |
| |
| template <typename CharacterType> |
| static inline String normalizeSpacesInternal(const CharacterType* characters, |
| unsigned length) { |
| StringBuilder normalized; |
| normalized.reserveCapacity(length); |
| |
| for (unsigned i = 0; i < length; ++i) |
| normalized.append(Character::normalizeSpaces(characters[i])); |
| |
| return normalized.toString(); |
| } |
| |
| String Character::normalizeSpaces(const LChar* characters, unsigned length) { |
| return normalizeSpacesInternal(characters, length); |
| } |
| |
| String Character::normalizeSpaces(const UChar* characters, unsigned length) { |
| return normalizeSpacesInternal(characters, length); |
| } |
| |
| bool Character::isCommonOrInheritedScript(UChar32 character) { |
| UErrorCode status = U_ZERO_ERROR; |
| UScriptCode script = uscript_getScript(character, &status); |
| return U_SUCCESS(status) && |
| (script == USCRIPT_COMMON || script == USCRIPT_INHERITED); |
| } |
| |
| } // namespace blink |